From 6d3e0d8cc63221dec670d0ee92ac57961581e975 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 17 Jul 2023 21:52:01 +0206
Subject: kdb: Do not assume write() callback available

It is allowed for consoles to not provide a write() callback. For
example ttynull does this.

Check if a write() callback is available before using it.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230717194607.145135-2-john.ogness@linutronix.de
---
 kernel/debug/kdb/kdb_io.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 5c7e9ba7cd6b..e9139dfc1f0a 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -576,6 +576,8 @@ static void kdb_msg_write(const char *msg, int msg_len)
 			continue;
 		if (c == dbg_io_ops->cons)
 			continue;
+		if (!c->write)
+			continue;
 		/*
 		 * Set oops_in_progress to encourage the console drivers to
 		 * disregard their internal spin locks: in the current calling
-- 
cgit v1.2.3


From 7b23a66db55ed0a55b020e913f0d6f6d52a1ad2c Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 17 Jul 2023 21:52:02 +0206
Subject: printk: Reduce console_unblank() usage in unsafe scenarios

A semaphore is not NMI-safe, even when using down_trylock(). Both
down_trylock() and up() are using internal spinlocks and up()
might even call wake_up_process().

In the panic() code path it gets even worse because the internal
spinlocks of the semaphore may have been taken by a CPU that has
been stopped.

To reduce the risk of deadlocks caused by the console semaphore in
the panic path, make the following changes:

- First check if any consoles have implemented the unblank()
  callback. If not, then there is no reason to take the console
  semaphore anyway. (This check is also useful for the non-panic
  path since the locking/unlocking of the console lock can be
  quite expensive due to console printing.)

- If the panic path is in NMI context, bail out without attempting
  to take the console semaphore or calling any unblank() callbacks.
  Bailing out is acceptable because console_unblank() would already
  bail out if the console semaphore is contended. The alternative of
  ignoring the console semaphore and calling the unblank() callbacks
  anyway is a bad idea because these callbacks are also not NMI-safe.

If consoles with unblank() callbacks exist and console_unblank() is
called from a non-NMI panic context, it will still attempt a
down_trylock(). This could still result in a deadlock if one of the
stopped CPUs is holding the semaphore internal spinlock. But this
is a risk that the kernel has been (and continues to be) willing
to take.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230717194607.145135-3-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9644f6e5bf15..7aa9dbee12e8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3043,9 +3043,27 @@ EXPORT_SYMBOL(console_conditional_schedule);
 
 void console_unblank(void)
 {
+	bool found_unblank = false;
 	struct console *c;
 	int cookie;
 
+	/*
+	 * First check if there are any consoles implementing the unblank()
+	 * callback. If not, there is no reason to continue and take the
+	 * console lock, which in particular can be dangerous if
+	 * @oops_in_progress is set.
+	 */
+	cookie = console_srcu_read_lock();
+	for_each_console_srcu(c) {
+		if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) {
+			found_unblank = true;
+			break;
+		}
+	}
+	console_srcu_read_unlock(cookie);
+	if (!found_unblank)
+		return;
+
 	/*
 	 * Stop console printing because the unblank() callback may
 	 * assume the console is not within its write() callback.
@@ -3054,6 +3072,16 @@ void console_unblank(void)
 	 * In that case, attempt a trylock as best-effort.
 	 */
 	if (oops_in_progress) {
+		/* Semaphores are not NMI-safe. */
+		if (in_nmi())
+			return;
+
+		/*
+		 * Attempting to trylock the console lock can deadlock
+		 * if another CPU was stopped while modifying the
+		 * semaphore. "Hope and pray" that this is not the
+		 * current situation.
+		 */
 		if (down_trylock_console_sem() != 0)
 			return;
 	} else
-- 
cgit v1.2.3


From 51a1d258e50e03a0216bf42b6af9ff34ec402ac1 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 17 Jul 2023 21:52:03 +0206
Subject: printk: Keep non-panic-CPUs out of console lock

When in a panic situation, non-panic CPUs should avoid holding the
console lock so as not to contend with the panic CPU. This is already
implemented with abandon_console_lock_in_panic(), which is checked
after each printed line. However, non-panic CPUs should also avoid
trying to acquire the console lock during a panic.

Modify console_trylock() to fail and console_lock() to block() when
called from a non-panic CPU during a panic.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230717194607.145135-4-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 7aa9dbee12e8..7219991885e6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2583,6 +2583,25 @@ static int console_cpu_notify(unsigned int cpu)
 	return 0;
 }
 
+/*
+ * Return true when this CPU should unlock console_sem without pushing all
+ * messages to the console. This reduces the chance that the console is
+ * locked when the panic CPU tries to use it.
+ */
+static bool abandon_console_lock_in_panic(void)
+{
+	if (!panic_in_progress())
+		return false;
+
+	/*
+	 * We can use raw_smp_processor_id() here because it is impossible for
+	 * the task to be migrated to the panic_cpu, or away from it. If
+	 * panic_cpu has already been set, and we're not currently executing on
+	 * that CPU, then we never will be.
+	 */
+	return atomic_read(&panic_cpu) != raw_smp_processor_id();
+}
+
 /**
  * console_lock - block the console subsystem from printing
  *
@@ -2595,6 +2614,10 @@ void console_lock(void)
 {
 	might_sleep();
 
+	/* On panic, the console_lock must be left to the panic cpu. */
+	while (abandon_console_lock_in_panic())
+		msleep(1000);
+
 	down_console_sem();
 	if (console_suspended)
 		return;
@@ -2613,6 +2636,9 @@ EXPORT_SYMBOL(console_lock);
  */
 int console_trylock(void)
 {
+	/* On panic, the console_lock must be left to the panic cpu. */
+	if (abandon_console_lock_in_panic())
+		return 0;
 	if (down_trylock_console_sem())
 		return 0;
 	if (console_suspended) {
@@ -2631,25 +2657,6 @@ int is_console_locked(void)
 }
 EXPORT_SYMBOL(is_console_locked);
 
-/*
- * Return true when this CPU should unlock console_sem without pushing all
- * messages to the console. This reduces the chance that the console is
- * locked when the panic CPU tries to use it.
- */
-static bool abandon_console_lock_in_panic(void)
-{
-	if (!panic_in_progress())
-		return false;
-
-	/*
-	 * We can use raw_smp_processor_id() here because it is impossible for
-	 * the task to be migrated to the panic_cpu, or away from it. If
-	 * panic_cpu has already been set, and we're not currently executing on
-	 * that CPU, then we never will be.
-	 */
-	return atomic_read(&panic_cpu) != raw_smp_processor_id();
-}
-
 /*
  * Check if the given console is currently capable and allowed to print
  * records.
-- 
cgit v1.2.3


From eacb04ff3c5b8662a65f380ae450250698448cff Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 17 Jul 2023 21:52:04 +0206
Subject: printk: Do not take console lock for console_flush_on_panic()

Currently console_flush_on_panic() will attempt to acquire the
console lock when flushing the buffer on panic. If it fails to
acquire the lock, it continues anyway because this is the last
chance to get any pending records printed.

The reason why the console lock was attempted at all was to
prevent any other CPUs from acquiring the console lock for
printing while the panic CPU was printing. But as of the
previous commit, non-panic CPUs will no longer attempt to
acquire the console lock in a panic situation. Therefore it is
no longer strictly necessary for a panic CPU to acquire the
console lock.

Avoiding taking the console lock when flushing in panic has
the additional benefit of avoiding possible deadlocks due to
semaphore usage in NMI context (semaphores are not NMI-safe)
and avoiding possible deadlocks if another CPU accesses the
semaphore and is stopped while holding one of the semaphore's
internal spinlocks.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230717194607.145135-5-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 7219991885e6..51445e8ea730 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3118,14 +3118,24 @@ void console_unblank(void)
  */
 void console_flush_on_panic(enum con_flush_mode mode)
 {
+	bool handover;
+	u64 next_seq;
+
 	/*
-	 * If someone else is holding the console lock, trylock will fail
-	 * and may_schedule may be set.  Ignore and proceed to unlock so
-	 * that messages are flushed out.  As this can be called from any
-	 * context and we don't want to get preempted while flushing,
-	 * ensure may_schedule is cleared.
+	 * Ignore the console lock and flush out the messages. Attempting a
+	 * trylock would not be useful because:
+	 *
+	 *   - if it is contended, it must be ignored anyway
+	 *   - console_lock() and console_trylock() block and fail
+	 *     respectively in panic for non-panic CPUs
+	 *   - semaphores are not NMI-safe
+	 */
+
+	/*
+	 * If another context is holding the console lock,
+	 * @console_may_schedule might be set. Clear it so that
+	 * this context does not call cond_resched() while flushing.
 	 */
-	console_trylock();
 	console_may_schedule = 0;
 
 	if (mode == CONSOLE_REPLAY_ALL) {
@@ -3138,15 +3148,15 @@ void console_flush_on_panic(enum con_flush_mode mode)
 		cookie = console_srcu_read_lock();
 		for_each_console_srcu(c) {
 			/*
-			 * If the above console_trylock() failed, this is an
-			 * unsynchronized assignment. But in that case, the
+			 * This is an unsynchronized assignment, but the
 			 * kernel is in "hope and pray" mode anyway.
 			 */
 			c->seq = seq;
 		}
 		console_srcu_read_unlock(cookie);
 	}
-	console_unlock();
+
+	console_flush_all(false, &next_seq, &handover);
 }
 
 /*
-- 
cgit v1.2.3


From 696ffaf50e1f8dbc66223ff614473f945f5fb8d8 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 17 Jul 2023 21:52:05 +0206
Subject: printk: Consolidate console deferred printing

Printing to consoles can be deferred for several reasons:

- explicitly with printk_deferred()
- printk() in NMI context
- recursive printk() calls

The current implementation is not consistent. For printk_deferred(),
irq work is scheduled twice. For NMI und recursive, panic CPU
suppression and caller delays are not properly enforced.

Correct these inconsistencies by consolidating the deferred printing
code so that vprintk_deferred() is the top-level function for
deferred printing and vprintk_emit() will perform whichever irq_work
queueing is appropriate.

Also add kerneldoc for wake_up_klogd() and defer_console_output() to
clarify their differences and appropriate usage.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230717194607.145135-6-john.ogness@linutronix.de
---
 kernel/printk/printk.c      | 35 ++++++++++++++++++++++++++++-------
 kernel/printk/printk_safe.c |  9 ++-------
 2 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 51445e8ea730..6e853a1441a7 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2306,7 +2306,11 @@ asmlinkage int vprintk_emit(int facility, int level,
 		preempt_enable();
 	}
 
-	wake_up_klogd();
+	if (in_sched)
+		defer_console_output();
+	else
+		wake_up_klogd();
+
 	return printed_len;
 }
 EXPORT_SYMBOL(vprintk_emit);
@@ -3841,11 +3845,33 @@ static void __wake_up_klogd(int val)
 	preempt_enable();
 }
 
+/**
+ * wake_up_klogd - Wake kernel logging daemon
+ *
+ * Use this function when new records have been added to the ringbuffer
+ * and the console printing of those records has already occurred or is
+ * known to be handled by some other context. This function will only
+ * wake the logging daemon.
+ *
+ * Context: Any context.
+ */
 void wake_up_klogd(void)
 {
 	__wake_up_klogd(PRINTK_PENDING_WAKEUP);
 }
 
+/**
+ * defer_console_output - Wake kernel logging daemon and trigger
+ *	console printing in a deferred context
+ *
+ * Use this function when new records have been added to the ringbuffer,
+ * this context is responsible for console printing those records, but
+ * the current context is not allowed to perform the console printing.
+ * Trigger an irq_work context to perform the console printing. This
+ * function also wakes the logging daemon.
+ *
+ * Context: Any context.
+ */
 void defer_console_output(void)
 {
 	/*
@@ -3862,12 +3888,7 @@ void printk_trigger_flush(void)
 
 int vprintk_deferred(const char *fmt, va_list args)
 {
-	int r;
-
-	r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
-	defer_console_output();
-
-	return r;
+	return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
 }
 
 int _printk_deferred(const char *fmt, ...)
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index ef0f9a2044da..6d10927a07d8 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -38,13 +38,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	 * Use the main logbuf even in NMI. But avoid calling console
 	 * drivers that might have their own locks.
 	 */
-	if (this_cpu_read(printk_context) || in_nmi()) {
-		int len;
-
-		len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
-		defer_console_output();
-		return len;
-	}
+	if (this_cpu_read(printk_context) || in_nmi())
+		return vprintk_deferred(fmt, args);
 
 	/* No obstacles. */
 	return vprintk_default(fmt, args);
-- 
cgit v1.2.3


From 9e70a5e109a4a23367810de09be826c52d27ee2f Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 17 Jul 2023 21:52:06 +0206
Subject: printk: Add per-console suspended state

Currently the global @console_suspended is used to determine if
consoles are in a suspended state. Its primary purpose is to allow
usage of the console_lock when suspended without causing console
printing. It is synchronized by the console_lock.

Rather than relying on the console_lock to determine suspended
state, make it an official per-console state that is set within
console->flags. This allows the state to be queried via SRCU.

Remove @console_suspended. Console printing will still be avoided
when suspended because console_is_usable() returns false when
the new suspended flag is set for that console.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230717194607.145135-7-john.ogness@linutronix.de
---
 include/linux/console.h |  3 ++
 kernel/printk/printk.c  | 74 +++++++++++++++++++++++++++++--------------------
 2 files changed, 47 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/console.h b/include/linux/console.h
index d3195664baa5..7de11c763eb3 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -154,6 +154,8 @@ static inline int con_debug_leave(void)
  *			receiving the printk spam for obvious reasons.
  * @CON_EXTENDED:	The console supports the extended output format of
  *			/dev/kmesg which requires a larger output buffer.
+ * @CON_SUSPENDED:	Indicates if a console is suspended. If true, the
+ *			printing callbacks must not be called.
  */
 enum cons_flags {
 	CON_PRINTBUFFER		= BIT(0),
@@ -163,6 +165,7 @@ enum cons_flags {
 	CON_ANYTIME		= BIT(4),
 	CON_BRL			= BIT(5),
 	CON_EXTENDED		= BIT(6),
+	CON_SUSPENDED		= BIT(7),
 };
 
 /**
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 6e853a1441a7..efe577477913 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -86,7 +86,7 @@ EXPORT_SYMBOL(oops_in_progress);
 static DEFINE_MUTEX(console_mutex);
 
 /*
- * console_sem protects updates to console->seq and console_suspended,
+ * console_sem protects updates to console->seq
  * and also provides serialization for console printing.
  */
 static DEFINE_SEMAPHORE(console_sem);
@@ -359,7 +359,7 @@ static bool panic_in_progress(void)
  * paths in the console code where we end up in places I want
  * locked without the console semaphore held).
  */
-static int console_locked, console_suspended;
+static int console_locked;
 
 /*
  *	Array of consoles built from command line options (console=)
@@ -2549,22 +2549,46 @@ MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to hig
  */
 void suspend_console(void)
 {
+	struct console *con;
+
 	if (!console_suspend_enabled)
 		return;
 	pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
 	pr_flush(1000, true);
-	console_lock();
-	console_suspended = 1;
-	up_console_sem();
+
+	console_list_lock();
+	for_each_console(con)
+		console_srcu_write_flags(con, con->flags | CON_SUSPENDED);
+	console_list_unlock();
+
+	/*
+	 * Ensure that all SRCU list walks have completed. All printing
+	 * contexts must be able to see that they are suspended so that it
+	 * is guaranteed that all printing has stopped when this function
+	 * completes.
+	 */
+	synchronize_srcu(&console_srcu);
 }
 
 void resume_console(void)
 {
+	struct console *con;
+
 	if (!console_suspend_enabled)
 		return;
-	down_console_sem();
-	console_suspended = 0;
-	console_unlock();
+
+	console_list_lock();
+	for_each_console(con)
+		console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
+	console_list_unlock();
+
+	/*
+	 * Ensure that all SRCU list walks have completed. All printing
+	 * contexts must be able to see they are no longer suspended so
+	 * that they are guaranteed to wake up and resume printing.
+	 */
+	synchronize_srcu(&console_srcu);
+
 	pr_flush(1000, true);
 }
 
@@ -2623,8 +2647,6 @@ void console_lock(void)
 		msleep(1000);
 
 	down_console_sem();
-	if (console_suspended)
-		return;
 	console_locked = 1;
 	console_may_schedule = 1;
 }
@@ -2645,10 +2667,6 @@ int console_trylock(void)
 		return 0;
 	if (down_trylock_console_sem())
 		return 0;
-	if (console_suspended) {
-		up_console_sem();
-		return 0;
-	}
 	console_locked = 1;
 	console_may_schedule = 0;
 	return 1;
@@ -2674,6 +2692,9 @@ static inline bool console_is_usable(struct console *con)
 	if (!(flags & CON_ENABLED))
 		return false;
 
+	if ((flags & CON_SUSPENDED))
+		return false;
+
 	if (!con->write)
 		return false;
 
@@ -2992,11 +3013,6 @@ void console_unlock(void)
 	bool flushed;
 	u64 next_seq;
 
-	if (console_suspended) {
-		up_console_sem();
-		return;
-	}
-
 	/*
 	 * Console drivers are called with interrupts disabled, so
 	 * @console_may_schedule should be cleared before; however, we may
@@ -3726,8 +3742,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 
 		/*
 		 * Hold the console_lock to guarantee safe access to
-		 * console->seq and to prevent changes to @console_suspended
-		 * until all consoles have been processed.
+		 * console->seq.
 		 */
 		console_lock();
 
@@ -3735,6 +3750,11 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 		for_each_console_srcu(c) {
 			if (con && con != c)
 				continue;
+			/*
+			 * If consoles are not usable, it cannot be expected
+			 * that they make forward progress, so only increment
+			 * @diff for usable consoles.
+			 */
 			if (!console_is_usable(c))
 				continue;
 			printk_seq = c->seq;
@@ -3743,18 +3763,12 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 		}
 		console_srcu_read_unlock(cookie);
 
-		/*
-		 * If consoles are suspended, it cannot be expected that they
-		 * make forward progress, so timeout immediately. @diff is
-		 * still used to return a valid flush status.
-		 */
-		if (console_suspended)
-			remaining = 0;
-		else if (diff != last_diff && reset_on_progress)
+		if (diff != last_diff && reset_on_progress)
 			remaining = timeout_ms;
 
 		console_unlock();
 
+		/* Note: @diff is 0 if there are no usable consoles. */
 		if (diff == 0 || remaining == 0)
 			break;
 
@@ -3788,7 +3802,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
  * printer has been seen to make some forward progress.
  *
  * Context: Process context. May sleep while acquiring console lock.
- * Return: true if all enabled printers are caught up.
+ * Return: true if all usable printers are caught up.
  */
 static bool pr_flush(int timeout_ms, bool reset_on_progress)
 {
-- 
cgit v1.2.3


From 132a90d1527fedba2d95085c951ccf00dbbebe41 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 17 Jul 2023 21:52:07 +0206
Subject: printk: Rename abandon_console_lock_in_panic() to
 other_cpu_in_panic()

Currently abandon_console_lock_in_panic() is only used to determine if
the current CPU should immediately release the console lock because
another CPU is in panic. However, later this function will be used by
the CPU to immediately release other resources in this situation.

Rename the function to other_cpu_in_panic(), which is a better
description and does not assume it is related to the console lock.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230717194607.145135-8-john.ogness@linutronix.de
---
 kernel/printk/internal.h |  2 ++
 kernel/printk/printk.c   | 15 ++++++++-------
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 2a17704136f1..7d4979d5c3ce 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -103,3 +103,5 @@ struct printk_message {
 	u64			seq;
 	unsigned long		dropped;
 };
+
+bool other_cpu_in_panic(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index efe577477913..8787d3a72114 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2612,11 +2612,12 @@ static int console_cpu_notify(unsigned int cpu)
 }
 
 /*
- * Return true when this CPU should unlock console_sem without pushing all
- * messages to the console. This reduces the chance that the console is
- * locked when the panic CPU tries to use it.
+ * Return true if a panic is in progress on a remote CPU.
+ *
+ * On true, the local CPU should immediately release any printing resources
+ * that may be needed by the panic CPU.
  */
-static bool abandon_console_lock_in_panic(void)
+bool other_cpu_in_panic(void)
 {
 	if (!panic_in_progress())
 		return false;
@@ -2643,7 +2644,7 @@ void console_lock(void)
 	might_sleep();
 
 	/* On panic, the console_lock must be left to the panic cpu. */
-	while (abandon_console_lock_in_panic())
+	while (other_cpu_in_panic())
 		msleep(1000);
 
 	down_console_sem();
@@ -2663,7 +2664,7 @@ EXPORT_SYMBOL(console_lock);
 int console_trylock(void)
 {
 	/* On panic, the console_lock must be left to the panic cpu. */
-	if (abandon_console_lock_in_panic())
+	if (other_cpu_in_panic())
 		return 0;
 	if (down_trylock_console_sem())
 		return 0;
@@ -2978,7 +2979,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
 			any_progress = true;
 
 			/* Allow panic_cpu to take over the consoles safely. */
-			if (abandon_console_lock_in_panic())
+			if (other_cpu_in_panic())
 				goto abandon;
 
 			if (do_cond_resched)
-- 
cgit v1.2.3


From 60466c067927abbcaff299845abd4b7069963139 Mon Sep 17 00:00:00 2001
From: Benjamin Bara <benjamin.bara@skidata.com>
Date: Sat, 15 Jul 2023 09:53:23 +0200
Subject: kernel/reboot: emergency_restart: Set correct system_state

As the emergency restart does not call kernel_restart_prepare(), the
system_state stays in SYSTEM_RUNNING.

Since bae1d3a05a8b, this hinders i2c_in_atomic_xfer_mode() from becoming
active, and therefore might lead to avoidable warnings in the restart
handlers, e.g.:

[   12.667612] WARNING: CPU: 1 PID: 1 at kernel/rcu/tree_plugin.h:318 rcu_note_context_switch+0x33c/0x6b0
[   12.676926] Voluntary context switch within RCU read-side critical section!
...
[   12.742376]  schedule_timeout from wait_for_completion_timeout+0x90/0x114
[   12.749179]  wait_for_completion_timeout from tegra_i2c_wait_completion+0x40/0x70
...
[   12.994527]  atomic_notifier_call_chain from machine_restart+0x34/0x58
[   13.001050]  machine_restart from panic+0x2a8/0x32c

Avoid these by setting the correct system_state.

Fixes: bae1d3a05a8b ("i2c: core: remove use of in_atomic()")
Cc: stable@vger.kernel.org # v5.2+
Reviewed-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Tested-by: Nishanth Menon <nm@ti.com>
Signed-off-by: Benjamin Bara <benjamin.bara@skidata.com>
Link: https://lore.kernel.org/r/20230327-tegra-pmic-reboot-v7-1-18699d5dcd76@skidata.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 kernel/reboot.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index 3bba88c7ffc6..6ebef11c8876 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -74,6 +74,7 @@ void __weak (*pm_power_off)(void);
 void emergency_restart(void)
 {
 	kmsg_dump(KMSG_DUMP_EMERG);
+	system_state = SYSTEM_RESTART;
 	machine_emergency_restart();
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
-- 
cgit v1.2.3


From db2d6038c5e795cab4f0a8d3e86b4f7e33338629 Mon Sep 17 00:00:00 2001
From: Benjamin Bara <benjamin.bara@skidata.com>
Date: Sat, 15 Jul 2023 09:53:25 +0200
Subject: kernel/reboot: Add device to sys_off_handler

If the dev is known (e.g. a devm-based sys_off_handler is used), it can
be passed to the handler's callback to have it available there.
Otherwise, cb_data might be set to the dev in most of the cases.

Reviewed-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Benjamin Bara <benjamin.bara@skidata.com>
Link: https://lore.kernel.org/r/20230327-tegra-pmic-reboot-v7-3-18699d5dcd76@skidata.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/reboot.h | 3 +++
 kernel/reboot.c        | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 2b6bb593be5b..c4cc3b89ced1 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -129,11 +129,14 @@ enum sys_off_mode {
  * @cb_data: User's callback data.
  * @cmd: Command string. Currently used only by the sys-off restart mode,
  *       NULL otherwise.
+ * @dev: Device of the sys-off handler. Only if known (devm_register_*),
+ *       NULL otherwise.
  */
 struct sys_off_data {
 	int mode;
 	void *cb_data;
 	const char *cmd;
+	struct device *dev;
 };
 
 struct sys_off_handler *
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 6ebef11c8876..395a0ea3c7a8 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -55,6 +55,7 @@ struct sys_off_handler {
 	enum sys_off_mode mode;
 	bool blocking;
 	void *list;
+	struct device *dev;
 };
 
 /*
@@ -324,6 +325,7 @@ static int sys_off_notify(struct notifier_block *nb,
 	data.cb_data = handler->cb_data;
 	data.mode = mode;
 	data.cmd = cmd;
+	data.dev = handler->dev;
 
 	return handler->sys_off_cb(&data);
 }
@@ -511,6 +513,7 @@ int devm_register_sys_off_handler(struct device *dev,
 	handler = register_sys_off_handler(mode, priority, callback, cb_data);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
+	handler->dev = dev;
 
 	return devm_add_action_or_reset(dev, devm_unregister_sys_off_handler,
 					handler);
-- 
cgit v1.2.3


From 53e9e33ede37a247d926db5e4a9e56b55204e66c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 10 Aug 2023 22:45:32 -0700
Subject: printk: ringbuffer: Fix truncating buffer size min_t cast

If an output buffer size exceeded U16_MAX, the min_t(u16, ...) cast in
copy_data() was causing writes to truncate. This manifested as output
bytes being skipped, seen as %NUL bytes in pstore dumps when the available
record size was larger than 65536. Fix the cast to no longer truncate
the calculation.

Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: John Ogness <john.ogness@linutronix.de>
Reported-by: Vijay Balakrishna <vijayb@linux.microsoft.com>
Link: https://lore.kernel.org/lkml/d8bb1ec7-a4c5-43a2-9de0-9643a70b899f@linux.microsoft.com/
Fixes: b6cf8b3f3312 ("printk: add lockless ringbuffer")
Cc: stable@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Vijay Balakrishna <vijayb@linux.microsoft.com>
Tested-by: Guilherme G. Piccoli <gpiccoli@igalia.com> # Steam Deck
Reviewed-by: Tyler Hicks (Microsoft) <code@tyhicks.com>
Tested-by: Tyler Hicks (Microsoft) <code@tyhicks.com>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230811054528.never.165-kees@kernel.org
---
 kernel/printk/printk_ringbuffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index 2dc4d5a1f1ff..fde338606ce8 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -1735,7 +1735,7 @@ static bool copy_data(struct prb_data_ring *data_ring,
 	if (!buf || !buf_size)
 		return true;
 
-	data_size = min_t(u16, buf_size, len);
+	data_size = min_t(unsigned int, buf_size, len);
 
 	memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */
 	return true;
-- 
cgit v1.2.3


From 3e00123a13d824d63072b1824c9da59cd78356d9 Mon Sep 17 00:00:00 2001
From: Enlin Mu <enlin.mu@unisoc.com>
Date: Tue, 15 Aug 2023 10:07:11 +0800
Subject: printk: export symbols for debug modules

the module is out-of-tree, it saves kernel logs when panic

Signed-off-by: Enlin Mu <enlin.mu@unisoc.com>
Acked-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230815020711.2604939-1-yunlong.xing@unisoc.com
---
 kernel/printk/printk.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9644f6e5bf15..accfd326e7a1 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -536,12 +536,14 @@ char *log_buf_addr_get(void)
 {
 	return log_buf;
 }
+EXPORT_SYMBOL_GPL(log_buf_addr_get);
 
 /* Return log buffer size */
 u32 log_buf_len_get(void)
 {
 	return log_buf_len;
 }
+EXPORT_SYMBOL_GPL(log_buf_len_get);
 
 /*
  * Define how much of the log buffer we could take at maximum. The value
-- 
cgit v1.2.3


From fb5a4315591dae307a65fc246ca80b5159d296e1 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <senozhatsky@chromium.org>
Date: Wed, 16 Aug 2023 11:32:21 +0900
Subject: dma-debug: don't call __dma_entry_alloc_check_leak() under
 free_entries_lock

__dma_entry_alloc_check_leak() calls into printk -> serial console
output (qcom geni) and grabs port->lock under free_entries_lock
spin lock, which is a reverse locking dependency chain as qcom_geni
IRQ handler can call into dma-debug code and grab free_entries_lock
under port->lock.

Move __dma_entry_alloc_check_leak() call out of free_entries_lock
scope so that we don't acquire serial console's port->lock under it.

Trimmed-down lockdep splat:

 The existing dependency chain (in reverse order) is:

               -> #2 (free_entries_lock){-.-.}-{2:2}:
        _raw_spin_lock_irqsave+0x60/0x80
        dma_entry_alloc+0x38/0x110
        debug_dma_map_page+0x60/0xf8
        dma_map_page_attrs+0x1e0/0x230
        dma_map_single_attrs.constprop.0+0x6c/0xc8
        geni_se_rx_dma_prep+0x40/0xcc
        qcom_geni_serial_isr+0x310/0x510
        __handle_irq_event_percpu+0x110/0x244
        handle_irq_event_percpu+0x20/0x54
        handle_irq_event+0x50/0x88
        handle_fasteoi_irq+0xa4/0xcc
        handle_irq_desc+0x28/0x40
        generic_handle_domain_irq+0x24/0x30
        gic_handle_irq+0xc4/0x148
        do_interrupt_handler+0xa4/0xb0
        el1_interrupt+0x34/0x64
        el1h_64_irq_handler+0x18/0x24
        el1h_64_irq+0x64/0x68
        arch_local_irq_enable+0x4/0x8
        ____do_softirq+0x18/0x24
        ...

               -> #1 (&port_lock_key){-.-.}-{2:2}:
        _raw_spin_lock_irqsave+0x60/0x80
        qcom_geni_serial_console_write+0x184/0x1dc
        console_flush_all+0x344/0x454
        console_unlock+0x94/0xf0
        vprintk_emit+0x238/0x24c
        vprintk_default+0x3c/0x48
        vprintk+0xb4/0xbc
        _printk+0x68/0x90
        register_console+0x230/0x38c
        uart_add_one_port+0x338/0x494
        qcom_geni_serial_probe+0x390/0x424
        platform_probe+0x70/0xc0
        really_probe+0x148/0x280
        __driver_probe_device+0xfc/0x114
        driver_probe_device+0x44/0x100
        __device_attach_driver+0x64/0xdc
        bus_for_each_drv+0xb0/0xd8
        __device_attach+0xe4/0x140
        device_initial_probe+0x1c/0x28
        bus_probe_device+0x44/0xb0
        device_add+0x538/0x668
        of_device_add+0x44/0x50
        of_platform_device_create_pdata+0x94/0xc8
        of_platform_bus_create+0x270/0x304
        of_platform_populate+0xac/0xc4
        devm_of_platform_populate+0x60/0xac
        geni_se_probe+0x154/0x160
        platform_probe+0x70/0xc0
        ...

               -> #0 (console_owner){-...}-{0:0}:
        __lock_acquire+0xdf8/0x109c
        lock_acquire+0x234/0x284
        console_flush_all+0x330/0x454
        console_unlock+0x94/0xf0
        vprintk_emit+0x238/0x24c
        vprintk_default+0x3c/0x48
        vprintk+0xb4/0xbc
        _printk+0x68/0x90
        dma_entry_alloc+0xb4/0x110
        debug_dma_map_sg+0xdc/0x2f8
        __dma_map_sg_attrs+0xac/0xe4
        dma_map_sgtable+0x30/0x4c
        get_pages+0x1d4/0x1e4 [msm]
        msm_gem_pin_pages_locked+0x38/0xac [msm]
        msm_gem_pin_vma_locked+0x58/0x88 [msm]
        msm_ioctl_gem_submit+0xde4/0x13ac [msm]
        drm_ioctl_kernel+0xe0/0x15c
        drm_ioctl+0x2e8/0x3f4
        vfs_ioctl+0x30/0x50
        ...

 Chain exists of:
   console_owner --> &port_lock_key --> free_entries_lock

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(free_entries_lock);
                                lock(&port_lock_key);
                                lock(free_entries_lock);
   lock(console_owner);

                *** DEADLOCK ***

 Call trace:
  dump_backtrace+0xb4/0xf0
  show_stack+0x20/0x30
  dump_stack_lvl+0x60/0x84
  dump_stack+0x18/0x24
  print_circular_bug+0x1cc/0x234
  check_noncircular+0x78/0xac
  __lock_acquire+0xdf8/0x109c
  lock_acquire+0x234/0x284
  console_flush_all+0x330/0x454
  console_unlock+0x94/0xf0
  vprintk_emit+0x238/0x24c
  vprintk_default+0x3c/0x48
  vprintk+0xb4/0xbc
  _printk+0x68/0x90
  dma_entry_alloc+0xb4/0x110
  debug_dma_map_sg+0xdc/0x2f8
  __dma_map_sg_attrs+0xac/0xe4
  dma_map_sgtable+0x30/0x4c
  get_pages+0x1d4/0x1e4 [msm]
  msm_gem_pin_pages_locked+0x38/0xac [msm]
  msm_gem_pin_vma_locked+0x58/0x88 [msm]
  msm_ioctl_gem_submit+0xde4/0x13ac [msm]
  drm_ioctl_kernel+0xe0/0x15c
  drm_ioctl+0x2e8/0x3f4
  vfs_ioctl+0x30/0x50
  ...

Reported-by: Rob Clark <robdclark@chromium.org>
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/debug.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index f190651bcadd..06366acd27b0 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -637,15 +637,19 @@ static struct dma_debug_entry *__dma_entry_alloc(void)
 	return entry;
 }
 
-static void __dma_entry_alloc_check_leak(void)
+/*
+ * This should be called outside of free_entries_lock scope to avoid potential
+ * deadlocks with serial consoles that use DMA.
+ */
+static void __dma_entry_alloc_check_leak(u32 nr_entries)
 {
-	u32 tmp = nr_total_entries % nr_prealloc_entries;
+	u32 tmp = nr_entries % nr_prealloc_entries;
 
 	/* Shout each time we tick over some multiple of the initial pool */
 	if (tmp < DMA_DEBUG_DYNAMIC_ENTRIES) {
 		pr_info("dma_debug_entry pool grown to %u (%u00%%)\n",
-			nr_total_entries,
-			(nr_total_entries / nr_prealloc_entries));
+			nr_entries,
+			(nr_entries / nr_prealloc_entries));
 	}
 }
 
@@ -656,8 +660,10 @@ static void __dma_entry_alloc_check_leak(void)
  */
 static struct dma_debug_entry *dma_entry_alloc(void)
 {
+	bool alloc_check_leak = false;
 	struct dma_debug_entry *entry;
 	unsigned long flags;
+	u32 nr_entries;
 
 	spin_lock_irqsave(&free_entries_lock, flags);
 	if (num_free_entries == 0) {
@@ -667,13 +673,17 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 			pr_err("debugging out of memory - disabling\n");
 			return NULL;
 		}
-		__dma_entry_alloc_check_leak();
+		alloc_check_leak = true;
+		nr_entries = nr_total_entries;
 	}
 
 	entry = __dma_entry_alloc();
 
 	spin_unlock_irqrestore(&free_entries_lock, flags);
 
+	if (alloc_check_leak)
+		__dma_entry_alloc_check_leak(nr_entries);
+
 #ifdef CONFIG_STACKTRACE
 	entry->stack_len = stack_trace_save(entry->stack_entries,
 					    ARRAY_SIZE(entry->stack_entries),
-- 
cgit v1.2.3


From 2dcdf8c18d5c1835571bfa40f40ac134c8a1f0f5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 30 Aug 2023 11:25:25 +0200
Subject: dma-contiguous: fix the Kconfig entry for CONFIG_DMA_NUMA_CMA

It makes no sense to expose CONFIG_DMA_NUMA_CMA if CONFIG_NUMA is not
enabled, and random config options shouldn't be default unless there
is a good reason.  Replace the default NUMA with a depends on to fix both
issues.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <roin.murphy@arm.com>
---
 kernel/dma/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 4c1e9a3c0ab6..f488997b0717 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -160,7 +160,7 @@ if  DMA_CMA
 
 config DMA_NUMA_CMA
 	bool "Enable separate DMA Contiguous Memory Area for NUMA Node"
-	default NUMA
+	depends on NUMA
 	help
 	  Enable this option to get numa CMA areas so that NUMA devices
 	  can get local memory by DMA coherent APIs.
-- 
cgit v1.2.3


From 7e9be1124dbe7888907e82cab20164578e3f9ab7 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Tue, 29 Aug 2023 19:51:57 +0200
Subject: netfilter: nf_tables: Audit log setelem reset

Since set element reset is not integrated into nf_tables' transaction
logic, an explicit log call is needed, similar to NFT_MSG_GETOBJ_RESET
handling.

For the sake of simplicity, catchall element reset will always generate
a dedicated log entry. This relieves nf_tables_dump_set() from having to
adjust the logged element count depending on whether a catchall element
was found or not.

Fixes: 079cd633219d7 ("netfilter: nf_tables: Introduce NFT_MSG_GETSETELEM_RESET")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/audit.h         |  1 +
 kernel/auditsc.c              |  1 +
 net/netfilter/nf_tables_api.c | 31 ++++++++++++++++++++++++++++---
 3 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 6a3a9e122bb5..192bf03aacc5 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -117,6 +117,7 @@ enum audit_nfcfgop {
 	AUDIT_NFT_OP_OBJ_RESET,
 	AUDIT_NFT_OP_FLOWTABLE_REGISTER,
 	AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
+	AUDIT_NFT_OP_SETELEM_RESET,
 	AUDIT_NFT_OP_INVALID,
 };
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index addeed3df15d..38481e318197 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -143,6 +143,7 @@ static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
 	{ AUDIT_NFT_OP_OBJ_RESET,		"nft_reset_obj"		   },
 	{ AUDIT_NFT_OP_FLOWTABLE_REGISTER,	"nft_register_flowtable"   },
 	{ AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,	"nft_unregister_flowtable" },
+	{ AUDIT_NFT_OP_SETELEM_RESET,		"nft_reset_setelem"        },
 	{ AUDIT_NFT_OP_INVALID,			"nft_invalid"		   },
 };
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 41b826dff6f5..361e98e71692 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -102,6 +102,7 @@ static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types
 	[NFT_MSG_NEWFLOWTABLE]	= AUDIT_NFT_OP_FLOWTABLE_REGISTER,
 	[NFT_MSG_GETFLOWTABLE]	= AUDIT_NFT_OP_INVALID,
 	[NFT_MSG_DELFLOWTABLE]	= AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
+	[NFT_MSG_GETSETELEM_RESET] = AUDIT_NFT_OP_SETELEM_RESET,
 };
 
 static void nft_validate_state_update(struct nft_table *table, u8 new_validate_state)
@@ -5624,13 +5625,25 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
 	return nf_tables_fill_setelem(args->skb, set, elem, args->reset);
 }
 
+static void audit_log_nft_set_reset(const struct nft_table *table,
+				    unsigned int base_seq,
+				    unsigned int nentries)
+{
+	char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq);
+
+	audit_log_nfcfg(buf, table->family, nentries,
+			AUDIT_NFT_OP_SETELEM_RESET, GFP_ATOMIC);
+	kfree(buf);
+}
+
 struct nft_set_dump_ctx {
 	const struct nft_set	*set;
 	struct nft_ctx		ctx;
 };
 
 static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
-				 const struct nft_set *set, bool reset)
+				 const struct nft_set *set, bool reset,
+				 unsigned int base_seq)
 {
 	struct nft_set_elem_catchall *catchall;
 	u8 genmask = nft_genmask_cur(net);
@@ -5646,6 +5659,8 @@ static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
 
 		elem.priv = catchall->elem;
 		ret = nf_tables_fill_setelem(skb, set, &elem, reset);
+		if (reset && !ret)
+			audit_log_nft_set_reset(set->table, base_seq, 1);
 		break;
 	}
 
@@ -5725,12 +5740,17 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	set->ops->walk(&dump_ctx->ctx, set, &args.iter);
 
 	if (!args.iter.err && args.iter.count == cb->args[0])
-		args.iter.err = nft_set_catchall_dump(net, skb, set, reset);
+		args.iter.err = nft_set_catchall_dump(net, skb, set,
+						      reset, cb->seq);
 	rcu_read_unlock();
 
 	nla_nest_end(skb, nest);
 	nlmsg_end(skb, nlh);
 
+	if (reset && args.iter.count > args.iter.skip)
+		audit_log_nft_set_reset(table, cb->seq,
+					args.iter.count - args.iter.skip);
+
 	if (args.iter.err && args.iter.err != -EMSGSIZE)
 		return args.iter.err;
 	if (args.iter.count == cb->args[0])
@@ -5955,13 +5975,13 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_cur(info->net);
 	u8 family = info->nfmsg->nfgen_family;
+	int rem, err = 0, nelems = 0;
 	struct net *net = info->net;
 	struct nft_table *table;
 	struct nft_set *set;
 	struct nlattr *attr;
 	struct nft_ctx ctx;
 	bool reset = false;
-	int rem, err = 0;
 
 	table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
 				 genmask, 0);
@@ -6004,8 +6024,13 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
 			NL_SET_BAD_ATTR(extack, attr);
 			break;
 		}
+		nelems++;
 	}
 
+	if (reset)
+		audit_log_nft_set_reset(table, nft_pernet(net)->base_seq,
+					nelems);
+
 	return err;
 }
 
-- 
cgit v1.2.3


From ea078ae9108e25fc881c84369f7c03931d22e555 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Tue, 29 Aug 2023 19:51:58 +0200
Subject: netfilter: nf_tables: Audit log rule reset

Resetting rules' stateful data happens outside of the transaction logic,
so 'get' and 'dump' handlers have to emit audit log entries themselves.

Fixes: 8daa8fde3fc3f ("netfilter: nf_tables: Introduce NFT_MSG_GETRULE_RESET")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/audit.h         |  1 +
 kernel/auditsc.c              |  1 +
 net/netfilter/nf_tables_api.c | 18 ++++++++++++++++++
 3 files changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 192bf03aacc5..51b1b7054a23 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -118,6 +118,7 @@ enum audit_nfcfgop {
 	AUDIT_NFT_OP_FLOWTABLE_REGISTER,
 	AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
 	AUDIT_NFT_OP_SETELEM_RESET,
+	AUDIT_NFT_OP_RULE_RESET,
 	AUDIT_NFT_OP_INVALID,
 };
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 38481e318197..fc0c7c03eeab 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -144,6 +144,7 @@ static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
 	{ AUDIT_NFT_OP_FLOWTABLE_REGISTER,	"nft_register_flowtable"   },
 	{ AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,	"nft_unregister_flowtable" },
 	{ AUDIT_NFT_OP_SETELEM_RESET,		"nft_reset_setelem"        },
+	{ AUDIT_NFT_OP_RULE_RESET,		"nft_reset_rule"           },
 	{ AUDIT_NFT_OP_INVALID,			"nft_invalid"		   },
 };
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 361e98e71692..2c81cee858d6 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3422,6 +3422,18 @@ err:
 	nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
 }
 
+static void audit_log_rule_reset(const struct nft_table *table,
+				 unsigned int base_seq,
+				 unsigned int nentries)
+{
+	char *buf = kasprintf(GFP_ATOMIC, "%s:%u",
+			      table->name, base_seq);
+
+	audit_log_nfcfg(buf, table->family, nentries,
+			AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
+	kfree(buf);
+}
+
 struct nft_rule_dump_ctx {
 	char *table;
 	char *chain;
@@ -3528,6 +3540,9 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 done:
 	rcu_read_unlock();
 
+	if (reset && idx > cb->args[0])
+		audit_log_rule_reset(table, cb->seq, idx - cb->args[0]);
+
 	cb->args[0] = idx;
 	return skb->len;
 }
@@ -3635,6 +3650,9 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
 	if (err < 0)
 		goto err_fill_rule_info;
 
+	if (reset)
+		audit_log_rule_reset(table, nft_pernet(net)->base_seq, 1);
+
 	return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
 
 err_fill_rule_info:
-- 
cgit v1.2.3


From 765aa6b3a462ed69ddcb1c41a6f7f93b8abb7d43 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 30 Aug 2023 11:22:59 +0200
Subject: dma-pool: remove a __maybe_unused label in atomic_pool_expand

Move the #endif a line so that free_page label is only seen by the
compile pass when actually used.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chunhui He <hchunhui@mail.ustc.edu.cn>
Reviewed-by: Robin Murphy <roin.murphy@arm.com>
---
 kernel/dma/pool.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 1acec2e22827..b481c48a31a6 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -135,9 +135,9 @@ encrypt_mapping:
 remove_mapping:
 #ifdef CONFIG_DMA_DIRECT_REMAP
 	dma_common_free_remap(addr, pool_size);
-#endif
-free_page: __maybe_unused
+free_page:
 	__free_pages(page, order);
+#endif
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From f8858d96061f5942216c6abb0194c3ea7b78e1e8 Mon Sep 17 00:00:00 2001
From: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
Date: Sat, 2 Sep 2023 13:42:04 +0530
Subject: sched/fair: Optimize should_we_balance() for large SMT systems
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

should_we_balance() is called in load_balance() to find out if the CPU that
is trying to do the load balance is the right one or not.

With commit:

  b1bfeab9b002("sched/fair: Consider the idle state of the whole core for load balance")

the code tries to find an idle core to do the load balancing
and falls back on an idle sibling CPU if there is no idle core.

However, on larger SMT systems, it could be needlessly iterating to find a
idle by scanning all the CPUs in an non-idle core. If the core is not idle,
and first SMT sibling which is idle has been found, then its not needed to
check other SMT siblings for idleness

Lets say in SMT4, Core0 has 0,2,4,6 and CPU0 is BUSY and rest are IDLE.
balancing domain is MC/DIE. CPU2 will be set as the first idle_smt and
same process would be repeated for CPU4 and CPU6 but this is unnecessary.
Since calling is_core_idle loops through all CPU's in the SMT mask, effect
is multiplied by weight of smt_mask. For example,when say 1 CPU is busy,
we would skip loop for 2 CPU's and skip iterating over 8CPU's. That
effect would be more in DIE/NUMA domain where there are more cores.

Testing and performance evaluation
==================================

The test has been done on this system which has 12 cores, i.e 24 small
cores with SMT=4:

  lscpu
  Architecture:            ppc64le
    Byte Order:            Little Endian
  CPU(s):                  96
    On-line CPU(s) list:   0-95
  Model name:              POWER10 (architected), altivec supported
    Thread(s) per core:    8

Used funclatency bcc tool to evaluate the time taken by should_we_balance(). For
base tip/sched/core the time taken is collected by making the
should_we_balance() noinline. time is in nanoseconds. The values are
collected by running the funclatency tracer for 60 seconds. values are
average of 3 such runs. This represents the expected reduced time with
patch.

tip/sched/core was at commit:

  2f88c8e802c8 ("sched/eevdf/doc: Modify the documented knob to base_slice_ns as well")

Results:

	------------------------------------------------------------------------------
	workload			   tip/sched/core	with_patch(%gain)
	------------------------------------------------------------------------------
	idle system				 809.3		 695.0(16.45)
	stress ng – 12 threads -l 100		1013.5		 893.1(13.49)
	stress ng – 24 threads -l 100		1073.5		 980.0(9.54)
	stress ng – 48 threads -l 100		 683.0		 641.0(6.55)
	stress ng – 96 threads -l 100		2421.0		2300(5.26)
	stress ng – 96 threads -l 15		 375.5		 377.5(-0.53)
	stress ng – 96 threads -l 25		 635.5		 637.5(-0.31)
	stress ng – 96 threads -l 35		 934.0		 891.0(4.83)

Ran schbench(old), hackbench and stress_ng  to evaluate the workload
performance between tip/sched/core and with patch.
No modification to tip/sched/core

TL;DR:

Good improvement is seen with schbench. when hackbench and stress_ng
runs for longer good improvement is seen.

	------------------------------------------------------------------------------
	schbench(old)		            tip		+patch(%gain)
	10 iterations			sched/core
	------------------------------------------------------------------------------
	1 Threads
	50.0th:		      		    8.00       9.00(-12.50)
	75.0th:   			    9.60       9.00(6.25)
	90.0th:   			   11.80      10.20(13.56)
	95.0th:   			   12.60      10.40(17.46)
	99.0th:   			   13.60      11.90(12.50)
	99.5th:   			   14.10      12.60(10.64)
	99.9th:   			   15.90      14.60(8.18)
	2 Threads
	50.0th:   			    9.90       9.20(7.07)
	75.0th:   			   12.60      10.10(19.84)
	90.0th:   			   15.50      12.00(22.58)
	95.0th:   			   17.70      14.00(20.90)
	99.0th:   			   21.20      16.90(20.28)
	99.5th:   			   22.60      17.50(22.57)
	99.9th:   			   30.40      19.40(36.18)
	4 Threads
	50.0th:   			   12.50      10.60(15.20)
	75.0th:   			   15.30      12.00(21.57)
	90.0th:   			   18.60      14.10(24.19)
	95.0th:   			   21.30      16.20(23.94)
	99.0th:   			   26.00      20.70(20.38)
	99.5th:   			   27.60      22.50(18.48)
	99.9th:   			   33.90      31.40(7.37)
	8 Threads
	50.0th:   			   16.30      14.30(12.27)
	75.0th:   			   20.20      17.40(13.86)
	90.0th:   			   24.50      21.90(10.61)
	95.0th:   			   27.30      24.70(9.52)
	99.0th:   			   35.00      31.20(10.86)
	99.5th:   			   46.40      33.30(28.23)
	99.9th:   			   89.30      57.50(35.61)
	16 Threads
	50.0th:   			   22.70      20.70(8.81)
	75.0th:   			   30.10      27.40(8.97)
	90.0th:   			   36.00      32.80(8.89)
	95.0th:   			   39.60      36.40(8.08)
	99.0th:   			   49.20      44.10(10.37)
	99.5th:   			   64.90      50.50(22.19)
	99.9th:   			  143.50     100.60(29.90)
	32 Threads
	50.0th:   			   34.60      35.50(-2.60)
	75.0th:   			   48.20      50.50(-4.77)
	90.0th:   			   59.20      62.40(-5.41)
	95.0th:   			   65.20      69.00(-5.83)
	99.0th:   			   80.40      83.80(-4.23)
	99.5th:   			  102.10      98.90(3.13)
	99.9th:   			  727.10     506.80(30.30)

schbench does improve in general. There is some run to run variation with
schbench. Did a validation run to confirm that trend is similar.

	------------------------------------------------------------------------------
	hackbench				tip	   +patch(%gain)
	20 iterations, 50000 loops	     sched/core
	------------------------------------------------------------------------------
	Process 10 groups                :      11.74      11.70(0.34)
	Process 20 groups                :      22.73      22.69(0.18)
	Process 30 groups                :      33.39      33.40(-0.03)
	Process 40 groups                :      43.73      43.61(0.27)
	Process 50 groups                :      53.82      54.35(-0.98)
	Process 60 groups                :      64.16      65.29(-1.76)
	thread 10 Time                   :      12.81      12.79(0.16)
	thread 20 Time                   :      24.63      24.47(0.65)
	Process(Pipe) 10 Time            :       6.40       6.34(0.94)
	Process(Pipe) 20 Time            :      10.62      10.63(-0.09)
	Process(Pipe) 30 Time            :      15.09      14.84(1.66)
	Process(Pipe) 40 Time            :      19.42      19.01(2.11)
	Process(Pipe) 50 Time            :      24.04      23.34(2.91)
	Process(Pipe) 60 Time            :      28.94      27.51(4.94)
	thread(Pipe) 10 Time             :       6.96       6.87(1.29)
	thread(Pipe) 20 Time             :      11.74      11.73(0.09)

hackbench shows slight improvement with pipe. Slight degradation in process.

	------------------------------------------------------------------------------
	stress_ng				tip        +patch(%gain)
	10 iterations 100000 cpu_ops	     sched/core
	------------------------------------------------------------------------------

	--cpu=96 -util=100 Time taken  	 :       5.30,       5.01(5.47)
	--cpu=48 -util=100 Time taken    :       7.94,       6.73(15.24)
	--cpu=24 -util=100 Time taken    :      11.67,       8.75(25.02)
	--cpu=12 -util=100 Time taken    :      15.71,      15.02(4.39)
	--cpu=96 -util=10 Time taken     :      22.71,      22.19(2.29)
	--cpu=96 -util=20 Time taken     :      12.14,      12.37(-1.89)
	--cpu=96 -util=30 Time taken     :       8.76,       8.86(-1.14)
	--cpu=96 -util=40 Time taken     :       7.13,       7.14(-0.14)
	--cpu=96 -util=50 Time taken     :       6.10,       6.13(-0.49)
	--cpu=96 -util=60 Time taken     :       5.42,       5.41(0.18)
	--cpu=96 -util=70 Time taken     :       4.94,       4.94(0.00)
	--cpu=96 -util=80 Time taken     :       4.56,       4.53(0.66)
	--cpu=96 -util=90 Time taken     :       4.27,       4.26(0.23)

Good improvement seen with 24 CPUs. In this case only one CPU is busy,
and no core is idle. Decent improvement with 100% utilization case. no
difference in other utilization.

Fixes: b1bfeab9b002 ("sched/fair: Consider the idle state of the whole core for load balance")
Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230902081204.232218-1-sshegde@linux.vnet.ibm.com
---
 kernel/sched/fair.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8dbff6e7ad4f..33a2b6bba676 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6619,6 +6619,7 @@ dequeue_throttle:
 /* Working cpumask for: load_balance, load_balance_newidle. */
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
+static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
 
 #ifdef CONFIG_NO_HZ_COMMON
 
@@ -10917,6 +10918,7 @@ static int active_load_balance_cpu_stop(void *data);
 
 static int should_we_balance(struct lb_env *env)
 {
+	struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
 	struct sched_group *sg = env->sd->groups;
 	int cpu, idle_smt = -1;
 
@@ -10940,8 +10942,9 @@ static int should_we_balance(struct lb_env *env)
 		return 1;
 	}
 
+	cpumask_copy(swb_cpus, group_balance_mask(sg));
 	/* Try to find first idle CPU */
-	for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+	for_each_cpu_and(cpu, swb_cpus, env->cpus) {
 		if (!idle_cpu(cpu))
 			continue;
 
@@ -10953,6 +10956,14 @@ static int should_we_balance(struct lb_env *env)
 		if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
 			if (idle_smt == -1)
 				idle_smt = cpu;
+			/*
+			 * If the core is not idle, and first SMT sibling which is
+			 * idle has been found, then its not needed to check other
+			 * SMT siblings for idleness:
+			 */
+#ifdef CONFIG_SCHED_SMT
+			cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
+#endif
 			continue;
 		}
 
@@ -12918,6 +12929,8 @@ __init void init_sched_fair_class(void)
 	for_each_possible_cpu(i) {
 		zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
 		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
+		zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
+					GFP_KERNEL, cpu_to_node(i));
 
 #ifdef CONFIG_CFS_BANDWIDTH
 		INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
-- 
cgit v1.2.3


From feec5e1f74f5b735c0c5c02ec70673db1334173f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 31 Aug 2023 12:13:39 -0700
Subject: kbuild: Show marked Kconfig fragments in "help"

Currently the Kconfig fragments in kernel/configs and arch/*/configs
that aren't used internally aren't discoverable through "make help",
which consists of hard-coded lists of config fragments. Instead, list
all the fragment targets that have a "# Help: " comment prefix so the
targets can be generated dynamically.

Add logic to the Makefile to search for and display the fragment and
comment. Add comments to fragments that are intended to be direct targets.

Signed-off-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Reviewed-by: Nicolas Schier <nicolas@fjasle.eu>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 Makefile                                   |  1 -
 arch/arm/configs/dram_0x00000000.config    |  1 +
 arch/arm/configs/dram_0xc0000000.config    |  1 +
 arch/arm/configs/dram_0xd0000000.config    |  1 +
 arch/arm/configs/lpae.config               |  1 +
 arch/arm64/configs/virt.config             |  1 +
 arch/powerpc/configs/disable-werror.config |  1 +
 arch/powerpc/configs/security.config       |  4 +++-
 arch/riscv/configs/32-bit.config           |  1 +
 arch/riscv/configs/64-bit.config           |  1 +
 arch/s390/configs/btf.config               |  1 +
 arch/s390/configs/kasan.config             |  1 +
 arch/x86/Makefile                          |  4 ----
 kernel/configs/debug.config                |  2 ++
 kernel/configs/kvm_guest.config            |  1 +
 kernel/configs/nopm.config                 |  2 ++
 kernel/configs/rust.config                 |  1 +
 kernel/configs/x86_debug.config            |  1 +
 kernel/configs/xen.config                  |  2 ++
 scripts/kconfig/Makefile                   | 15 ++++++++++++---
 20 files changed, 34 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/Makefile b/Makefile
index e21bf66af6fd..23cd62a5ff05 100644
--- a/Makefile
+++ b/Makefile
@@ -1552,7 +1552,6 @@ help:
 	@echo  '  mrproper	  - Remove all generated files + config + various backup files'
 	@echo  '  distclean	  - mrproper + remove editor backup and patch files'
 	@echo  ''
-	@echo  'Configuration targets:'
 	@$(MAKE) -f $(srctree)/scripts/kconfig/Makefile help
 	@echo  ''
 	@echo  'Other generic targets:'
diff --git a/arch/arm/configs/dram_0x00000000.config b/arch/arm/configs/dram_0x00000000.config
index db96dcb420ce..8803a0f58343 100644
--- a/arch/arm/configs/dram_0x00000000.config
+++ b/arch/arm/configs/dram_0x00000000.config
@@ -1 +1,2 @@
+# Help: DRAM base at 0x00000000
 CONFIG_DRAM_BASE=0x00000000
diff --git a/arch/arm/configs/dram_0xc0000000.config b/arch/arm/configs/dram_0xc0000000.config
index 343d5333d973..aab8f864686b 100644
--- a/arch/arm/configs/dram_0xc0000000.config
+++ b/arch/arm/configs/dram_0xc0000000.config
@@ -1 +1,2 @@
+# Help: DRAM base at 0xc0000000
 CONFIG_DRAM_BASE=0xc0000000
diff --git a/arch/arm/configs/dram_0xd0000000.config b/arch/arm/configs/dram_0xd0000000.config
index 61ba7045f8a1..4aabce4ea3d4 100644
--- a/arch/arm/configs/dram_0xd0000000.config
+++ b/arch/arm/configs/dram_0xd0000000.config
@@ -1 +1,2 @@
+# Help: DRAM base at 0xd0000000
 CONFIG_DRAM_BASE=0xd0000000
diff --git a/arch/arm/configs/lpae.config b/arch/arm/configs/lpae.config
index a6d6f7ab3c01..1ab94da8345d 100644
--- a/arch/arm/configs/lpae.config
+++ b/arch/arm/configs/lpae.config
@@ -1,2 +1,3 @@
+# Help: Enable Large Physical Address Extension mode
 CONFIG_ARM_LPAE=y
 CONFIG_VMSPLIT_2G=y
diff --git a/arch/arm64/configs/virt.config b/arch/arm64/configs/virt.config
index 6865d54e68f8..c47c36f8f67b 100644
--- a/arch/arm64/configs/virt.config
+++ b/arch/arm64/configs/virt.config
@@ -1,3 +1,4 @@
+# Help: Virtualization guest
 #
 # Base options for platforms
 #
diff --git a/arch/powerpc/configs/disable-werror.config b/arch/powerpc/configs/disable-werror.config
index 6ea12a12432c..7776b91da37f 100644
--- a/arch/powerpc/configs/disable-werror.config
+++ b/arch/powerpc/configs/disable-werror.config
@@ -1 +1,2 @@
+# Help: Disable -Werror
 CONFIG_PPC_DISABLE_WERROR=y
diff --git a/arch/powerpc/configs/security.config b/arch/powerpc/configs/security.config
index 1c91a35c6a73..0d54e29e2cdf 100644
--- a/arch/powerpc/configs/security.config
+++ b/arch/powerpc/configs/security.config
@@ -1,3 +1,5 @@
+# Help: Common security options for PowerPC builds
+
 # This is the equivalent of booting with lockdown=integrity
 CONFIG_SECURITY=y
 CONFIG_SECURITYFS=y
@@ -12,4 +14,4 @@ CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
 
 # UBSAN bounds checking is very cheap and good for hardening
 CONFIG_UBSAN=y
-# CONFIG_UBSAN_MISC is not set
\ No newline at end of file
+# CONFIG_UBSAN_MISC is not set
diff --git a/arch/riscv/configs/32-bit.config b/arch/riscv/configs/32-bit.config
index f6af0f708df4..16ee163847b4 100644
--- a/arch/riscv/configs/32-bit.config
+++ b/arch/riscv/configs/32-bit.config
@@ -1,3 +1,4 @@
+# Help: Build a 32-bit image
 CONFIG_ARCH_RV32I=y
 CONFIG_32BIT=y
 # CONFIG_PORTABLE is not set
diff --git a/arch/riscv/configs/64-bit.config b/arch/riscv/configs/64-bit.config
index 313edc554d84..d872a2d533f2 100644
--- a/arch/riscv/configs/64-bit.config
+++ b/arch/riscv/configs/64-bit.config
@@ -1,2 +1,3 @@
+# Help: Build a 64-bit image
 CONFIG_ARCH_RV64I=y
 CONFIG_64BIT=y
diff --git a/arch/s390/configs/btf.config b/arch/s390/configs/btf.config
index 39227b4511af..eb7f84f5925c 100644
--- a/arch/s390/configs/btf.config
+++ b/arch/s390/configs/btf.config
@@ -1 +1,2 @@
+# Help: Enable BTF debug info
 CONFIG_DEBUG_INFO_BTF=y
diff --git a/arch/s390/configs/kasan.config b/arch/s390/configs/kasan.config
index 700a8b25c3ff..84c2b551e992 100644
--- a/arch/s390/configs/kasan.config
+++ b/arch/s390/configs/kasan.config
@@ -1,3 +1,4 @@
+# Help: Enable KASan for debugging
 CONFIG_KASAN=y
 CONFIG_KASAN_INLINE=y
 CONFIG_KASAN_VMALLOC=y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index fdc2e3abd615..c4b2a8a19fc8 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -335,9 +335,5 @@ define archhelp
   echo  '			  bzdisk/fdimage*/hdimage/isoimage also accept:'
   echo  '			  FDARGS="..."  arguments for the booted kernel'
   echo  '			  FDINITRD=file initrd for the booted kernel'
-  echo  ''
-  echo  '  kvm_guest.config	- Enable Kconfig items for running this kernel as a KVM guest'
-  echo  '  xen.config		- Enable Kconfig items for running this kernel as a Xen guest'
-  echo  '  x86_debug.config	- Enable tip tree debugging options for testing'
 
 endef
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index e8db8d938661..4722b998a324 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -1,3 +1,5 @@
+# Help: Debugging for CI systems and finding regressions
+#
 # The config is based on running daily CI for enterprise Linux distros to
 # seek regressions on linux-next builds on different bare-metal and virtual
 # platforms. It can be used for example,
diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
index 208481d91090..d0877063d925 100644
--- a/kernel/configs/kvm_guest.config
+++ b/kernel/configs/kvm_guest.config
@@ -1,3 +1,4 @@
+# Help: Bootable as a KVM guest
 CONFIG_NET=y
 CONFIG_NET_CORE=y
 CONFIG_NETDEVICES=y
diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config
index 81ff07863576..ebfdc3d8aa9a 100644
--- a/kernel/configs/nopm.config
+++ b/kernel/configs/nopm.config
@@ -1,3 +1,5 @@
+# Help: Disable Power Management
+
 CONFIG_PM=n
 CONFIG_SUSPEND=n
 CONFIG_HIBERNATION=n
diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config
index 38a7c5362c9c..2c6e001a7284 100644
--- a/kernel/configs/rust.config
+++ b/kernel/configs/rust.config
@@ -1 +1,2 @@
+# Help: Enable Rust
 CONFIG_RUST=y
diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config
index 6fac5b405334..35f48671b8d5 100644
--- a/kernel/configs/x86_debug.config
+++ b/kernel/configs/x86_debug.config
@@ -1,3 +1,4 @@
+# Help: Debugging options for tip tree testing
 CONFIG_X86_DEBUG_FPU=y
 CONFIG_LOCK_STAT=y
 CONFIG_DEBUG_VM=y
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
index 436f806aa1ed..6878b9a49be8 100644
--- a/kernel/configs/xen.config
+++ b/kernel/configs/xen.config
@@ -1,3 +1,5 @@
+# Help: Bootable as a Xen guest
+#
 # global stuff - these enable us to allow some
 # of the not so generic stuff below for xen
 CONFIG_PARAVIRT=y
diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile
index af1c96198f49..4eee155121a8 100644
--- a/scripts/kconfig/Makefile
+++ b/scripts/kconfig/Makefile
@@ -93,11 +93,13 @@ endif
 %_defconfig: $(obj)/conf
 	$(Q)$< $(silent) --defconfig=arch/$(SRCARCH)/configs/$@ $(Kconfig)
 
-configfiles=$(wildcard $(srctree)/kernel/configs/$@ $(srctree)/arch/$(SRCARCH)/configs/$@)
+configfiles = $(wildcard $(srctree)/kernel/configs/$(1) $(srctree)/arch/$(SRCARCH)/configs/$(1))
+all-config-fragments = $(call configfiles,*.config)
+config-fragments = $(call configfiles,$@)
 
 %.config: $(obj)/conf
-	$(if $(call configfiles),, $(error No configuration exists for this target on this architecture))
-	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m .config $(configfiles)
+	$(if $(config-fragments),, $(error $@ fragment does not exists on this architecture))
+	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m .config $(config-fragments)
 	$(Q)$(MAKE) -f $(srctree)/Makefile olddefconfig
 
 PHONY += tinyconfig
@@ -115,6 +117,7 @@ clean-files += tests/.cache
 
 # Help text used by make help
 help:
+	@echo  'Configuration targets:'
 	@echo  '  config	  - Update current config utilising a line-oriented program'
 	@echo  '  nconfig         - Update current config utilising a ncurses menu based program'
 	@echo  '  menuconfig	  - Update current config utilising a menu based program'
@@ -141,6 +144,12 @@ help:
 	@echo  '                    default value without prompting'
 	@echo  '  tinyconfig	  - Configure the tiniest possible kernel'
 	@echo  '  testconfig	  - Run Kconfig unit tests (requires python3 and pytest)'
+	@echo  ''
+	@echo  'Configuration topic targets:'
+	@$(foreach f, $(all-config-fragments), \
+		if help=$$(grep -m1 '^# Help: ' $(f)); then \
+			printf '  %-25s - %s\n' '$(notdir $(f))' "$${help#*: }"; \
+		fi;)
 
 # ===========================================================================
 # object files used by all kconfig flavours
-- 
cgit v1.2.3


From 7645629f7dc88cd777f98970134bf1a54c8d77e3 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 30 Aug 2023 10:04:04 +0200
Subject: bpf: Invoke __bpf_prog_exit_sleepable_recur() on recursion in
 kern_sys_bpf().

If __bpf_prog_enter_sleepable_recur() detects recursion then it returns
0 without undoing rcu_read_lock_trace(), migrate_disable() or
decrementing the recursion counter. This is fine in the JIT case because
the JIT code will jump in the 0 case to the end and invoke the matching
exit trampoline (__bpf_prog_exit_sleepable_recur()).

This is not the case in kern_sys_bpf() which returns directly to the
caller with an error code.

Add __bpf_prog_exit_sleepable_recur() as clean up in the recursion case.

Fixes: b1d18a7574d0d ("bpf: Extend sys_bpf commands for bpf_syscall programs.")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20230830080405.251926-2-bigeasy@linutronix.de
---
 kernel/bpf/syscall.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ebeb0695305a..53a0b62464e9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5505,6 +5505,7 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
 		run_ctx.saved_run_ctx = NULL;
 		if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
 			/* recursion detected */
+			__bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
 			bpf_prog_put(prog);
 			return -EBUSY;
 		}
-- 
cgit v1.2.3


From 6764e767f4af1e35f87f3497e1182d945de37f93 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 30 Aug 2023 10:04:05 +0200
Subject: bpf: Assign bpf_tramp_run_ctx::saved_run_ctx before recursion check.

__bpf_prog_enter_recur() assigns bpf_tramp_run_ctx::saved_run_ctx before
performing the recursion check which means in case of a recursion
__bpf_prog_exit_recur() uses the previously set bpf_tramp_run_ctx::saved_run_ctx
value.

__bpf_prog_enter_sleepable_recur() assigns bpf_tramp_run_ctx::saved_run_ctx
after the recursion check which means in case of a recursion
__bpf_prog_exit_sleepable_recur() uses an uninitialized value. This does not
look right. If I read the entry trampoline code right, then bpf_tramp_run_ctx
isn't initialized upfront.

Align __bpf_prog_enter_sleepable_recur() with __bpf_prog_enter_recur() and
set bpf_tramp_run_ctx::saved_run_ctx before the recursion check is made.
Remove the assignment of saved_run_ctx in kern_sys_bpf() since it happens
a few cycles later.

Fixes: e384c7b7b46d0 ("bpf, x86: Create bpf_tramp_run_ctx on the caller thread's stack")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20230830080405.251926-3-bigeasy@linutronix.de
---
 kernel/bpf/syscall.c    | 1 -
 kernel/bpf/trampoline.c | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 53a0b62464e9..eb01c31ed591 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5502,7 +5502,6 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
 		}
 
 		run_ctx.bpf_cookie = 0;
-		run_ctx.saved_run_ctx = NULL;
 		if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
 			/* recursion detected */
 			__bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 78acf28d4873..53ff50cac61e 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -926,13 +926,12 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 	migrate_disable();
 	might_fault();
 
+	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
 	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
 		bpf_prog_inc_misses_counter(prog);
 		return 0;
 	}
-
-	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
-
 	return bpf_prog_start_time();
 }
 
-- 
cgit v1.2.3


From a96a44aba556c42b432929d37d60158aca21ad4c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Fri, 1 Sep 2023 16:11:27 -0700
Subject: bpf: bpf_sk_storage: Fix invalid wait context lockdep report

'./test_progs -t test_local_storage' reported a splat:

[   27.137569] =============================
[   27.138122] [ BUG: Invalid wait context ]
[   27.138650] 6.5.0-03980-gd11ae1b16b0a #247 Tainted: G           O
[   27.139542] -----------------------------
[   27.140106] test_progs/1729 is trying to lock:
[   27.140713] ffff8883ef047b88 (stock_lock){-.-.}-{3:3}, at: local_lock_acquire+0x9/0x130
[   27.141834] other info that might help us debug this:
[   27.142437] context-{5:5}
[   27.142856] 2 locks held by test_progs/1729:
[   27.143352]  #0: ffffffff84bcd9c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire+0x4/0x40
[   27.144492]  #1: ffff888107deb2c0 (&storage->lock){..-.}-{2:2}, at: bpf_local_storage_update+0x39e/0x8e0
[   27.145855] stack backtrace:
[   27.146274] CPU: 0 PID: 1729 Comm: test_progs Tainted: G           O       6.5.0-03980-gd11ae1b16b0a #247
[   27.147550] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[   27.149127] Call Trace:
[   27.149490]  <TASK>
[   27.149867]  dump_stack_lvl+0x130/0x1d0
[   27.152609]  dump_stack+0x14/0x20
[   27.153131]  __lock_acquire+0x1657/0x2220
[   27.153677]  lock_acquire+0x1b8/0x510
[   27.157908]  local_lock_acquire+0x29/0x130
[   27.159048]  obj_cgroup_charge+0xf4/0x3c0
[   27.160794]  slab_pre_alloc_hook+0x28e/0x2b0
[   27.161931]  __kmem_cache_alloc_node+0x51/0x210
[   27.163557]  __kmalloc+0xaa/0x210
[   27.164593]  bpf_map_kzalloc+0xbc/0x170
[   27.165147]  bpf_selem_alloc+0x130/0x510
[   27.166295]  bpf_local_storage_update+0x5aa/0x8e0
[   27.167042]  bpf_fd_sk_storage_update_elem+0xdb/0x1a0
[   27.169199]  bpf_map_update_value+0x415/0x4f0
[   27.169871]  map_update_elem+0x413/0x550
[   27.170330]  __sys_bpf+0x5e9/0x640
[   27.174065]  __x64_sys_bpf+0x80/0x90
[   27.174568]  do_syscall_64+0x48/0xa0
[   27.175201]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[   27.175932] RIP: 0033:0x7effb40e41ad
[   27.176357] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d8
[   27.179028] RSP: 002b:00007ffe64c21fc8 EFLAGS: 00000202 ORIG_RAX: 0000000000000141
[   27.180088] RAX: ffffffffffffffda RBX: 00007ffe64c22768 RCX: 00007effb40e41ad
[   27.181082] RDX: 0000000000000020 RSI: 00007ffe64c22008 RDI: 0000000000000002
[   27.182030] RBP: 00007ffe64c21ff0 R08: 0000000000000000 R09: 00007ffe64c22788
[   27.183038] R10: 0000000000000064 R11: 0000000000000202 R12: 0000000000000000
[   27.184006] R13: 00007ffe64c22788 R14: 00007effb42a1000 R15: 0000000000000000
[   27.184958]  </TASK>

It complains about acquiring a local_lock while holding a raw_spin_lock.
It means it should not allocate memory while holding a raw_spin_lock
since it is not safe for RT.

raw_spin_lock is needed because bpf_local_storage supports tracing
context. In particular for task local storage, it is easy to
get a "current" task PTR_TO_BTF_ID in tracing bpf prog.
However, task (and cgroup) local storage has already been moved to
bpf mem allocator which can be used after raw_spin_lock.

The splat is for the sk storage. For sk (and inode) storage,
it has not been moved to bpf mem allocator. Using raw_spin_lock or not,
kzalloc(GFP_ATOMIC) could theoretically be unsafe in tracing context.
However, the local storage helper requires a verifier accepted
sk pointer (PTR_TO_BTF_ID), it is hypothetical if that (mean running
a bpf prog in a kzalloc unsafe context and also able to hold a verifier
accepted sk pointer) could happen.

This patch avoids kzalloc after raw_spin_lock to silent the splat.
There is an existing kzalloc before the raw_spin_lock. At that point,
a kzalloc is very likely required because a lookup has just been done
before. Thus, this patch always does the kzalloc before acquiring
the raw_spin_lock and remove the later kzalloc usage after the
raw_spin_lock. After this change, it will have a charge and then
uncharge during the syscall bpf_map_update_elem() code path.
This patch opts for simplicity and not continue the old
optimization to save one charge and uncharge.

This issue is dated back to the very first commit of bpf_sk_storage
which had been refactored multiple times to create task, inode, and
cgroup storage. This patch uses a Fixes tag with a more recent
commit that should be easier to do backport.

Fixes: b00fa38a9c1c ("bpf: Enable non-atomic allocations in local storage")
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230901231129.578493-2-martin.lau@linux.dev
---
 kernel/bpf/bpf_local_storage.c | 47 +++++++++++++-----------------------------
 1 file changed, 14 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b5149cfce7d4..37ad47d52dc5 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -553,7 +553,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 			 void *value, u64 map_flags, gfp_t gfp_flags)
 {
 	struct bpf_local_storage_data *old_sdata = NULL;
-	struct bpf_local_storage_elem *selem = NULL;
+	struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
 	struct bpf_local_storage *local_storage;
 	unsigned long flags;
 	int err;
@@ -607,11 +607,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		}
 	}
 
-	if (gfp_flags == GFP_KERNEL) {
-		selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
-		if (!selem)
-			return ERR_PTR(-ENOMEM);
-	}
+	/* A lookup has just been done before and concluded a new selem is
+	 * needed. The chance of an unnecessary alloc is unlikely.
+	 */
+	alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+	if (!alloc_selem)
+		return ERR_PTR(-ENOMEM);
 
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 
@@ -623,13 +624,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		 * simple.
 		 */
 		err = -EAGAIN;
-		goto unlock_err;
+		goto unlock;
 	}
 
 	old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
 	err = check_flags(old_sdata, map_flags);
 	if (err)
-		goto unlock_err;
+		goto unlock;
 
 	if (old_sdata && (map_flags & BPF_F_LOCK)) {
 		copy_map_value_locked(&smap->map, old_sdata->data, value,
@@ -638,23 +639,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		goto unlock;
 	}
 
-	if (gfp_flags != GFP_KERNEL) {
-		/* local_storage->lock is held.  Hence, we are sure
-		 * we can unlink and uncharge the old_sdata successfully
-		 * later.  Hence, instead of charging the new selem now
-		 * and then uncharge the old selem later (which may cause
-		 * a potential but unnecessary charge failure),  avoid taking
-		 * a charge at all here (the "!old_sdata" check) and the
-		 * old_sdata will not be uncharged later during
-		 * bpf_selem_unlink_storage_nolock().
-		 */
-		selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags);
-		if (!selem) {
-			err = -ENOMEM;
-			goto unlock_err;
-		}
-	}
-
+	alloc_selem = NULL;
 	/* First, link the new selem to the map */
 	bpf_selem_link_map(smap, selem);
 
@@ -665,20 +650,16 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	if (old_sdata) {
 		bpf_selem_unlink_map(SELEM(old_sdata));
 		bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
-						false, false);
+						true, false);
 	}
 
 unlock:
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
-	return SDATA(selem);
-
-unlock_err:
-	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
-	if (selem) {
+	if (alloc_selem) {
 		mem_uncharge(smap, owner, smap->elem_size);
-		bpf_selem_free(selem, smap, true);
+		bpf_selem_free(alloc_selem, smap, true);
 	}
-	return ERR_PTR(err);
+	return err ? ERR_PTR(err) : SDATA(selem);
 }
 
 static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
-- 
cgit v1.2.3


From 55d49f750b1cb1f177fb1b00ae02cba4613bcfb7 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Fri, 1 Sep 2023 16:11:28 -0700
Subject: bpf: bpf_sk_storage: Fix the missing uncharge in sk_omem_alloc

The commit c83597fa5dc6 ("bpf: Refactor some inode/task/sk storage functions
for reuse"), refactored the bpf_{sk,task,inode}_storage_free() into
bpf_local_storage_unlink_nolock() which then later renamed to
bpf_local_storage_destroy(). The commit accidentally passed the
"bool uncharge_mem = false" argument to bpf_selem_unlink_storage_nolock()
which then stopped the uncharge from happening to the sk->sk_omem_alloc.

This missing uncharge only happens when the sk is going away (during
__sk_destruct).

This patch fixes it by always passing "uncharge_mem = true". It is a
noop to the task/inode/cgroup storage because they do not have the
map_local_storage_(un)charge enabled in the map_ops. A followup patch
will be done in bpf-next to remove the uncharge_mem argument.

A selftest is added in the next patch.

Fixes: c83597fa5dc6 ("bpf: Refactor some inode/task/sk storage functions for reuse")
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230901231129.578493-3-martin.lau@linux.dev
---
 kernel/bpf/bpf_local_storage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 37ad47d52dc5..146824cc9689 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -760,7 +760,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 		 * of the loop will set the free_cgroup_storage to true.
 		 */
 		free_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, false, true);
+			local_storage, selem, true, true);
 	}
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
-- 
cgit v1.2.3


From 20e490adea279d49d57b800475938f5b67926d98 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay12@gmail.com>
Date: Thu, 31 Aug 2023 13:12:26 +0000
Subject: bpf: make bpf_prog_pack allocator portable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bpf_prog_pack allocator currently uses module_alloc() and
module_memfree() to allocate and free memory. This is not portable
because different architectures use different methods for allocating
memory for BPF programs. Like ARM64 and riscv use vmalloc()/vfree().

Use bpf_jit_alloc_exec() and bpf_jit_free_exec() for memory management
in bpf_prog_pack allocator. Other architectures can override these with
their implementation and will be able to use bpf_prog_pack directly.

On architectures that don't override bpf_jit_alloc/free_exec() this is
basically a NOP.

Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
Acked-by: Song Liu <song@kernel.org>
Acked-by: Björn Töpel <bjorn@kernel.org>
Tested-by: Björn Töpel <bjorn@rivosinc.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20230831131229.497941-2-puranjay12@gmail.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 kernel/bpf/core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 0f8f036d8bd1..4e3ce0542e31 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -870,7 +870,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins
 		       GFP_KERNEL);
 	if (!pack)
 		return NULL;
-	pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
+	pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
 	if (!pack->ptr) {
 		kfree(pack);
 		return NULL;
@@ -894,7 +894,7 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
 	mutex_lock(&pack_mutex);
 	if (size > BPF_PROG_PACK_SIZE) {
 		size = round_up(size, PAGE_SIZE);
-		ptr = module_alloc(size);
+		ptr = bpf_jit_alloc_exec(size);
 		if (ptr) {
 			bpf_fill_ill_insns(ptr, size);
 			set_vm_flush_reset_perms(ptr);
@@ -932,7 +932,7 @@ void bpf_prog_pack_free(struct bpf_binary_header *hdr)
 
 	mutex_lock(&pack_mutex);
 	if (hdr->size > BPF_PROG_PACK_SIZE) {
-		module_memfree(hdr);
+		bpf_jit_free_exec(hdr);
 		goto out;
 	}
 
@@ -956,7 +956,7 @@ void bpf_prog_pack_free(struct bpf_binary_header *hdr)
 	if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
 				       BPF_PROG_CHUNK_COUNT, 0) == 0) {
 		list_del(&pack->list);
-		module_memfree(pack->ptr);
+		bpf_jit_free_exec(pack->ptr);
 		kfree(pack);
 	}
 out:
-- 
cgit v1.2.3


From 4952801fc6adb5b50b8ec2bcc5aeef92fcce8730 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 5 Sep 2023 10:19:02 +0200
Subject: Revert "printk: export symbols for debug modules"

This reverts commit 3e00123a13d824d63072b1824c9da59cd78356d9.

No, we never export random symbols for out of tree modules.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230905081902.321778-1-hch@lst.de
---
 kernel/printk/printk.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 96fc38cb2e84..7e0b4dd02398 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -538,14 +538,12 @@ char *log_buf_addr_get(void)
 {
 	return log_buf;
 }
-EXPORT_SYMBOL_GPL(log_buf_addr_get);
 
 /* Return log buffer size */
 u32 log_buf_len_get(void)
 {
 	return log_buf_len;
 }
-EXPORT_SYMBOL_GPL(log_buf_len_get);
 
 /*
  * Define how much of the log buffer we could take at maximum. The value
-- 
cgit v1.2.3


From f5ca233e2e66dc1c249bf07eefa37e34a6c9346a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 6 Sep 2023 22:47:12 -0400
Subject: tracing: Increase trace array ref count on enable and filter files

When the trace event enable and filter files are opened, increment the
trace array ref counter, otherwise they can be accessed when the trace
array is being deleted. The ref counter keeps the trace array from being
deleted while those files are opened.

Link: https://lkml.kernel.org/r/20230907024803.456187066@goodmis.org
Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: 8530dec63e7b4 ("tracing: Add tracing_check_open_get_tr()")
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Reported-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 27 +++++++++++++++++++++++++++
 kernel/trace/trace.h        |  2 ++
 kernel/trace/trace_events.c |  6 ++++--
 3 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 35783a7baf15..0827037ee3b8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4973,6 +4973,33 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+/*
+ * The private pointer of the inode is the trace_event_file.
+ * Update the tr ref count associated to it.
+ */
+int tracing_open_file_tr(struct inode *inode, struct file *filp)
+{
+	struct trace_event_file *file = inode->i_private;
+	int ret;
+
+	ret = tracing_check_open_get_tr(file->tr);
+	if (ret)
+		return ret;
+
+	filp->private_data = inode->i_private;
+
+	return 0;
+}
+
+int tracing_release_file_tr(struct inode *inode, struct file *filp)
+{
+	struct trace_event_file *file = inode->i_private;
+
+	trace_array_put(file->tr);
+
+	return 0;
+}
+
 static int tracing_mark_open(struct inode *inode, struct file *filp)
 {
 	stream_open(inode, filp);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5669dd1f90d9..77debe53f07c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -610,6 +610,8 @@ void tracing_reset_all_online_cpus(void);
 void tracing_reset_all_online_cpus_unlocked(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 int tracing_open_generic_tr(struct inode *inode, struct file *filp);
+int tracing_open_file_tr(struct inode *inode, struct file *filp);
+int tracing_release_file_tr(struct inode *inode, struct file *filp);
 bool tracing_is_disabled(void);
 bool tracer_tracing_is_on(struct trace_array *tr);
 void tracer_tracing_on(struct trace_array *tr);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ed367d713be0..2af92177b765 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2103,9 +2103,10 @@ static const struct file_operations ftrace_set_event_notrace_pid_fops = {
 };
 
 static const struct file_operations ftrace_enable_fops = {
-	.open = tracing_open_generic,
+	.open = tracing_open_file_tr,
 	.read = event_enable_read,
 	.write = event_enable_write,
+	.release = tracing_release_file_tr,
 	.llseek = default_llseek,
 };
 
@@ -2122,9 +2123,10 @@ static const struct file_operations ftrace_event_id_fops = {
 };
 
 static const struct file_operations ftrace_event_filter_fops = {
-	.open = tracing_open_generic,
+	.open = tracing_open_file_tr,
 	.read = event_filter_read,
 	.write = event_filter_write,
+	.release = tracing_release_file_tr,
 	.llseek = default_llseek,
 };
 
-- 
cgit v1.2.3


From 7d660c9b2bc95107f90a9f4c4759be85309a6550 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 6 Sep 2023 22:47:13 -0400
Subject: tracing: Have tracing_max_latency inc the trace array ref count

The tracing_max_latency file points to the trace_array max_latency field.
For an instance, if the file is opened and the instance is deleted,
reading or writing to the file will cause a use after free.

Up the ref count of the trace_array when tracing_max_latency is opened.

Link: https://lkml.kernel.org/r/20230907024803.666889383@goodmis.org
Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Zheng Yejian <zhengyejian1@huawei.com>
Fixes: 8530dec63e7b4 ("tracing: Add tracing_check_open_get_tr()")
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0827037ee3b8..c8b8b4c6feaf 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1772,7 +1772,7 @@ static void trace_create_maxlat_file(struct trace_array *tr,
 	init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
 	tr->d_max_latency = trace_create_file("tracing_max_latency",
 					      TRACE_MODE_WRITE,
-					      d_tracer, &tr->max_latency,
+					      d_tracer, tr,
 					      &tracing_max_lat_fops);
 }
 
@@ -1805,7 +1805,7 @@ void latency_fsnotify(struct trace_array *tr)
 
 #define trace_create_maxlat_file(tr, d_tracer)				\
 	trace_create_file("tracing_max_latency", TRACE_MODE_WRITE,	\
-			  d_tracer, &tr->max_latency, &tracing_max_lat_fops)
+			  d_tracer, tr, &tracing_max_lat_fops)
 
 #endif
 
@@ -6717,14 +6717,18 @@ static ssize_t
 tracing_max_lat_read(struct file *filp, char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
 {
-	return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos);
+	struct trace_array *tr = filp->private_data;
+
+	return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos);
 }
 
 static ssize_t
 tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 		      size_t cnt, loff_t *ppos)
 {
-	return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos);
+	struct trace_array *tr = filp->private_data;
+
+	return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos);
 }
 
 #endif
@@ -7778,10 +7782,11 @@ static const struct file_operations tracing_thresh_fops = {
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 static const struct file_operations tracing_max_lat_fops = {
-	.open		= tracing_open_generic,
+	.open		= tracing_open_generic_tr,
 	.read		= tracing_max_lat_read,
 	.write		= tracing_max_lat_write,
 	.llseek		= generic_file_llseek,
+	.release	= tracing_release_generic_tr,
 };
 #endif
 
-- 
cgit v1.2.3


From 9b37febc578b2e1ad76a105aab11d00af5ec3d27 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 6 Sep 2023 22:47:14 -0400
Subject: tracing: Have current_trace inc the trace array ref count

The current_trace updates the trace array tracer. For an instance, if the
file is opened and the instance is deleted, reading or writing to the file
will cause a use after free.

Up the ref count of the trace array when current_trace is opened.

Link: https://lkml.kernel.org/r/20230907024803.877687227@goodmis.org
Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Zheng Yejian <zhengyejian1@huawei.com>
Fixes: 8530dec63e7b4 ("tracing: Add tracing_check_open_get_tr()")
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c8b8b4c6feaf..b82df33d20ff 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7791,10 +7791,11 @@ static const struct file_operations tracing_max_lat_fops = {
 #endif
 
 static const struct file_operations set_tracer_fops = {
-	.open		= tracing_open_generic,
+	.open		= tracing_open_generic_tr,
 	.read		= tracing_set_trace_read,
 	.write		= tracing_set_trace_write,
 	.llseek		= generic_file_llseek,
+	.release	= tracing_release_generic_tr,
 };
 
 static const struct file_operations tracing_pipe_fops = {
-- 
cgit v1.2.3


From 7e2cfbd2d3c86afcd5c26b5c4b1dd251f63c5838 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 6 Sep 2023 22:47:15 -0400
Subject: tracing: Have option files inc the trace array ref count

The option files update the options for a given trace array. For an
instance, if the file is opened and the instance is deleted, reading or
writing to the file will cause a use after free.

Up the ref count of the trace_array when an option file is opened.

Link: https://lkml.kernel.org/r/20230907024804.086679464@goodmis.org
Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Zheng Yejian <zhengyejian1@huawei.com>
Fixes: 8530dec63e7b4 ("tracing: Add tracing_check_open_get_tr()")
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b82df33d20ff..0608ad20cf30 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8988,12 +8988,33 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static int tracing_open_options(struct inode *inode, struct file *filp)
+{
+	struct trace_option_dentry *topt = inode->i_private;
+	int ret;
+
+	ret = tracing_check_open_get_tr(topt->tr);
+	if (ret)
+		return ret;
+
+	filp->private_data = inode->i_private;
+	return 0;
+}
+
+static int tracing_release_options(struct inode *inode, struct file *file)
+{
+	struct trace_option_dentry *topt = file->private_data;
+
+	trace_array_put(topt->tr);
+	return 0;
+}
 
 static const struct file_operations trace_options_fops = {
-	.open = tracing_open_generic,
+	.open = tracing_open_options,
 	.read = trace_options_read,
 	.write = trace_options_write,
 	.llseek	= generic_file_llseek,
+	.release = tracing_release_options,
 };
 
 /*
-- 
cgit v1.2.3


From e5c624f027ac74f97e97c8f36c69228ac9f1102d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 6 Sep 2023 22:47:16 -0400
Subject: tracing: Have event inject files inc the trace array ref count

The event inject files add events for a specific trace array. For an
instance, if the file is opened and the instance is deleted, reading or
writing to the file will cause a use after free.

Up the ref count of the trace_array when a event inject file is opened.

Link: https://lkml.kernel.org/r/20230907024804.292337868@goodmis.org
Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Zheng Yejian <zhengyejian1@huawei.com>
Fixes: 6c3edaf9fd6a ("tracing: Introduce trace event injection")
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Tested-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_inject.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index abe805d471eb..8650562bdaa9 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -328,7 +328,8 @@ event_inject_read(struct file *file, char __user *buf, size_t size,
 }
 
 const struct file_operations event_inject_fops = {
-	.open = tracing_open_generic,
+	.open = tracing_open_file_tr,
 	.read = event_inject_read,
 	.write = event_inject_write,
+	.release = tracing_release_file_tr,
 };
-- 
cgit v1.2.3


From f6bd2c92488c30ef53b5bd80c52f0a7eee9d545a Mon Sep 17 00:00:00 2001
From: Zheng Yejian <zhengyejian1@huawei.com>
Date: Wed, 6 Sep 2023 16:19:30 +0800
Subject: ring-buffer: Avoid softlockup in ring_buffer_resize()

When user resize all trace ring buffer through file 'buffer_size_kb',
then in ring_buffer_resize(), kernel allocates buffer pages for each
cpu in a loop.

If the kernel preemption model is PREEMPT_NONE and there are many cpus
and there are many buffer pages to be allocated, it may not give up cpu
for a long time and finally cause a softlockup.

To avoid it, call cond_resched() after each cpu buffer allocation.

Link: https://lore.kernel.org/linux-trace-kernel/20230906081930.3939106-1-zhengyejian1@huawei.com

Cc: <mhiramat@kernel.org>
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 78502d4c7214..72ccf75defd0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2198,6 +2198,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 				err = -ENOMEM;
 				goto out_err;
 			}
+
+			cond_resched();
 		}
 
 		cpus_read_lock();
-- 
cgit v1.2.3


From f875db4f20f4ec2e4fa3b3be0e5081976e0b5dad Mon Sep 17 00:00:00 2001
From: Zhenhua Huang <quic_zhenhuah@quicinc.com>
Date: Thu, 7 Sep 2023 16:03:56 +0800
Subject: Revert "dma-contiguous: check for memory region overlap"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 3fa6456ebe13adab3ba1817c8e515a5b88f95dce.

The Commit broke the CMA region creation through DT on arm64,
as showed below logs with "memblock=debug":
[    0.000000] memblock_phys_alloc_range: 41943040 bytes align=0x200000
from=0x0000000000000000 max_addr=0x00000000ffffffff
early_init_dt_alloc_reserved_memory_arch+0x34/0xa0
[    0.000000] memblock_reserve: [0x00000000fd600000-0x00000000ffdfffff]
memblock_alloc_range_nid+0xc0/0x19c
[    0.000000] Reserved memory: overlap with other memblock reserved region

>From call flow, region we defined in DT was always reserved before entering
into rmem_cma_setup. Also, rmem_cma_setup has one routine cma_init_reserved_mem
to ensure the region was reserved. Checking the region not reserved here seems
not correct.

early_init_fdt_scan_reserved_mem:
    fdt_scan_reserved_mem
        __reserved_mem_reserve_reg
		early_init_dt_reserve_memory
			memblock_reserve(using “reg” prop case)
        fdt_init_reserved_mem
		__reserved_mem_alloc_size
			*early_init_dt_alloc_reserved_memory_arch*
				memblock_reserve(dynamic alloc case)
        __reserved_mem_init_node
		rmem_cma_setup(region overlap check here should always fail)

Example DT can be used to reproduce issue:

    dump_mem: mem_dump_region {
            compatible = "shared-dma-pool";
            alloc-ranges = <0x0 0x00000000 0x0 0xffffffff>;
            reusable;
            size = <0 0x2800000>;
    };

Signed-off-by: Zhenhua Huang <quic_zhenhuah@quicinc.com>
---
 kernel/dma/contiguous.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 88c595e49e34..f005c66f378c 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -473,11 +473,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
 		return -EBUSY;
 	}
 
-	if (memblock_is_region_reserved(rmem->base, rmem->size)) {
-		pr_info("Reserved memory: overlap with other memblock reserved region\n");
-		return -EBUSY;
-	}
-
 	if (!of_get_flat_dt_prop(node, "reusable", NULL) ||
 	    of_get_flat_dt_prop(node, "no-map", NULL))
 		return -EINVAL;
-- 
cgit v1.2.3


From 41a5db8d8161457b121a03fde999ff6e00090ee2 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:27:34 -0700
Subject: bpf: Add support for non-fix-size percpu mem allocation

This is needed for later percpu mem allocation when the
allocation is done by bpf program. For such cases, a global
bpf_global_percpu_ma is added where a flexible allocation
size is needed.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152734.1995725-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  4 ++--
 kernel/bpf/core.c     |  8 +++++---
 kernel/bpf/memalloc.c | 14 ++++++--------
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 024e8b28c34b..440dd1f59a1c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -55,8 +55,8 @@ struct cgroup;
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 extern struct kobject *btf_kobj;
-extern struct bpf_mem_alloc bpf_global_ma;
-extern bool bpf_global_ma_set;
+extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
+extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
 
 typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 0f8f036d8bd1..95599df82ee4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -64,8 +64,8 @@
 #define OFF	insn->off
 #define IMM	insn->imm
 
-struct bpf_mem_alloc bpf_global_ma;
-bool bpf_global_ma_set;
+struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
+bool bpf_global_ma_set, bpf_global_percpu_ma_set;
 
 /* No hurry in this branch
  *
@@ -2921,7 +2921,9 @@ static int __init bpf_global_ma_init(void)
 
 	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
 	bpf_global_ma_set = !ret;
-	return ret;
+	ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
+	bpf_global_percpu_ma_set = !ret;
+	return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
 }
 late_initcall(bpf_global_ma_init);
 #endif
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 9c49ae53deaf..cb60445de98a 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -499,15 +499,16 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 	struct obj_cgroup *objcg = NULL;
 	int cpu, i, unit_size, percpu_size = 0;
 
+	/* room for llist_node and per-cpu pointer */
+	if (percpu)
+		percpu_size = LLIST_NODE_SZ + sizeof(void *);
+
 	if (size) {
 		pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
 		if (!pc)
 			return -ENOMEM;
 
-		if (percpu)
-			/* room for llist_node and per-cpu pointer */
-			percpu_size = LLIST_NODE_SZ + sizeof(void *);
-		else
+		if (!percpu)
 			size += LLIST_NODE_SZ; /* room for llist_node */
 		unit_size = size;
 
@@ -527,10 +528,6 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 		return 0;
 	}
 
-	/* size == 0 && percpu is an invalid combination */
-	if (WARN_ON_ONCE(percpu))
-		return -EINVAL;
-
 	pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
 	if (!pcc)
 		return -ENOMEM;
@@ -543,6 +540,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			c = &cc->cache[i];
 			c->unit_size = sizes[i];
 			c->objcg = objcg;
+			c->percpu_size = percpu_size;
 			c->tgt = c;
 			prefill_mem_cache(c, cpu);
 		}
-- 
cgit v1.2.3


From 55db92f42fe4a4ef7b4c2b4960c6212c8512dd53 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:27:39 -0700
Subject: bpf: Add BPF_KPTR_PERCPU as a field type

BPF_KPTR_PERCPU represents a percpu field type like below

  struct val_t {
    ... fields ...
  };
  struct t {
    ...
    struct val_t __percpu_kptr *percpu_data_ptr;
    ...
  };

where
  #define __percpu_kptr __attribute__((btf_type_tag("percpu_kptr")))

While BPF_KPTR_REF points to a trusted kernel object or a trusted
local object, BPF_KPTR_PERCPU points to a trusted local
percpu object.

This patch added basic support for BPF_KPTR_PERCPU
related to percpu_kptr field parsing, recording and free operations.
BPF_KPTR_PERCPU also supports the same map types
as BPF_KPTR_REF does.

Note that unlike a local kptr, it is possible that
a BPF_KTPR_PERCPU struct may not contain any
special fields like other kptr, bpf_spin_lock, bpf_list_head, etc.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152739.1996391-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 18 ++++++++++++------
 kernel/bpf/btf.c     |  5 +++++
 kernel/bpf/syscall.c |  4 ++++
 3 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 440dd1f59a1c..87eeb3a46a1d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -180,14 +180,15 @@ enum btf_field_type {
 	BPF_TIMER      = (1 << 1),
 	BPF_KPTR_UNREF = (1 << 2),
 	BPF_KPTR_REF   = (1 << 3),
-	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF,
-	BPF_LIST_HEAD  = (1 << 4),
-	BPF_LIST_NODE  = (1 << 5),
-	BPF_RB_ROOT    = (1 << 6),
-	BPF_RB_NODE    = (1 << 7),
+	BPF_KPTR_PERCPU = (1 << 4),
+	BPF_KPTR       = BPF_KPTR_UNREF | BPF_KPTR_REF | BPF_KPTR_PERCPU,
+	BPF_LIST_HEAD  = (1 << 5),
+	BPF_LIST_NODE  = (1 << 6),
+	BPF_RB_ROOT    = (1 << 7),
+	BPF_RB_NODE    = (1 << 8),
 	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
 				 BPF_RB_NODE | BPF_RB_ROOT,
-	BPF_REFCOUNT   = (1 << 8),
+	BPF_REFCOUNT   = (1 << 9),
 };
 
 typedef void (*btf_dtor_kfunc_t)(void *);
@@ -300,6 +301,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 		return "kptr";
+	case BPF_KPTR_PERCPU:
+		return "percpu_kptr";
 	case BPF_LIST_HEAD:
 		return "bpf_list_head";
 	case BPF_LIST_NODE:
@@ -325,6 +328,7 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
 		return sizeof(struct bpf_timer);
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
+	case BPF_KPTR_PERCPU:
 		return sizeof(u64);
 	case BPF_LIST_HEAD:
 		return sizeof(struct bpf_list_head);
@@ -351,6 +355,7 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
 		return __alignof__(struct bpf_timer);
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
+	case BPF_KPTR_PERCPU:
 		return __alignof__(u64);
 	case BPF_LIST_HEAD:
 		return __alignof__(struct bpf_list_head);
@@ -389,6 +394,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
 	case BPF_TIMER:
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
+	case BPF_KPTR_PERCPU:
 		break;
 	default:
 		WARN_ON_ONCE(1);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 1095bbe29859..187b57276fec 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3293,6 +3293,8 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 		type = BPF_KPTR_UNREF;
 	else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
 		type = BPF_KPTR_REF;
+	else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off)))
+		type = BPF_KPTR_PERCPU;
 	else
 		return -EINVAL;
 
@@ -3457,6 +3459,7 @@ static int btf_find_struct_field(const struct btf *btf,
 			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			ret = btf_find_kptr(btf, member_type, off, sz,
 					    idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3523,6 +3526,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			ret = btf_find_kptr(btf, var_type, off, sz,
 					    idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3783,6 +3787,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
 			if (ret < 0)
 				goto end;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index eb01c31ed591..6a692f3bea15 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -514,6 +514,7 @@ void btf_record_free(struct btf_record *rec)
 		switch (rec->fields[i].type) {
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			if (rec->fields[i].kptr.module)
 				module_put(rec->fields[i].kptr.module);
 			btf_put(rec->fields[i].kptr.btf);
@@ -560,6 +561,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
 		switch (fields[i].type) {
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			btf_get(fields[i].kptr.btf);
 			if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
 				ret = -ENXIO;
@@ -650,6 +652,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 			WRITE_ONCE(*(u64 *)field_ptr, 0);
 			break;
 		case BPF_KPTR_REF:
+		case BPF_KPTR_PERCPU:
 			xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
 			if (!xchgd_field)
 				break;
@@ -1045,6 +1048,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 				break;
 			case BPF_KPTR_UNREF:
 			case BPF_KPTR_REF:
+			case BPF_KPTR_PERCPU:
 			case BPF_REFCOUNT:
 				if (map->map_type != BPF_MAP_TYPE_HASH &&
 				    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
-- 
cgit v1.2.3


From 36d8bdf75a93190e5669b9d1d95994e13e15ba1d Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:27:44 -0700
Subject: bpf: Add alloc/xchg/direct_access support for local percpu kptr

Add two new kfunc's, bpf_percpu_obj_new_impl() and
bpf_percpu_obj_drop_impl(), to allocate a percpu obj.
Two functions are very similar to bpf_obj_new_impl()
and bpf_obj_drop_impl(). The major difference is related
to percpu handling.

    bpf_rcu_read_lock()
    struct val_t __percpu_kptr *v = map_val->percpu_data;
    ...
    bpf_rcu_read_unlock()

For a percpu data map_val like above 'v', the reg->type
is set as
	PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU
if inside rcu critical section.

MEM_RCU marking here is similar to NON_OWN_REF as 'v'
is not a owning reference. But NON_OWN_REF is
trusted and typically inside the spinlock while
MEM_RCU is under rcu read lock. RCU is preferred here
since percpu data structures mean potential concurrent
access into its contents.

Also, bpf_percpu_obj_new_impl() is restricted such that
no pointers or special fields are allowed. Therefore,
the bpf_list_head and bpf_rb_root will not be supported
in this patch set to avoid potential memory leak issue
due to racing between bpf_obj_free_fields() and another
bpf_kptr_xchg() moving an allocated object to
bpf_list_head and bpf_rb_root.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152744.1996739-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c  |  16 ++++++++
 kernel/bpf/verifier.c | 112 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 106 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 8bd3812fb8df..b0a9834f1051 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1902,6 +1902,14 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 	return p;
 }
 
+__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+	u64 size = local_type_id__k;
+
+	/* The verifier has ensured that meta__ign must be NULL */
+	return bpf_mem_alloc(&bpf_global_percpu_ma, size);
+}
+
 /* Must be called under migrate_disable(), as required by bpf_mem_free */
 void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
 {
@@ -1930,6 +1938,12 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
 	__bpf_obj_drop_impl(p, meta ? meta->record : NULL);
 }
 
+__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+	/* The verifier has ensured that meta__ign must be NULL */
+	bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc);
+}
+
 __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
 {
 	struct btf_struct_meta *meta = meta__ign;
@@ -2442,7 +2456,9 @@ BTF_SET8_START(generic_btf_ids)
 BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
 #endif
 BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_push_front_impl)
 BTF_ID_FLAGS(func, bpf_list_push_back_impl)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bb78212fa5b2..6c886ead18f6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -304,7 +304,7 @@ struct bpf_kfunc_call_arg_meta {
 	/* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling,
 	 * generally to pass info about user-defined local kptr types to later
 	 * verification logic
-	 *   bpf_obj_drop
+	 *   bpf_obj_drop/bpf_percpu_obj_drop
 	 *     Record the local kptr type to be drop'd
 	 *   bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type)
 	 *     Record the local kptr type to be refcount_incr'd and use
@@ -5001,6 +5001,8 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
 			perm_flags |= PTR_UNTRUSTED;
 	} else {
 		perm_flags = PTR_MAYBE_NULL | MEM_ALLOC;
+		if (kptr_field->type == BPF_KPTR_PERCPU)
+			perm_flags |= MEM_PERCPU;
 	}
 
 	if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
@@ -5044,7 +5046,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
 	 */
 	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
 				  kptr_field->kptr.btf, kptr_field->kptr.btf_id,
-				  kptr_field->type == BPF_KPTR_REF))
+				  kptr_field->type != BPF_KPTR_UNREF))
 		goto bad_type;
 	return 0;
 bad_type:
@@ -5088,7 +5090,18 @@ static bool rcu_safe_kptr(const struct btf_field *field)
 {
 	const struct btf_field_kptr *kptr = &field->kptr;
 
-	return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
+	return field->type == BPF_KPTR_PERCPU ||
+	       (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id));
+}
+
+static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
+{
+	if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) {
+		if (kptr_field->type != BPF_KPTR_PERCPU)
+			return PTR_MAYBE_NULL | MEM_RCU;
+		return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU;
+	}
+	return PTR_MAYBE_NULL | PTR_UNTRUSTED;
 }
 
 static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
@@ -5114,7 +5127,8 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 	/* We only allow loading referenced kptr, since it will be marked as
 	 * untrusted, similar to unreferenced kptr.
 	 */
-	if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) {
+	if (class != BPF_LDX &&
+	    (kptr_field->type == BPF_KPTR_REF || kptr_field->type == BPF_KPTR_PERCPU)) {
 		verbose(env, "store to referenced kptr disallowed\n");
 		return -EACCES;
 	}
@@ -5125,10 +5139,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 		 * value from map as PTR_TO_BTF_ID, with the correct type.
 		 */
 		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
-				kptr_field->kptr.btf_id,
-				rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ?
-				PTR_MAYBE_NULL | MEM_RCU :
-				PTR_MAYBE_NULL | PTR_UNTRUSTED);
+				kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field));
 		/* For mark_ptr_or_null_reg */
 		val_reg->id = ++env->id_gen;
 	} else if (class == BPF_STX) {
@@ -5182,6 +5193,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 			switch (field->type) {
 			case BPF_KPTR_UNREF:
 			case BPF_KPTR_REF:
+			case BPF_KPTR_PERCPU:
 				if (src != ACCESS_DIRECT) {
 					verbose(env, "kptr cannot be accessed indirectly by helper\n");
 					return -EACCES;
@@ -7320,7 +7332,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
 		verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
 		return -EACCES;
 	}
-	if (kptr_field->type != BPF_KPTR_REF) {
+	if (kptr_field->type != BPF_KPTR_REF && kptr_field->type != BPF_KPTR_PERCPU) {
 		verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
 		return -EACCES;
 	}
@@ -7831,8 +7843,10 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
 	if (base_type(arg_type) == ARG_PTR_TO_MEM)
 		type &= ~DYNPTR_TYPE_FLAG_MASK;
 
-	if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type))
+	if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) {
 		type &= ~MEM_ALLOC;
+		type &= ~MEM_PERCPU;
+	}
 
 	for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
 		expected = compatible->types[i];
@@ -7915,6 +7929,7 @@ found:
 		break;
 	}
 	case PTR_TO_BTF_ID | MEM_ALLOC:
+	case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
 		if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
 		    meta->func_id != BPF_FUNC_kptr_xchg) {
 			verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
@@ -9882,8 +9897,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		if (func_id == BPF_FUNC_kptr_xchg) {
 			ret_btf = meta.kptr_field->kptr.btf;
 			ret_btf_id = meta.kptr_field->kptr.btf_id;
-			if (!btf_is_kernel(ret_btf))
+			if (!btf_is_kernel(ret_btf)) {
 				regs[BPF_REG_0].type |= MEM_ALLOC;
+				if (meta.kptr_field->type == BPF_KPTR_PERCPU)
+					regs[BPF_REG_0].type |= MEM_PERCPU;
+			}
 		} else {
 			if (fn->ret_btf_id == BPF_PTR_POISON) {
 				verbose(env, "verifier internal error:");
@@ -10268,6 +10286,8 @@ enum special_kfunc_type {
 	KF_bpf_dynptr_slice,
 	KF_bpf_dynptr_slice_rdwr,
 	KF_bpf_dynptr_clone,
+	KF_bpf_percpu_obj_new_impl,
+	KF_bpf_percpu_obj_drop_impl,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -10288,6 +10308,8 @@ BTF_ID(func, bpf_dynptr_from_xdp)
 BTF_ID(func, bpf_dynptr_slice)
 BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_percpu_obj_new_impl)
+BTF_ID(func, bpf_percpu_obj_drop_impl)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -10310,6 +10332,8 @@ BTF_ID(func, bpf_dynptr_from_xdp)
 BTF_ID(func, bpf_dynptr_slice)
 BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_percpu_obj_new_impl)
+BTF_ID(func, bpf_percpu_obj_drop_impl)
 
 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -11004,7 +11028,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			}
 			break;
 		case KF_ARG_PTR_TO_ALLOC_BTF_ID:
-			if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+			if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) {
+				if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) {
+					verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i);
+					return -EINVAL;
+				}
+			} else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) {
+				if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
+					verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i);
+					return -EINVAL;
+				}
+			} else {
 				verbose(env, "arg#%d expected pointer to allocated object\n", i);
 				return -EINVAL;
 			}
@@ -11012,8 +11046,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				verbose(env, "allocated object must be referenced\n");
 				return -EINVAL;
 			}
-			if (meta->btf == btf_vmlinux &&
-			    meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+			if (meta->btf == btf_vmlinux) {
 				meta->arg_btf = reg->btf;
 				meta->arg_btf_id = reg->btf_id;
 			}
@@ -11413,6 +11446,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		/* Only exception is bpf_obj_new_impl */
 		if (meta.btf != btf_vmlinux ||
 		    (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
+		     meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
 		     meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
 			verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
 			return -EINVAL;
@@ -11426,11 +11460,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
 
 		if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
-			if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+			if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+			    meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+				struct btf_struct_meta *struct_meta;
 				struct btf *ret_btf;
 				u32 ret_btf_id;
 
-				if (unlikely(!bpf_global_ma_set))
+				if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
+					return -ENOMEM;
+
+				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set)
 					return -ENOMEM;
 
 				if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
@@ -11443,24 +11482,38 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 
 				/* This may be NULL due to user not supplying a BTF */
 				if (!ret_btf) {
-					verbose(env, "bpf_obj_new requires prog BTF\n");
+					verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
 					return -EINVAL;
 				}
 
 				ret_t = btf_type_by_id(ret_btf, ret_btf_id);
 				if (!ret_t || !__btf_type_is_struct(ret_t)) {
-					verbose(env, "bpf_obj_new type ID argument must be of a struct\n");
+					verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
 					return -EINVAL;
 				}
 
+				struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
+				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+					if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
+						verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
+						return -EINVAL;
+					}
+
+					if (struct_meta) {
+						verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
+						return -EINVAL;
+					}
+				}
+
 				mark_reg_known_zero(env, regs, BPF_REG_0);
 				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
 				regs[BPF_REG_0].btf = ret_btf;
 				regs[BPF_REG_0].btf_id = ret_btf_id;
+				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
+					regs[BPF_REG_0].type |= MEM_PERCPU;
 
 				insn_aux->obj_new_size = ret_t->size;
-				insn_aux->kptr_struct_meta =
-					btf_find_struct_meta(ret_btf, ret_btf_id);
+				insn_aux->kptr_struct_meta = struct_meta;
 			} else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
 				mark_reg_known_zero(env, regs, BPF_REG_0);
 				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
@@ -11597,7 +11650,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			regs[BPF_REG_0].id = ++env->id_gen;
 	} else if (btf_type_is_void(t)) {
 		if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
-			if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+			if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+			    meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
 				insn_aux->kptr_struct_meta =
 					btf_find_struct_meta(meta.arg_btf,
 							     meta.arg_btf_id);
@@ -18266,21 +18320,35 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		insn->imm = BPF_CALL_IMM(desc->addr);
 	if (insn->off)
 		return 0;
-	if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+	if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+	    desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
 		struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
 		struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
 		u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
 
+		if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) {
+			verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n",
+				insn_idx);
+			return -EFAULT;
+		}
+
 		insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
 		insn_buf[1] = addr[0];
 		insn_buf[2] = addr[1];
 		insn_buf[3] = *insn;
 		*cnt = 4;
 	} else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+		   desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] ||
 		   desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
 		struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
 		struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
 
+		if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) {
+			verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n",
+				insn_idx);
+			return -EFAULT;
+		}
+
 		if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
 		    !kptr_struct_meta) {
 			verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
-- 
cgit v1.2.3


From 01cc55af93884f1ff5a883426e1924378dfcc62a Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:27:49 -0700
Subject: bpf: Add bpf_this_cpu_ptr/bpf_per_cpu_ptr support for allocated
 percpu obj

The bpf helpers bpf_this_cpu_ptr() and bpf_per_cpu_ptr() are re-purposed
for allocated percpu objects. For an allocated percpu obj,
the reg type is 'PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU'.

The return type for these two re-purposed helpera is
'PTR_TO_MEM | MEM_RCU | MEM_ALLOC'.
The MEM_ALLOC allows that the per-cpu data can be read and written.

Since the memory allocator bpf_mem_alloc() returns
a ptr to a percpu ptr for percpu data, the first argument
of bpf_this_cpu_ptr() and bpf_per_cpu_ptr() is patched
with a dereference before passing to the helper func.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152749.1997202-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 59 ++++++++++++++++++++++++++++++++++++++------
 2 files changed, 52 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b6e58dab8e27..a3236651ec64 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -480,6 +480,7 @@ struct bpf_insn_aux_data {
 	bool zext_dst; /* this insn zero extends dst reg */
 	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
 	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
+	bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */
 	u8 alu_state; /* used in combination with alu_limit */
 
 	/* below fields are initialized once */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6c886ead18f6..6b7e7ca611f3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6221,7 +6221,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		}
 
 		if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
-		    !reg->ref_obj_id) {
+		    !(reg->type & MEM_RCU) && !reg->ref_obj_id) {
 			verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
 			return -EFAULT;
 		}
@@ -7765,6 +7765,7 @@ static const struct bpf_reg_types btf_ptr_types = {
 static const struct bpf_reg_types percpu_btf_ptr_types = {
 	.types = {
 		PTR_TO_BTF_ID | MEM_PERCPU,
+		PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU,
 		PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
 	}
 };
@@ -7941,6 +7942,7 @@ found:
 		}
 		break;
 	case PTR_TO_BTF_ID | MEM_PERCPU:
+	case PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU:
 	case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
 		/* Handled by helper specific checks */
 		break;
@@ -9547,6 +9549,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			     int *insn_idx_p)
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+	bool returns_cpu_specific_alloc_ptr = false;
 	const struct bpf_func_proto *fn = NULL;
 	enum bpf_return_type ret_type;
 	enum bpf_type_flag ret_flag;
@@ -9785,6 +9788,23 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 		break;
 	}
+	case BPF_FUNC_per_cpu_ptr:
+	case BPF_FUNC_this_cpu_ptr:
+	{
+		struct bpf_reg_state *reg = &regs[BPF_REG_1];
+		const struct btf_type *type;
+
+		if (reg->type & MEM_RCU) {
+			type = btf_type_by_id(reg->btf, reg->btf_id);
+			if (!type || !btf_type_is_struct(type)) {
+				verbose(env, "Helper has invalid btf/btf_id in R1\n");
+				return -EFAULT;
+			}
+			returns_cpu_specific_alloc_ptr = true;
+			env->insn_aux_data[insn_idx].call_with_percpu_alloc_ptr = true;
+		}
+		break;
+	}
 	case BPF_FUNC_user_ringbuf_drain:
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_user_ringbuf_callback_state);
@@ -9874,14 +9894,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
 			regs[BPF_REG_0].mem_size = tsize;
 		} else {
-			/* MEM_RDONLY may be carried from ret_flag, but it
-			 * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
-			 * it will confuse the check of PTR_TO_BTF_ID in
-			 * check_mem_access().
-			 */
-			ret_flag &= ~MEM_RDONLY;
+			if (returns_cpu_specific_alloc_ptr) {
+				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC | MEM_RCU;
+			} else {
+				/* MEM_RDONLY may be carried from ret_flag, but it
+				 * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
+				 * it will confuse the check of PTR_TO_BTF_ID in
+				 * check_mem_access().
+				 */
+				ret_flag &= ~MEM_RDONLY;
+				regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
+			}
 
-			regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
 			regs[BPF_REG_0].btf = meta.ret_btf;
 			regs[BPF_REG_0].btf_id = meta.ret_btf_id;
 		}
@@ -18676,6 +18700,25 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto patch_call_imm;
 		}
 
+		/* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
+		if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
+			/* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
+			 * bpf_mem_alloc() returns a ptr to the percpu data ptr.
+			 */
+			insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0);
+			insn_buf[1] = *insn;
+			cnt = 2;
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta += cnt - 1;
+			env->prog = prog = new_prog;
+			insn = new_prog->insnsi + i + delta;
+			goto patch_call_imm;
+		}
+
 		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
 		 * and other inlining handlers are currently limited to 64 bit
 		 * only.
-- 
cgit v1.2.3


From 5b221ecb3a9e48013d7b4ad7960af3adba23d1d1 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Sun, 27 Aug 2023 08:28:16 -0700
Subject: bpf: Mark OBJ_RELEASE argument as MEM_RCU when possible

In previous selftests/bpf patch, we have
  p = bpf_percpu_obj_new(struct val_t);
  if (!p)
          goto out;

  p1 = bpf_kptr_xchg(&e->pc, p);
  if (p1) {
          /* race condition */
          bpf_percpu_obj_drop(p1);
  }

  p = e->pc;
  if (!p)
          goto out;

After bpf_kptr_xchg(), we need to re-read e->pc into 'p'.
This is due to that the second argument of bpf_kptr_xchg() is marked
OBJ_RELEASE and it will be marked as invalid after the call.
So after bpf_kptr_xchg(), 'p' is an unknown scalar,
and the bpf program needs to reread from the map value.

This patch checks if the 'p' has type MEM_ALLOC and MEM_PERCPU,
and if 'p' is RCU protected. If this is the case, 'p' can be marked
as MEM_RCU. MEM_ALLOC needs to be removed since 'p' is not
an owning reference any more. Such a change makes re-read
from the map value unnecessary.

Note that re-reading 'e->pc' after bpf_kptr_xchg() might get
a different value from 'p' if immediately before 'p = e->pc',
another cpu may do another bpf_kptr_xchg() and swap in another value
into 'e->pc'. If this is the case, then 'p = e->pc' may
get either 'p' or another value, and race condition already exists.
So removing direct re-reading seems fine too.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230827152816.2000760-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6b7e7ca611f3..dbba2b806017 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9660,6 +9660,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 				return -EFAULT;
 			}
 			err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
+		} else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) {
+			u32 ref_obj_id = meta.ref_obj_id;
+			bool in_rcu = in_rcu_cs(env);
+			struct bpf_func_state *state;
+			struct bpf_reg_state *reg;
+
+			err = release_reference_state(cur_func(env), ref_obj_id);
+			if (!err) {
+				bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+					if (reg->ref_obj_id == ref_obj_id) {
+						if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) {
+							reg->ref_obj_id = 0;
+							reg->type &= ~MEM_ALLOC;
+							reg->type |= MEM_RCU;
+						} else {
+							mark_reg_invalid(env, reg);
+						}
+					}
+				}));
+			}
 		} else if (meta.ref_obj_id) {
 			err = release_reference(env, meta.ref_obj_id);
 		} else if (register_is_null(&regs[meta.release_regno])) {
-- 
cgit v1.2.3


From 566f6de3cea3482d75d836a2398792a8be32ec26 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 1 Sep 2023 19:19:52 +0800
Subject: bpf: Enable IRQ after irq_work_raise() completes in unit_alloc()

When doing stress test for qp-trie, bpf_mem_alloc() returned NULL
unexpectedly because all qp-trie operations were initiated from
bpf syscalls and there was still available free memory. bpf_obj_new()
has the same problem as shown by the following selftest.

The failure is due to the preemption. irq_work_raise() will invoke
irq_work_claim() first to mark the irq work as pending and then inovke
__irq_work_queue_local() to raise an IPI. So when the current task
which is invoking irq_work_raise() is preempted by other task,
unit_alloc() may return NULL for preemption task as shown below:

task A         task B

unit_alloc()
  // low_watermark = 32
  // free_cnt = 31 after alloc
  irq_work_raise()
    // mark irq work as IRQ_WORK_PENDING
    irq_work_claim()

	       // task B preempts task A
	       unit_alloc()
	         // free_cnt = 30 after alloc
	         // irq work is already PENDING,
	         // so just return
	         irq_work_raise()
	       // does unit_alloc() 30-times
	       ......
	       unit_alloc()
	         // free_cnt = 0 before alloc
	         return NULL

Fix it by enabling IRQ after irq_work_raise() completes. An alternative
fix is using preempt_{disable|enable}_notrace() pair, but it may have
extra overhead. Another feasible fix is to only disable preemption or
IRQ before invoking irq_work_queue() and enable preemption or IRQ after
the invocation completes, but it can't handle the case when
c->low_watermark is 1.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230901111954.1804721-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index cb60445de98a..c5d822d7cfaa 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -732,12 +732,17 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c)
 		}
 	}
 	local_dec(&c->active);
-	local_irq_restore(flags);
 
 	WARN_ON(cnt < 0);
 
 	if (cnt < c->low_watermark)
 		irq_work_raise(c);
+	/* Enable IRQ after the enqueue of irq work completes, so irq work
+	 * will run after IRQ is enabled and free_llist may be refilled by
+	 * irq work before other task preempts current task.
+	 */
+	local_irq_restore(flags);
+
 	return llnode;
 }
 
-- 
cgit v1.2.3


From 62cf51cb0ebe997a9903208e546755b63eb7ff9d Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 1 Sep 2023 19:19:53 +0800
Subject: bpf: Enable IRQ after irq_work_raise() completes in unit_free{_rcu}()

Both unit_free() and unit_free_rcu() invoke irq_work_raise() to free
freed objects back to slab and the invocation may also be preempted by
unit_alloc() and unit_alloc() may return NULL unexpectedly as shown in
the following case:

task A         task B

unit_free()
  // high_watermark = 48
  // free_cnt = 49 after free
  irq_work_raise()
    // mark irq work as IRQ_WORK_PENDING
    irq_work_claim()

               // task B preempts task A
               unit_alloc()
                 // free_cnt = 48 after alloc

               // does unit_alloc() 32-times
	       ......
	       // free_cnt = 16

	       unit_alloc()
	         // free_cnt = 15 after alloc
                 // irq work is already PENDING,
                 // so just return
                 irq_work_raise()

	       // does unit_alloc() 15-times
               ......
	       // free_cnt = 0

               unit_alloc()
                 // free_cnt = 0 before alloc
                 return NULL

Fix it by enabling IRQ after irq_work_raise() completes.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230901111954.1804721-3-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index c5d822d7cfaa..961df89d45f1 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -778,11 +778,16 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
 		llist_add(llnode, &c->free_llist_extra);
 	}
 	local_dec(&c->active);
-	local_irq_restore(flags);
 
 	if (cnt > c->high_watermark)
 		/* free few objects from current cpu into global kmalloc pool */
 		irq_work_raise(c);
+	/* Enable IRQ after irq_work_raise() completes, otherwise when current
+	 * task is preempted by task which does unit_alloc(), unit_alloc() may
+	 * return NULL unexpectedly because irq work is already pending but can
+	 * not been triggered and free_llist can not be refilled timely.
+	 */
+	local_irq_restore(flags);
 }
 
 static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
@@ -800,10 +805,10 @@ static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
 		llist_add(llnode, &c->free_llist_extra_rcu);
 	}
 	local_dec(&c->active);
-	local_irq_restore(flags);
 
 	if (!atomic_read(&c->call_rcu_in_progress))
 		irq_work_raise(c);
+	local_irq_restore(flags);
 }
 
 /* Called from BPF program or from sys_bpf syscall.
-- 
cgit v1.2.3


From 1a00ef57d9f120b711b6b1193d12ba3789d47ec2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 5 Sep 2023 17:46:46 +0200
Subject: bpf: task_group_seq_get_next: cleanup the usage of next_thread()

1. find_pid_ns() + get_pid_task() under rcu_read_lock() guarantees that we
   can safely iterate the task->thread_group list. Even if this task exits
   right after get_pid_task() (or goto retry) and pid_alive() returns 0.

   Kill the unnecessary pid_alive() check.

2. next_thread() simply can't return NULL, kill the bogus "if (!next_task)"
   check.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230905154646.GA24928@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/task_iter.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index c4ab9d6cdbe9..4d1125108014 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -75,15 +75,8 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
 		return NULL;
 
 retry:
-	if (!pid_alive(task)) {
-		put_task_struct(task);
-		return NULL;
-	}
-
 	next_task = next_thread(task);
 	put_task_struct(task);
-	if (!next_task)
-		return NULL;
 
 	saved_tid = *tid;
 	*tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
-- 
cgit v1.2.3


From 4981921350452a7639fac9ac8f19be4d25febdca Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 5 Sep 2023 17:46:49 +0200
Subject: bpf: task_group_seq_get_next: cleanup the usage of
 get/put_task_struct

get_pid_task() makes no sense, the code does put_task_struct() soon after.
Use find_task_by_pid_ns() instead of find_pid_ns + get_pid_task and kill
put_task_struct(), this allows to do get_task_struct() only once before
return.

While at it, kill the unnecessary "if (!pid)" check in the "if (!*tid)"
block, this matches the next usage of find_pid_ns() + get_pid_task() in
this function.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230905154649.GA24935@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/task_iter.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 4d1125108014..1589ec3faded 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -42,9 +42,6 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
 	if (!*tid) {
 		/* The first time, the iterator calls this function. */
 		pid = find_pid_ns(common->pid, common->ns);
-		if (!pid)
-			return NULL;
-
 		task = get_pid_task(pid, PIDTYPE_TGID);
 		if (!task)
 			return NULL;
@@ -66,17 +63,12 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
 		return task;
 	}
 
-	pid = find_pid_ns(common->pid_visiting, common->ns);
-	if (!pid)
-		return NULL;
-
-	task = get_pid_task(pid, PIDTYPE_PID);
+	task = find_task_by_pid_ns(common->pid_visiting, common->ns);
 	if (!task)
 		return NULL;
 
 retry:
 	next_task = next_thread(task);
-	put_task_struct(task);
 
 	saved_tid = *tid;
 	*tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
@@ -88,7 +80,6 @@ retry:
 		return NULL;
 	}
 
-	get_task_struct(next_task);
 	common->pid_visiting = *tid;
 
 	if (skip_if_dup_files && task->files == task->group_leader->files) {
@@ -96,6 +87,7 @@ retry:
 		goto retry;
 	}
 
+	get_task_struct(next_task);
 	return next_task;
 }
 
-- 
cgit v1.2.3


From 87abbf7a54f6c9c51374b0701cd7ab47534516ae Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 5 Sep 2023 17:46:51 +0200
Subject: bpf: task_group_seq_get_next: fix the skip_if_dup_files check

Unless I am notally confused it is wrong. We are going to return or
skip next_task so we need to check next_task-files, not task->files.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230905154651.GA24940@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/task_iter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 1589ec3faded..2264870ae3fc 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -82,7 +82,7 @@ retry:
 
 	common->pid_visiting = *tid;
 
-	if (skip_if_dup_files && task->files == task->group_leader->files) {
+	if (skip_if_dup_files && next_task->files == next_task->group_leader->files) {
 		task = next_task;
 		goto retry;
 	}
-- 
cgit v1.2.3


From 0ee9808b0a211ba1e572073c6afe5897f8300b9c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 5 Sep 2023 17:46:54 +0200
Subject: bpf: task_group_seq_get_next: kill next_task

It only adds the unnecessary confusion and compicates the "retry" code.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230905154654.GA24945@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/task_iter.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 2264870ae3fc..f51f476ec679 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -35,7 +35,7 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
 						   u32 *tid,
 						   bool skip_if_dup_files)
 {
-	struct task_struct *task, *next_task;
+	struct task_struct *task;
 	struct pid *pid;
 	u32 saved_tid;
 
@@ -68,10 +68,10 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
 		return NULL;
 
 retry:
-	next_task = next_thread(task);
+	task = next_thread(task);
 
 	saved_tid = *tid;
-	*tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
+	*tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns);
 	if (!*tid || *tid == common->pid) {
 		/* Run out of tasks of a process.  The tasks of a
 		 * thread_group are linked as circular linked list.
@@ -82,13 +82,11 @@ retry:
 
 	common->pid_visiting = *tid;
 
-	if (skip_if_dup_files && next_task->files == next_task->group_leader->files) {
-		task = next_task;
+	if (skip_if_dup_files && task->files == task->group_leader->files)
 		goto retry;
-	}
 
-	get_task_struct(next_task);
-	return next_task;
+	get_task_struct(task);
+	return task;
 }
 
 static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
-- 
cgit v1.2.3


From 780aa8dfcb73f4703b1c4be11c21c8dca36502ad Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 5 Sep 2023 17:46:56 +0200
Subject: bpf: task_group_seq_get_next: simplify the "next tid" logic

Kill saved_tid. It looks ugly to update *tid and then restore the
previous value if __task_pid_nr_ns() returns 0. Change this code
to update *tid and common->pid_visiting once before return.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20230905154656.GA24950@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/task_iter.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index f51f476ec679..7473068ed313 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -37,7 +37,7 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
 {
 	struct task_struct *task;
 	struct pid *pid;
-	u32 saved_tid;
+	u32 next_tid;
 
 	if (!*tid) {
 		/* The first time, the iterator calls this function. */
@@ -70,21 +70,18 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
 retry:
 	task = next_thread(task);
 
-	saved_tid = *tid;
-	*tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns);
-	if (!*tid || *tid == common->pid) {
+	next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns);
+	if (!next_tid || next_tid == common->pid) {
 		/* Run out of tasks of a process.  The tasks of a
 		 * thread_group are linked as circular linked list.
 		 */
-		*tid = saved_tid;
 		return NULL;
 	}
 
-	common->pid_visiting = *tid;
-
 	if (skip_if_dup_files && task->files == task->group_leader->files)
 		goto retry;
 
+	*tid = common->pid_visiting = next_tid;
 	get_task_struct(task);
 	return task;
 }
-- 
cgit v1.2.3


From 41bc46c12a8053a1b3279a379bd6b5e87b045b85 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 7 Sep 2023 22:06:51 +0200
Subject: bpf: Add override check to kprobe multi link attach

Currently the multi_kprobe link attach does not check error
injection list for programs with bpf_override_return helper
and allows them to attach anywhere. Adding the missing check.

Fixes: 0dcac2725406 ("bpf: Add multi kprobe link")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/bpf/20230907200652.926951-1-jolsa@kernel.org
---
 kernel/trace/bpf_trace.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a7264b2c17ad..c1c1af63ced2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2853,6 +2853,17 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3
 	return arr.mods_cnt;
 }
 
+static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt)
+{
+	u32 i;
+
+	for (i = 0; i < cnt; i++) {
+		if (!within_error_injection_list(addrs[i]))
+			return -EINVAL;
+	}
+	return 0;
+}
+
 int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 {
 	struct bpf_kprobe_multi_link *link = NULL;
@@ -2930,6 +2941,11 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 			goto error;
 	}
 
+	if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) {
+		err = -EINVAL;
+		goto error;
+	}
+
 	link = kzalloc(sizeof(*link), GFP_KERNEL);
 	if (!link) {
 		err = -ENOMEM;
-- 
cgit v1.2.3


From 95a404bd60af6c4d9d8db01ad14fe8957ece31ca Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 7 Sep 2023 12:28:20 -0400
Subject: ring-buffer: Do not attempt to read past "commit"

When iterating over the ring buffer while the ring buffer is active, the
writer can corrupt the reader. There's barriers to help detect this and
handle it, but that code missed the case where the last event was at the
very end of the page and has only 4 bytes left.

The checks to detect the corruption by the writer to reads needs to see the
length of the event. If the length in the first 4 bytes is zero then the
length is stored in the second 4 bytes. But if the writer is in the process
of updating that code, there's a small window where the length in the first
4 bytes could be zero even though the length is only 4 bytes. That will
cause rb_event_length() to read the next 4 bytes which could happen to be off the
allocated page.

To protect against this, fail immediately if the next event pointer is
less than 8 bytes from the end of the commit (last byte of data), as all
events must be a minimum of 8 bytes anyway.

Link: https://lore.kernel.org/all/20230905141245.26470-1-Tze-nan.Wu@mediatek.com/
Link: https://lore.kernel.org/linux-trace-kernel/20230907122820.0899019c@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Reported-by: Tze-nan Wu <Tze-nan.Wu@mediatek.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 72ccf75defd0..a1651edc48d5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2390,6 +2390,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
 	 */
 	commit = rb_page_commit(iter_head_page);
 	smp_rmb();
+
+	/* An event needs to be at least 8 bytes in size */
+	if (iter->head > commit - 8)
+		goto reset;
+
 	event = __rb_page_index(iter_head_page, iter->head);
 	length = rb_event_length(event);
 
-- 
cgit v1.2.3


From 1ef26d8b2ca5d8715563c951083cf6c385c77d1f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 7 Sep 2023 22:19:11 -0400
Subject: tracing: Use the new eventfs descriptor for print trigger

The check to create the print event "trigger" was using the obsolete "dir"
value of the trace_event_file to determine if it should create the trigger
or not. But that value will now be NULL because it uses the event file
descriptor.

Change it to test the "ef" field of the trace_event_file structure so that
the trace_marker "trigger" file appears again.

Link: https://lkml.kernel.org/r/20230908022001.371815239@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Fixes: 27152bceea1df ("eventfs: Move tracing/events to eventfs")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0608ad20cf30..122c23c9eb28 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9792,8 +9792,8 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 			  tr, &tracing_mark_fops);
 
 	file = __find_event_file(tr, "ftrace", "print");
-	if (file && file->dir)
-		trace_create_file("trigger", TRACE_MODE_WRITE, file->dir,
+	if (file && file->ef)
+		eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef,
 				  file, &event_trigger_fops);
 	tr->trace_marker_file = file;
 
-- 
cgit v1.2.3


From 6fdac58c560e4d164eb8161987bee045147cabe4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 7 Sep 2023 22:19:12 -0400
Subject: tracing: Remove unused trace_event_file dir field

Now that eventfs structure is used to create the events directory via the
eventfs dynamically allocate code, the "dir" field of the trace_event_file
structure is no longer used. Remove it.

Link: https://lkml.kernel.org/r/20230908022001.580400115@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  1 -
 kernel/trace/trace_events.c  | 13 -------------
 2 files changed, 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index eb5c3add939b..12f875e9e69a 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -650,7 +650,6 @@ struct trace_event_file {
 	struct trace_event_call		*event_call;
 	struct event_filter __rcu	*filter;
 	struct eventfs_file             *ef;
-	struct dentry			*dir;
 	struct trace_array		*tr;
 	struct trace_subsystem_dir	*system;
 	struct list_head		triggers;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2af92177b765..065c63991858 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -992,19 +992,6 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
 
 static void remove_event_file_dir(struct trace_event_file *file)
 {
-	struct dentry *dir = file->dir;
-	struct dentry *child;
-
-	if (dir) {
-		spin_lock(&dir->d_lock);	/* probably unneeded */
-		list_for_each_entry(child, &dir->d_subdirs, d_child) {
-			if (d_really_is_positive(child))	/* probably unneeded */
-				d_inode(child)->i_private = NULL;
-		}
-		spin_unlock(&dir->d_lock);
-
-		tracefs_remove(dir);
-	}
 	eventfs_remove(file->ef);
 	list_del(&file->list);
 	remove_subsystem(file->system);
-- 
cgit v1.2.3


From cf8e8658100d4eae80ce9b21f7a81cb024dd5057 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Thu, 20 Oct 2022 15:54:33 +0200
Subject: arch: Remove Itanium (IA-64) architecture

The Itanium architecture is obsolete, and an informal survey [0] reveals
that any residual use of Itanium hardware in production is mostly HP-UX
or OpenVMS based. The use of Linux on Itanium appears to be limited to
enthusiasts that occasionally boot a fresh Linux kernel to see whether
things are still working as intended, and perhaps to churn out some
distro packages that are rarely used in practice.

None of the original companies behind Itanium still produce or support
any hardware or software for the architecture, and it is listed as
'Orphaned' in the MAINTAINERS file, as apparently, none of the engineers
that contributed on behalf of those companies (nor anyone else, for that
matter) have been willing to support or maintain the architecture
upstream or even be responsible for applying the odd fix. The Intel
firmware team removed all IA-64 support from the Tianocore/EDK2
reference implementation of EFI in 2018. (Itanium is the original
architecture for which EFI was developed, and the way Linux supports it
deviates significantly from other architectures.) Some distros, such as
Debian and Gentoo, still maintain [unofficial] ia64 ports, but many have
dropped support years ago.

While the argument is being made [1] that there is a 'for the common
good' angle to being able to build and run existing projects such as the
Grid Community Toolkit [2] on Itanium for interoperability testing, the
fact remains that none of those projects are known to be deployed on
Linux/ia64, and very few people actually have access to such a system in
the first place. Even if there were ways imaginable in which Linux/ia64
could be put to good use today, what matters is whether anyone is
actually doing that, and this does not appear to be the case.

There are no emulators widely available, and so boot testing Itanium is
generally infeasible for ordinary contributors. GCC still supports IA-64
but its compile farm [3] no longer has any IA-64 machines. GLIBC would
like to get rid of IA-64 [4] too because it would permit some overdue
code cleanups. In summary, the benefits to the ecosystem of having IA-64
be part of it are mostly theoretical, whereas the maintenance overhead
of keeping it supported is real.

So let's rip off the band aid, and remove the IA-64 arch code entirely.
This follows the timeline proposed by the Debian/ia64 maintainer [5],
which removes support in a controlled manner, leaving IA-64 in a known
good state in the most recent LTS release. Other projects will follow
once the kernel support is removed.

[0] https://lore.kernel.org/all/CAMj1kXFCMh_578jniKpUtx_j8ByHnt=s7S+yQ+vGbKt9ud7+kQ@mail.gmail.com/
[1] https://lore.kernel.org/all/0075883c-7c51-00f5-2c2d-5119c1820410@web.de/
[2] https://gridcf.org/gct-docs/latest/index.html
[3] https://cfarm.tetaneutral.net/machines/list/
[4] https://lore.kernel.org/all/87bkiilpc4.fsf@mid.deneb.enyo.de/
[5] https://lore.kernel.org/all/ff58a3e76e5102c94bb5946d99187b358def688a.camel@physik.fu-berlin.de/

Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 Documentation/arch/ia64/aliasing.rst               |  246 ---
 Documentation/arch/ia64/efirtc.rst                 |  144 --
 Documentation/arch/ia64/err_inject.rst             | 1067 ---------
 Documentation/arch/ia64/features.rst               |    3 -
 Documentation/arch/ia64/fsys.rst                   |  303 ---
 Documentation/arch/ia64/ia64.rst                   |   49 -
 Documentation/arch/ia64/index.rst                  |   19 -
 Documentation/arch/ia64/irq-redir.rst              |   80 -
 Documentation/arch/ia64/mca.rst                    |  198 --
 Documentation/arch/ia64/serial.rst                 |  165 --
 Documentation/core-api/cpu_hotplug.rst             |    6 -
 MAINTAINERS                                        |   11 -
 arch/Kconfig                                       |    1 -
 arch/ia64/Kbuild                                   |    3 -
 arch/ia64/Kconfig                                  |  394 ----
 arch/ia64/Kconfig.debug                            |   55 -
 arch/ia64/Makefile                                 |   82 -
 arch/ia64/configs/bigsur_defconfig                 |  102 -
 arch/ia64/configs/generic_defconfig                |  206 --
 arch/ia64/configs/gensparse_defconfig              |  184 --
 arch/ia64/configs/tiger_defconfig                  |  169 --
 arch/ia64/configs/zx1_defconfig                    |  148 --
 arch/ia64/hp/common/Makefile                       |   10 -
 arch/ia64/hp/common/aml_nfw.c                      |  232 --
 arch/ia64/hp/common/sba_iommu.c                    | 2155 ------------------
 arch/ia64/include/asm/Kbuild                       |    6 -
 arch/ia64/include/asm/acenv.h                      |   49 -
 arch/ia64/include/asm/acpi-ext.h                   |   17 -
 arch/ia64/include/asm/acpi.h                       |  110 -
 arch/ia64/include/asm/asm-offsets.h                |    1 -
 arch/ia64/include/asm/asm-prototypes.h             |   30 -
 arch/ia64/include/asm/asmmacro.h                   |  136 --
 arch/ia64/include/asm/atomic.h                     |  216 --
 arch/ia64/include/asm/barrier.h                    |   79 -
 arch/ia64/include/asm/bitops.h                     |  453 ----
 arch/ia64/include/asm/bug.h                        |   19 -
 arch/ia64/include/asm/cache.h                      |   30 -
 arch/ia64/include/asm/cacheflush.h                 |   39 -
 arch/ia64/include/asm/checksum.h                   |   63 -
 arch/ia64/include/asm/clocksource.h                |   11 -
 arch/ia64/include/asm/cmpxchg.h                    |   33 -
 arch/ia64/include/asm/cpu.h                        |   23 -
 arch/ia64/include/asm/cputime.h                    |   21 -
 arch/ia64/include/asm/current.h                    |   18 -
 arch/ia64/include/asm/cyclone.h                    |   16 -
 arch/ia64/include/asm/delay.h                      |   89 -
 arch/ia64/include/asm/device.h                     |   14 -
 arch/ia64/include/asm/div64.h                      |    1 -
 arch/ia64/include/asm/dma-mapping.h                |   16 -
 arch/ia64/include/asm/dma.h                        |   17 -
 arch/ia64/include/asm/dmi.h                        |   15 -
 arch/ia64/include/asm/early_ioremap.h              |   11 -
 arch/ia64/include/asm/efi.h                        |   13 -
 arch/ia64/include/asm/elf.h                        |  233 --
 arch/ia64/include/asm/emergency-restart.h          |    6 -
 arch/ia64/include/asm/esi.h                        |   30 -
 arch/ia64/include/asm/exception.h                  |   23 -
 arch/ia64/include/asm/extable.h                    |   12 -
 arch/ia64/include/asm/fb.h                         |   43 -
 arch/ia64/include/asm/fpswa.h                      |   74 -
 arch/ia64/include/asm/ftrace.h                     |   28 -
 arch/ia64/include/asm/futex.h                      |  109 -
 arch/ia64/include/asm/gcc_intrin.h                 |   13 -
 arch/ia64/include/asm/hardirq.h                    |   27 -
 arch/ia64/include/asm/hugetlb.h                    |   34 -
 arch/ia64/include/asm/hw_irq.h                     |  167 --
 arch/ia64/include/asm/idle.h                       |    8 -
 arch/ia64/include/asm/intrinsics.h                 |   13 -
 arch/ia64/include/asm/io.h                         |  271 ---
 arch/ia64/include/asm/iommu.h                      |   22 -
 arch/ia64/include/asm/iosapic.h                    |  106 -
 arch/ia64/include/asm/irq.h                        |   37 -
 arch/ia64/include/asm/irq_regs.h                   |    1 -
 arch/ia64/include/asm/irq_remapping.h              |    5 -
 arch/ia64/include/asm/irqflags.h                   |   95 -
 arch/ia64/include/asm/kdebug.h                     |   45 -
 arch/ia64/include/asm/kexec.h                      |   46 -
 arch/ia64/include/asm/kprobes.h                    |  116 -
 arch/ia64/include/asm/kregs.h                      |  166 --
 arch/ia64/include/asm/libata-portmap.h             |    9 -
 arch/ia64/include/asm/linkage.h                    |   19 -
 arch/ia64/include/asm/local.h                      |    1 -
 arch/ia64/include/asm/mca.h                        |  185 --
 arch/ia64/include/asm/mca_asm.h                    |  245 ---
 arch/ia64/include/asm/meminit.h                    |   59 -
 arch/ia64/include/asm/mman.h                       |   18 -
 arch/ia64/include/asm/mmiowb.h                     |   17 -
 arch/ia64/include/asm/mmu.h                        |   14 -
 arch/ia64/include/asm/mmu_context.h                |  194 --
 arch/ia64/include/asm/mmzone.h                     |   35 -
 arch/ia64/include/asm/module.h                     |   35 -
 arch/ia64/include/asm/module.lds.h                 |   14 -
 arch/ia64/include/asm/msidef.h                     |   43 -
 arch/ia64/include/asm/native/inst.h                |  119 -
 arch/ia64/include/asm/native/irq.h                 |   20 -
 arch/ia64/include/asm/native/patchlist.h           |   24 -
 arch/ia64/include/asm/nodedata.h                   |   63 -
 arch/ia64/include/asm/numa.h                       |   83 -
 arch/ia64/include/asm/page.h                       |  208 --
 arch/ia64/include/asm/pal.h                        | 1827 ---------------
 arch/ia64/include/asm/param.h                      |   18 -
 arch/ia64/include/asm/parport.h                    |   20 -
 arch/ia64/include/asm/patch.h                      |   28 -
 arch/ia64/include/asm/pci.h                        |   66 -
 arch/ia64/include/asm/percpu.h                     |   53 -
 arch/ia64/include/asm/pgalloc.h                    |   64 -
 arch/ia64/include/asm/pgtable.h                    |  545 -----
 arch/ia64/include/asm/processor.h                  |  660 ------
 arch/ia64/include/asm/ptrace.h                     |  146 --
 arch/ia64/include/asm/sal.h                        |  919 --------
 arch/ia64/include/asm/sections.h                   |   33 -
 arch/ia64/include/asm/serial.h                     |   17 -
 arch/ia64/include/asm/shmparam.h                   |   13 -
 arch/ia64/include/asm/signal.h                     |   33 -
 arch/ia64/include/asm/smp.h                        |  103 -
 arch/ia64/include/asm/sn/intr.h                    |   15 -
 arch/ia64/include/asm/sn/sn_sal.h                  |  124 --
 arch/ia64/include/asm/sparsemem.h                  |   28 -
 arch/ia64/include/asm/spinlock.h                   |  265 ---
 arch/ia64/include/asm/spinlock_types.h             |   22 -
 arch/ia64/include/asm/string.h                     |   22 -
 arch/ia64/include/asm/switch_to.h                  |   71 -
 arch/ia64/include/asm/syscall.h                    |   65 -
 arch/ia64/include/asm/thread_info.h                |  131 --
 arch/ia64/include/asm/timex.h                      |   47 -
 arch/ia64/include/asm/tlb.h                        |   50 -
 arch/ia64/include/asm/tlbflush.h                   |  128 --
 arch/ia64/include/asm/topology.h                   |   56 -
 arch/ia64/include/asm/types.h                      |   32 -
 arch/ia64/include/asm/uaccess.h                    |  265 ---
 arch/ia64/include/asm/uncached.h                   |    9 -
 arch/ia64/include/asm/unistd.h                     |   38 -
 arch/ia64/include/asm/unwind.h                     |  234 --
 arch/ia64/include/asm/user.h                       |   53 -
 arch/ia64/include/asm/ustack.h                     |   12 -
 arch/ia64/include/asm/uv/uv.h                      |   30 -
 arch/ia64/include/asm/uv/uv_hub.h                  |  315 ---
 arch/ia64/include/asm/uv/uv_mmrs.h                 |  825 -------
 arch/ia64/include/asm/vermagic.h                   |   15 -
 arch/ia64/include/asm/vga.h                        |   26 -
 arch/ia64/include/asm/vmalloc.h                    |    4 -
 arch/ia64/include/asm/xor.h                        |   30 -
 arch/ia64/include/asm/xtp.h                        |   46 -
 arch/ia64/include/uapi/asm/Kbuild                  |    2 -
 arch/ia64/include/uapi/asm/auxvec.h                |   14 -
 arch/ia64/include/uapi/asm/bitsperlong.h           |    9 -
 arch/ia64/include/uapi/asm/break.h                 |   23 -
 arch/ia64/include/uapi/asm/byteorder.h             |    7 -
 arch/ia64/include/uapi/asm/cmpxchg.h               |  138 --
 arch/ia64/include/uapi/asm/fcntl.h                 |   15 -
 arch/ia64/include/uapi/asm/fpu.h                   |   67 -
 arch/ia64/include/uapi/asm/gcc_intrin.h            |  619 ------
 arch/ia64/include/uapi/asm/ia64regs.h              |  101 -
 arch/ia64/include/uapi/asm/intrinsics.h            |   82 -
 arch/ia64/include/uapi/asm/mman.h                  |   17 -
 arch/ia64/include/uapi/asm/param.h                 |   30 -
 arch/ia64/include/uapi/asm/posix_types.h           |    9 -
 arch/ia64/include/uapi/asm/ptrace.h                |  248 ---
 arch/ia64/include/uapi/asm/ptrace_offsets.h        |  269 ---
 arch/ia64/include/uapi/asm/resource.h              |    8 -
 arch/ia64/include/uapi/asm/rse.h                   |   67 -
 arch/ia64/include/uapi/asm/setup.h                 |   25 -
 arch/ia64/include/uapi/asm/sigcontext.h            |   71 -
 arch/ia64/include/uapi/asm/siginfo.h               |   28 -
 arch/ia64/include/uapi/asm/signal.h                |   98 -
 arch/ia64/include/uapi/asm/stat.h                  |   52 -
 arch/ia64/include/uapi/asm/statfs.h                |   21 -
 arch/ia64/include/uapi/asm/swab.h                  |   35 -
 arch/ia64/include/uapi/asm/types.h                 |   32 -
 arch/ia64/include/uapi/asm/ucontext.h              |   13 -
 arch/ia64/include/uapi/asm/unistd.h                |   22 -
 arch/ia64/include/uapi/asm/ustack.h                |   13 -
 arch/ia64/install.sh                               |   30 -
 arch/ia64/kernel/.gitignore                        |    3 -
 arch/ia64/kernel/Makefile                          |   46 -
 arch/ia64/kernel/Makefile.gate                     |   29 -
 arch/ia64/kernel/acpi-ext.c                        |  101 -
 arch/ia64/kernel/acpi.c                            |  913 --------
 arch/ia64/kernel/asm-offsets.c                     |  289 ---
 arch/ia64/kernel/audit.c                           |   63 -
 arch/ia64/kernel/brl_emu.c                         |  217 --
 arch/ia64/kernel/crash.c                           |  257 ---
 arch/ia64/kernel/crash_dump.c                      |   27 -
 arch/ia64/kernel/cyclone.c                         |  125 --
 arch/ia64/kernel/dma-mapping.c                     |    9 -
 arch/ia64/kernel/efi.c                             | 1360 ------------
 arch/ia64/kernel/efi_stub.S                        |   87 -
 arch/ia64/kernel/elfcore.c                         |   77 -
 arch/ia64/kernel/entry.S                           | 1427 ------------
 arch/ia64/kernel/entry.h                           |   83 -
 arch/ia64/kernel/err_inject.c                      |  273 ---
 arch/ia64/kernel/esi.c                             |  193 --
 arch/ia64/kernel/esi_stub.S                        |   99 -
 arch/ia64/kernel/fsys.S                            |  837 -------
 arch/ia64/kernel/fsyscall_gtod_data.h              |   30 -
 arch/ia64/kernel/ftrace.c                          |  196 --
 arch/ia64/kernel/gate-data.S                       |    3 -
 arch/ia64/kernel/gate.S                            |  380 ----
 arch/ia64/kernel/gate.lds.S                        |  108 -
 arch/ia64/kernel/head.S                            | 1167 ----------
 arch/ia64/kernel/iosapic.c                         | 1137 ----------
 arch/ia64/kernel/irq.c                             |  181 --
 arch/ia64/kernel/irq.h                             |    3 -
 arch/ia64/kernel/irq_ia64.c                        |  645 ------
 arch/ia64/kernel/irq_lsapic.c                      |   45 -
 arch/ia64/kernel/ivt.S                             | 1688 --------------
 arch/ia64/kernel/kprobes.c                         |  911 --------
 arch/ia64/kernel/machine_kexec.c                   |  163 --
 arch/ia64/kernel/mca.c                             | 2111 ------------------
 arch/ia64/kernel/mca_asm.S                         | 1123 ----------
 arch/ia64/kernel/mca_drv.c                         |  796 -------
 arch/ia64/kernel/mca_drv.h                         |  123 --
 arch/ia64/kernel/mca_drv_asm.S                     |   56 -
 arch/ia64/kernel/minstate.h                        |  251 ---
 arch/ia64/kernel/module.c                          |  959 --------
 arch/ia64/kernel/msi_ia64.c                        |  198 --
 arch/ia64/kernel/numa.c                            |   73 -
 arch/ia64/kernel/pal.S                             |  306 ---
 arch/ia64/kernel/palinfo.c                         |  942 --------
 arch/ia64/kernel/patch.c                           |  237 --
 arch/ia64/kernel/pci-dma.c                         |   33 -
 arch/ia64/kernel/perfmon_itanium.h                 |  116 -
 arch/ia64/kernel/process.c                         |  611 ------
 arch/ia64/kernel/ptrace.c                          | 2012 -----------------
 arch/ia64/kernel/relocate_kernel.S                 |  321 ---
 arch/ia64/kernel/sal.c                             |  400 ----
 arch/ia64/kernel/salinfo.c                         |  646 ------
 arch/ia64/kernel/setup.c                           | 1081 ---------
 arch/ia64/kernel/sigframe.h                        |   26 -
 arch/ia64/kernel/signal.c                          |  412 ----
 arch/ia64/kernel/smp.c                             |  335 ---
 arch/ia64/kernel/smpboot.c                         |  839 -------
 arch/ia64/kernel/stacktrace.c                      |   40 -
 arch/ia64/kernel/sys_ia64.c                        |  197 --
 arch/ia64/kernel/syscalls/Makefile                 |   32 -
 arch/ia64/kernel/syscalls/syscall.tbl              |  375 ----
 arch/ia64/kernel/time.c                            |  463 ----
 arch/ia64/kernel/topology.c                        |  410 ----
 arch/ia64/kernel/traps.c                           |  612 ------
 arch/ia64/kernel/unaligned.c                       | 1560 -------------
 arch/ia64/kernel/uncached.c                        |  273 ---
 arch/ia64/kernel/unwind.c                          | 2320 --------------------
 arch/ia64/kernel/unwind_decoder.c                  |  460 ----
 arch/ia64/kernel/unwind_i.h                        |  165 --
 arch/ia64/kernel/vmlinux.lds.S                     |  224 --
 arch/ia64/lib/Makefile                             |   48 -
 arch/ia64/lib/checksum.c                           |  102 -
 arch/ia64/lib/clear_page.S                         |   79 -
 arch/ia64/lib/clear_user.S                         |  212 --
 arch/ia64/lib/copy_page.S                          |  101 -
 arch/ia64/lib/copy_page_mck.S                      |  188 --
 arch/ia64/lib/copy_user.S                          |  613 ------
 arch/ia64/lib/csum_partial_copy.c                  |   98 -
 arch/ia64/lib/do_csum.S                            |  324 ---
 arch/ia64/lib/flush.S                              |  119 -
 arch/ia64/lib/idiv32.S                             |   86 -
 arch/ia64/lib/idiv64.S                             |   83 -
 arch/ia64/lib/io.c                                 |   51 -
 arch/ia64/lib/ip_fast_csum.S                       |  148 --
 arch/ia64/lib/memcpy.S                             |  304 ---
 arch/ia64/lib/memcpy_mck.S                         |  659 ------
 arch/ia64/lib/memset.S                             |  365 ---
 arch/ia64/lib/strlen.S                             |  195 --
 arch/ia64/lib/strncpy_from_user.S                  |   47 -
 arch/ia64/lib/strnlen_user.S                       |   48 -
 arch/ia64/lib/xor.S                                |  181 --
 arch/ia64/mm/Makefile                              |   11 -
 arch/ia64/mm/contig.c                              |  208 --
 arch/ia64/mm/discontig.c                           |  635 ------
 arch/ia64/mm/extable.c                             |   24 -
 arch/ia64/mm/fault.c                               |  251 ---
 arch/ia64/mm/hugetlbpage.c                         |  186 --
 arch/ia64/mm/init.c                                |  532 -----
 arch/ia64/mm/ioremap.c                             |   94 -
 arch/ia64/mm/numa.c                                |   80 -
 arch/ia64/mm/tlb.c                                 |  591 -----
 arch/ia64/pci/Makefile                             |    5 -
 arch/ia64/pci/fixup.c                              |   80 -
 arch/ia64/pci/pci.c                                |  576 -----
 arch/ia64/scripts/check-gas                        |   16 -
 arch/ia64/scripts/check-gas-asm.S                  |    2 -
 arch/ia64/scripts/check-model.c                    |    1 -
 arch/ia64/scripts/check-segrel.S                   |    5 -
 arch/ia64/scripts/check-segrel.lds                 |   13 -
 arch/ia64/scripts/check-serialize.S                |    2 -
 arch/ia64/scripts/check-text-align.S               |    7 -
 arch/ia64/scripts/toolchain-flags                  |   54 -
 arch/ia64/scripts/unwcheck.py                      |   65 -
 arch/ia64/uv/Makefile                              |   12 -
 arch/ia64/uv/kernel/Makefile                       |   12 -
 arch/ia64/uv/kernel/setup.c                        |  120 -
 drivers/acpi/Kconfig                               |    6 +-
 drivers/acpi/numa/Kconfig                          |    4 +-
 drivers/acpi/osl.c                                 |    2 +-
 drivers/char/Kconfig                               |    4 +-
 drivers/char/Makefile                              |    1 -
 drivers/char/agp/Kconfig                           |   16 +-
 drivers/char/agp/Makefile                          |    2 -
 drivers/char/agp/hp-agp.c                          |  550 -----
 drivers/char/agp/i460-agp.c                        |  659 ------
 drivers/char/hpet.c                                |   30 -
 drivers/char/hw_random/Kconfig                     |    2 +-
 drivers/char/mem.c                                 |   12 -
 drivers/char/mspec.c                               |  295 ---
 drivers/cpufreq/Kconfig                            |   11 -
 drivers/cpufreq/Makefile                           |    1 -
 drivers/cpufreq/ia64-acpi-cpufreq.c                |  353 ---
 drivers/firmware/Kconfig                           |   24 -
 drivers/firmware/Makefile                          |    1 -
 drivers/firmware/efi/Kconfig                       |    6 +-
 drivers/firmware/efi/efi.c                         |   13 +-
 drivers/firmware/pcdp.c                            |  135 --
 drivers/firmware/pcdp.h                            |  108 -
 drivers/gpu/drm/drm_ioc32.c                        |    4 +-
 drivers/input/serio/i8042.h                        |    2 +-
 drivers/iommu/Kconfig                              |    4 +-
 drivers/iommu/intel/Kconfig                        |    2 +-
 drivers/media/cec/platform/Kconfig                 |    2 +-
 drivers/misc/Kconfig                               |    2 +-
 drivers/misc/sgi-gru/gru.h                         |    4 +-
 drivers/misc/sgi-gru/gru_instructions.h            |   12 +-
 drivers/misc/sgi-gru/grufile.c                     |   72 -
 drivers/misc/sgi-gru/gruhandles.c                  |    6 -
 drivers/misc/sgi-gru/grumain.c                     |    4 -
 drivers/misc/sgi-xp/xp.h                           |    2 +-
 drivers/misc/sgi-xp/xp_uv.c                        |   24 -
 drivers/misc/sgi-xp/xpc_main.c                     |   31 -
 drivers/misc/sgi-xp/xpc_uv.c                       |   85 -
 drivers/net/ethernet/broadcom/tg3.c                |    2 +-
 drivers/net/ethernet/brocade/bna/bnad.h            |    1 -
 .../net/ethernet/qlogic/netxen/netxen_nic_main.c   |    2 -
 drivers/pci/vgaarb.c                               |    2 +-
 drivers/tty/serial/8250/Kconfig                    |    2 +-
 drivers/tty/vt/keyboard.c                          |    2 +-
 drivers/video/fbdev/Kconfig                        |    2 +-
 drivers/watchdog/Kconfig                           |    2 +-
 fs/Kconfig                                         |    2 +-
 fs/afs/main.c                                      |    2 -
 fs/xfs/xfs_ioctl32.h                               |    2 +-
 include/linux/acpi.h                               |    9 +-
 include/linux/efi.h                                |    7 -
 include/linux/mm.h                                 |    2 -
 include/linux/moduleparam.h                        |    2 +-
 include/trace/events/mmflags.h                     |    2 +-
 init/Kconfig                                       |    2 +-
 kernel/cpu.c                                       |    3 -
 kernel/fork.c                                      |    2 +-
 kernel/sched/core.c                                |   29 +-
 kernel/sysctl.c                                    |    9 -
 lib/Kconfig.debug                                  |    2 +-
 lib/decompress_unxz.c                              |    3 -
 lib/xz/Kconfig                                     |    5 -
 mm/mmap.c                                          |    6 +-
 tools/arch/ia64/include/asm/barrier.h              |   59 -
 tools/arch/ia64/include/uapi/asm/bitsperlong.h     |    9 -
 tools/arch/ia64/include/uapi/asm/mman.h            |    7 -
 usr/include/Makefile                               |    6 -
 357 files changed, 45 insertions(+), 64955 deletions(-)
 delete mode 100644 Documentation/arch/ia64/aliasing.rst
 delete mode 100644 Documentation/arch/ia64/efirtc.rst
 delete mode 100644 Documentation/arch/ia64/err_inject.rst
 delete mode 100644 Documentation/arch/ia64/features.rst
 delete mode 100644 Documentation/arch/ia64/fsys.rst
 delete mode 100644 Documentation/arch/ia64/ia64.rst
 delete mode 100644 Documentation/arch/ia64/index.rst
 delete mode 100644 Documentation/arch/ia64/irq-redir.rst
 delete mode 100644 Documentation/arch/ia64/mca.rst
 delete mode 100644 Documentation/arch/ia64/serial.rst
 delete mode 100644 arch/ia64/Kbuild
 delete mode 100644 arch/ia64/Kconfig
 delete mode 100644 arch/ia64/Kconfig.debug
 delete mode 100644 arch/ia64/Makefile
 delete mode 100644 arch/ia64/configs/bigsur_defconfig
 delete mode 100644 arch/ia64/configs/generic_defconfig
 delete mode 100644 arch/ia64/configs/gensparse_defconfig
 delete mode 100644 arch/ia64/configs/tiger_defconfig
 delete mode 100644 arch/ia64/configs/zx1_defconfig
 delete mode 100644 arch/ia64/hp/common/Makefile
 delete mode 100644 arch/ia64/hp/common/aml_nfw.c
 delete mode 100644 arch/ia64/hp/common/sba_iommu.c
 delete mode 100644 arch/ia64/include/asm/Kbuild
 delete mode 100644 arch/ia64/include/asm/acenv.h
 delete mode 100644 arch/ia64/include/asm/acpi-ext.h
 delete mode 100644 arch/ia64/include/asm/acpi.h
 delete mode 100644 arch/ia64/include/asm/asm-offsets.h
 delete mode 100644 arch/ia64/include/asm/asm-prototypes.h
 delete mode 100644 arch/ia64/include/asm/asmmacro.h
 delete mode 100644 arch/ia64/include/asm/atomic.h
 delete mode 100644 arch/ia64/include/asm/barrier.h
 delete mode 100644 arch/ia64/include/asm/bitops.h
 delete mode 100644 arch/ia64/include/asm/bug.h
 delete mode 100644 arch/ia64/include/asm/cache.h
 delete mode 100644 arch/ia64/include/asm/cacheflush.h
 delete mode 100644 arch/ia64/include/asm/checksum.h
 delete mode 100644 arch/ia64/include/asm/clocksource.h
 delete mode 100644 arch/ia64/include/asm/cmpxchg.h
 delete mode 100644 arch/ia64/include/asm/cpu.h
 delete mode 100644 arch/ia64/include/asm/cputime.h
 delete mode 100644 arch/ia64/include/asm/current.h
 delete mode 100644 arch/ia64/include/asm/cyclone.h
 delete mode 100644 arch/ia64/include/asm/delay.h
 delete mode 100644 arch/ia64/include/asm/device.h
 delete mode 100644 arch/ia64/include/asm/div64.h
 delete mode 100644 arch/ia64/include/asm/dma-mapping.h
 delete mode 100644 arch/ia64/include/asm/dma.h
 delete mode 100644 arch/ia64/include/asm/dmi.h
 delete mode 100644 arch/ia64/include/asm/early_ioremap.h
 delete mode 100644 arch/ia64/include/asm/efi.h
 delete mode 100644 arch/ia64/include/asm/elf.h
 delete mode 100644 arch/ia64/include/asm/emergency-restart.h
 delete mode 100644 arch/ia64/include/asm/esi.h
 delete mode 100644 arch/ia64/include/asm/exception.h
 delete mode 100644 arch/ia64/include/asm/extable.h
 delete mode 100644 arch/ia64/include/asm/fb.h
 delete mode 100644 arch/ia64/include/asm/fpswa.h
 delete mode 100644 arch/ia64/include/asm/ftrace.h
 delete mode 100644 arch/ia64/include/asm/futex.h
 delete mode 100644 arch/ia64/include/asm/gcc_intrin.h
 delete mode 100644 arch/ia64/include/asm/hardirq.h
 delete mode 100644 arch/ia64/include/asm/hugetlb.h
 delete mode 100644 arch/ia64/include/asm/hw_irq.h
 delete mode 100644 arch/ia64/include/asm/idle.h
 delete mode 100644 arch/ia64/include/asm/intrinsics.h
 delete mode 100644 arch/ia64/include/asm/io.h
 delete mode 100644 arch/ia64/include/asm/iommu.h
 delete mode 100644 arch/ia64/include/asm/iosapic.h
 delete mode 100644 arch/ia64/include/asm/irq.h
 delete mode 100644 arch/ia64/include/asm/irq_regs.h
 delete mode 100644 arch/ia64/include/asm/irq_remapping.h
 delete mode 100644 arch/ia64/include/asm/irqflags.h
 delete mode 100644 arch/ia64/include/asm/kdebug.h
 delete mode 100644 arch/ia64/include/asm/kexec.h
 delete mode 100644 arch/ia64/include/asm/kprobes.h
 delete mode 100644 arch/ia64/include/asm/kregs.h
 delete mode 100644 arch/ia64/include/asm/libata-portmap.h
 delete mode 100644 arch/ia64/include/asm/linkage.h
 delete mode 100644 arch/ia64/include/asm/local.h
 delete mode 100644 arch/ia64/include/asm/mca.h
 delete mode 100644 arch/ia64/include/asm/mca_asm.h
 delete mode 100644 arch/ia64/include/asm/meminit.h
 delete mode 100644 arch/ia64/include/asm/mman.h
 delete mode 100644 arch/ia64/include/asm/mmiowb.h
 delete mode 100644 arch/ia64/include/asm/mmu.h
 delete mode 100644 arch/ia64/include/asm/mmu_context.h
 delete mode 100644 arch/ia64/include/asm/mmzone.h
 delete mode 100644 arch/ia64/include/asm/module.h
 delete mode 100644 arch/ia64/include/asm/module.lds.h
 delete mode 100644 arch/ia64/include/asm/msidef.h
 delete mode 100644 arch/ia64/include/asm/native/inst.h
 delete mode 100644 arch/ia64/include/asm/native/irq.h
 delete mode 100644 arch/ia64/include/asm/native/patchlist.h
 delete mode 100644 arch/ia64/include/asm/nodedata.h
 delete mode 100644 arch/ia64/include/asm/numa.h
 delete mode 100644 arch/ia64/include/asm/page.h
 delete mode 100644 arch/ia64/include/asm/pal.h
 delete mode 100644 arch/ia64/include/asm/param.h
 delete mode 100644 arch/ia64/include/asm/parport.h
 delete mode 100644 arch/ia64/include/asm/patch.h
 delete mode 100644 arch/ia64/include/asm/pci.h
 delete mode 100644 arch/ia64/include/asm/percpu.h
 delete mode 100644 arch/ia64/include/asm/pgalloc.h
 delete mode 100644 arch/ia64/include/asm/pgtable.h
 delete mode 100644 arch/ia64/include/asm/processor.h
 delete mode 100644 arch/ia64/include/asm/ptrace.h
 delete mode 100644 arch/ia64/include/asm/sal.h
 delete mode 100644 arch/ia64/include/asm/sections.h
 delete mode 100644 arch/ia64/include/asm/serial.h
 delete mode 100644 arch/ia64/include/asm/shmparam.h
 delete mode 100644 arch/ia64/include/asm/signal.h
 delete mode 100644 arch/ia64/include/asm/smp.h
 delete mode 100644 arch/ia64/include/asm/sn/intr.h
 delete mode 100644 arch/ia64/include/asm/sn/sn_sal.h
 delete mode 100644 arch/ia64/include/asm/sparsemem.h
 delete mode 100644 arch/ia64/include/asm/spinlock.h
 delete mode 100644 arch/ia64/include/asm/spinlock_types.h
 delete mode 100644 arch/ia64/include/asm/string.h
 delete mode 100644 arch/ia64/include/asm/switch_to.h
 delete mode 100644 arch/ia64/include/asm/syscall.h
 delete mode 100644 arch/ia64/include/asm/thread_info.h
 delete mode 100644 arch/ia64/include/asm/timex.h
 delete mode 100644 arch/ia64/include/asm/tlb.h
 delete mode 100644 arch/ia64/include/asm/tlbflush.h
 delete mode 100644 arch/ia64/include/asm/topology.h
 delete mode 100644 arch/ia64/include/asm/types.h
 delete mode 100644 arch/ia64/include/asm/uaccess.h
 delete mode 100644 arch/ia64/include/asm/uncached.h
 delete mode 100644 arch/ia64/include/asm/unistd.h
 delete mode 100644 arch/ia64/include/asm/unwind.h
 delete mode 100644 arch/ia64/include/asm/user.h
 delete mode 100644 arch/ia64/include/asm/ustack.h
 delete mode 100644 arch/ia64/include/asm/uv/uv.h
 delete mode 100644 arch/ia64/include/asm/uv/uv_hub.h
 delete mode 100644 arch/ia64/include/asm/uv/uv_mmrs.h
 delete mode 100644 arch/ia64/include/asm/vermagic.h
 delete mode 100644 arch/ia64/include/asm/vga.h
 delete mode 100644 arch/ia64/include/asm/vmalloc.h
 delete mode 100644 arch/ia64/include/asm/xor.h
 delete mode 100644 arch/ia64/include/asm/xtp.h
 delete mode 100644 arch/ia64/include/uapi/asm/Kbuild
 delete mode 100644 arch/ia64/include/uapi/asm/auxvec.h
 delete mode 100644 arch/ia64/include/uapi/asm/bitsperlong.h
 delete mode 100644 arch/ia64/include/uapi/asm/break.h
 delete mode 100644 arch/ia64/include/uapi/asm/byteorder.h
 delete mode 100644 arch/ia64/include/uapi/asm/cmpxchg.h
 delete mode 100644 arch/ia64/include/uapi/asm/fcntl.h
 delete mode 100644 arch/ia64/include/uapi/asm/fpu.h
 delete mode 100644 arch/ia64/include/uapi/asm/gcc_intrin.h
 delete mode 100644 arch/ia64/include/uapi/asm/ia64regs.h
 delete mode 100644 arch/ia64/include/uapi/asm/intrinsics.h
 delete mode 100644 arch/ia64/include/uapi/asm/mman.h
 delete mode 100644 arch/ia64/include/uapi/asm/param.h
 delete mode 100644 arch/ia64/include/uapi/asm/posix_types.h
 delete mode 100644 arch/ia64/include/uapi/asm/ptrace.h
 delete mode 100644 arch/ia64/include/uapi/asm/ptrace_offsets.h
 delete mode 100644 arch/ia64/include/uapi/asm/resource.h
 delete mode 100644 arch/ia64/include/uapi/asm/rse.h
 delete mode 100644 arch/ia64/include/uapi/asm/setup.h
 delete mode 100644 arch/ia64/include/uapi/asm/sigcontext.h
 delete mode 100644 arch/ia64/include/uapi/asm/siginfo.h
 delete mode 100644 arch/ia64/include/uapi/asm/signal.h
 delete mode 100644 arch/ia64/include/uapi/asm/stat.h
 delete mode 100644 arch/ia64/include/uapi/asm/statfs.h
 delete mode 100644 arch/ia64/include/uapi/asm/swab.h
 delete mode 100644 arch/ia64/include/uapi/asm/types.h
 delete mode 100644 arch/ia64/include/uapi/asm/ucontext.h
 delete mode 100644 arch/ia64/include/uapi/asm/unistd.h
 delete mode 100644 arch/ia64/include/uapi/asm/ustack.h
 delete mode 100755 arch/ia64/install.sh
 delete mode 100644 arch/ia64/kernel/.gitignore
 delete mode 100644 arch/ia64/kernel/Makefile
 delete mode 100644 arch/ia64/kernel/Makefile.gate
 delete mode 100644 arch/ia64/kernel/acpi-ext.c
 delete mode 100644 arch/ia64/kernel/acpi.c
 delete mode 100644 arch/ia64/kernel/asm-offsets.c
 delete mode 100644 arch/ia64/kernel/audit.c
 delete mode 100644 arch/ia64/kernel/brl_emu.c
 delete mode 100644 arch/ia64/kernel/crash.c
 delete mode 100644 arch/ia64/kernel/crash_dump.c
 delete mode 100644 arch/ia64/kernel/cyclone.c
 delete mode 100644 arch/ia64/kernel/dma-mapping.c
 delete mode 100644 arch/ia64/kernel/efi.c
 delete mode 100644 arch/ia64/kernel/efi_stub.S
 delete mode 100644 arch/ia64/kernel/elfcore.c
 delete mode 100644 arch/ia64/kernel/entry.S
 delete mode 100644 arch/ia64/kernel/entry.h
 delete mode 100644 arch/ia64/kernel/err_inject.c
 delete mode 100644 arch/ia64/kernel/esi.c
 delete mode 100644 arch/ia64/kernel/esi_stub.S
 delete mode 100644 arch/ia64/kernel/fsys.S
 delete mode 100644 arch/ia64/kernel/fsyscall_gtod_data.h
 delete mode 100644 arch/ia64/kernel/ftrace.c
 delete mode 100644 arch/ia64/kernel/gate-data.S
 delete mode 100644 arch/ia64/kernel/gate.S
 delete mode 100644 arch/ia64/kernel/gate.lds.S
 delete mode 100644 arch/ia64/kernel/head.S
 delete mode 100644 arch/ia64/kernel/iosapic.c
 delete mode 100644 arch/ia64/kernel/irq.c
 delete mode 100644 arch/ia64/kernel/irq.h
 delete mode 100644 arch/ia64/kernel/irq_ia64.c
 delete mode 100644 arch/ia64/kernel/irq_lsapic.c
 delete mode 100644 arch/ia64/kernel/ivt.S
 delete mode 100644 arch/ia64/kernel/kprobes.c
 delete mode 100644 arch/ia64/kernel/machine_kexec.c
 delete mode 100644 arch/ia64/kernel/mca.c
 delete mode 100644 arch/ia64/kernel/mca_asm.S
 delete mode 100644 arch/ia64/kernel/mca_drv.c
 delete mode 100644 arch/ia64/kernel/mca_drv.h
 delete mode 100644 arch/ia64/kernel/mca_drv_asm.S
 delete mode 100644 arch/ia64/kernel/minstate.h
 delete mode 100644 arch/ia64/kernel/module.c
 delete mode 100644 arch/ia64/kernel/msi_ia64.c
 delete mode 100644 arch/ia64/kernel/numa.c
 delete mode 100644 arch/ia64/kernel/pal.S
 delete mode 100644 arch/ia64/kernel/palinfo.c
 delete mode 100644 arch/ia64/kernel/patch.c
 delete mode 100644 arch/ia64/kernel/pci-dma.c
 delete mode 100644 arch/ia64/kernel/perfmon_itanium.h
 delete mode 100644 arch/ia64/kernel/process.c
 delete mode 100644 arch/ia64/kernel/ptrace.c
 delete mode 100644 arch/ia64/kernel/relocate_kernel.S
 delete mode 100644 arch/ia64/kernel/sal.c
 delete mode 100644 arch/ia64/kernel/salinfo.c
 delete mode 100644 arch/ia64/kernel/setup.c
 delete mode 100644 arch/ia64/kernel/sigframe.h
 delete mode 100644 arch/ia64/kernel/signal.c
 delete mode 100644 arch/ia64/kernel/smp.c
 delete mode 100644 arch/ia64/kernel/smpboot.c
 delete mode 100644 arch/ia64/kernel/stacktrace.c
 delete mode 100644 arch/ia64/kernel/sys_ia64.c
 delete mode 100644 arch/ia64/kernel/syscalls/Makefile
 delete mode 100644 arch/ia64/kernel/syscalls/syscall.tbl
 delete mode 100644 arch/ia64/kernel/time.c
 delete mode 100644 arch/ia64/kernel/topology.c
 delete mode 100644 arch/ia64/kernel/traps.c
 delete mode 100644 arch/ia64/kernel/unaligned.c
 delete mode 100644 arch/ia64/kernel/uncached.c
 delete mode 100644 arch/ia64/kernel/unwind.c
 delete mode 100644 arch/ia64/kernel/unwind_decoder.c
 delete mode 100644 arch/ia64/kernel/unwind_i.h
 delete mode 100644 arch/ia64/kernel/vmlinux.lds.S
 delete mode 100644 arch/ia64/lib/Makefile
 delete mode 100644 arch/ia64/lib/checksum.c
 delete mode 100644 arch/ia64/lib/clear_page.S
 delete mode 100644 arch/ia64/lib/clear_user.S
 delete mode 100644 arch/ia64/lib/copy_page.S
 delete mode 100644 arch/ia64/lib/copy_page_mck.S
 delete mode 100644 arch/ia64/lib/copy_user.S
 delete mode 100644 arch/ia64/lib/csum_partial_copy.c
 delete mode 100644 arch/ia64/lib/do_csum.S
 delete mode 100644 arch/ia64/lib/flush.S
 delete mode 100644 arch/ia64/lib/idiv32.S
 delete mode 100644 arch/ia64/lib/idiv64.S
 delete mode 100644 arch/ia64/lib/io.c
 delete mode 100644 arch/ia64/lib/ip_fast_csum.S
 delete mode 100644 arch/ia64/lib/memcpy.S
 delete mode 100644 arch/ia64/lib/memcpy_mck.S
 delete mode 100644 arch/ia64/lib/memset.S
 delete mode 100644 arch/ia64/lib/strlen.S
 delete mode 100644 arch/ia64/lib/strncpy_from_user.S
 delete mode 100644 arch/ia64/lib/strnlen_user.S
 delete mode 100644 arch/ia64/lib/xor.S
 delete mode 100644 arch/ia64/mm/Makefile
 delete mode 100644 arch/ia64/mm/contig.c
 delete mode 100644 arch/ia64/mm/discontig.c
 delete mode 100644 arch/ia64/mm/extable.c
 delete mode 100644 arch/ia64/mm/fault.c
 delete mode 100644 arch/ia64/mm/hugetlbpage.c
 delete mode 100644 arch/ia64/mm/init.c
 delete mode 100644 arch/ia64/mm/ioremap.c
 delete mode 100644 arch/ia64/mm/numa.c
 delete mode 100644 arch/ia64/mm/tlb.c
 delete mode 100644 arch/ia64/pci/Makefile
 delete mode 100644 arch/ia64/pci/fixup.c
 delete mode 100644 arch/ia64/pci/pci.c
 delete mode 100755 arch/ia64/scripts/check-gas
 delete mode 100644 arch/ia64/scripts/check-gas-asm.S
 delete mode 100644 arch/ia64/scripts/check-model.c
 delete mode 100644 arch/ia64/scripts/check-segrel.S
 delete mode 100644 arch/ia64/scripts/check-segrel.lds
 delete mode 100644 arch/ia64/scripts/check-serialize.S
 delete mode 100644 arch/ia64/scripts/check-text-align.S
 delete mode 100755 arch/ia64/scripts/toolchain-flags
 delete mode 100644 arch/ia64/scripts/unwcheck.py
 delete mode 100644 arch/ia64/uv/Makefile
 delete mode 100644 arch/ia64/uv/kernel/Makefile
 delete mode 100644 arch/ia64/uv/kernel/setup.c
 delete mode 100644 drivers/char/agp/hp-agp.c
 delete mode 100644 drivers/char/agp/i460-agp.c
 delete mode 100644 drivers/char/mspec.c
 delete mode 100644 drivers/cpufreq/ia64-acpi-cpufreq.c
 delete mode 100644 drivers/firmware/pcdp.c
 delete mode 100644 drivers/firmware/pcdp.h
 delete mode 100644 tools/arch/ia64/include/asm/barrier.h
 delete mode 100644 tools/arch/ia64/include/uapi/asm/bitsperlong.h
 delete mode 100644 tools/arch/ia64/include/uapi/asm/mman.h

(limited to 'kernel')

diff --git a/Documentation/arch/ia64/aliasing.rst b/Documentation/arch/ia64/aliasing.rst
deleted file mode 100644
index 36a1e1d4842b..000000000000
--- a/Documentation/arch/ia64/aliasing.rst
+++ /dev/null
@@ -1,246 +0,0 @@
-==================================
-Memory Attribute Aliasing on IA-64
-==================================
-
-Bjorn Helgaas <bjorn.helgaas@hp.com>
-
-May 4, 2006
-
-
-Memory Attributes
-=================
-
-    Itanium supports several attributes for virtual memory references.
-    The attribute is part of the virtual translation, i.e., it is
-    contained in the TLB entry.  The ones of most interest to the Linux
-    kernel are:
-
-	==		======================
-        WB		Write-back (cacheable)
-	UC		Uncacheable
-	WC		Write-coalescing
-	==		======================
-
-    System memory typically uses the WB attribute.  The UC attribute is
-    used for memory-mapped I/O devices.  The WC attribute is uncacheable
-    like UC is, but writes may be delayed and combined to increase
-    performance for things like frame buffers.
-
-    The Itanium architecture requires that we avoid accessing the same
-    page with both a cacheable mapping and an uncacheable mapping[1].
-
-    The design of the chipset determines which attributes are supported
-    on which regions of the address space.  For example, some chipsets
-    support either WB or UC access to main memory, while others support
-    only WB access.
-
-Memory Map
-==========
-
-    Platform firmware describes the physical memory map and the
-    supported attributes for each region.  At boot-time, the kernel uses
-    the EFI GetMemoryMap() interface.  ACPI can also describe memory
-    devices and the attributes they support, but Linux/ia64 currently
-    doesn't use this information.
-
-    The kernel uses the efi_memmap table returned from GetMemoryMap() to
-    learn the attributes supported by each region of physical address
-    space.  Unfortunately, this table does not completely describe the
-    address space because some machines omit some or all of the MMIO
-    regions from the map.
-
-    The kernel maintains another table, kern_memmap, which describes the
-    memory Linux is actually using and the attribute for each region.
-    This contains only system memory; it does not contain MMIO space.
-
-    The kern_memmap table typically contains only a subset of the system
-    memory described by the efi_memmap.  Linux/ia64 can't use all memory
-    in the system because of constraints imposed by the identity mapping
-    scheme.
-
-    The efi_memmap table is preserved unmodified because the original
-    boot-time information is required for kexec.
-
-Kernel Identity Mappings
-========================
-
-    Linux/ia64 identity mappings are done with large pages, currently
-    either 16MB or 64MB, referred to as "granules."  Cacheable mappings
-    are speculative[2], so the processor can read any location in the
-    page at any time, independent of the programmer's intentions.  This
-    means that to avoid attribute aliasing, Linux can create a cacheable
-    identity mapping only when the entire granule supports cacheable
-    access.
-
-    Therefore, kern_memmap contains only full granule-sized regions that
-    can referenced safely by an identity mapping.
-
-    Uncacheable mappings are not speculative, so the processor will
-    generate UC accesses only to locations explicitly referenced by
-    software.  This allows UC identity mappings to cover granules that
-    are only partially populated, or populated with a combination of UC
-    and WB regions.
-
-User Mappings
-=============
-
-    User mappings are typically done with 16K or 64K pages.  The smaller
-    page size allows more flexibility because only 16K or 64K has to be
-    homogeneous with respect to memory attributes.
-
-Potential Attribute Aliasing Cases
-==================================
-
-    There are several ways the kernel creates new mappings:
-
-mmap of /dev/mem
-----------------
-
-	This uses remap_pfn_range(), which creates user mappings.  These
-	mappings may be either WB or UC.  If the region being mapped
-	happens to be in kern_memmap, meaning that it may also be mapped
-	by a kernel identity mapping, the user mapping must use the same
-	attribute as the kernel mapping.
-
-	If the region is not in kern_memmap, the user mapping should use
-	an attribute reported as being supported in the EFI memory map.
-
-	Since the EFI memory map does not describe MMIO on some
-	machines, this should use an uncacheable mapping as a fallback.
-
-mmap of /sys/class/pci_bus/.../legacy_mem
------------------------------------------
-
-	This is very similar to mmap of /dev/mem, except that legacy_mem
-	only allows mmap of the one megabyte "legacy MMIO" area for a
-	specific PCI bus.  Typically this is the first megabyte of
-	physical address space, but it may be different on machines with
-	several VGA devices.
-
-	"X" uses this to access VGA frame buffers.  Using legacy_mem
-	rather than /dev/mem allows multiple instances of X to talk to
-	different VGA cards.
-
-	The /dev/mem mmap constraints apply.
-
-mmap of /proc/bus/pci/.../??.?
-------------------------------
-
-	This is an MMIO mmap of PCI functions, which additionally may or
-	may not be requested as using the WC attribute.
-
-	If WC is requested, and the region in kern_memmap is either WC
-	or UC, and the EFI memory map designates the region as WC, then
-	the WC mapping is allowed.
-
-	Otherwise, the user mapping must use the same attribute as the
-	kernel mapping.
-
-read/write of /dev/mem
-----------------------
-
-	This uses copy_from_user(), which implicitly uses a kernel
-	identity mapping.  This is obviously safe for things in
-	kern_memmap.
-
-	There may be corner cases of things that are not in kern_memmap,
-	but could be accessed this way.  For example, registers in MMIO
-	space are not in kern_memmap, but could be accessed with a UC
-	mapping.  This would not cause attribute aliasing.  But
-	registers typically can be accessed only with four-byte or
-	eight-byte accesses, and the copy_from_user() path doesn't allow
-	any control over the access size, so this would be dangerous.
-
-ioremap()
----------
-
-	This returns a mapping for use inside the kernel.
-
-	If the region is in kern_memmap, we should use the attribute
-	specified there.
-
-	If the EFI memory map reports that the entire granule supports
-	WB, we should use that (granules that are partially reserved
-	or occupied by firmware do not appear in kern_memmap).
-
-	If the granule contains non-WB memory, but we can cover the
-	region safely with kernel page table mappings, we can use
-	ioremap_page_range() as most other architectures do.
-
-	Failing all of the above, we have to fall back to a UC mapping.
-
-Past Problem Cases
-==================
-
-mmap of various MMIO regions from /dev/mem by "X" on Intel platforms
---------------------------------------------------------------------
-
-      The EFI memory map may not report these MMIO regions.
-
-      These must be allowed so that X will work.  This means that
-      when the EFI memory map is incomplete, every /dev/mem mmap must
-      succeed.  It may create either WB or UC user mappings, depending
-      on whether the region is in kern_memmap or the EFI memory map.
-
-mmap of 0x0-0x9FFFF /dev/mem by "hwinfo" on HP sx1000 with VGA enabled
-----------------------------------------------------------------------
-
-      The EFI memory map reports the following attributes:
-
-        =============== ======= ==================
-        0x00000-0x9FFFF WB only
-        0xA0000-0xBFFFF UC only (VGA frame buffer)
-        0xC0000-0xFFFFF WB only
-        =============== ======= ==================
-
-      This mmap is done with user pages, not kernel identity mappings,
-      so it is safe to use WB mappings.
-
-      The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000,
-      which uses a granule-sized UC mapping.  This granule will cover some
-      WB-only memory, but since UC is non-speculative, the processor will
-      never generate an uncacheable reference to the WB-only areas unless
-      the driver explicitly touches them.
-
-mmap of 0x0-0xFFFFF legacy_mem by "X"
--------------------------------------
-
-      If the EFI memory map reports that the entire range supports the
-      same attributes, we can allow the mmap (and we will prefer WB if
-      supported, as is the case with HP sx[12]000 machines with VGA
-      disabled).
-
-      If EFI reports the range as partly WB and partly UC (as on sx[12]000
-      machines with VGA enabled), we must fail the mmap because there's no
-      safe attribute to use.
-
-      If EFI reports some of the range but not all (as on Intel firmware
-      that doesn't report the VGA frame buffer at all), we should fail the
-      mmap and force the user to map just the specific region of interest.
-
-mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled
-------------------------------------------------------------------------
-
-      The EFI memory map reports the following attributes::
-
-        0x00000-0xFFFFF WB only (no VGA MMIO hole)
-
-      This is a special case of the previous case, and the mmap should
-      fail for the same reason as above.
-
-read of /sys/devices/.../rom
-----------------------------
-
-      For VGA devices, this may cause an ioremap() of 0xC0000.  This
-      used to be done with a UC mapping, because the VGA frame buffer
-      at 0xA0000 prevents use of a WB granule.  The UC mapping causes
-      an MCA on HP sx[12]000 chipsets.
-
-      We should use WB page table mappings to avoid covering the VGA
-      frame buffer.
-
-Notes
-=====
-
-    [1] SDM rev 2.2, vol 2, sec 4.4.1.
-    [2] SDM rev 2.2, vol 2, sec 4.4.6.
diff --git a/Documentation/arch/ia64/efirtc.rst b/Documentation/arch/ia64/efirtc.rst
deleted file mode 100644
index fd8328408301..000000000000
--- a/Documentation/arch/ia64/efirtc.rst
+++ /dev/null
@@ -1,144 +0,0 @@
-==========================
-EFI Real Time Clock driver
-==========================
-
-S. Eranian <eranian@hpl.hp.com>
-
-March 2000
-
-1. Introduction
-===============
-
-This document describes the efirtc.c driver has provided for
-the IA-64 platform.
-
-The purpose of this driver is to supply an API for kernel and user applications
-to get access to the Time Service offered by EFI version 0.92.
-
-EFI provides 4 calls one can make once the OS is booted: GetTime(),
-SetTime(), GetWakeupTime(), SetWakeupTime() which are all supported by this
-driver. We describe those calls as well the design of the driver in the
-following sections.
-
-2. Design Decisions
-===================
-
-The original ideas was to provide a very simple driver to get access to,
-at first, the time of day service. This is required in order to access, in a
-portable way, the CMOS clock. A program like /sbin/hwclock uses such a clock
-to initialize the system view of the time during boot.
-
-Because we wanted to minimize the impact on existing user-level apps using
-the CMOS clock, we decided to expose an API that was very similar to the one
-used today with the legacy RTC driver (driver/char/rtc.c). However, because
-EFI provides a simpler services, not all ioctl() are available. Also
-new ioctl()s have been introduced for things that EFI provides but not the
-legacy.
-
-EFI uses a slightly different way of representing the time, noticeably
-the reference date is different. Year is the using the full 4-digit format.
-The Epoch is January 1st 1998. For backward compatibility reasons we don't
-expose this new way of representing time. Instead we use something very
-similar to the struct tm, i.e. struct rtc_time, as used by hwclock.
-One of the reasons for doing it this way is to allow for EFI to still evolve
-without necessarily impacting any of the user applications. The decoupling
-enables flexibility and permits writing wrapper code is ncase things change.
-
-The driver exposes two interfaces, one via the device file and a set of
-ioctl()s. The other is read-only via the /proc filesystem.
-
-As of today we don't offer a /proc/sys interface.
-
-To allow for a uniform interface between the legacy RTC and EFI time service,
-we have created the include/linux/rtc.h header file to contain only the
-"public" API of the two drivers.  The specifics of the legacy RTC are still
-in include/linux/mc146818rtc.h.
-
-
-3. Time of day service
-======================
-
-The part of the driver gives access to the time of day service of EFI.
-Two ioctl()s, compatible with the legacy RTC calls:
-
-	Read the CMOS clock::
-
-		ioctl(d, RTC_RD_TIME, &rtc);
-
-	Write the CMOS clock::
-
-		ioctl(d, RTC_SET_TIME, &rtc);
-
-The rtc is a pointer to a data structure defined in rtc.h which is close
-to a struct tm::
-
-  struct rtc_time {
-          int tm_sec;
-          int tm_min;
-          int tm_hour;
-          int tm_mday;
-          int tm_mon;
-          int tm_year;
-          int tm_wday;
-          int tm_yday;
-          int tm_isdst;
-  };
-
-The driver takes care of converting back an forth between the EFI time and
-this format.
-
-Those two ioctl()s can be exercised with the hwclock command:
-
-For reading::
-
-	# /sbin/hwclock --show
-	Mon Mar  6 15:32:32 2000  -0.910248 seconds
-
-For setting::
-
-	# /sbin/hwclock --systohc
-
-Root privileges are required to be able to set the time of day.
-
-4. Wakeup Alarm service
-=======================
-
-EFI provides an API by which one can program when a machine should wakeup,
-i.e. reboot. This is very different from the alarm provided by the legacy
-RTC which is some kind of interval timer alarm. For this reason we don't use
-the same ioctl()s to get access to the service. Instead we have
-introduced 2 news ioctl()s to the interface of an RTC.
-
-We have added 2 new ioctl()s that are specific to the EFI driver:
-
-	Read the current state of the alarm::
-
-		ioctl(d, RTC_WKALM_RD, &wkt)
-
-	Set the alarm or change its status::
-
-		ioctl(d, RTC_WKALM_SET, &wkt)
-
-The wkt structure encapsulates a struct rtc_time + 2 extra fields to get
-status information::
-
-  struct rtc_wkalrm {
-
-          unsigned char enabled; /* =1 if alarm is enabled */
-          unsigned char pending; /* =1 if alarm is pending  */
-
-          struct rtc_time time;
-  }
-
-As of today, none of the existing user-level apps supports this feature.
-However writing such a program should be hard by simply using those two
-ioctl().
-
-Root privileges are required to be able to set the alarm.
-
-5. References
-=============
-
-Checkout the following Web site for more information on EFI:
-
-http://developer.intel.com/technology/efi/
diff --git a/Documentation/arch/ia64/err_inject.rst b/Documentation/arch/ia64/err_inject.rst
deleted file mode 100644
index 900f71e93a29..000000000000
--- a/Documentation/arch/ia64/err_inject.rst
+++ /dev/null
@@ -1,1067 +0,0 @@
-========================================
-IPF Machine Check (MC) error inject tool
-========================================
-
-IPF Machine Check (MC) error inject tool is used to inject MC
-errors from Linux. The tool is a test bed for IPF MC work flow including
-hardware correctable error handling, OS recoverable error handling, MC
-event logging, etc.
-
-The tool includes two parts: a kernel driver and a user application
-sample. The driver provides interface to PAL to inject error
-and query error injection capabilities. The driver code is in
-arch/ia64/kernel/err_inject.c. The application sample (shown below)
-provides a combination of various errors and calls the driver's interface
-(sysfs interface) to inject errors or query error injection capabilities.
-
-The tool can be used to test Intel IPF machine MC handling capabilities.
-It's especially useful for people who can not access hardware MC injection
-tool to inject error. It's also very useful to integrate with other
-software test suits to do stressful testing on IPF.
-
-Below is a sample application as part of the whole tool. The sample
-can be used as a working test tool. Or it can be expanded to include
-more features. It also can be a integrated into a library or other user
-application to have more thorough test.
-
-The sample application takes err.conf as error configuration input. GCC
-compiles the code. After you install err_inject driver, you can run
-this sample application to inject errors.
-
-Errata: Itanium 2 Processors Specification Update lists some errata against
-the pal_mc_error_inject PAL procedure. The following err.conf has been tested
-on latest Montecito PAL.
-
-err.conf::
-
-  #This is configuration file for err_inject_tool.
-  #The format of the each line is:
-  #cpu, loop, interval, err_type_info, err_struct_info, err_data_buffer
-  #where
-  #	cpu: logical cpu number the error will be inject in.
-  #	loop: times the error will be injected.
-  #	interval: In second. every so often one error is injected.
-  #	err_type_info, err_struct_info: PAL parameters.
-  #
-  #Note: All values are hex w/o or w/ 0x prefix.
-
-
-  #On cpu2, inject only total 0x10 errors, interval 5 seconds
-  #corrected, data cache, hier-2, physical addr(assigned by tool code).
-  #working on Montecito latest PAL.
-  2, 10, 5, 4101, 95
-
-  #On cpu4, inject and consume total 0x10 errors, interval 5 seconds
-  #corrected, data cache, hier-2, physical addr(assigned by tool code).
-  #working on Montecito latest PAL.
-  4, 10, 5, 4109, 95
-
-  #On cpu15, inject and consume total 0x10 errors, interval 5 seconds
-  #recoverable, DTR0, hier-2.
-  #working on Montecito latest PAL.
-  0xf, 0x10, 5, 4249, 15
-
-The sample application source code:
-
-err_injection_tool.c::
-
-  /*
-   * This program is free software; you can redistribute it and/or modify
-   * it under the terms of the GNU General Public License as published by
-   * the Free Software Foundation; either version 2 of the License, or
-   * (at your option) any later version.
-   *
-   * This program is distributed in the hope that it will be useful, but
-   * WITHOUT ANY WARRANTY; without even the implied warranty of
-   * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
-   * NON INFRINGEMENT.  See the GNU General Public License for more
-   * details.
-   *
-   * You should have received a copy of the GNU General Public License
-   * along with this program; if not, write to the Free Software
-   * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-   *
-   * Copyright (C) 2006 Intel Co
-   *	Fenghua Yu <fenghua.yu@intel.com>
-   *
-   */
-  #include <sys/types.h>
-  #include <sys/stat.h>
-  #include <fcntl.h>
-  #include <stdio.h>
-  #include <sched.h>
-  #include <unistd.h>
-  #include <stdlib.h>
-  #include <stdarg.h>
-  #include <string.h>
-  #include <errno.h>
-  #include <time.h>
-  #include <sys/ipc.h>
-  #include <sys/sem.h>
-  #include <sys/wait.h>
-  #include <sys/mman.h>
-  #include <sys/shm.h>
-
-  #define MAX_FN_SIZE 		256
-  #define MAX_BUF_SIZE 		256
-  #define DATA_BUF_SIZE 		256
-  #define NR_CPUS 		512
-  #define MAX_TASK_NUM		2048
-  #define MIN_INTERVAL		5	// seconds
-  #define	ERR_DATA_BUFFER_SIZE 	3	// Three 8-byte.
-  #define PARA_FIELD_NUM		5
-  #define MASK_SIZE		(NR_CPUS/64)
-  #define PATH_FORMAT "/sys/devices/system/cpu/cpu%d/err_inject/"
-
-  int sched_setaffinity(pid_t pid, unsigned int len, unsigned long *mask);
-
-  int verbose;
-  #define vbprintf if (verbose) printf
-
-  int log_info(int cpu, const char *fmt, ...)
-  {
-	FILE *log;
-	char fn[MAX_FN_SIZE];
-	char buf[MAX_BUF_SIZE];
-	va_list args;
-
-	sprintf(fn, "%d.log", cpu);
-	log=fopen(fn, "a+");
-	if (log==NULL) {
-		perror("Error open:");
-		return -1;
-	}
-
-	va_start(args, fmt);
-	vprintf(fmt, args);
-	memset(buf, 0, MAX_BUF_SIZE);
-	vsprintf(buf, fmt, args);
-	va_end(args);
-
-	fwrite(buf, sizeof(buf), 1, log);
-	fclose(log);
-
-	return 0;
-  }
-
-  typedef unsigned long u64;
-  typedef unsigned int  u32;
-
-  typedef union err_type_info_u {
-	struct {
-		u64	mode		: 3,	/* 0-2 */
-			err_inj		: 3,	/* 3-5 */
-			err_sev		: 2,	/* 6-7 */
-			err_struct	: 5,	/* 8-12 */
-			struct_hier	: 3,	/* 13-15 */
-			reserved	: 48;	/* 16-63 */
-	} err_type_info_u;
-	u64	err_type_info;
-  } err_type_info_t;
-
-  typedef union err_struct_info_u {
-	struct {
-		u64	siv		: 1,	/* 0	 */
-			c_t		: 2,	/* 1-2	 */
-			cl_p		: 3,	/* 3-5	 */
-			cl_id		: 3,	/* 6-8	 */
-			cl_dp		: 1,	/* 9	 */
-			reserved1	: 22,	/* 10-31 */
-			tiv		: 1,	/* 32	 */
-			trigger		: 4,	/* 33-36 */
-			trigger_pl 	: 3,	/* 37-39 */
-			reserved2 	: 24;	/* 40-63 */
-	} err_struct_info_cache;
-	struct {
-		u64	siv		: 1,	/* 0	 */
-			tt		: 2,	/* 1-2	 */
-			tc_tr		: 2,	/* 3-4	 */
-			tr_slot		: 8,	/* 5-12	 */
-			reserved1	: 19,	/* 13-31 */
-			tiv		: 1,	/* 32	 */
-			trigger		: 4,	/* 33-36 */
-			trigger_pl 	: 3,	/* 37-39 */
-			reserved2 	: 24;	/* 40-63 */
-	} err_struct_info_tlb;
-	struct {
-		u64	siv		: 1,	/* 0	 */
-			regfile_id	: 4,	/* 1-4	 */
-			reg_num		: 7,	/* 5-11	 */
-			reserved1	: 20,	/* 12-31 */
-			tiv		: 1,	/* 32	 */
-			trigger		: 4,	/* 33-36 */
-			trigger_pl 	: 3,	/* 37-39 */
-			reserved2 	: 24;	/* 40-63 */
-	} err_struct_info_register;
-	struct {
-		u64	reserved;
-	} err_struct_info_bus_processor_interconnect;
-	u64	err_struct_info;
-  } err_struct_info_t;
-
-  typedef union err_data_buffer_u {
-	struct {
-		u64	trigger_addr;		/* 0-63		*/
-		u64	inj_addr;		/* 64-127 	*/
-		u64	way		: 5,	/* 128-132	*/
-			index		: 20,	/* 133-152	*/
-					: 39;	/* 153-191	*/
-	} err_data_buffer_cache;
-	struct {
-		u64	trigger_addr;		/* 0-63		*/
-		u64	inj_addr;		/* 64-127 	*/
-		u64	way		: 5,	/* 128-132	*/
-			index		: 20,	/* 133-152	*/
-			reserved	: 39;	/* 153-191	*/
-	} err_data_buffer_tlb;
-	struct {
-		u64	trigger_addr;		/* 0-63		*/
-	} err_data_buffer_register;
-	struct {
-		u64	reserved;		/* 0-63		*/
-	} err_data_buffer_bus_processor_interconnect;
-	u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
-  } err_data_buffer_t;
-
-  typedef union capabilities_u {
-	struct {
-		u64	i		: 1,
-			d		: 1,
-			rv		: 1,
-			tag		: 1,
-			data		: 1,
-			mesi		: 1,
-			dp		: 1,
-			reserved1	: 3,
-			pa		: 1,
-			va		: 1,
-			wi		: 1,
-			reserved2	: 20,
-			trigger		: 1,
-			trigger_pl	: 1,
-			reserved3	: 30;
-	} capabilities_cache;
-	struct {
-		u64	d		: 1,
-			i		: 1,
-			rv		: 1,
-			tc		: 1,
-			tr		: 1,
-			reserved1	: 27,
-			trigger		: 1,
-			trigger_pl	: 1,
-			reserved2	: 30;
-	} capabilities_tlb;
-	struct {
-		u64	gr_b0		: 1,
-			gr_b1		: 1,
-			fr		: 1,
-			br		: 1,
-			pr		: 1,
-			ar		: 1,
-			cr		: 1,
-			rr		: 1,
-			pkr		: 1,
-			dbr		: 1,
-			ibr		: 1,
-			pmc		: 1,
-			pmd		: 1,
-			reserved1	: 3,
-			regnum		: 1,
-			reserved2	: 15,
-			trigger		: 1,
-			trigger_pl	: 1,
-			reserved3	: 30;
-	} capabilities_register;
-	struct {
-		u64	reserved;
-	} capabilities_bus_processor_interconnect;
-  } capabilities_t;
-
-  typedef struct resources_s {
-	u64	ibr0		: 1,
-		ibr2		: 1,
-		ibr4		: 1,
-		ibr6		: 1,
-		dbr0		: 1,
-		dbr2		: 1,
-		dbr4		: 1,
-		dbr6		: 1,
-		reserved	: 48;
-  } resources_t;
-
-
-  long get_page_size(void)
-  {
-	long page_size=sysconf(_SC_PAGESIZE);
-	return page_size;
-  }
-
-  #define PAGE_SIZE (get_page_size()==-1?0x4000:get_page_size())
-  #define SHM_SIZE (2*PAGE_SIZE*NR_CPUS)
-  #define SHM_VA 0x2000000100000000
-
-  int shmid;
-  void *shmaddr;
-
-  int create_shm(void)
-  {
-	key_t key;
-	char fn[MAX_FN_SIZE];
-
-	/* cpu0 is always existing */
-	sprintf(fn, PATH_FORMAT, 0);
-	if ((key = ftok(fn, 's')) == -1) {
-		perror("ftok");
-		return -1;
-	}
-
-	shmid = shmget(key, SHM_SIZE, 0644 | IPC_CREAT);
-	if (shmid == -1) {
-		if (errno==EEXIST) {
-			shmid = shmget(key, SHM_SIZE, 0);
-			if (shmid == -1) {
-				perror("shmget");
-				return -1;
-			}
-		}
-		else {
-			perror("shmget");
-			return -1;
-		}
-	}
-	vbprintf("shmid=%d", shmid);
-
-	/* connect to the segment: */
-	shmaddr = shmat(shmid, (void *)SHM_VA, 0);
-	if (shmaddr == (void*)-1) {
-		perror("shmat");
-		return -1;
-	}
-
-	memset(shmaddr, 0, SHM_SIZE);
-	mlock(shmaddr, SHM_SIZE);
-
-	return 0;
-  }
-
-  int free_shm()
-  {
-	munlock(shmaddr, SHM_SIZE);
-          shmdt(shmaddr);
-	semctl(shmid, 0, IPC_RMID);
-
-	return 0;
-  }
-
-  #ifdef _SEM_SEMUN_UNDEFINED
-  union semun
-  {
-	int val;
-	struct semid_ds *buf;
-	unsigned short int *array;
-	struct seminfo *__buf;
-  };
-  #endif
-
-  u32 mode=1; /* 1: physical mode; 2: virtual mode. */
-  int one_lock=1;
-  key_t key[NR_CPUS];
-  int semid[NR_CPUS];
-
-  int create_sem(int cpu)
-  {
-	union semun arg;
-	char fn[MAX_FN_SIZE];
-	int sid;
-
-	sprintf(fn, PATH_FORMAT, cpu);
-	sprintf(fn, "%s/%s", fn, "err_type_info");
-	if ((key[cpu] = ftok(fn, 'e')) == -1) {
-		perror("ftok");
-		return -1;
-	}
-
-	if (semid[cpu]!=0)
-		return 0;
-
-	/* clear old semaphore */
-	if ((sid = semget(key[cpu], 1, 0)) != -1)
-		semctl(sid, 0, IPC_RMID);
-
-	/* get one semaphore */
-	if ((semid[cpu] = semget(key[cpu], 1, IPC_CREAT | IPC_EXCL)) == -1) {
-		perror("semget");
-		printf("Please remove semaphore with key=0x%lx, then run the tool.\n",
-			(u64)key[cpu]);
-		return -1;
-	}
-
-	vbprintf("semid[%d]=0x%lx, key[%d]=%lx\n",cpu,(u64)semid[cpu],cpu,
-		(u64)key[cpu]);
-	/* initialize the semaphore to 1: */
-	arg.val = 1;
-	if (semctl(semid[cpu], 0, SETVAL, arg) == -1) {
-		perror("semctl");
-		return -1;
-	}
-
-	return 0;
-  }
-
-  static int lock(int cpu)
-  {
-	struct sembuf lock;
-
-	lock.sem_num = cpu;
-	lock.sem_op = 1;
-	semop(semid[cpu], &lock, 1);
-
-          return 0;
-  }
-
-  static int unlock(int cpu)
-  {
-	struct sembuf unlock;
-
-	unlock.sem_num = cpu;
-	unlock.sem_op = -1;
-	semop(semid[cpu], &unlock, 1);
-
-          return 0;
-  }
-
-  void free_sem(int cpu)
-  {
-	semctl(semid[cpu], 0, IPC_RMID);
-  }
-
-  int wr_multi(char *fn, unsigned long *data, int size)
-  {
-	int fd;
-	char buf[MAX_BUF_SIZE];
-	int ret;
-
-	if (size==1)
-		sprintf(buf, "%lx", *data);
-	else if (size==3)
-		sprintf(buf, "%lx,%lx,%lx", data[0], data[1], data[2]);
-	else {
-		fprintf(stderr,"write to file with wrong size!\n");
-		return -1;
-	}
-
-	fd=open(fn, O_RDWR);
-	if (!fd) {
-		perror("Error:");
-		return -1;
-	}
-	ret=write(fd, buf, sizeof(buf));
-	close(fd);
-	return ret;
-  }
-
-  int wr(char *fn, unsigned long data)
-  {
-	return wr_multi(fn, &data, 1);
-  }
-
-  int rd(char *fn, unsigned long *data)
-  {
-	int fd;
-	char buf[MAX_BUF_SIZE];
-
-	fd=open(fn, O_RDONLY);
-	if (fd<0) {
-		perror("Error:");
-		return -1;
-	}
-	read(fd, buf, MAX_BUF_SIZE);
-	*data=strtoul(buf, NULL, 16);
-	close(fd);
-	return 0;
-  }
-
-  int rd_status(char *path, int *status)
-  {
-	char fn[MAX_FN_SIZE];
-	sprintf(fn, "%s/status", path);
-	if (rd(fn, (u64*)status)<0) {
-		perror("status reading error.\n");
-		return -1;
-	}
-
-	return 0;
-  }
-
-  int rd_capabilities(char *path, u64 *capabilities)
-  {
-	char fn[MAX_FN_SIZE];
-	sprintf(fn, "%s/capabilities", path);
-	if (rd(fn, capabilities)<0) {
-		perror("capabilities reading error.\n");
-		return -1;
-	}
-
-	return 0;
-  }
-
-  int rd_all(char *path)
-  {
-	unsigned long err_type_info, err_struct_info, err_data_buffer;
-	int status;
-	unsigned long capabilities, resources;
-	char fn[MAX_FN_SIZE];
-
-	sprintf(fn, "%s/err_type_info", path);
-	if (rd(fn, &err_type_info)<0) {
-		perror("err_type_info reading error.\n");
-		return -1;
-	}
-	printf("err_type_info=%lx\n", err_type_info);
-
-	sprintf(fn, "%s/err_struct_info", path);
-	if (rd(fn, &err_struct_info)<0) {
-		perror("err_struct_info reading error.\n");
-		return -1;
-	}
-	printf("err_struct_info=%lx\n", err_struct_info);
-
-	sprintf(fn, "%s/err_data_buffer", path);
-	if (rd(fn, &err_data_buffer)<0) {
-		perror("err_data_buffer reading error.\n");
-		return -1;
-	}
-	printf("err_data_buffer=%lx\n", err_data_buffer);
-
-	sprintf(fn, "%s/status", path);
-	if (rd("status", (u64*)&status)<0) {
-		perror("status reading error.\n");
-		return -1;
-	}
-	printf("status=%d\n", status);
-
-	sprintf(fn, "%s/capabilities", path);
-	if (rd(fn,&capabilities)<0) {
-		perror("capabilities reading error.\n");
-		return -1;
-	}
-	printf("capabilities=%lx\n", capabilities);
-
-	sprintf(fn, "%s/resources", path);
-	if (rd(fn, &resources)<0) {
-		perror("resources reading error.\n");
-		return -1;
-	}
-	printf("resources=%lx\n", resources);
-
-	return 0;
-  }
-
-  int query_capabilities(char *path, err_type_info_t err_type_info,
-			u64 *capabilities)
-  {
-	char fn[MAX_FN_SIZE];
-	err_struct_info_t err_struct_info;
-	err_data_buffer_t err_data_buffer;
-
-	err_struct_info.err_struct_info=0;
-	memset(err_data_buffer.err_data_buffer, -1, ERR_DATA_BUFFER_SIZE*8);
-
-	sprintf(fn, "%s/err_type_info", path);
-	wr(fn, err_type_info.err_type_info);
-	sprintf(fn, "%s/err_struct_info", path);
-	wr(fn, 0x0);
-	sprintf(fn, "%s/err_data_buffer", path);
-	wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
-
-	// Fire pal_mc_error_inject procedure.
-	sprintf(fn, "%s/call_start", path);
-	wr(fn, mode);
-
-	if (rd_capabilities(path, capabilities)<0)
-		return -1;
-
-	return 0;
-  }
-
-  int query_all_capabilities()
-  {
-	int status;
-	err_type_info_t err_type_info;
-	int err_sev, err_struct, struct_hier;
-	int cap=0;
-	u64 capabilities;
-	char path[MAX_FN_SIZE];
-
-	err_type_info.err_type_info=0;			// Initial
-	err_type_info.err_type_info_u.mode=0;		// Query mode;
-	err_type_info.err_type_info_u.err_inj=0;
-
-	printf("All capabilities implemented in pal_mc_error_inject:\n");
-	sprintf(path, PATH_FORMAT ,0);
-	for (err_sev=0;err_sev<3;err_sev++)
-		for (err_struct=0;err_struct<5;err_struct++)
-			for (struct_hier=0;struct_hier<5;struct_hier++)
-	{
-		status=-1;
-		capabilities=0;
-		err_type_info.err_type_info_u.err_sev=err_sev;
-		err_type_info.err_type_info_u.err_struct=err_struct;
-		err_type_info.err_type_info_u.struct_hier=struct_hier;
-
-		if (query_capabilities(path, err_type_info, &capabilities)<0)
-			continue;
-
-		if (rd_status(path, &status)<0)
-			continue;
-
-		if (status==0) {
-			cap=1;
-			printf("For err_sev=%d, err_struct=%d, struct_hier=%d: ",
-				err_sev, err_struct, struct_hier);
-			printf("capabilities 0x%lx\n", capabilities);
-		}
-	}
-	if (!cap) {
-		printf("No capabilities supported.\n");
-		return 0;
-	}
-
-	return 0;
-  }
-
-  int err_inject(int cpu, char *path, err_type_info_t err_type_info,
-		err_struct_info_t err_struct_info,
-		err_data_buffer_t err_data_buffer)
-  {
-	int status;
-	char fn[MAX_FN_SIZE];
-
-	log_info(cpu, "err_type_info=%lx, err_struct_info=%lx, ",
-		err_type_info.err_type_info,
-		err_struct_info.err_struct_info);
-	log_info(cpu,"err_data_buffer=[%lx,%lx,%lx]\n",
-		err_data_buffer.err_data_buffer[0],
-		err_data_buffer.err_data_buffer[1],
-		err_data_buffer.err_data_buffer[2]);
-	sprintf(fn, "%s/err_type_info", path);
-	wr(fn, err_type_info.err_type_info);
-	sprintf(fn, "%s/err_struct_info", path);
-	wr(fn, err_struct_info.err_struct_info);
-	sprintf(fn, "%s/err_data_buffer", path);
-	wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
-
-	// Fire pal_mc_error_inject procedure.
-	sprintf(fn, "%s/call_start", path);
-	wr(fn,mode);
-
-	if (rd_status(path, &status)<0) {
-		vbprintf("fail: read status\n");
-		return -100;
-	}
-
-	if (status!=0) {
-		log_info(cpu, "fail: status=%d\n", status);
-		return status;
-	}
-
-	return status;
-  }
-
-  static int construct_data_buf(char *path, err_type_info_t err_type_info,
-		err_struct_info_t err_struct_info,
-		err_data_buffer_t *err_data_buffer,
-		void *va1)
-  {
-	char fn[MAX_FN_SIZE];
-	u64 virt_addr=0, phys_addr=0;
-
-	vbprintf("va1=%lx\n", (u64)va1);
-	memset(&err_data_buffer->err_data_buffer_cache, 0, ERR_DATA_BUFFER_SIZE*8);
-
-	switch (err_type_info.err_type_info_u.err_struct) {
-		case 1: // Cache
-			switch (err_struct_info.err_struct_info_cache.cl_id) {
-				case 1: //Virtual addr
-					err_data_buffer->err_data_buffer_cache.inj_addr=(u64)va1;
-					break;
-				case 2: //Phys addr
-					sprintf(fn, "%s/virtual_to_phys", path);
-					virt_addr=(u64)va1;
-					if (wr(fn,virt_addr)<0)
-						return -1;
-					rd(fn, &phys_addr);
-					err_data_buffer->err_data_buffer_cache.inj_addr=phys_addr;
-					break;
-				default:
-					printf("Not supported cl_id\n");
-					break;
-			}
-			break;
-		case 2: //  TLB
-			break;
-		case 3: //  Register file
-			break;
-		case 4: //  Bus/system interconnect
-		default:
-			printf("Not supported err_struct\n");
-			break;
-	}
-
-	return 0;
-  }
-
-  typedef struct {
-	u64 cpu;
-	u64 loop;
-	u64 interval;
-	u64 err_type_info;
-	u64 err_struct_info;
-	u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
-  } parameters_t;
-
-  parameters_t line_para;
-  int para;
-
-  static int empty_data_buffer(u64 *err_data_buffer)
-  {
-	int empty=1;
-	int i;
-
-	for (i=0;i<ERR_DATA_BUFFER_SIZE; i++)
-	   if (err_data_buffer[i]!=-1)
-		empty=0;
-
-	return empty;
-  }
-
-  int err_inj()
-  {
-	err_type_info_t err_type_info;
-	err_struct_info_t err_struct_info;
-	err_data_buffer_t err_data_buffer;
-	int count;
-	FILE *fp;
-	unsigned long cpu, loop, interval, err_type_info_conf, err_struct_info_conf;
-	u64 err_data_buffer_conf[ERR_DATA_BUFFER_SIZE];
-	int num;
-	int i;
-	char path[MAX_FN_SIZE];
-	parameters_t parameters[MAX_TASK_NUM]={};
-	pid_t child_pid[MAX_TASK_NUM];
-	time_t current_time;
-	int status;
-
-	if (!para) {
-	    fp=fopen("err.conf", "r");
-	    if (fp==NULL) {
-		perror("Error open err.conf");
-		return -1;
-	    }
-
-	    num=0;
-	    while (!feof(fp)) {
-		char buf[256];
-		memset(buf,0,256);
-		fgets(buf, 256, fp);
-		count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
-				&cpu, &loop, &interval,&err_type_info_conf,
-				&err_struct_info_conf,
-				&err_data_buffer_conf[0],
-				&err_data_buffer_conf[1],
-				&err_data_buffer_conf[2]);
-		if (count!=PARA_FIELD_NUM+3) {
-			err_data_buffer_conf[0]=-1;
-			err_data_buffer_conf[1]=-1;
-			err_data_buffer_conf[2]=-1;
-			count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx\n",
-				&cpu, &loop, &interval,&err_type_info_conf,
-				&err_struct_info_conf);
-			if (count!=PARA_FIELD_NUM)
-				continue;
-		}
-
-		parameters[num].cpu=cpu;
-		parameters[num].loop=loop;
-		parameters[num].interval= interval>MIN_INTERVAL
-					  ?interval:MIN_INTERVAL;
-		parameters[num].err_type_info=err_type_info_conf;
-		parameters[num].err_struct_info=err_struct_info_conf;
-		memcpy(parameters[num++].err_data_buffer,
-			err_data_buffer_conf,ERR_DATA_BUFFER_SIZE*8) ;
-
-		if (num>=MAX_TASK_NUM)
-			break;
-	    }
-	}
-	else {
-		parameters[0].cpu=line_para.cpu;
-		parameters[0].loop=line_para.loop;
-		parameters[0].interval= line_para.interval>MIN_INTERVAL
-					  ?line_para.interval:MIN_INTERVAL;
-		parameters[0].err_type_info=line_para.err_type_info;
-		parameters[0].err_struct_info=line_para.err_struct_info;
-		memcpy(parameters[0].err_data_buffer,
-			line_para.err_data_buffer,ERR_DATA_BUFFER_SIZE*8) ;
-
-		num=1;
-	}
-
-	/* Create semaphore: If one_lock, one semaphore for all processors.
-	   Otherwise, one semaphore for each processor. */
-	if (one_lock) {
-		if (create_sem(0)) {
-			printf("Can not create semaphore...exit\n");
-			free_sem(0);
-			return -1;
-		}
-	}
-	else {
-		for (i=0;i<num;i++) {
-		   if (create_sem(parameters[i].cpu)) {
-			printf("Can not create semaphore for cpu%d...exit\n",i);
-			free_sem(parameters[num].cpu);
-			return -1;
-		   }
-		}
-	}
-
-	/* Create a shm segment which will be used to inject/consume errors on.*/
-	if (create_shm()==-1) {
-		printf("Error to create shm...exit\n");
-		return -1;
-	}
-
-	for (i=0;i<num;i++) {
-		pid_t pid;
-
-		current_time=time(NULL);
-		log_info(parameters[i].cpu, "\nBegine at %s", ctime(&current_time));
-		log_info(parameters[i].cpu, "Configurations:\n");
-		log_info(parameters[i].cpu,"On cpu%ld: loop=%lx, interval=%lx(s)",
-			parameters[i].cpu,
-			parameters[i].loop,
-			parameters[i].interval);
-		log_info(parameters[i].cpu," err_type_info=%lx,err_struct_info=%lx\n",
-			parameters[i].err_type_info,
-			parameters[i].err_struct_info);
-
-		sprintf(path, PATH_FORMAT, (int)parameters[i].cpu);
-		err_type_info.err_type_info=parameters[i].err_type_info;
-		err_struct_info.err_struct_info=parameters[i].err_struct_info;
-		memcpy(err_data_buffer.err_data_buffer,
-			parameters[i].err_data_buffer,
-			ERR_DATA_BUFFER_SIZE*8);
-
-		pid=fork();
-		if (pid==0) {
-			unsigned long mask[MASK_SIZE];
-			int j, k;
-
-			void *va1, *va2;
-
-			/* Allocate two memory areas va1 and va2 in shm */
-			va1=shmaddr+parameters[i].cpu*PAGE_SIZE;
-			va2=shmaddr+parameters[i].cpu*PAGE_SIZE+PAGE_SIZE;
-
-			vbprintf("va1=%lx, va2=%lx\n", (u64)va1, (u64)va2);
-			memset(va1, 0x1, PAGE_SIZE);
-			memset(va2, 0x2, PAGE_SIZE);
-
-			if (empty_data_buffer(err_data_buffer.err_data_buffer))
-				/* If not specified yet, construct data buffer
-				 * with va1
-				 */
-				construct_data_buf(path, err_type_info,
-					err_struct_info, &err_data_buffer,va1);
-
-			for (j=0;j<MASK_SIZE;j++)
-				mask[j]=0;
-
-			cpu=parameters[i].cpu;
-			k = cpu%64;
-			j = cpu/64;
-			mask[j] = 1UL << k;
-
-			if (sched_setaffinity(0, MASK_SIZE*8, mask)==-1) {
-				perror("Error sched_setaffinity:");
-				return -1;
-			}
-
-			for (j=0; j<parameters[i].loop; j++) {
-				log_info(parameters[i].cpu,"Injection ");
-				log_info(parameters[i].cpu,"on cpu%ld: #%d/%ld ",
-
-					parameters[i].cpu,j+1, parameters[i].loop);
-
-				/* Hold the lock */
-				if (one_lock)
-					lock(0);
-				else
-				/* Hold lock on this cpu */
-					lock(parameters[i].cpu);
-
-				if ((status=err_inject(parameters[i].cpu,
-					   path, err_type_info,
-					   err_struct_info, err_data_buffer))
-					   ==0) {
-					/* consume the error for "inject only"*/
-					memcpy(va2, va1, PAGE_SIZE);
-					memcpy(va1, va2, PAGE_SIZE);
-					log_info(parameters[i].cpu,
-						"successful\n");
-				}
-				else {
-					log_info(parameters[i].cpu,"fail:");
-					log_info(parameters[i].cpu,
-						"status=%d\n", status);
-					unlock(parameters[i].cpu);
-					break;
-				}
-				if (one_lock)
-				/* Release the lock */
-					unlock(0);
-				/* Release lock on this cpu */
-				else
-					unlock(parameters[i].cpu);
-
-				if (j < parameters[i].loop-1)
-					sleep(parameters[i].interval);
-			}
-			current_time=time(NULL);
-			log_info(parameters[i].cpu, "Done at %s", ctime(&current_time));
-			return 0;
-		}
-		else if (pid<0) {
-			perror("Error fork:");
-			continue;
-		}
-		child_pid[i]=pid;
-	}
-	for (i=0;i<num;i++)
-		waitpid(child_pid[i], NULL, 0);
-
-	if (one_lock)
-		free_sem(0);
-	else
-		for (i=0;i<num;i++)
-			free_sem(parameters[i].cpu);
-
-	printf("All done.\n");
-
-	return 0;
-  }
-
-  void help()
-  {
-	printf("err_inject_tool:\n");
-	printf("\t-q: query all capabilities. default: off\n");
-	printf("\t-m: procedure mode. 1: physical 2: virtual. default: 1\n");
-	printf("\t-i: inject errors. default: off\n");
-	printf("\t-l: one lock per cpu. default: one lock for all\n");
-	printf("\t-e: error parameters:\n");
-	printf("\t\tcpu,loop,interval,err_type_info,err_struct_info[,err_data_buffer[0],err_data_buffer[1],err_data_buffer[2]]\n");
-	printf("\t\t   cpu: logical cpu number the error will be inject in.\n");
-	printf("\t\t   loop: times the error will be injected.\n");
-	printf("\t\t   interval: In second. every so often one error is injected.\n");
-	printf("\t\t   err_type_info, err_struct_info: PAL parameters.\n");
-	printf("\t\t   err_data_buffer: PAL parameter. Optional. If not present,\n");
-	printf("\t\t                    it's constructed by tool automatically. Be\n");
-	printf("\t\t                    careful to provide err_data_buffer and make\n");
-	printf("\t\t                    sure it's working with the environment.\n");
-	printf("\t    Note:no space between error parameters.\n");
-	printf("\t    default: Take error parameters from err.conf instead of command line.\n");
-	printf("\t-v: verbose. default: off\n");
-	printf("\t-h: help\n\n");
-	printf("The tool will take err.conf file as ");
-	printf("input to inject single or multiple errors ");
-	printf("on one or multiple cpus in parallel.\n");
-  }
-
-  int main(int argc, char **argv)
-  {
-	char c;
-	int do_err_inj=0;
-	int do_query_all=0;
-	int count;
-	u32 m;
-
-	/* Default one lock for all cpu's */
-	one_lock=1;
-	while ((c = getopt(argc, argv, "m:iqvhle:")) != EOF)
-		switch (c) {
-			case 'm':	/* Procedure mode. 1: phys 2: virt */
-				count=sscanf(optarg, "%x", &m);
-				if (count!=1 || (m!=1 && m!=2)) {
-					printf("Wrong mode number.\n");
-					help();
-					return -1;
-				}
-				mode=m;
-				break;
-			case 'i':	/* Inject errors */
-				do_err_inj=1;
-				break;
-			case 'q':	/* Query */
-				do_query_all=1;
-				break;
-			case 'v':	/* Verbose */
-				verbose=1;
-				break;
-			case 'l':	/* One lock per cpu */
-				one_lock=0;
-				break;
-			case 'e':	/* error arguments */
-				/* Take parameters:
-				 * #cpu, loop, interval, err_type_info, err_struct_info[, err_data_buffer]
-				 * err_data_buffer is optional. Recommend not to specify
-				 * err_data_buffer. Better to use tool to generate it.
-				 */
-				count=sscanf(optarg,
-					"%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
-					&line_para.cpu,
-					&line_para.loop,
-					&line_para.interval,
-					&line_para.err_type_info,
-					&line_para.err_struct_info,
-					&line_para.err_data_buffer[0],
-					&line_para.err_data_buffer[1],
-					&line_para.err_data_buffer[2]);
-				if (count!=PARA_FIELD_NUM+3) {
-				    line_para.err_data_buffer[0]=-1,
-				    line_para.err_data_buffer[1]=-1,
-				    line_para.err_data_buffer[2]=-1;
-				    count=sscanf(optarg, "%lx, %lx, %lx, %lx, %lx\n",
-						&line_para.cpu,
-						&line_para.loop,
-						&line_para.interval,
-						&line_para.err_type_info,
-						&line_para.err_struct_info);
-				    if (count!=PARA_FIELD_NUM) {
-					printf("Wrong error arguments.\n");
-					help();
-					return -1;
-				    }
-				}
-				para=1;
-				break;
-			continue;
-				break;
-			case 'h':
-				help();
-				return 0;
-			default:
-				break;
-		}
-
-	if (do_query_all)
-		query_all_capabilities();
-	if (do_err_inj)
-		err_inj();
-
-	if (!do_query_all &&  !do_err_inj)
-		help();
-
-	return 0;
-  }
diff --git a/Documentation/arch/ia64/features.rst b/Documentation/arch/ia64/features.rst
deleted file mode 100644
index d7226fdcf5f8..000000000000
--- a/Documentation/arch/ia64/features.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-.. kernel-feat:: $srctree/Documentation/features ia64
diff --git a/Documentation/arch/ia64/fsys.rst b/Documentation/arch/ia64/fsys.rst
deleted file mode 100644
index a702d2cc94b6..000000000000
--- a/Documentation/arch/ia64/fsys.rst
+++ /dev/null
@@ -1,303 +0,0 @@
-===================================
-Light-weight System Calls for IA-64
-===================================
-
-		        Started: 13-Jan-2003
-
-		    Last update: 27-Sep-2003
-
-	              David Mosberger-Tang
-		      <davidm@hpl.hp.com>
-
-Using the "epc" instruction effectively introduces a new mode of
-execution to the ia64 linux kernel.  We call this mode the
-"fsys-mode".  To recap, the normal states of execution are:
-
-  - kernel mode:
-	Both the register stack and the memory stack have been
-	switched over to kernel memory.  The user-level state is saved
-	in a pt-regs structure at the top of the kernel memory stack.
-
-  - user mode:
-	Both the register stack and the kernel stack are in
-	user memory.  The user-level state is contained in the
-	CPU registers.
-
-  - bank 0 interruption-handling mode:
-	This is the non-interruptible state which all
-	interruption-handlers start execution in.  The user-level
-	state remains in the CPU registers and some kernel state may
-	be stored in bank 0 of registers r16-r31.
-
-In contrast, fsys-mode has the following special properties:
-
-  - execution is at privilege level 0 (most-privileged)
-
-  - CPU registers may contain a mixture of user-level and kernel-level
-    state (it is the responsibility of the kernel to ensure that no
-    security-sensitive kernel-level state is leaked back to
-    user-level)
-
-  - execution is interruptible and preemptible (an fsys-mode handler
-    can disable interrupts and avoid all other interruption-sources
-    to avoid preemption)
-
-  - neither the memory-stack nor the register-stack can be trusted while
-    in fsys-mode (they point to the user-level stacks, which may
-    be invalid, or completely bogus addresses)
-
-In summary, fsys-mode is much more similar to running in user-mode
-than it is to running in kernel-mode.  Of course, given that the
-privilege level is at level 0, this means that fsys-mode requires some
-care (see below).
-
-
-How to tell fsys-mode
-=====================
-
-Linux operates in fsys-mode when (a) the privilege level is 0 (most
-privileged) and (b) the stacks have NOT been switched to kernel memory
-yet.  For convenience, the header file <asm-ia64/ptrace.h> provides
-three macros::
-
-	user_mode(regs)
-	user_stack(task,regs)
-	fsys_mode(task,regs)
-
-The "regs" argument is a pointer to a pt_regs structure.  The "task"
-argument is a pointer to the task structure to which the "regs"
-pointer belongs to.  user_mode() returns TRUE if the CPU state pointed
-to by "regs" was executing in user mode (privilege level 3).
-user_stack() returns TRUE if the state pointed to by "regs" was
-executing on the user-level stack(s).  Finally, fsys_mode() returns
-TRUE if the CPU state pointed to by "regs" was executing in fsys-mode.
-The fsys_mode() macro is equivalent to the expression::
-
-	!user_mode(regs) && user_stack(task,regs)
-
-How to write an fsyscall handler
-================================
-
-The file arch/ia64/kernel/fsys.S contains a table of fsyscall-handlers
-(fsyscall_table).  This table contains one entry for each system call.
-By default, a system call is handled by fsys_fallback_syscall().  This
-routine takes care of entering (full) kernel mode and calling the
-normal Linux system call handler.  For performance-critical system
-calls, it is possible to write a hand-tuned fsyscall_handler.  For
-example, fsys.S contains fsys_getpid(), which is a hand-tuned version
-of the getpid() system call.
-
-The entry and exit-state of an fsyscall handler is as follows:
-
-Machine state on entry to fsyscall handler
-------------------------------------------
-
-  ========= ===============================================================
-  r10	    0
-  r11	    saved ar.pfs (a user-level value)
-  r15	    system call number
-  r16	    "current" task pointer (in normal kernel-mode, this is in r13)
-  r32-r39   system call arguments
-  b6	    return address (a user-level value)
-  ar.pfs    previous frame-state (a user-level value)
-  PSR.be    cleared to zero (i.e., little-endian byte order is in effect)
-  -         all other registers may contain values passed in from user-mode
-  ========= ===============================================================
-
-Required machine state on exit to fsyscall handler
---------------------------------------------------
-
-  ========= ===========================================================
-  r11	    saved ar.pfs (as passed into the fsyscall handler)
-  r15	    system call number (as passed into the fsyscall handler)
-  r32-r39   system call arguments (as passed into the fsyscall handler)
-  b6	    return address (as passed into the fsyscall handler)
-  ar.pfs    previous frame-state (as passed into the fsyscall handler)
-  ========= ===========================================================
-
-Fsyscall handlers can execute with very little overhead, but with that
-speed comes a set of restrictions:
-
- * Fsyscall-handlers MUST check for any pending work in the flags
-   member of the thread-info structure and if any of the
-   TIF_ALLWORK_MASK flags are set, the handler needs to fall back on
-   doing a full system call (by calling fsys_fallback_syscall).
-
- * Fsyscall-handlers MUST preserve incoming arguments (r32-r39, r11,
-   r15, b6, and ar.pfs) because they will be needed in case of a
-   system call restart.  Of course, all "preserved" registers also
-   must be preserved, in accordance to the normal calling conventions.
-
- * Fsyscall-handlers MUST check argument registers for containing a
-   NaT value before using them in any way that could trigger a
-   NaT-consumption fault.  If a system call argument is found to
-   contain a NaT value, an fsyscall-handler may return immediately
-   with r8=EINVAL, r10=-1.
-
- * Fsyscall-handlers MUST NOT use the "alloc" instruction or perform
-   any other operation that would trigger mandatory RSE
-   (register-stack engine) traffic.
-
- * Fsyscall-handlers MUST NOT write to any stacked registers because
-   it is not safe to assume that user-level called a handler with the
-   proper number of arguments.
-
- * Fsyscall-handlers need to be careful when accessing per-CPU variables:
-   unless proper safe-guards are taken (e.g., interruptions are avoided),
-   execution may be pre-empted and resumed on another CPU at any given
-   time.
-
- * Fsyscall-handlers must be careful not to leak sensitive kernel'
-   information back to user-level.  In particular, before returning to
-   user-level, care needs to be taken to clear any scratch registers
-   that could contain sensitive information (note that the current
-   task pointer is not considered sensitive: it's already exposed
-   through ar.k6).
-
- * Fsyscall-handlers MUST NOT access user-memory without first
-   validating access-permission (this can be done typically via
-   probe.r.fault and/or probe.w.fault) and without guarding against
-   memory access exceptions (this can be done with the EX() macros
-   defined by asmmacro.h).
-
-The above restrictions may seem draconian, but remember that it's
-possible to trade off some of the restrictions by paying a slightly
-higher overhead.  For example, if an fsyscall-handler could benefit
-from the shadow register bank, it could temporarily disable PSR.i and
-PSR.ic, switch to bank 0 (bsw.0) and then use the shadow registers as
-needed.  In other words, following the above rules yields extremely
-fast system call execution (while fully preserving system call
-semantics), but there is also a lot of flexibility in handling more
-complicated cases.
-
-Signal handling
-===============
-
-The delivery of (asynchronous) signals must be delayed until fsys-mode
-is exited.  This is accomplished with the help of the lower-privilege
-transfer trap: arch/ia64/kernel/process.c:do_notify_resume_user()
-checks whether the interrupted task was in fsys-mode and, if so, sets
-PSR.lp and returns immediately.  When fsys-mode is exited via the
-"br.ret" instruction that lowers the privilege level, a trap will
-occur.  The trap handler clears PSR.lp again and returns immediately.
-The kernel exit path then checks for and delivers any pending signals.
-
-PSR Handling
-============
-
-The "epc" instruction doesn't change the contents of PSR at all.  This
-is in contrast to a regular interruption, which clears almost all
-bits.  Because of that, some care needs to be taken to ensure things
-work as expected.  The following discussion describes how each PSR bit
-is handled.
-
-======= =======================================================================
-PSR.be	Cleared when entering fsys-mode.  A srlz.d instruction is used
-	to ensure the CPU is in little-endian mode before the first
-	load/store instruction is executed.  PSR.be is normally NOT
-	restored upon return from an fsys-mode handler.  In other
-	words, user-level code must not rely on PSR.be being preserved
-	across a system call.
-PSR.up	Unchanged.
-PSR.ac	Unchanged.
-PSR.mfl Unchanged.  Note: fsys-mode handlers must not write-registers!
-PSR.mfh	Unchanged.  Note: fsys-mode handlers must not write-registers!
-PSR.ic	Unchanged.  Note: fsys-mode handlers can clear the bit, if needed.
-PSR.i	Unchanged.  Note: fsys-mode handlers can clear the bit, if needed.
-PSR.pk	Unchanged.
-PSR.dt	Unchanged.
-PSR.dfl	Unchanged.  Note: fsys-mode handlers must not write-registers!
-PSR.dfh	Unchanged.  Note: fsys-mode handlers must not write-registers!
-PSR.sp	Unchanged.
-PSR.pp	Unchanged.
-PSR.di	Unchanged.
-PSR.si	Unchanged.
-PSR.db	Unchanged.  The kernel prevents user-level from setting a hardware
-	breakpoint that triggers at any privilege level other than
-	3 (user-mode).
-PSR.lp	Unchanged.
-PSR.tb	Lazy redirect.  If a taken-branch trap occurs while in
-	fsys-mode, the trap-handler modifies the saved machine state
-	such that execution resumes in the gate page at
-	syscall_via_break(), with privilege level 3.  Note: the
-	taken branch would occur on the branch invoking the
-	fsyscall-handler, at which point, by definition, a syscall
-	restart is still safe.  If the system call number is invalid,
-	the fsys-mode handler will return directly to user-level.  This
-	return will trigger a taken-branch trap, but since the trap is
-	taken _after_ restoring the privilege level, the CPU has already
-	left fsys-mode, so no special treatment is needed.
-PSR.rt	Unchanged.
-PSR.cpl	Cleared to 0.
-PSR.is	Unchanged (guaranteed to be 0 on entry to the gate page).
-PSR.mc	Unchanged.
-PSR.it	Unchanged (guaranteed to be 1).
-PSR.id	Unchanged.  Note: the ia64 linux kernel never sets this bit.
-PSR.da	Unchanged.  Note: the ia64 linux kernel never sets this bit.
-PSR.dd	Unchanged.  Note: the ia64 linux kernel never sets this bit.
-PSR.ss	Lazy redirect.  If set, "epc" will cause a Single Step Trap to
-	be taken.  The trap handler then modifies the saved machine
-	state such that execution resumes in the gate page at
-	syscall_via_break(), with privilege level 3.
-PSR.ri	Unchanged.
-PSR.ed	Unchanged.  Note: This bit could only have an effect if an fsys-mode
-	handler performed a speculative load that gets NaTted.  If so, this
-	would be the normal & expected behavior, so no special treatment is
-	needed.
-PSR.bn	Unchanged.  Note: fsys-mode handlers may clear the bit, if needed.
-	Doing so requires clearing PSR.i and PSR.ic as well.
-PSR.ia	Unchanged.  Note: the ia64 linux kernel never sets this bit.
-======= =======================================================================
-
-Using fast system calls
-=======================
-
-To use fast system calls, userspace applications need simply call
-__kernel_syscall_via_epc().  For example
-
--- example fgettimeofday() call --
-
--- fgettimeofday.S --
-
-::
-
-  #include <asm/asmmacro.h>
-
-  GLOBAL_ENTRY(fgettimeofday)
-  .prologue
-  .save ar.pfs, r11
-  mov r11 = ar.pfs
-  .body
-
-  mov r2 = 0xa000000000020660;;  // gate address
-			       // found by inspection of System.map for the
-			       // __kernel_syscall_via_epc() function.  See
-			       // below for how to do this for real.
-
-  mov b7 = r2
-  mov r15 = 1087		       // gettimeofday syscall
-  ;;
-  br.call.sptk.many b6 = b7
-  ;;
-
-  .restore sp
-
-  mov ar.pfs = r11
-  br.ret.sptk.many rp;;	      // return to caller
-  END(fgettimeofday)
-
--- end fgettimeofday.S --
-
-In reality, getting the gate address is accomplished by two extra
-values passed via the ELF auxiliary vector (include/asm-ia64/elf.h)
-
- * AT_SYSINFO : is the address of __kernel_syscall_via_epc()
- * AT_SYSINFO_EHDR : is the address of the kernel gate ELF DSO
-
-The ELF DSO is a pre-linked library that is mapped in by the kernel at
-the gate page.  It is a proper ELF shared object so, with a dynamic
-loader that recognises the library, you should be able to make calls to
-the exported functions within it as with any other shared library.
-AT_SYSINFO points into the kernel DSO at the
-__kernel_syscall_via_epc() function for historical reasons (it was
-used before the kernel DSO) and as a convenience.
diff --git a/Documentation/arch/ia64/ia64.rst b/Documentation/arch/ia64/ia64.rst
deleted file mode 100644
index b725019a9492..000000000000
--- a/Documentation/arch/ia64/ia64.rst
+++ /dev/null
@@ -1,49 +0,0 @@
-===========================================
-Linux kernel release for the IA-64 Platform
-===========================================
-
-   These are the release notes for Linux since version 2.4 for IA-64
-   platform.  This document provides information specific to IA-64
-   ONLY, to get additional information about the Linux kernel also
-   read the original Linux README provided with the kernel.
-
-Installing the Kernel
-=====================
-
- - IA-64 kernel installation is the same as the other platforms, see
-   original README for details.
-
-
-Software Requirements
-=====================
-
-   Compiling and running this kernel requires an IA-64 compliant GCC
-   compiler.  And various software packages also compiled with an
-   IA-64 compliant GCC compiler.
-
-
-Configuring the Kernel
-======================
-
-   Configuration is the same, see original README for details.
-
-
-Compiling the Kernel:
-
- - Compiling this kernel doesn't differ from other platform so read
-   the original README for details BUT make sure you have an IA-64
-   compliant GCC compiler.
-
-IA-64 Specifics
-===============
-
- - General issues:
-
-    * Hardly any performance tuning has been done. Obvious targets
-      include the library routines (IP checksum, etc.). Less
-      obvious targets include making sure we don't flush the TLB
-      needlessly, etc.
-
-    * SMP locks cleanup/optimization
-
-    * IA32 support.  Currently experimental.  It mostly works.
diff --git a/Documentation/arch/ia64/index.rst b/Documentation/arch/ia64/index.rst
deleted file mode 100644
index 761f2154dfa2..000000000000
--- a/Documentation/arch/ia64/index.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-==================
-IA-64 Architecture
-==================
-
-.. toctree::
-   :maxdepth: 1
-
-   ia64
-   aliasing
-   efirtc
-   err_inject
-   fsys
-   irq-redir
-   mca
-   serial
-
-   features
diff --git a/Documentation/arch/ia64/irq-redir.rst b/Documentation/arch/ia64/irq-redir.rst
deleted file mode 100644
index 6bbbbe4f73ef..000000000000
--- a/Documentation/arch/ia64/irq-redir.rst
+++ /dev/null
@@ -1,80 +0,0 @@
-==============================
-IRQ affinity on IA64 platforms
-==============================
-
-07.01.2002, Erich Focht <efocht@ess.nec.de>
-
-
-By writing to /proc/irq/IRQ#/smp_affinity the interrupt routing can be
-controlled. The behavior on IA64 platforms is slightly different from
-that described in Documentation/core-api/irq/irq-affinity.rst for i386 systems.
-
-Because of the usage of SAPIC mode and physical destination mode the
-IRQ target is one particular CPU and cannot be a mask of several
-CPUs. Only the first non-zero bit is taken into account.
-
-
-Usage examples
-==============
-
-The target CPU has to be specified as a hexadecimal CPU mask. The
-first non-zero bit is the selected CPU. This format has been kept for
-compatibility reasons with i386.
-
-Set the delivery mode of interrupt 41 to fixed and route the
-interrupts to CPU #3 (logical CPU number) (2^3=0x08)::
-
-     echo "8" >/proc/irq/41/smp_affinity
-
-Set the default route for IRQ number 41 to CPU 6 in lowest priority
-delivery mode (redirectable)::
-
-     echo "r 40" >/proc/irq/41/smp_affinity
-
-The output of the command::
-
-     cat /proc/irq/IRQ#/smp_affinity
-
-gives the target CPU mask for the specified interrupt vector. If the CPU
-mask is preceded by the character "r", the interrupt is redirectable
-(i.e. lowest priority mode routing is used), otherwise its route is
-fixed.
-
-
-
-Initialization and default behavior
-===================================
-
-If the platform features IRQ redirection (info provided by SAL) all
-IO-SAPIC interrupts are initialized with CPU#0 as their default target
-and the routing is the so called "lowest priority mode" (actually
-fixed SAPIC mode with hint). The XTP chipset registers are used as hints
-for the IRQ routing. Currently in Linux XTP registers can have three
-values:
-
-	- minimal for an idle task,
-	- normal if any other task runs,
-	- maximal if the CPU is going to be switched off.
-
-The IRQ is routed to the CPU with lowest XTP register value, the
-search begins at the default CPU. Therefore most of the interrupts
-will be handled by CPU #0.
-
-If the platform doesn't feature interrupt redirection IOSAPIC fixed
-routing is used. The target CPUs are distributed in a round robin
-manner. IRQs will be routed only to the selected target CPUs. Check
-with::
-
-        cat /proc/interrupts
-
-
-
-Comments
-========
-
-On large (multi-node) systems it is recommended to route the IRQs to
-the node to which the corresponding device is connected.
-For systems like the NEC AzusA we get IRQ node-affinity for free. This
-is because usually the chipsets on each node redirect the interrupts
-only to their own CPUs (as they cannot see the XTP registers on the
-other nodes).
diff --git a/Documentation/arch/ia64/mca.rst b/Documentation/arch/ia64/mca.rst
deleted file mode 100644
index 08270bba44a4..000000000000
--- a/Documentation/arch/ia64/mca.rst
+++ /dev/null
@@ -1,198 +0,0 @@
-=============================================================
-An ad-hoc collection of notes on IA64 MCA and INIT processing
-=============================================================
-
-Feel free to update it with notes about any area that is not clear.
-
----
-
-MCA/INIT are completely asynchronous.  They can occur at any time, when
-the OS is in any state.  Including when one of the cpus is already
-holding a spinlock.  Trying to get any lock from MCA/INIT state is
-asking for deadlock.  Also the state of structures that are protected
-by locks is indeterminate, including linked lists.
-
----
-
-The complicated ia64 MCA process.  All of this is mandated by Intel's
-specification for ia64 SAL, error recovery and unwind, it is not as
-if we have a choice here.
-
-* MCA occurs on one cpu, usually due to a double bit memory error.
-  This is the monarch cpu.
-
-* SAL sends an MCA rendezvous interrupt (which is a normal interrupt)
-  to all the other cpus, the slaves.
-
-* Slave cpus that receive the MCA interrupt call down into SAL, they
-  end up spinning disabled while the MCA is being serviced.
-
-* If any slave cpu was already spinning disabled when the MCA occurred
-  then it cannot service the MCA interrupt.  SAL waits ~20 seconds then
-  sends an unmaskable INIT event to the slave cpus that have not
-  already rendezvoused.
-
-* Because MCA/INIT can be delivered at any time, including when the cpu
-  is down in PAL in physical mode, the registers at the time of the
-  event are _completely_ undefined.  In particular the MCA/INIT
-  handlers cannot rely on the thread pointer, PAL physical mode can
-  (and does) modify TP.  It is allowed to do that as long as it resets
-  TP on return.  However MCA/INIT events expose us to these PAL
-  internal TP changes.  Hence curr_task().
-
-* If an MCA/INIT event occurs while the kernel was running (not user
-  space) and the kernel has called PAL then the MCA/INIT handler cannot
-  assume that the kernel stack is in a fit state to be used.  Mainly
-  because PAL may or may not maintain the stack pointer internally.
-  Because the MCA/INIT handlers cannot trust the kernel stack, they
-  have to use their own, per-cpu stacks.  The MCA/INIT stacks are
-  preformatted with just enough task state to let the relevant handlers
-  do their job.
-
-* Unlike most other architectures, the ia64 struct task is embedded in
-  the kernel stack[1].  So switching to a new kernel stack means that
-  we switch to a new task as well.  Because various bits of the kernel
-  assume that current points into the struct task, switching to a new
-  stack also means a new value for current.
-
-* Once all slaves have rendezvoused and are spinning disabled, the
-  monarch is entered.  The monarch now tries to diagnose the problem
-  and decide if it can recover or not.
-
-* Part of the monarch's job is to look at the state of all the other
-  tasks.  The only way to do that on ia64 is to call the unwinder,
-  as mandated by Intel.
-
-* The starting point for the unwind depends on whether a task is
-  running or not.  That is, whether it is on a cpu or is blocked.  The
-  monarch has to determine whether or not a task is on a cpu before it
-  knows how to start unwinding it.  The tasks that received an MCA or
-  INIT event are no longer running, they have been converted to blocked
-  tasks.  But (and its a big but), the cpus that received the MCA
-  rendezvous interrupt are still running on their normal kernel stacks!
-
-* To distinguish between these two cases, the monarch must know which
-  tasks are on a cpu and which are not.  Hence each slave cpu that
-  switches to an MCA/INIT stack, registers its new stack using
-  set_curr_task(), so the monarch can tell that the _original_ task is
-  no longer running on that cpu.  That gives us a decent chance of
-  getting a valid backtrace of the _original_ task.
-
-* MCA/INIT can be nested, to a depth of 2 on any cpu.  In the case of a
-  nested error, we want diagnostics on the MCA/INIT handler that
-  failed, not on the task that was originally running.  Again this
-  requires set_curr_task() so the MCA/INIT handlers can register their
-  own stack as running on that cpu.  Then a recursive error gets a
-  trace of the failing handler's "task".
-
-[1]
-    My (Keith Owens) original design called for ia64 to separate its
-    struct task and the kernel stacks.  Then the MCA/INIT data would be
-    chained stacks like i386 interrupt stacks.  But that required
-    radical surgery on the rest of ia64, plus extra hard wired TLB
-    entries with its associated performance degradation.  David
-    Mosberger vetoed that approach.  Which meant that separate kernel
-    stacks meant separate "tasks" for the MCA/INIT handlers.
-
----
-
-INIT is less complicated than MCA.  Pressing the nmi button or using
-the equivalent command on the management console sends INIT to all
-cpus.  SAL picks one of the cpus as the monarch and the rest are
-slaves.  All the OS INIT handlers are entered at approximately the same
-time.  The OS monarch prints the state of all tasks and returns, after
-which the slaves return and the system resumes.
-
-At least that is what is supposed to happen.  Alas there are broken
-versions of SAL out there.  Some drive all the cpus as monarchs.  Some
-drive them all as slaves.  Some drive one cpu as monarch, wait for that
-cpu to return from the OS then drive the rest as slaves.  Some versions
-of SAL cannot even cope with returning from the OS, they spin inside
-SAL on resume.  The OS INIT code has workarounds for some of these
-broken SAL symptoms, but some simply cannot be fixed from the OS side.
-
----
-
-The scheduler hooks used by ia64 (curr_task, set_curr_task) are layer
-violations.  Unfortunately MCA/INIT start off as massive layer
-violations (can occur at _any_ time) and they build from there.
-
-At least ia64 makes an attempt at recovering from hardware errors, but
-it is a difficult problem because of the asynchronous nature of these
-errors.  When processing an unmaskable interrupt we sometimes need
-special code to cope with our inability to take any locks.
-
----
-
-How is ia64 MCA/INIT different from x86 NMI?
-
-* x86 NMI typically gets delivered to one cpu.  MCA/INIT gets sent to
-  all cpus.
-
-* x86 NMI cannot be nested.  MCA/INIT can be nested, to a depth of 2
-  per cpu.
-
-* x86 has a separate struct task which points to one of multiple kernel
-  stacks.  ia64 has the struct task embedded in the single kernel
-  stack, so switching stack means switching task.
-
-* x86 does not call the BIOS so the NMI handler does not have to worry
-  about any registers having changed.  MCA/INIT can occur while the cpu
-  is in PAL in physical mode, with undefined registers and an undefined
-  kernel stack.
-
-* i386 backtrace is not very sensitive to whether a process is running
-  or not.  ia64 unwind is very, very sensitive to whether a process is
-  running or not.
-
----
-
-What happens when MCA/INIT is delivered what a cpu is running user
-space code?
-
-The user mode registers are stored in the RSE area of the MCA/INIT on
-entry to the OS and are restored from there on return to SAL, so user
-mode registers are preserved across a recoverable MCA/INIT.  Since the
-OS has no idea what unwind data is available for the user space stack,
-MCA/INIT never tries to backtrace user space.  Which means that the OS
-does not bother making the user space process look like a blocked task,
-i.e. the OS does not copy pt_regs and switch_stack to the user space
-stack.  Also the OS has no idea how big the user space RSE and memory
-stacks are, which makes it too risky to copy the saved state to a user
-mode stack.
-
----
-
-How do we get a backtrace on the tasks that were running when MCA/INIT
-was delivered?
-
-mca.c:::ia64_mca_modify_original_stack().  That identifies and
-verifies the original kernel stack, copies the dirty registers from
-the MCA/INIT stack's RSE to the original stack's RSE, copies the
-skeleton struct pt_regs and switch_stack to the original stack, fills
-in the skeleton structures from the PAL minstate area and updates the
-original stack's thread.ksp.  That makes the original stack look
-exactly like any other blocked task, i.e. it now appears to be
-sleeping.  To get a backtrace, just start with thread.ksp for the
-original task and unwind like any other sleeping task.
-
----
-
-How do we identify the tasks that were running when MCA/INIT was
-delivered?
-
-If the previous task has been verified and converted to a blocked
-state, then sos->prev_task on the MCA/INIT stack is updated to point to
-the previous task.  You can look at that field in dumps or debuggers.
-To help distinguish between the handler and the original tasks,
-handlers have _TIF_MCA_INIT set in thread_info.flags.
-
-The sos data is always in the MCA/INIT handler stack, at offset
-MCA_SOS_OFFSET.  You can get that value from mca_asm.h or calculate it
-as KERNEL_STACK_SIZE - sizeof(struct pt_regs) - sizeof(struct
-ia64_sal_os_state), with 16 byte alignment for all structures.
-
-Also the comm field of the MCA/INIT task is modified to include the pid
-of the original task, for humans to use.  For example, a comm field of
-'MCA 12159' means that pid 12159 was running when the MCA was
-delivered.
diff --git a/Documentation/arch/ia64/serial.rst b/Documentation/arch/ia64/serial.rst
deleted file mode 100644
index 1de70c305a79..000000000000
--- a/Documentation/arch/ia64/serial.rst
+++ /dev/null
@@ -1,165 +0,0 @@
-==============
-Serial Devices
-==============
-
-Serial Device Naming
-====================
-
-    As of 2.6.10, serial devices on ia64 are named based on the
-    order of ACPI and PCI enumeration.  The first device in the
-    ACPI namespace (if any) becomes /dev/ttyS0, the second becomes
-    /dev/ttyS1, etc., and PCI devices are named sequentially
-    starting after the ACPI devices.
-
-    Prior to 2.6.10, there were confusing exceptions to this:
-
-	- Firmware on some machines (mostly from HP) provides an HCDP
-	  table[1] that tells the kernel about devices that can be used
-	  as a serial console.  If the user specified "console=ttyS0"
-	  or the EFI ConOut path contained only UART devices, the
-	  kernel registered the device described by the HCDP as
-	  /dev/ttyS0.
-
-	- If there was no HCDP, we assumed there were UARTs at the
-	  legacy COM port addresses (I/O ports 0x3f8 and 0x2f8), so
-	  the kernel registered those as /dev/ttyS0 and /dev/ttyS1.
-
-    Any additional ACPI or PCI devices were registered sequentially
-    after /dev/ttyS0 as they were discovered.
-
-    With an HCDP, device names changed depending on EFI configuration
-    and "console=" arguments.  Without an HCDP, device names didn't
-    change, but we registered devices that might not really exist.
-
-    For example, an HP rx1600 with a single built-in serial port
-    (described in the ACPI namespace) plus an MP[2] (a PCI device) has
-    these ports:
-
-      ==========  ==========     ============    ============   =======
-      Type        MMIO           pre-2.6.10      pre-2.6.10     2.6.10+
-		  address
-				 (EFI console    (EFI console
-                                 on builtin)     on MP port)
-      ==========  ==========     ============    ============   =======
-      builtin     0xff5e0000        ttyS0           ttyS1         ttyS0
-      MP UPS      0xf8031000        ttyS1           ttyS2         ttyS1
-      MP Console  0xf8030000        ttyS2           ttyS0         ttyS2
-      MP 2        0xf8030010        ttyS3           ttyS3         ttyS3
-      MP 3        0xf8030038        ttyS4           ttyS4         ttyS4
-      ==========  ==========     ============    ============   =======
-
-Console Selection
-=================
-
-    EFI knows what your console devices are, but it doesn't tell the
-    kernel quite enough to actually locate them.  The DIG64 HCDP
-    table[1] does tell the kernel where potential serial console
-    devices are, but not all firmware supplies it.  Also, EFI supports
-    multiple simultaneous consoles and doesn't tell the kernel which
-    should be the "primary" one.
-
-    So how do you tell Linux which console device to use?
-
-	- If your firmware supplies the HCDP, it is simplest to
-	  configure EFI with a single device (either a UART or a VGA
-	  card) as the console.  Then you don't need to tell Linux
-	  anything; the kernel will automatically use the EFI console.
-
-	  (This works only in 2.6.6 or later; prior to that you had
-	  to specify "console=ttyS0" to get a serial console.)
-
-	- Without an HCDP, Linux defaults to a VGA console unless you
-	  specify a "console=" argument.
-
-    NOTE: Don't assume that a serial console device will be /dev/ttyS0.
-    It might be ttyS1, ttyS2, etc.  Make sure you have the appropriate
-    entries in /etc/inittab (for getty) and /etc/securetty (to allow
-    root login).
-
-Early Serial Console
-====================
-
-    The kernel can't start using a serial console until it knows where
-    the device lives.  Normally this happens when the driver enumerates
-    all the serial devices, which can happen a minute or more after the
-    kernel starts booting.
-
-    2.6.10 and later kernels have an "early uart" driver that works
-    very early in the boot process.  The kernel will automatically use
-    this if the user supplies an argument like "console=uart,io,0x3f8",
-    or if the EFI console path contains only a UART device and the
-    firmware supplies an HCDP.
-
-Troubleshooting Serial Console Problems
-=======================================
-
-    No kernel output after elilo prints "Uncompressing Linux... done":
-
-	- You specified "console=ttyS0" but Linux changed the device
-	  to which ttyS0 refers.  Configure exactly one EFI console
-	  device[3] and remove the "console=" option.
-
-	- The EFI console path contains both a VGA device and a UART.
-	  EFI and elilo use both, but Linux defaults to VGA.  Remove
-	  the VGA device from the EFI console path[3].
-
-	- Multiple UARTs selected as EFI console devices.  EFI and
-	  elilo use all selected devices, but Linux uses only one.
-	  Make sure only one UART is selected in the EFI console
-	  path[3].
-
-	- You're connected to an HP MP port[2] but have a non-MP UART
-	  selected as EFI console device.  EFI uses the MP as a
-	  console device even when it isn't explicitly selected.
-	  Either move the console cable to the non-MP UART, or change
-	  the EFI console path[3] to the MP UART.
-
-    Long pause (60+ seconds) between "Uncompressing Linux... done" and
-    start of kernel output:
-
-	- No early console because you used "console=ttyS<n>".  Remove
-	  the "console=" option if your firmware supplies an HCDP.
-
-	- If you don't have an HCDP, the kernel doesn't know where
-	  your console lives until the driver discovers serial
-	  devices.  Use "console=uart,io,0x3f8" (or appropriate
-	  address for your machine).
-
-    Kernel and init script output works fine, but no "login:" prompt:
-
-	- Add getty entry to /etc/inittab for console tty.  Look for
-	  the "Adding console on ttyS<n>" message that tells you which
-	  device is the console.
-
-    "login:" prompt, but can't login as root:
-
-	- Add entry to /etc/securetty for console tty.
-
-    No ACPI serial devices found in 2.6.17 or later:
-
-	- Turn on CONFIG_PNP and CONFIG_PNPACPI.  Prior to 2.6.17, ACPI
-	  serial devices were discovered by 8250_acpi.  In 2.6.17,
-	  8250_acpi was replaced by the combination of 8250_pnp and
-	  CONFIG_PNPACPI.
-
-
-
-[1]
-    http://www.dig64.org/specifications/agreement
-    The table was originally defined as the "HCDP" for "Headless
-    Console/Debug Port."  The current version is the "PCDP" for
-    "Primary Console and Debug Port Devices."
-
-[2]
-    The HP MP (management processor) is a PCI device that provides
-    several UARTs.  One of the UARTs is often used as a console; the
-    EFI Boot Manager identifies it as "Acpi(HWP0002,700)/Pci(...)/Uart".
-    The external connection is usually a 25-pin connector, and a
-    special dongle converts that to three 9-pin connectors, one of
-    which is labelled "Console."
-
-[3]
-    EFI console devices are configured using the EFI Boot Manager
-    "Boot option maintenance" menu.  You may have to interrupt the
-    boot sequence to use this menu, and you will have to reset the
-    box after changing console configuration.
diff --git a/Documentation/core-api/cpu_hotplug.rst b/Documentation/core-api/cpu_hotplug.rst
index 9511e405aabd..dcb0e379e5e8 100644
--- a/Documentation/core-api/cpu_hotplug.rst
+++ b/Documentation/core-api/cpu_hotplug.rst
@@ -40,12 +40,6 @@ Command Line Switches
   supplied here is lower than the number of physically available CPUs, then
   those CPUs can not be brought online later.
 
-``additional_cpus=n``
-  Use this to limit hotpluggable CPUs. This option sets
-  ``cpu_possible_mask = cpu_present_mask + additional_cpus``
-
-  This option is limited to the IA64 architecture.
-
 ``possible_cpus=n``
   This option sets ``possible_cpus`` bits in ``cpu_possible_mask``.
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 90f13281d297..aac200bb0aca 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9935,12 +9935,6 @@ F:	Documentation/driver-api/i3c
 F:	drivers/i3c/
 F:	include/linux/i3c/
 
-IA64 (Itanium) PLATFORM
-L:	linux-ia64@vger.kernel.org
-S:	Orphan
-F:	Documentation/arch/ia64/
-F:	arch/ia64/
-
 IBM Operation Panel Input Driver
 M:	Eddie James <eajames@linux.ibm.com>
 L:	linux-input@vger.kernel.org
@@ -16269,11 +16263,6 @@ L:	linux-i2c@vger.kernel.org
 S:	Maintained
 F:	drivers/i2c/muxes/i2c-mux-pca9541.c
 
-PCDP - PRIMARY CONSOLE AND DEBUG PORT
-M:	Khalid Aziz <khalid@gonehiking.org>
-S:	Maintained
-F:	drivers/firmware/pcdp.*
-
 PCI DRIVER FOR AARDVARK (Marvell Armada 3700)
 M:	Thomas Petazzoni <thomas.petazzoni@bootlin.com>
 M:	Pali Rohár <pali@kernel.org>
diff --git a/arch/Kconfig b/arch/Kconfig
index 12d51495caec..f4b210ab0612 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1088,7 +1088,6 @@ config HAVE_ARCH_COMPAT_MMAP_BASES
 config PAGE_SIZE_LESS_THAN_64KB
 	def_bool y
 	depends on !ARM64_64K_PAGES
-	depends on !IA64_PAGE_SIZE_64KB
 	depends on !PAGE_SIZE_64KB
 	depends on !PARISC_PAGE_SIZE_64KB
 	depends on PAGE_SIZE_LESS_THAN_256KB
diff --git a/arch/ia64/Kbuild b/arch/ia64/Kbuild
deleted file mode 100644
index e77cc76d228c..000000000000
--- a/arch/ia64/Kbuild
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-y				+= kernel/ mm/
-obj-$(CONFIG_IA64_SGI_UV)	+= uv/
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
deleted file mode 100644
index 53faa122b0f4..000000000000
--- a/arch/ia64/Kconfig
+++ /dev/null
@@ -1,394 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-config PGTABLE_LEVELS
-	int "Page Table Levels" if !IA64_PAGE_SIZE_64KB
-	range 3 4 if !IA64_PAGE_SIZE_64KB
-	default 3
-
-menu "Processor type and features"
-
-config IA64
-	bool
-	select ARCH_BINFMT_ELF_EXTRA_PHDRS
-	select ARCH_HAS_CPU_FINALIZE_INIT
-	select ARCH_HAS_DMA_MARK_CLEAN
-	select ARCH_HAS_STRNCPY_FROM_USER
-	select ARCH_HAS_STRNLEN_USER
-	select ARCH_MIGHT_HAVE_PC_PARPORT
-	select ARCH_MIGHT_HAVE_PC_SERIO
-	select ACPI
-	select ACPI_NUMA if NUMA
-	select ARCH_ENABLE_MEMORY_HOTPLUG
-	select ARCH_ENABLE_MEMORY_HOTREMOVE
-	select ARCH_SUPPORTS_ACPI
-	select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
-	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
-	select FORCE_PCI
-	select PCI_DOMAINS if PCI
-	select PCI_MSI
-	select PCI_SYSCALL if PCI
-	select HAS_IOPORT
-	select HAVE_ASM_MODVERSIONS
-	select HAVE_UNSTABLE_SCHED_CLOCK
-	select HAVE_EXIT_THREAD
-	select HAVE_KPROBES
-	select HAVE_KRETPROBES
-	select HAVE_FTRACE_MCOUNT_RECORD
-	select HAVE_DYNAMIC_FTRACE if (!ITANIUM)
-	select HAVE_FUNCTION_TRACER
-	select HAVE_SETUP_PER_CPU_AREA
-	select TTY
-	select HAVE_ARCH_TRACEHOOK
-	select HAVE_FUNCTION_DESCRIPTORS
-	select HAVE_VIRT_CPU_ACCOUNTING
-	select HUGETLB_PAGE_SIZE_VARIABLE if HUGETLB_PAGE
-	select GENERIC_IRQ_PROBE
-	select GENERIC_PENDING_IRQ if SMP
-	select GENERIC_IRQ_SHOW
-	select GENERIC_IRQ_LEGACY
-	select ARCH_HAVE_NMI_SAFE_CMPXCHG
-	select GENERIC_IOMAP
-	select GENERIC_IOREMAP
-	select GENERIC_SMP_IDLE_THREAD
-	select ARCH_TASK_STRUCT_ON_STACK
-	select ARCH_TASK_STRUCT_ALLOCATOR
-	select ARCH_THREAD_STACK_ALLOCATOR
-	select ARCH_CLOCKSOURCE_DATA
-	select GENERIC_TIME_VSYSCALL
-	select LEGACY_TIMER_TICK
-	select SWIOTLB
-	select SYSCTL_ARCH_UNALIGN_NO_WARN
-	select HAVE_MOD_ARCH_SPECIFIC
-	select MODULES_USE_ELF_RELA
-	select ARCH_USE_CMPXCHG_LOCKREF
-	select HAVE_ARCH_AUDITSYSCALL
-	select NEED_DMA_MAP_STATE
-	select NEED_SG_DMA_LENGTH
-	select NUMA if !FLATMEM
-	select PCI_MSI_ARCH_FALLBACKS if PCI_MSI
-	select ZONE_DMA32
-	select FUNCTION_ALIGNMENT_32B
-	default y
-	help
-	  The Itanium Processor Family is Intel's 64-bit successor to
-	  the 32-bit X86 line.  The IA-64 Linux project has a home
-	  page at <http://www.linuxia64.org/> and a mailing list at
-	  <linux-ia64@vger.kernel.org>.
-
-config 64BIT
-	bool
-	select ATA_NONSTANDARD if ATA
-	default y
-
-config MMU
-	bool
-	default y
-
-config STACKTRACE_SUPPORT
-	def_bool y
-
-config GENERIC_LOCKBREAK
-	def_bool n
-
-config GENERIC_CALIBRATE_DELAY
-	bool
-	default y
-
-config DMI
-	bool
-	default y
-	select DMI_SCAN_MACHINE_NON_EFI_FALLBACK
-
-config EFI
-	bool
-	select UCS2_STRING
-	default y
-
-config SCHED_OMIT_FRAME_POINTER
-	bool
-	default y
-
-config IA64_UNCACHED_ALLOCATOR
-	bool
-	select GENERIC_ALLOCATOR
-
-config ARCH_USES_PG_UNCACHED
-	def_bool y
-	depends on IA64_UNCACHED_ALLOCATOR
-
-config AUDIT_ARCH
-	bool
-	default y
-
-choice
-	prompt "Processor type"
-	default ITANIUM
-
-config ITANIUM
-	bool "Itanium"
-	help
-	  Select your IA-64 processor type.  The default is Itanium.
-	  This choice is safe for all IA-64 systems, but may not perform
-	  optimally on systems with, say, Itanium 2 or newer processors.
-
-config MCKINLEY
-	bool "Itanium 2"
-	help
-	  Select this to configure for an Itanium 2 (McKinley) processor.
-
-endchoice
-
-choice
-	prompt "Kernel page size"
-	default IA64_PAGE_SIZE_16KB
-
-config IA64_PAGE_SIZE_4KB
-	bool "4KB"
-	help
-	  This lets you select the page size of the kernel.  For best IA-64
-	  performance, a page size of 8KB or 16KB is recommended.  For best
-	  IA-32 compatibility, a page size of 4KB should be selected (the vast
-	  majority of IA-32 binaries work perfectly fine with a larger page
-	  size).  For Itanium 2 or newer systems, a page size of 64KB can also
-	  be selected.
-
-	  4KB                For best IA-32 compatibility
-	  8KB                For best IA-64 performance
-	  16KB               For best IA-64 performance
-	  64KB               Requires Itanium 2 or newer processor.
-
-	  If you don't know what to do, choose 16KB.
-
-config IA64_PAGE_SIZE_8KB
-	bool "8KB"
-
-config IA64_PAGE_SIZE_16KB
-	bool "16KB"
-
-config IA64_PAGE_SIZE_64KB
-	depends on !ITANIUM
-	bool "64KB"
-
-endchoice
-
-source "kernel/Kconfig.hz"
-
-config IA64_BRL_EMU
-	bool
-	depends on ITANIUM
-	default y
-
-# align cache-sensitive data to 128 bytes
-config IA64_L1_CACHE_SHIFT
-	int
-	default "7" if MCKINLEY
-	default "6" if ITANIUM
-
-config IA64_SGI_UV
-	bool "SGI-UV support"
-	help
-	  Selecting this option will add specific support for running on SGI
-	  UV based systems.  If you have an SGI UV system or are building a
-	  distro kernel, select this option.
-
-config IA64_HP_SBA_IOMMU
-	bool "HP SBA IOMMU support"
-	select DMA_OPS
-	default y
-	help
-	  Say Y here to add support for the SBA IOMMU found on HP zx1 and
-	  sx1000 systems.  If you're unsure, answer Y.
-
-config IA64_CYCLONE
-	bool "Cyclone (EXA) Time Source support"
-	help
-	  Say Y here to enable support for IBM EXA Cyclone time source.
-	  If you're unsure, answer N.
-
-config ARCH_FORCE_MAX_ORDER
-	int
-	default "16" if HUGETLB_PAGE
-	default "10"
-
-config SMP
-	bool "Symmetric multi-processing support"
-	help
-	  This enables support for systems with more than one CPU. If you have
-	  a system with only one CPU, say N.  If you have a system with more
-	  than one CPU, say Y.
-
-	  If you say N here, the kernel will run on single and multiprocessor
-	  systems, but will use only one CPU of a multiprocessor system.  If
-	  you say Y here, the kernel will run on many, but not all,
-	  single processor systems.  On a single processor system, the kernel
-	  will run faster if you say N here.
-
-	  See also the SMP-HOWTO available at
-	  <http://www.tldp.org/docs.html#howto>.
-
-	  If you don't know what to do here, say N.
-
-config NR_CPUS
-	int "Maximum number of CPUs (2-4096)"
-	range 2 4096
-	depends on SMP
-	default "4096"
-	help
-	  You should set this to the number of CPUs in your system, but
-	  keep in mind that a kernel compiled for, e.g., 2 CPUs will boot but
-	  only use 2 CPUs on a >2 CPU system.  Setting this to a value larger
-	  than 64 will cause the use of a CPU mask array, causing a small
-	  performance hit.
-
-config HOTPLUG_CPU
-	bool "Support for hot-pluggable CPUs"
-	depends on SMP
-	default n
-	help
-	  Say Y here to experiment with turning CPUs off and on.  CPUs
-	  can be controlled through /sys/devices/system/cpu/cpu#.
-	  Say N if you want to disable CPU hotplug.
-
-config SCHED_SMT
-	bool "SMT scheduler support"
-	depends on SMP
-	help
-	  Improves the CPU scheduler's decision making when dealing with
-	  Intel IA64 chips with MultiThreading at a cost of slightly increased
-	  overhead in some places. If unsure say N here.
-
-config PERMIT_BSP_REMOVE
-	bool "Support removal of Bootstrap Processor"
-	depends on HOTPLUG_CPU
-	default n
-	help
-	Say Y here if your platform SAL will support removal of BSP with HOTPLUG_CPU
-	support. 
-
-config FORCE_CPEI_RETARGET
-	bool "Force assumption that CPEI can be re-targeted"
-	depends on PERMIT_BSP_REMOVE
-	default n
-	help
-	Say Y if you need to force the assumption that CPEI can be re-targeted to
-	any cpu in the system. This hint is available via ACPI 3.0 specifications.
-	Tiger4 systems are capable of re-directing CPEI to any CPU other than BSP.
-	This option it useful to enable this feature on older BIOS's as well.
-	You can also enable this by using boot command line option force_cpei=1.
-
-config ARCH_SELECT_MEMORY_MODEL
-	def_bool y
-
-config ARCH_FLATMEM_ENABLE
-	def_bool y
-
-config ARCH_SPARSEMEM_ENABLE
-	def_bool y
-	select SPARSEMEM_VMEMMAP_ENABLE
-
-config ARCH_SPARSEMEM_DEFAULT
-	def_bool y
-	depends on ARCH_SPARSEMEM_ENABLE
-
-config NUMA
-	bool "NUMA support"
-	depends on !FLATMEM
-	select SMP
-	select USE_PERCPU_NUMA_NODE_ID
-	help
-	  Say Y to compile the kernel to support NUMA (Non-Uniform Memory
-	  Access).  This option is for configuring high-end multiprocessor
-	  server systems.  If in doubt, say N.
-
-config NODES_SHIFT
-	int "Max num nodes shift(3-10)"
-	range 3 10
-	default "10"
-	depends on NUMA
-	help
-	  This option specifies the maximum number of nodes in your SSI system.
-	  MAX_NUMNODES will be 2^(This value).
-	  If in doubt, use the default.
-
-config HAVE_ARCH_NODEDATA_EXTENSION
-	def_bool y
-	depends on NUMA
-
-config HAVE_MEMORYLESS_NODES
-	def_bool NUMA
-
-config ARCH_PROC_KCORE_TEXT
-	def_bool y
-	depends on PROC_KCORE
-
-config IA64_MCA_RECOVERY
-	bool "MCA recovery from errors other than TLB."
-
-config IA64_PALINFO
-	tristate "/proc/pal support"
-	help
-	  If you say Y here, you are able to get PAL (Processor Abstraction
-	  Layer) information in /proc/pal.  This contains useful information
-	  about the processors in your systems, such as cache and TLB sizes
-	  and the PAL firmware version in use.
-
-	  To use this option, you have to ensure that the "/proc file system
-	  support" (CONFIG_PROC_FS) is enabled, too.
-
-config IA64_MC_ERR_INJECT
-	tristate "MC error injection support"
-	help
-	  Adds support for MC error injection. If enabled, the kernel 
-	  will provide a sysfs interface for user applications to
-	  call MC error injection PAL procedures to inject various errors.
-	  This is a useful tool for MCA testing.
-
-	  If you're unsure, do not select this option.
-
-config IA64_ESI
-	bool "ESI (Extensible SAL Interface) support"
-	help
-	  If you say Y here, support is built into the kernel to
-	  make ESI calls.  ESI calls are used to support vendor-specific
-	  firmware extensions, such as the ability to inject memory-errors
-	  for test-purposes.  If you're unsure, say N.
-
-config IA64_HP_AML_NFW
-	bool "Support ACPI AML calls to native firmware"
-	help
-	  This driver installs a global ACPI Operation Region handler for
-	  region 0xA1.  AML methods can use this OpRegion to call arbitrary
-	  native firmware functions.  The driver installs the OpRegion
-	  handler if there is an HPQ5001 device or if the user supplies
-	  the "force" module parameter, e.g., with the "aml_nfw.force"
-	  kernel command line option.
-
-endmenu
-
-config ARCH_SUPPORTS_KEXEC
-	def_bool !SMP || HOTPLUG_CPU
-
-config ARCH_SUPPORTS_CRASH_DUMP
-	def_bool IA64_MCA_RECOVERY && (!SMP || HOTPLUG_CPU)
-
-menu "Power management and ACPI options"
-
-source "kernel/power/Kconfig"
-
-source "drivers/acpi/Kconfig"
-
-if PM
-menu "CPU Frequency scaling"
-source "drivers/cpufreq/Kconfig"
-endmenu
-endif
-
-endmenu
-
-config MSPEC
-	tristate "Memory special operations driver"
-	depends on IA64
-	select IA64_UNCACHED_ALLOCATOR
-	help
-	  If you have an ia64 and you want to enable memory special
-	  operations support (formerly known as fetchop), say Y here,
-	  otherwise say N.
diff --git a/arch/ia64/Kconfig.debug b/arch/ia64/Kconfig.debug
deleted file mode 100644
index 2ce008e2d164..000000000000
--- a/arch/ia64/Kconfig.debug
+++ /dev/null
@@ -1,55 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-choice
-	prompt "Physical memory granularity"
-	default IA64_GRANULE_64MB
-
-config IA64_GRANULE_16MB
-	bool "16MB"
-	help
-	  IA-64 identity-mapped regions use a large page size called "granules".
-
-	  Select "16MB" for a small granule size.
-	  Select "64MB" for a large granule size.  This is the current default.
-
-config IA64_GRANULE_64MB
-	bool "64MB"
-	depends on BROKEN
-
-endchoice
-
-config IA64_PRINT_HAZARDS
-	bool "Print possible IA-64 dependency violations to console"
-	depends on DEBUG_KERNEL
-	help
-	  Selecting this option prints more information for Illegal Dependency
-	  Faults, that is, for Read-after-Write (RAW), Write-after-Write (WAW),
-	  or Write-after-Read (WAR) violations.  This option is ignored if you
-	  are compiling for an Itanium A step processor
-	  (CONFIG_ITANIUM_ASTEP_SPECIFIC).  If you're unsure, select Y.
-
-config DISABLE_VHPT
-	bool "Disable VHPT"
-	depends on DEBUG_KERNEL
-	help
-	  The Virtual Hash Page Table (VHPT) enhances virtual address
-	  translation performance.  Normally you want the VHPT active but you
-	  can select this option to disable the VHPT for debugging.  If you're
-	  unsure, answer N.
-
-config IA64_DEBUG_CMPXCHG
-	bool "Turn on compare-and-exchange bug checking (slow!)"
-	depends on DEBUG_KERNEL && PRINTK
-	help
-	  Selecting this option turns on bug checking for the IA-64
-	  compare-and-exchange instructions.  This is slow!  Itaniums
-	  from step B3 or later don't have this problem. If you're unsure,
-	  select N.
-
-config IA64_DEBUG_IRQ
-	bool "Turn on irq debug checks (slow!)"
-	depends on DEBUG_KERNEL
-	help
-	  Selecting this option turns on bug checking for the IA-64 irq_save
-	  and restore instructions.  It's useful for tracking down spinlock
-	  problems, but slow!  If you're unsure, select N.
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
deleted file mode 100644
index d553ab7022fe..000000000000
--- a/arch/ia64/Makefile
+++ /dev/null
@@ -1,82 +0,0 @@
-#
-# ia64/Makefile
-#
-# This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies.
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 1998-2004 by David Mosberger-Tang <davidm@hpl.hp.com>
-#
-
-KBUILD_DEFCONFIG := generic_defconfig
-
-NM := $(CROSS_COMPILE)nm -B
-
-CHECKFLAGS	+= -D__ia64=1 -D__ia64__=1 -D_LP64 -D__LP64__
-
-OBJCOPYFLAGS	:= --strip-all
-LDFLAGS_vmlinux	:= -static
-KBUILD_AFLAGS_KERNEL := -mconstant-gp
-EXTRA		:=
-
-cflags-y	:= -pipe $(EXTRA) -ffixed-r13 -mfixed-range=f12-f15,f32-f127 \
-		   -frename-registers -fno-optimize-sibling-calls
-KBUILD_CFLAGS_KERNEL := -mconstant-gp
-
-GAS_STATUS	= $(shell $(srctree)/arch/ia64/scripts/check-gas "$(CC)" "$(OBJDUMP)")
-KBUILD_CPPFLAGS += $(shell $(srctree)/arch/ia64/scripts/toolchain-flags "$(CC)" "$(OBJDUMP)" "$(READELF)")
-
-ifeq ($(GAS_STATUS),buggy)
-$(error Sorry, you need a newer version of the assember, one that is built from	\
-	a source-tree that post-dates 18-Dec-2002.  You can find a pre-compiled	\
-	static binary of such an assembler at:					\
-										\
-		ftp://ftp.hpl.hp.com/pub/linux-ia64/gas-030124.tar.gz)
-endif
-
-quiet_cmd_gzip = GZIP    $@
-cmd_gzip = cat $(real-prereqs) | $(KGZIP) -n -f -9 > $@
-
-quiet_cmd_objcopy = OBJCOPY $@
-cmd_objcopy = $(OBJCOPY) $(OBJCOPYFLAGS) $(OBJCOPYFLAGS_$(@F)) $< $@
-
-KBUILD_CFLAGS += $(cflags-y)
-
-libs-y				+= arch/ia64/lib/
-
-drivers-y			+= arch/ia64/pci/ arch/ia64/hp/common/
-
-PHONY += compressed check
-
-all: compressed unwcheck
-
-compressed: vmlinux.gz
-
-vmlinuz: vmlinux.gz
-
-vmlinux.gz: vmlinux.bin FORCE
-	$(call if_changed,gzip)
-
-vmlinux.bin: vmlinux FORCE
-	$(call if_changed,objcopy)
-
-unwcheck: vmlinux
-	-$(Q)READELF=$(READELF) $(PYTHON3) $(srctree)/arch/ia64/scripts/unwcheck.py $<
-
-archheaders:
-	$(Q)$(MAKE) $(build)=arch/ia64/kernel/syscalls all
-
-CLEAN_FILES += vmlinux.gz
-
-install: KBUILD_IMAGE := vmlinux.gz
-install:
-	$(call cmd,install)
-
-define archhelp
-  echo '* compressed	- Build compressed kernel image'
-  echo '  install	- Install compressed kernel image'
-  echo '* unwcheck	- Check vmlinux for invalid unwind info'
-endef
diff --git a/arch/ia64/configs/bigsur_defconfig b/arch/ia64/configs/bigsur_defconfig
deleted file mode 100644
index 7cb96db9a25d..000000000000
--- a/arch/ia64/configs/bigsur_defconfig
+++ /dev/null
@@ -1,102 +0,0 @@
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_LOG_BUF_SHIFT=16
-CONFIG_PROFILING=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_SGI_PARTITION=y
-CONFIG_SMP=y
-CONFIG_NR_CPUS=2
-CONFIG_PREEMPT=y
-CONFIG_IA64_PALINFO=y
-CONFIG_BINFMT_MISC=m
-CONFIG_ACPI_BUTTON=m
-CONFIG_ACPI_FAN=m
-CONFIG_ACPI_PROCESSOR=m
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_INET=y
-# CONFIG_IPV6 is not set
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=m
-CONFIG_ATA=m
-CONFIG_ATA_GENERIC=m
-CONFIG_ATA_PIIX=m
-CONFIG_SCSI=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_SCSI_CONSTANTS=y
-CONFIG_SCSI_LOGGING=y
-CONFIG_SCSI_SPI_ATTRS=m
-CONFIG_SCSI_QLOGIC_1280=y
-CONFIG_MD=y
-CONFIG_BLK_DEV_MD=m
-CONFIG_MD_LINEAR=m
-CONFIG_MD_RAID0=m
-CONFIG_MD_RAID1=m
-CONFIG_MD_RAID10=m
-CONFIG_MD_MULTIPATH=m
-CONFIG_BLK_DEV_DM=m
-CONFIG_DM_CRYPT=m
-CONFIG_DM_SNAPSHOT=m
-CONFIG_DM_MIRROR=m
-CONFIG_DM_ZERO=m
-CONFIG_NETDEVICES=y
-CONFIG_DUMMY=y
-CONFIG_INPUT_EVDEV=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_8250_EXTENDED=y
-CONFIG_SERIAL_8250_SHARE_IRQ=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_EFI=y
-CONFIG_I2C=y
-CONFIG_I2C_CHARDEV=y
-CONFIG_AGP=m
-CONFIG_AGP_I460=m
-CONFIG_DRM=m
-CONFIG_DRM_R128=m
-CONFIG_SOUND=m
-CONFIG_SND=m
-CONFIG_SND_SEQUENCER=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-CONFIG_SND_CS4281=m
-CONFIG_USB_HIDDEV=y
-CONFIG_USB=m
-CONFIG_USB_MON=m
-CONFIG_USB_UHCI_HCD=m
-CONFIG_USB_ACM=m
-CONFIG_USB_PRINTER=m
-CONFIG_USB_STORAGE=m
-CONFIG_EXT2_FS=y
-CONFIG_EXT3_FS=y
-CONFIG_XFS_FS=y
-CONFIG_XFS_QUOTA=y
-CONFIG_XFS_POSIX_ACL=y
-CONFIG_AUTOFS_FS=m
-CONFIG_ISO9660_FS=m
-CONFIG_JOLIET=y
-CONFIG_UDF_FS=m
-CONFIG_VFAT_FS=y
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_HUGETLBFS=y
-CONFIG_NFS_FS=m
-CONFIG_NFS_V4=m
-CONFIG_NFSD=m
-CONFIG_NFSD_V4=y
-CONFIG_CIFS=m
-CONFIG_CIFS_XATTR=y
-CONFIG_CIFS_POSIX=y
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_UTF8=m
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_DEBUG_MUTEXES=y
-CONFIG_CRYPTO_MD5=y
-CONFIG_CRYPTO_DES=y
diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig
deleted file mode 100644
index 4581240013dd..000000000000
--- a/arch/ia64/configs/generic_defconfig
+++ /dev/null
@@ -1,206 +0,0 @@
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_LOG_BUF_SHIFT=20
-CONFIG_CGROUPS=y
-CONFIG_CPUSETS=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_KALLSYMS_ALL=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODVERSIONS=y
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_SGI_PARTITION=y
-CONFIG_MCKINLEY=y
-CONFIG_IA64_PAGE_SIZE_64KB=y
-CONFIG_IA64_CYCLONE=y
-CONFIG_SMP=y
-CONFIG_HOTPLUG_CPU=y
-CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_IA64_PALINFO=y
-CONFIG_KEXEC=y
-CONFIG_CRASH_DUMP=y
-CONFIG_BINFMT_MISC=m
-CONFIG_ACPI_BUTTON=m
-CONFIG_ACPI_FAN=m
-CONFIG_ACPI_DOCK=y
-CONFIG_ACPI_PROCESSOR=m
-CONFIG_HOTPLUG_PCI=y
-CONFIG_HOTPLUG_PCI_ACPI=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_SYN_COOKIES=y
-# CONFIG_IPV6 is not set
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-CONFIG_CONNECTOR=y
-# CONFIG_PNP_DEBUG_MESSAGES is not set
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=y
-CONFIG_SGI_XP=m
-CONFIG_ATA=y
-CONFIG_ATA_GENERIC=y
-CONFIG_PATA_CMD64X=y
-CONFIG_ATA_PIIX=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_CHR_DEV_ST=m
-CONFIG_BLK_DEV_SR=m
-CONFIG_CHR_DEV_SG=m
-CONFIG_SCSI_FC_ATTRS=y
-CONFIG_SCSI_SYM53C8XX_2=y
-CONFIG_SCSI_QLOGIC_1280=y
-CONFIG_SATA_VITESSE=y
-CONFIG_MD=y
-CONFIG_BLK_DEV_MD=m
-CONFIG_MD_LINEAR=m
-CONFIG_MD_RAID0=m
-CONFIG_MD_RAID1=m
-CONFIG_MD_MULTIPATH=m
-CONFIG_BLK_DEV_DM=m
-CONFIG_DM_CRYPT=m
-CONFIG_DM_SNAPSHOT=m
-CONFIG_DM_MIRROR=m
-CONFIG_DM_ZERO=m
-CONFIG_DM_MULTIPATH=m
-CONFIG_FUSION=y
-CONFIG_FUSION_SPI=y
-CONFIG_FUSION_FC=m
-CONFIG_FUSION_SAS=y
-CONFIG_NETDEVICES=y
-CONFIG_DUMMY=m
-CONFIG_NETCONSOLE=y
-CONFIG_TIGON3=y
-CONFIG_NET_TULIP=y
-CONFIG_TULIP=m
-CONFIG_E100=m
-CONFIG_E1000=y
-CONFIG_IGB=y
-# CONFIG_SERIO_SERPORT is not set
-CONFIG_GAMEPORT=m
-CONFIG_SERIAL_NONSTANDARD=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_8250_NR_UARTS=6
-CONFIG_SERIAL_8250_EXTENDED=y
-CONFIG_SERIAL_8250_SHARE_IRQ=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_EFI=y
-CONFIG_HPET=y
-CONFIG_AGP=m
-CONFIG_AGP_I460=m
-CONFIG_AGP_HP_ZX1=m
-CONFIG_DRM=m
-CONFIG_DRM_TDFX=m
-CONFIG_DRM_R128=m
-CONFIG_DRM_RADEON=m
-CONFIG_DRM_MGA=m
-CONFIG_DRM_SIS=m
-CONFIG_SOUND=m
-CONFIG_SND=m
-CONFIG_SND_SEQUENCER=m
-CONFIG_SND_SEQ_DUMMY=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-CONFIG_SND_SEQUENCER_OSS=y
-CONFIG_SND_VERBOSE_PRINTK=y
-CONFIG_SND_DUMMY=m
-CONFIG_SND_VIRMIDI=m
-CONFIG_SND_MTPAV=m
-CONFIG_SND_SERIAL_U16550=m
-CONFIG_SND_MPU401=m
-CONFIG_SND_CS4281=m
-CONFIG_SND_CS46XX=m
-CONFIG_SND_EMU10K1=m
-CONFIG_SND_FM801=m
-CONFIG_HID_GYRATION=m
-CONFIG_HID_PANTHERLORD=m
-CONFIG_HID_PETALYNX=m
-CONFIG_HID_SAMSUNG=m
-CONFIG_HID_SONY=m
-CONFIG_HID_SUNPLUS=m
-CONFIG_USB=m
-CONFIG_USB_MON=m
-CONFIG_USB_EHCI_HCD=m
-CONFIG_USB_OHCI_HCD=m
-CONFIG_USB_UHCI_HCD=m
-CONFIG_USB_STORAGE=m
-CONFIG_INFINIBAND=m
-CONFIG_INFINIBAND_MTHCA=m
-CONFIG_INFINIBAND_IPOIB=m
-CONFIG_INTEL_IOMMU=y
-CONFIG_MSPEC=m
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
-CONFIG_EXT2_FS_POSIX_ACL=y
-CONFIG_EXT2_FS_SECURITY=y
-CONFIG_EXT3_FS=y
-CONFIG_EXT3_FS_POSIX_ACL=y
-CONFIG_EXT3_FS_SECURITY=y
-CONFIG_REISERFS_FS=y
-CONFIG_REISERFS_FS_XATTR=y
-CONFIG_REISERFS_FS_POSIX_ACL=y
-CONFIG_REISERFS_FS_SECURITY=y
-CONFIG_XFS_FS=y
-CONFIG_AUTOFS_FS=m
-CONFIG_ISO9660_FS=m
-CONFIG_JOLIET=y
-CONFIG_UDF_FS=m
-CONFIG_VFAT_FS=y
-CONFIG_NTFS_FS=m
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_HUGETLBFS=y
-CONFIG_NFS_FS=m
-CONFIG_NFS_V4=m
-CONFIG_NFSD=m
-CONFIG_NFSD_V4=y
-CONFIG_CIFS=m
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_CODEPAGE_737=m
-CONFIG_NLS_CODEPAGE_775=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_CODEPAGE_852=m
-CONFIG_NLS_CODEPAGE_855=m
-CONFIG_NLS_CODEPAGE_857=m
-CONFIG_NLS_CODEPAGE_860=m
-CONFIG_NLS_CODEPAGE_861=m
-CONFIG_NLS_CODEPAGE_862=m
-CONFIG_NLS_CODEPAGE_863=m
-CONFIG_NLS_CODEPAGE_864=m
-CONFIG_NLS_CODEPAGE_865=m
-CONFIG_NLS_CODEPAGE_866=m
-CONFIG_NLS_CODEPAGE_869=m
-CONFIG_NLS_CODEPAGE_936=m
-CONFIG_NLS_CODEPAGE_950=m
-CONFIG_NLS_CODEPAGE_932=m
-CONFIG_NLS_CODEPAGE_949=m
-CONFIG_NLS_CODEPAGE_874=m
-CONFIG_NLS_ISO8859_8=m
-CONFIG_NLS_CODEPAGE_1250=m
-CONFIG_NLS_CODEPAGE_1251=m
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_ISO8859_2=m
-CONFIG_NLS_ISO8859_3=m
-CONFIG_NLS_ISO8859_4=m
-CONFIG_NLS_ISO8859_5=m
-CONFIG_NLS_ISO8859_6=m
-CONFIG_NLS_ISO8859_7=m
-CONFIG_NLS_ISO8859_9=m
-CONFIG_NLS_ISO8859_13=m
-CONFIG_NLS_ISO8859_14=m
-CONFIG_NLS_ISO8859_15=m
-CONFIG_NLS_KOI8_R=m
-CONFIG_NLS_KOI8_U=m
-CONFIG_NLS_UTF8=m
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_DEBUG_MUTEXES=y
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_MD5=y
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
-CONFIG_CRC_T10DIF=y
diff --git a/arch/ia64/configs/gensparse_defconfig b/arch/ia64/configs/gensparse_defconfig
deleted file mode 100644
index c9e806616544..000000000000
--- a/arch/ia64/configs/gensparse_defconfig
+++ /dev/null
@@ -1,184 +0,0 @@
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_LOG_BUF_SHIFT=20
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_KALLSYMS_ALL=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODVERSIONS=y
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_SGI_PARTITION=y
-CONFIG_MCKINLEY=y
-CONFIG_IA64_CYCLONE=y
-CONFIG_SMP=y
-CONFIG_NR_CPUS=512
-CONFIG_HOTPLUG_CPU=y
-CONFIG_SPARSEMEM_MANUAL=y
-CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_IA64_PALINFO=y
-CONFIG_BINFMT_MISC=m
-CONFIG_ACPI_BUTTON=m
-CONFIG_ACPI_FAN=m
-CONFIG_ACPI_PROCESSOR=m
-CONFIG_HOTPLUG_PCI=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_SYN_COOKIES=y
-# CONFIG_IPV6 is not set
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=y
-CONFIG_ATA=y
-CONFIG_ATA_GENERIC=y
-CONFIG_PATA_CMD64X=y
-CONFIG_ATA_PIIX=y
-CONFIG_SCSI=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_CHR_DEV_ST=m
-CONFIG_BLK_DEV_SR=m
-CONFIG_CHR_DEV_SG=m
-CONFIG_SCSI_FC_ATTRS=y
-CONFIG_SCSI_SYM53C8XX_2=y
-CONFIG_SCSI_QLOGIC_1280=y
-CONFIG_MD=y
-CONFIG_BLK_DEV_MD=m
-CONFIG_MD_LINEAR=m
-CONFIG_MD_RAID0=m
-CONFIG_MD_RAID1=m
-CONFIG_MD_MULTIPATH=m
-CONFIG_BLK_DEV_DM=m
-CONFIG_DM_CRYPT=m
-CONFIG_DM_SNAPSHOT=m
-CONFIG_DM_MIRROR=m
-CONFIG_DM_ZERO=m
-CONFIG_DM_MULTIPATH=m
-CONFIG_FUSION=y
-CONFIG_FUSION_SPI=y
-CONFIG_FUSION_FC=m
-CONFIG_NETDEVICES=y
-CONFIG_DUMMY=m
-CONFIG_NETCONSOLE=y
-CONFIG_TIGON3=y
-CONFIG_NET_TULIP=y
-CONFIG_TULIP=m
-CONFIG_E100=m
-CONFIG_E1000=y
-# CONFIG_SERIO_SERPORT is not set
-CONFIG_GAMEPORT=m
-CONFIG_SERIAL_NONSTANDARD=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_8250_NR_UARTS=6
-CONFIG_SERIAL_8250_EXTENDED=y
-CONFIG_SERIAL_8250_SHARE_IRQ=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_EFI=y
-CONFIG_HPET=y
-CONFIG_AGP=m
-CONFIG_AGP_I460=m
-CONFIG_AGP_HP_ZX1=m
-CONFIG_DRM=m
-CONFIG_DRM_TDFX=m
-CONFIG_DRM_R128=m
-CONFIG_DRM_RADEON=m
-CONFIG_DRM_MGA=m
-CONFIG_DRM_SIS=m
-CONFIG_SOUND=m
-CONFIG_SND=m
-CONFIG_SND_SEQUENCER=m
-CONFIG_SND_SEQ_DUMMY=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-CONFIG_SND_SEQUENCER_OSS=y
-CONFIG_SND_VERBOSE_PRINTK=y
-CONFIG_SND_DUMMY=m
-CONFIG_SND_VIRMIDI=m
-CONFIG_SND_MTPAV=m
-CONFIG_SND_SERIAL_U16550=m
-CONFIG_SND_MPU401=m
-CONFIG_SND_CS4281=m
-CONFIG_SND_CS46XX=m
-CONFIG_SND_EMU10K1=m
-CONFIG_SND_FM801=m
-CONFIG_USB=m
-CONFIG_USB_MON=m
-CONFIG_USB_EHCI_HCD=m
-CONFIG_USB_OHCI_HCD=m
-CONFIG_USB_UHCI_HCD=m
-CONFIG_USB_STORAGE=m
-CONFIG_INFINIBAND=m
-CONFIG_INFINIBAND_MTHCA=m
-CONFIG_INFINIBAND_IPOIB=m
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
-CONFIG_EXT2_FS_POSIX_ACL=y
-CONFIG_EXT2_FS_SECURITY=y
-CONFIG_EXT3_FS=y
-CONFIG_EXT3_FS_POSIX_ACL=y
-CONFIG_EXT3_FS_SECURITY=y
-CONFIG_REISERFS_FS=y
-CONFIG_REISERFS_FS_XATTR=y
-CONFIG_REISERFS_FS_POSIX_ACL=y
-CONFIG_REISERFS_FS_SECURITY=y
-CONFIG_XFS_FS=y
-CONFIG_AUTOFS_FS=y
-CONFIG_ISO9660_FS=m
-CONFIG_JOLIET=y
-CONFIG_UDF_FS=m
-CONFIG_VFAT_FS=y
-CONFIG_NTFS_FS=m
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_HUGETLBFS=y
-CONFIG_NFS_FS=m
-CONFIG_NFS_V4=m
-CONFIG_NFSD=m
-CONFIG_NFSD_V4=y
-CONFIG_CIFS=m
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_CODEPAGE_737=m
-CONFIG_NLS_CODEPAGE_775=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_CODEPAGE_852=m
-CONFIG_NLS_CODEPAGE_855=m
-CONFIG_NLS_CODEPAGE_857=m
-CONFIG_NLS_CODEPAGE_860=m
-CONFIG_NLS_CODEPAGE_861=m
-CONFIG_NLS_CODEPAGE_862=m
-CONFIG_NLS_CODEPAGE_863=m
-CONFIG_NLS_CODEPAGE_864=m
-CONFIG_NLS_CODEPAGE_865=m
-CONFIG_NLS_CODEPAGE_866=m
-CONFIG_NLS_CODEPAGE_869=m
-CONFIG_NLS_CODEPAGE_936=m
-CONFIG_NLS_CODEPAGE_950=m
-CONFIG_NLS_CODEPAGE_932=m
-CONFIG_NLS_CODEPAGE_949=m
-CONFIG_NLS_CODEPAGE_874=m
-CONFIG_NLS_ISO8859_8=m
-CONFIG_NLS_CODEPAGE_1250=m
-CONFIG_NLS_CODEPAGE_1251=m
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_ISO8859_2=m
-CONFIG_NLS_ISO8859_3=m
-CONFIG_NLS_ISO8859_4=m
-CONFIG_NLS_ISO8859_5=m
-CONFIG_NLS_ISO8859_6=m
-CONFIG_NLS_ISO8859_7=m
-CONFIG_NLS_ISO8859_9=m
-CONFIG_NLS_ISO8859_13=m
-CONFIG_NLS_ISO8859_14=m
-CONFIG_NLS_ISO8859_15=m
-CONFIG_NLS_KOI8_R=m
-CONFIG_NLS_KOI8_U=m
-CONFIG_NLS_UTF8=m
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_DEBUG_MUTEXES=y
-CONFIG_CRYPTO_MD5=y
diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig
deleted file mode 100644
index d7d8fb5c7b71..000000000000
--- a/arch/ia64/configs/tiger_defconfig
+++ /dev/null
@@ -1,169 +0,0 @@
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_LOG_BUF_SHIFT=20
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_KALLSYMS_ALL=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODVERSIONS=y
-CONFIG_MODULE_SRCVERSION_ALL=y
-# CONFIG_BLK_DEV_BSG is not set
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_SGI_PARTITION=y
-CONFIG_MCKINLEY=y
-CONFIG_IA64_PAGE_SIZE_64KB=y
-CONFIG_IA64_CYCLONE=y
-CONFIG_SMP=y
-CONFIG_NR_CPUS=16
-CONFIG_HOTPLUG_CPU=y
-CONFIG_PERMIT_BSP_REMOVE=y
-CONFIG_FORCE_CPEI_RETARGET=y
-CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_IA64_PALINFO=y
-CONFIG_KEXEC=y
-CONFIG_BINFMT_MISC=m
-CONFIG_ACPI_BUTTON=m
-CONFIG_ACPI_FAN=m
-CONFIG_ACPI_PROCESSOR=m
-CONFIG_HOTPLUG_PCI=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_SYN_COOKIES=y
-# CONFIG_IPV6 is not set
-CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=y
-CONFIG_ATA=y
-CONFIG_ATA_GENERIC=y
-CONFIG_PATA_CMD64X=y
-CONFIG_ATA_PIIX=y
-CONFIG_SCSI=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_CHR_DEV_ST=m
-CONFIG_BLK_DEV_SR=m
-CONFIG_CHR_DEV_SG=m
-CONFIG_SCSI_FC_ATTRS=y
-CONFIG_SCSI_SYM53C8XX_2=y
-CONFIG_SCSI_QLOGIC_1280=y
-CONFIG_MD=y
-CONFIG_BLK_DEV_MD=m
-CONFIG_MD_LINEAR=m
-CONFIG_MD_RAID0=m
-CONFIG_MD_RAID1=m
-CONFIG_MD_MULTIPATH=m
-CONFIG_BLK_DEV_DM=m
-CONFIG_DM_CRYPT=m
-CONFIG_DM_SNAPSHOT=m
-CONFIG_DM_MIRROR=m
-CONFIG_DM_ZERO=m
-CONFIG_FUSION=y
-CONFIG_FUSION_SPI=y
-CONFIG_FUSION_FC=y
-CONFIG_FUSION_CTL=y
-CONFIG_NETDEVICES=y
-CONFIG_DUMMY=m
-CONFIG_NETCONSOLE=y
-CONFIG_TIGON3=y
-CONFIG_NET_TULIP=y
-CONFIG_TULIP=m
-CONFIG_E100=m
-CONFIG_E1000=y
-# CONFIG_SERIO_SERPORT is not set
-CONFIG_GAMEPORT=m
-CONFIG_SERIAL_NONSTANDARD=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_8250_NR_UARTS=6
-CONFIG_SERIAL_8250_EXTENDED=y
-CONFIG_SERIAL_8250_SHARE_IRQ=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_EFI=y
-CONFIG_HPET=y
-CONFIG_AGP=m
-CONFIG_AGP_I460=m
-CONFIG_DRM=m
-CONFIG_DRM_TDFX=m
-CONFIG_DRM_R128=m
-CONFIG_DRM_RADEON=m
-CONFIG_DRM_MGA=m
-CONFIG_DRM_SIS=m
-CONFIG_USB=y
-CONFIG_USB_EHCI_HCD=m
-CONFIG_USB_OHCI_HCD=m
-CONFIG_USB_UHCI_HCD=y
-CONFIG_USB_STORAGE=m
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
-CONFIG_EXT2_FS_POSIX_ACL=y
-CONFIG_EXT2_FS_SECURITY=y
-CONFIG_EXT3_FS=y
-CONFIG_EXT3_FS_POSIX_ACL=y
-CONFIG_EXT3_FS_SECURITY=y
-CONFIG_REISERFS_FS=y
-CONFIG_REISERFS_FS_XATTR=y
-CONFIG_REISERFS_FS_POSIX_ACL=y
-CONFIG_REISERFS_FS_SECURITY=y
-CONFIG_XFS_FS=y
-CONFIG_AUTOFS_FS=y
-CONFIG_ISO9660_FS=m
-CONFIG_JOLIET=y
-CONFIG_UDF_FS=m
-CONFIG_VFAT_FS=y
-CONFIG_NTFS_FS=m
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_HUGETLBFS=y
-CONFIG_NFS_FS=m
-CONFIG_NFS_V4=m
-CONFIG_NFSD=m
-CONFIG_NFSD_V4=y
-CONFIG_CIFS=m
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_CODEPAGE_737=m
-CONFIG_NLS_CODEPAGE_775=m
-CONFIG_NLS_CODEPAGE_850=m
-CONFIG_NLS_CODEPAGE_852=m
-CONFIG_NLS_CODEPAGE_855=m
-CONFIG_NLS_CODEPAGE_857=m
-CONFIG_NLS_CODEPAGE_860=m
-CONFIG_NLS_CODEPAGE_861=m
-CONFIG_NLS_CODEPAGE_862=m
-CONFIG_NLS_CODEPAGE_863=m
-CONFIG_NLS_CODEPAGE_864=m
-CONFIG_NLS_CODEPAGE_865=m
-CONFIG_NLS_CODEPAGE_866=m
-CONFIG_NLS_CODEPAGE_869=m
-CONFIG_NLS_CODEPAGE_936=m
-CONFIG_NLS_CODEPAGE_950=m
-CONFIG_NLS_CODEPAGE_932=m
-CONFIG_NLS_CODEPAGE_949=m
-CONFIG_NLS_CODEPAGE_874=m
-CONFIG_NLS_ISO8859_8=m
-CONFIG_NLS_CODEPAGE_1250=m
-CONFIG_NLS_CODEPAGE_1251=m
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_ISO8859_2=m
-CONFIG_NLS_ISO8859_3=m
-CONFIG_NLS_ISO8859_4=m
-CONFIG_NLS_ISO8859_5=m
-CONFIG_NLS_ISO8859_6=m
-CONFIG_NLS_ISO8859_7=m
-CONFIG_NLS_ISO8859_9=m
-CONFIG_NLS_ISO8859_13=m
-CONFIG_NLS_ISO8859_14=m
-CONFIG_NLS_ISO8859_15=m
-CONFIG_NLS_KOI8_R=m
-CONFIG_NLS_KOI8_U=m
-CONFIG_NLS_UTF8=m
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_DEBUG_MUTEXES=y
-CONFIG_IA64_GRANULE_16MB=y
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_MD5=y
diff --git a/arch/ia64/configs/zx1_defconfig b/arch/ia64/configs/zx1_defconfig
deleted file mode 100644
index ed104550d0d5..000000000000
--- a/arch/ia64/configs/zx1_defconfig
+++ /dev/null
@@ -1,148 +0,0 @@
-CONFIG_SYSVIPC=y
-CONFIG_BSD_PROCESS_ACCT=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_KPROBES=y
-CONFIG_MODULES=y
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_MCKINLEY=y
-CONFIG_SMP=y
-CONFIG_NR_CPUS=16
-CONFIG_HOTPLUG_CPU=y
-CONFIG_FLATMEM_MANUAL=y
-CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_IA64_PALINFO=y
-CONFIG_CRASH_DUMP=y
-CONFIG_BINFMT_MISC=y
-CONFIG_HOTPLUG_PCI=y
-CONFIG_HOTPLUG_PCI_ACPI=y
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-# CONFIG_IPV6 is not set
-CONFIG_NETFILTER=y
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_RAM=y
-CONFIG_ATA=y
-CONFIG_ATA_GENERIC=y
-CONFIG_PATA_CMD64X=y
-CONFIG_SCSI=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_CHR_DEV_ST=y
-CONFIG_BLK_DEV_SR=y
-CONFIG_CHR_DEV_SG=y
-CONFIG_SCSI_CONSTANTS=y
-CONFIG_SCSI_LOGGING=y
-CONFIG_SCSI_FC_ATTRS=y
-CONFIG_SCSI_SYM53C8XX_2=y
-CONFIG_SCSI_QLOGIC_1280=y
-CONFIG_FUSION=y
-CONFIG_FUSION_SPI=y
-CONFIG_FUSION_FC=y
-CONFIG_FUSION_CTL=m
-CONFIG_NETDEVICES=y
-CONFIG_DUMMY=y
-CONFIG_TIGON3=y
-CONFIG_NET_TULIP=y
-CONFIG_TULIP=y
-CONFIG_TULIP_MWI=y
-CONFIG_TULIP_MMIO=y
-CONFIG_TULIP_NAPI=y
-CONFIG_TULIP_NAPI_HW_MITIGATION=y
-CONFIG_E100=y
-CONFIG_E1000=y
-CONFIG_INPUT_JOYDEV=y
-CONFIG_INPUT_EVDEV=y
-# CONFIG_INPUT_KEYBOARD is not set
-# CONFIG_INPUT_MOUSE is not set
-# CONFIG_SERIO_I8042 is not set
-# CONFIG_SERIO_SERPORT is not set
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_8250_NR_UARTS=8
-CONFIG_SERIAL_8250_EXTENDED=y
-CONFIG_SERIAL_8250_SHARE_IRQ=y
-# CONFIG_HW_RANDOM is not set
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_EFI=y
-CONFIG_I2C_CHARDEV=y
-CONFIG_AGP=y
-CONFIG_AGP_HP_ZX1=y
-CONFIG_DRM=y
-CONFIG_DRM_RADEON=y
-CONFIG_FB_RADEON=y
-CONFIG_FB_RADEON_DEBUG=y
-CONFIG_LOGO=y
-# CONFIG_LOGO_LINUX_MONO is not set
-# CONFIG_LOGO_LINUX_VGA16 is not set
-CONFIG_SOUND=y
-CONFIG_SND=y
-CONFIG_SND_SEQUENCER=y
-CONFIG_SND_MIXER_OSS=y
-CONFIG_SND_PCM_OSS=y
-CONFIG_SND_SEQUENCER_OSS=y
-CONFIG_SND_FM801=y
-CONFIG_USB_HIDDEV=y
-CONFIG_USB=y
-CONFIG_USB_MON=y
-CONFIG_USB_EHCI_HCD=y
-CONFIG_USB_OHCI_HCD=y
-CONFIG_USB_UHCI_HCD=y
-CONFIG_USB_STORAGE=y
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
-CONFIG_EXT3_FS=y
-CONFIG_ISO9660_FS=y
-CONFIG_JOLIET=y
-CONFIG_UDF_FS=y
-CONFIG_MSDOS_FS=y
-CONFIG_VFAT_FS=y
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_HUGETLBFS=y
-CONFIG_NFS_FS=y
-CONFIG_NFS_V4=y
-CONFIG_NFSD=y
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_CODEPAGE_737=y
-CONFIG_NLS_CODEPAGE_775=y
-CONFIG_NLS_CODEPAGE_850=y
-CONFIG_NLS_CODEPAGE_852=y
-CONFIG_NLS_CODEPAGE_855=y
-CONFIG_NLS_CODEPAGE_857=y
-CONFIG_NLS_CODEPAGE_860=y
-CONFIG_NLS_CODEPAGE_861=y
-CONFIG_NLS_CODEPAGE_862=y
-CONFIG_NLS_CODEPAGE_863=y
-CONFIG_NLS_CODEPAGE_864=y
-CONFIG_NLS_CODEPAGE_865=y
-CONFIG_NLS_CODEPAGE_866=y
-CONFIG_NLS_CODEPAGE_869=y
-CONFIG_NLS_CODEPAGE_936=y
-CONFIG_NLS_CODEPAGE_950=y
-CONFIG_NLS_CODEPAGE_932=y
-CONFIG_NLS_CODEPAGE_949=y
-CONFIG_NLS_CODEPAGE_874=y
-CONFIG_NLS_ISO8859_8=y
-CONFIG_NLS_CODEPAGE_1251=y
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_ISO8859_2=y
-CONFIG_NLS_ISO8859_3=y
-CONFIG_NLS_ISO8859_4=y
-CONFIG_NLS_ISO8859_5=y
-CONFIG_NLS_ISO8859_6=y
-CONFIG_NLS_ISO8859_7=y
-CONFIG_NLS_ISO8859_9=y
-CONFIG_NLS_ISO8859_13=y
-CONFIG_NLS_ISO8859_14=y
-CONFIG_NLS_ISO8859_15=y
-CONFIG_NLS_KOI8_R=y
-CONFIG_NLS_KOI8_U=y
-CONFIG_NLS_UTF8=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_DEBUG_MUTEXES=y
-CONFIG_IA64_PRINT_HAZARDS=y
-CONFIG_CRYPTO_ECB=m
-CONFIG_CRYPTO_PCBC=m
diff --git a/arch/ia64/hp/common/Makefile b/arch/ia64/hp/common/Makefile
deleted file mode 100644
index 11a56ed38229..000000000000
--- a/arch/ia64/hp/common/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# ia64/platform/hp/common/Makefile
-#
-# Copyright (C) 2002 Hewlett Packard
-# Copyright (C) Alex Williamson (alex_williamson@hp.com)
-#
-
-obj-$(CONFIG_IA64_HP_SBA_IOMMU) += sba_iommu.o
-obj-$(CONFIG_IA64_HP_AML_NFW) += aml_nfw.o
diff --git a/arch/ia64/hp/common/aml_nfw.c b/arch/ia64/hp/common/aml_nfw.c
deleted file mode 100644
index 901df49461a0..000000000000
--- a/arch/ia64/hp/common/aml_nfw.c
+++ /dev/null
@@ -1,232 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * OpRegion handler to allow AML to call native firmware
- *
- * (c) Copyright 2007 Hewlett-Packard Development Company, L.P.
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- *
- * This driver implements HP Open Source Review Board proposal 1842,
- * which was approved on 9/20/2006.
- *
- * For technical documentation, see the HP SPPA Firmware EAS, Appendix F.
- *
- * ACPI does not define a mechanism for AML methods to call native firmware
- * interfaces such as PAL or SAL.  This OpRegion handler adds such a mechanism.
- * After the handler is installed, an AML method can call native firmware by
- * storing the arguments and firmware entry point to specific offsets in the
- * OpRegion.  When AML reads the "return value" offset from the OpRegion, this
- * handler loads up the arguments, makes the firmware call, and returns the
- * result.
- */
-
-#include <linux/module.h>
-#include <linux/acpi.h>
-#include <asm/sal.h>
-
-MODULE_AUTHOR("Bjorn Helgaas <bjorn.helgaas@hp.com>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("ACPI opregion handler for native firmware calls");
-
-static bool force_register;
-module_param_named(force, force_register, bool, 0);
-MODULE_PARM_DESC(force, "Install opregion handler even without HPQ5001 device");
-
-#define AML_NFW_SPACE		0xA1
-
-struct ia64_pdesc {
-	void *ip;
-	void *gp;
-};
-
-/*
- * N.B.  The layout of this structure is defined in the HP SPPA FW EAS, and
- *	 the member offsets are embedded in AML methods.
- */
-struct ia64_nfw_context {
-	u64 arg[8];
-	struct ia64_sal_retval ret;
-	u64 ip;
-	u64 gp;
-	u64 pad[2];
-};
-
-static void *virt_map(u64 address)
-{
-	if (address & (1UL << 63))
-		return (void *) (__IA64_UNCACHED_OFFSET | address);
-
-	return __va(address);
-}
-
-static void aml_nfw_execute(struct ia64_nfw_context *c)
-{
-	struct ia64_pdesc virt_entry;
-	ia64_sal_handler entry;
-
-	virt_entry.ip = virt_map(c->ip);
-	virt_entry.gp = virt_map(c->gp);
-
-	entry = (ia64_sal_handler) &virt_entry;
-
-	IA64_FW_CALL(entry, c->ret,
-		     c->arg[0], c->arg[1], c->arg[2], c->arg[3],
-		     c->arg[4], c->arg[5], c->arg[6], c->arg[7]);
-}
-
-static void aml_nfw_read_arg(u8 *offset, u32 bit_width, u64 *value)
-{
-	switch (bit_width) {
-	case 8:
-		*value = *(u8 *)offset;
-		break;
-	case 16:
-		*value = *(u16 *)offset;
-		break;
-	case 32:
-		*value = *(u32 *)offset;
-		break;
-	case 64:
-		*value = *(u64 *)offset;
-		break;
-	}
-}
-
-static void aml_nfw_write_arg(u8 *offset, u32 bit_width, u64 *value)
-{
-	switch (bit_width) {
-	case 8:
-		*(u8 *) offset = *value;
-		break;
-	case 16:
-		*(u16 *) offset = *value;
-		break;
-	case 32:
-		*(u32 *) offset = *value;
-		break;
-	case 64:
-		*(u64 *) offset = *value;
-		break;
-	}
-}
-
-static acpi_status aml_nfw_handler(u32 function, acpi_physical_address address,
-	u32 bit_width, u64 *value, void *handler_context,
-	void *region_context)
-{
-	struct ia64_nfw_context *context = handler_context;
-	u8 *offset = (u8 *) context + address;
-
-	if (bit_width !=  8 && bit_width != 16 &&
-	    bit_width != 32 && bit_width != 64)
-		return AE_BAD_PARAMETER;
-
-	if (address + (bit_width >> 3) > sizeof(struct ia64_nfw_context))
-		return AE_BAD_PARAMETER;
-
-	switch (function) {
-	case ACPI_READ:
-		if (address == offsetof(struct ia64_nfw_context, ret))
-			aml_nfw_execute(context);
-		aml_nfw_read_arg(offset, bit_width, value);
-		break;
-	case ACPI_WRITE:
-		aml_nfw_write_arg(offset, bit_width, value);
-		break;
-	}
-
-	return AE_OK;
-}
-
-static struct ia64_nfw_context global_context;
-static int global_handler_registered;
-
-static int aml_nfw_add_global_handler(void)
-{
-	acpi_status status;
-
-	if (global_handler_registered)
-		return 0;
-
-	status = acpi_install_address_space_handler(ACPI_ROOT_OBJECT,
-		AML_NFW_SPACE, aml_nfw_handler, NULL, &global_context);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	global_handler_registered = 1;
-	printk(KERN_INFO "Global 0x%02X opregion handler registered\n",
-		AML_NFW_SPACE);
-	return 0;
-}
-
-static int aml_nfw_remove_global_handler(void)
-{
-	acpi_status status;
-
-	if (!global_handler_registered)
-		return 0;
-
-	status = acpi_remove_address_space_handler(ACPI_ROOT_OBJECT,
-		AML_NFW_SPACE, aml_nfw_handler);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	global_handler_registered = 0;
-	printk(KERN_INFO "Global 0x%02X opregion handler removed\n",
-		AML_NFW_SPACE);
-	return 0;
-}
-
-static int aml_nfw_add(struct acpi_device *device)
-{
-	/*
-	 * We would normally allocate a new context structure and install
-	 * the address space handler for the specific device we found.
-	 * But the HP-UX implementation shares a single global context
-	 * and always puts the handler at the root, so we'll do the same.
-	 */
-	return aml_nfw_add_global_handler();
-}
-
-static void aml_nfw_remove(struct acpi_device *device)
-{
-	aml_nfw_remove_global_handler();
-}
-
-static const struct acpi_device_id aml_nfw_ids[] = {
-	{"HPQ5001", 0},
-	{"", 0}
-};
-
-static struct acpi_driver acpi_aml_nfw_driver = {
-	.name = "native firmware",
-	.ids = aml_nfw_ids,
-	.ops = {
-		.add = aml_nfw_add,
-		.remove = aml_nfw_remove,
-		},
-};
-
-static int __init aml_nfw_init(void)
-{
-	int result;
-
-	if (force_register)
-		aml_nfw_add_global_handler();
-
-	result = acpi_bus_register_driver(&acpi_aml_nfw_driver);
-	if (result < 0) {
-		aml_nfw_remove_global_handler();
-		return result;
-	}
-
-	return 0;
-}
-
-static void __exit aml_nfw_exit(void)
-{
-	acpi_bus_unregister_driver(&acpi_aml_nfw_driver);
-	aml_nfw_remove_global_handler();
-}
-
-module_init(aml_nfw_init);
-module_exit(aml_nfw_exit);
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
deleted file mode 100644
index c4d477e8bcd4..000000000000
--- a/arch/ia64/hp/common/sba_iommu.c
+++ /dev/null
@@ -1,2155 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
-**  IA64 System Bus Adapter (SBA) I/O MMU manager
-**
-**	(c) Copyright 2002-2005 Alex Williamson
-**	(c) Copyright 2002-2003 Grant Grundler
-**	(c) Copyright 2002-2005 Hewlett-Packard Company
-**
-**	Portions (c) 2000 Grant Grundler (from parisc I/O MMU code)
-**	Portions (c) 1999 Dave S. Miller (from sparc64 I/O MMU code)
-**
-**
-**
-** This module initializes the IOC (I/O Controller) found on HP
-** McKinley machines and their successors.
-**
-*/
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/acpi.h>
-#include <linux/efi.h>
-#include <linux/nodemask.h>
-#include <linux/bitops.h>         /* hweight64() */
-#include <linux/crash_dump.h>
-#include <linux/iommu-helper.h>
-#include <linux/dma-map-ops.h>
-#include <linux/prefetch.h>
-#include <linux/swiotlb.h>
-
-#include <asm/delay.h>		/* ia64_get_itc() */
-#include <asm/io.h>
-#include <asm/page.h>		/* PAGE_OFFSET */
-#include <asm/dma.h>
-
-#include <asm/acpi-ext.h>
-
-#define PFX "IOC: "
-
-/*
-** Enabling timing search of the pdir resource map.  Output in /proc.
-** Disabled by default to optimize performance.
-*/
-#undef PDIR_SEARCH_TIMING
-
-/*
-** This option allows cards capable of 64bit DMA to bypass the IOMMU.  If
-** not defined, all DMA will be 32bit and go through the TLB.
-** There's potentially a conflict in the bio merge code with us
-** advertising an iommu, but then bypassing it.  Since I/O MMU bypassing
-** appears to give more performance than bio-level virtual merging, we'll
-** do the former for now.  NOTE: BYPASS_SG also needs to be undef'd to
-** completely restrict DMA to the IOMMU.
-*/
-#define ALLOW_IOV_BYPASS
-
-/*
-** This option specifically allows/disallows bypassing scatterlists with
-** multiple entries.  Coalescing these entries can allow better DMA streaming
-** and in some cases shows better performance than entirely bypassing the
-** IOMMU.  Performance increase on the order of 1-2% sequential output/input
-** using bonnie++ on a RAID0 MD device (sym2 & mpt).
-*/
-#undef ALLOW_IOV_BYPASS_SG
-
-/*
-** If a device prefetches beyond the end of a valid pdir entry, it will cause
-** a hard failure, ie. MCA.  Version 3.0 and later of the zx1 LBA should
-** disconnect on 4k boundaries and prevent such issues.  If the device is
-** particularly aggressive, this option will keep the entire pdir valid such
-** that prefetching will hit a valid address.  This could severely impact
-** error containment, and is therefore off by default.  The page that is
-** used for spill-over is poisoned, so that should help debugging somewhat.
-*/
-#undef FULL_VALID_PDIR
-
-#define ENABLE_MARK_CLEAN
-
-/*
-** The number of debug flags is a clue - this code is fragile.  NOTE: since
-** tightening the use of res_lock the resource bitmap and actual pdir are no
-** longer guaranteed to stay in sync.  The sanity checking code isn't going to
-** like that.
-*/
-#undef DEBUG_SBA_INIT
-#undef DEBUG_SBA_RUN
-#undef DEBUG_SBA_RUN_SG
-#undef DEBUG_SBA_RESOURCE
-#undef ASSERT_PDIR_SANITY
-#undef DEBUG_LARGE_SG_ENTRIES
-#undef DEBUG_BYPASS
-
-#if defined(FULL_VALID_PDIR) && defined(ASSERT_PDIR_SANITY)
-#error FULL_VALID_PDIR and ASSERT_PDIR_SANITY are mutually exclusive
-#endif
-
-#define SBA_INLINE	__inline__
-/* #define SBA_INLINE */
-
-#ifdef DEBUG_SBA_INIT
-#define DBG_INIT(x...)	printk(x)
-#else
-#define DBG_INIT(x...)
-#endif
-
-#ifdef DEBUG_SBA_RUN
-#define DBG_RUN(x...)	printk(x)
-#else
-#define DBG_RUN(x...)
-#endif
-
-#ifdef DEBUG_SBA_RUN_SG
-#define DBG_RUN_SG(x...)	printk(x)
-#else
-#define DBG_RUN_SG(x...)
-#endif
-
-
-#ifdef DEBUG_SBA_RESOURCE
-#define DBG_RES(x...)	printk(x)
-#else
-#define DBG_RES(x...)
-#endif
-
-#ifdef DEBUG_BYPASS
-#define DBG_BYPASS(x...)	printk(x)
-#else
-#define DBG_BYPASS(x...)
-#endif
-
-#ifdef ASSERT_PDIR_SANITY
-#define ASSERT(expr) \
-        if(!(expr)) { \
-                printk( "\n" __FILE__ ":%d: Assertion " #expr " failed!\n",__LINE__); \
-                panic(#expr); \
-        }
-#else
-#define ASSERT(expr)
-#endif
-
-/*
-** The number of pdir entries to "free" before issuing
-** a read to PCOM register to flush out PCOM writes.
-** Interacts with allocation granularity (ie 4 or 8 entries
-** allocated and free'd/purged at a time might make this
-** less interesting).
-*/
-#define DELAYED_RESOURCE_CNT	64
-
-#define PCI_DEVICE_ID_HP_SX2000_IOC	0x12ec
-
-#define ZX1_IOC_ID	((PCI_DEVICE_ID_HP_ZX1_IOC << 16) | PCI_VENDOR_ID_HP)
-#define ZX2_IOC_ID	((PCI_DEVICE_ID_HP_ZX2_IOC << 16) | PCI_VENDOR_ID_HP)
-#define REO_IOC_ID	((PCI_DEVICE_ID_HP_REO_IOC << 16) | PCI_VENDOR_ID_HP)
-#define SX1000_IOC_ID	((PCI_DEVICE_ID_HP_SX1000_IOC << 16) | PCI_VENDOR_ID_HP)
-#define SX2000_IOC_ID	((PCI_DEVICE_ID_HP_SX2000_IOC << 16) | PCI_VENDOR_ID_HP)
-
-#define ZX1_IOC_OFFSET	0x1000	/* ACPI reports SBA, we want IOC */
-
-#define IOC_FUNC_ID	0x000
-#define IOC_FCLASS	0x008	/* function class, bist, header, rev... */
-#define IOC_IBASE	0x300	/* IO TLB */
-#define IOC_IMASK	0x308
-#define IOC_PCOM	0x310
-#define IOC_TCNFG	0x318
-#define IOC_PDIR_BASE	0x320
-
-#define IOC_ROPE0_CFG	0x500
-#define   IOC_ROPE_AO	  0x10	/* Allow "Relaxed Ordering" */
-
-
-/* AGP GART driver looks for this */
-#define ZX1_SBA_IOMMU_COOKIE	0x0000badbadc0ffeeUL
-
-/*
-** The zx1 IOC supports 4/8/16/64KB page sizes (see TCNFG register)
-**
-** Some IOCs (sx1000) can run at the above pages sizes, but are
-** really only supported using the IOC at a 4k page size.
-**
-** iovp_size could only be greater than PAGE_SIZE if we are
-** confident the drivers really only touch the next physical
-** page iff that driver instance owns it.
-*/
-static unsigned long iovp_size;
-static unsigned long iovp_shift;
-static unsigned long iovp_mask;
-
-struct ioc {
-	void __iomem	*ioc_hpa;	/* I/O MMU base address */
-	char		*res_map;	/* resource map, bit == pdir entry */
-	u64		*pdir_base;	/* physical base address */
-	unsigned long	ibase;		/* pdir IOV Space base */
-	unsigned long	imask;		/* pdir IOV Space mask */
-
-	unsigned long	*res_hint;	/* next avail IOVP - circular search */
-	unsigned long	dma_mask;
-	spinlock_t	res_lock;	/* protects the resource bitmap, but must be held when */
-					/* clearing pdir to prevent races with allocations. */
-	unsigned int	res_bitshift;	/* from the RIGHT! */
-	unsigned int	res_size;	/* size of resource map in bytes */
-#ifdef CONFIG_NUMA
-	unsigned int	node;		/* node where this IOC lives */
-#endif
-#if DELAYED_RESOURCE_CNT > 0
-	spinlock_t	saved_lock;	/* may want to try to get this on a separate cacheline */
-					/* than res_lock for bigger systems. */
-	int		saved_cnt;
-	struct sba_dma_pair {
-		dma_addr_t	iova;
-		size_t		size;
-	} saved[DELAYED_RESOURCE_CNT];
-#endif
-
-#ifdef PDIR_SEARCH_TIMING
-#define SBA_SEARCH_SAMPLE	0x100
-	unsigned long avg_search[SBA_SEARCH_SAMPLE];
-	unsigned long avg_idx;	/* current index into avg_search */
-#endif
-
-	/* Stuff we don't need in performance path */
-	struct ioc	*next;		/* list of IOC's in system */
-	acpi_handle	handle;		/* for multiple IOC's */
-	const char 	*name;
-	unsigned int	func_id;
-	unsigned int	rev;		/* HW revision of chip */
-	u32		iov_size;
-	unsigned int	pdir_size;	/* in bytes, determined by IOV Space size */
-	struct pci_dev	*sac_only_dev;
-};
-
-static struct ioc *ioc_list, *ioc_found;
-static int reserve_sba_gart = 1;
-
-static SBA_INLINE void sba_mark_invalid(struct ioc *, dma_addr_t, size_t);
-static SBA_INLINE void sba_free_range(struct ioc *, dma_addr_t, size_t);
-
-#define sba_sg_address(sg)	sg_virt((sg))
-
-#ifdef FULL_VALID_PDIR
-static u64 prefetch_spill_page;
-#endif
-
-#define GET_IOC(dev)	((dev_is_pci(dev))						\
-			 ? ((struct ioc *) PCI_CONTROLLER(to_pci_dev(dev))->iommu) : NULL)
-
-/*
-** DMA_CHUNK_SIZE is used by the SCSI mid-layer to break up
-** (or rather not merge) DMAs into manageable chunks.
-** On parisc, this is more of the software/tuning constraint
-** rather than the HW. I/O MMU allocation algorithms can be
-** faster with smaller sizes (to some degree).
-*/
-#define DMA_CHUNK_SIZE  (BITS_PER_LONG*iovp_size)
-
-#define ROUNDUP(x,y) ((x + ((y)-1)) & ~((y)-1))
-
-/************************************
-** SBA register read and write support
-**
-** BE WARNED: register writes are posted.
-**  (ie follow writes which must reach HW with a read)
-**
-*/
-#define READ_REG(addr)       __raw_readq(addr)
-#define WRITE_REG(val, addr) __raw_writeq(val, addr)
-
-#ifdef DEBUG_SBA_INIT
-
-/**
- * sba_dump_tlb - debugging only - print IOMMU operating parameters
- * @hpa: base address of the IOMMU
- *
- * Print the size/location of the IO MMU PDIR.
- */
-static void
-sba_dump_tlb(char *hpa)
-{
-	DBG_INIT("IO TLB at 0x%p\n", (void *)hpa);
-	DBG_INIT("IOC_IBASE    : %016lx\n", READ_REG(hpa+IOC_IBASE));
-	DBG_INIT("IOC_IMASK    : %016lx\n", READ_REG(hpa+IOC_IMASK));
-	DBG_INIT("IOC_TCNFG    : %016lx\n", READ_REG(hpa+IOC_TCNFG));
-	DBG_INIT("IOC_PDIR_BASE: %016lx\n", READ_REG(hpa+IOC_PDIR_BASE));
-	DBG_INIT("\n");
-}
-#endif
-
-
-#ifdef ASSERT_PDIR_SANITY
-
-/**
- * sba_dump_pdir_entry - debugging only - print one IOMMU PDIR entry
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @msg: text to print ont the output line.
- * @pide: pdir index.
- *
- * Print one entry of the IO MMU PDIR in human readable form.
- */
-static void
-sba_dump_pdir_entry(struct ioc *ioc, char *msg, uint pide)
-{
-	/* start printing from lowest pde in rval */
-	u64 *ptr = &ioc->pdir_base[pide  & ~(BITS_PER_LONG - 1)];
-	unsigned long *rptr = (unsigned long *) &ioc->res_map[(pide >>3) & -sizeof(unsigned long)];
-	uint rcnt;
-
-	printk(KERN_DEBUG "SBA: %s rp %p bit %d rval 0x%lx\n",
-		 msg, rptr, pide & (BITS_PER_LONG - 1), *rptr);
-
-	rcnt = 0;
-	while (rcnt < BITS_PER_LONG) {
-		printk(KERN_DEBUG "%s %2d %p %016Lx\n",
-		       (rcnt == (pide & (BITS_PER_LONG - 1)))
-		       ? "    -->" : "       ",
-		       rcnt, ptr, (unsigned long long) *ptr );
-		rcnt++;
-		ptr++;
-	}
-	printk(KERN_DEBUG "%s", msg);
-}
-
-
-/**
- * sba_check_pdir - debugging only - consistency checker
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @msg: text to print ont the output line.
- *
- * Verify the resource map and pdir state is consistent
- */
-static int
-sba_check_pdir(struct ioc *ioc, char *msg)
-{
-	u64 *rptr_end = (u64 *) &(ioc->res_map[ioc->res_size]);
-	u64 *rptr = (u64 *) ioc->res_map;	/* resource map ptr */
-	u64 *pptr = ioc->pdir_base;	/* pdir ptr */
-	uint pide = 0;
-
-	while (rptr < rptr_end) {
-		u64 rval;
-		int rcnt; /* number of bits we might check */
-
-		rval = *rptr;
-		rcnt = 64;
-
-		while (rcnt) {
-			/* Get last byte and highest bit from that */
-			u32 pde = ((u32)((*pptr >> (63)) & 0x1));
-			if ((rval & 0x1) ^ pde)
-			{
-				/*
-				** BUMMER!  -- res_map != pdir --
-				** Dump rval and matching pdir entries
-				*/
-				sba_dump_pdir_entry(ioc, msg, pide);
-				return(1);
-			}
-			rcnt--;
-			rval >>= 1;	/* try the next bit */
-			pptr++;
-			pide++;
-		}
-		rptr++;	/* look at next word of res_map */
-	}
-	/* It'd be nice if we always got here :^) */
-	return 0;
-}
-
-
-/**
- * sba_dump_sg - debugging only - print Scatter-Gather list
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @startsg: head of the SG list
- * @nents: number of entries in SG list
- *
- * print the SG list so we can verify it's correct by hand.
- */
-static void
-sba_dump_sg( struct ioc *ioc, struct scatterlist *startsg, int nents)
-{
-	while (nents-- > 0) {
-		printk(KERN_DEBUG " %d : DMA %08lx/%05x CPU %p\n", nents,
-		       startsg->dma_address, startsg->dma_length,
-		       sba_sg_address(startsg));
-		startsg = sg_next(startsg);
-	}
-}
-
-static void
-sba_check_sg( struct ioc *ioc, struct scatterlist *startsg, int nents)
-{
-	struct scatterlist *the_sg = startsg;
-	int the_nents = nents;
-
-	while (the_nents-- > 0) {
-		if (sba_sg_address(the_sg) == 0x0UL)
-			sba_dump_sg(NULL, startsg, nents);
-		the_sg = sg_next(the_sg);
-	}
-}
-
-#endif /* ASSERT_PDIR_SANITY */
-
-
-
-
-/**************************************************************
-*
-*   I/O Pdir Resource Management
-*
-*   Bits set in the resource map are in use.
-*   Each bit can represent a number of pages.
-*   LSbs represent lower addresses (IOVA's).
-*
-***************************************************************/
-#define PAGES_PER_RANGE 1	/* could increase this to 4 or 8 if needed */
-
-/* Convert from IOVP to IOVA and vice versa. */
-#define SBA_IOVA(ioc,iovp,offset) ((ioc->ibase) | (iovp) | (offset))
-#define SBA_IOVP(ioc,iova) ((iova) & ~(ioc->ibase))
-
-#define PDIR_ENTRY_SIZE	sizeof(u64)
-
-#define PDIR_INDEX(iovp)   ((iovp)>>iovp_shift)
-
-#define RESMAP_MASK(n)    ~(~0UL << (n))
-#define RESMAP_IDX_MASK   (sizeof(unsigned long) - 1)
-
-
-/**
- * For most cases the normal get_order is sufficient, however it limits us
- * to PAGE_SIZE being the minimum mapping alignment and TC flush granularity.
- * It only incurs about 1 clock cycle to use this one with the static variable
- * and makes the code more intuitive.
- */
-static SBA_INLINE int
-get_iovp_order (unsigned long size)
-{
-	long double d = size - 1;
-	long order;
-
-	order = ia64_getf_exp(d);
-	order = order - iovp_shift - 0xffff + 1;
-	if (order < 0)
-		order = 0;
-	return order;
-}
-
-static unsigned long ptr_to_pide(struct ioc *ioc, unsigned long *res_ptr,
-				 unsigned int bitshiftcnt)
-{
-	return (((unsigned long)res_ptr - (unsigned long)ioc->res_map) << 3)
-		+ bitshiftcnt;
-}
-
-/**
- * sba_search_bitmap - find free space in IO PDIR resource bitmap
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @bits_wanted: number of entries we need.
- * @use_hint: use res_hint to indicate where to start looking
- *
- * Find consecutive free bits in resource bitmap.
- * Each bit represents one entry in the IO Pdir.
- * Cool perf optimization: search for log2(size) bits at a time.
- */
-static SBA_INLINE unsigned long
-sba_search_bitmap(struct ioc *ioc, struct device *dev,
-		  unsigned long bits_wanted, int use_hint)
-{
-	unsigned long *res_ptr;
-	unsigned long *res_end = (unsigned long *) &(ioc->res_map[ioc->res_size]);
-	unsigned long flags, pide = ~0UL, tpide;
-	unsigned long boundary_size;
-	unsigned long shift;
-	int ret;
-
-	ASSERT(((unsigned long) ioc->res_hint & (sizeof(unsigned long) - 1UL)) == 0);
-	ASSERT(res_ptr < res_end);
-
-	boundary_size = dma_get_seg_boundary_nr_pages(dev, iovp_shift);
-
-	BUG_ON(ioc->ibase & ~iovp_mask);
-	shift = ioc->ibase >> iovp_shift;
-
-	spin_lock_irqsave(&ioc->res_lock, flags);
-
-	/* Allow caller to force a search through the entire resource space */
-	if (likely(use_hint)) {
-		res_ptr = ioc->res_hint;
-	} else {
-		res_ptr = (ulong *)ioc->res_map;
-		ioc->res_bitshift = 0;
-	}
-
-	/*
-	 * N.B.  REO/Grande defect AR2305 can cause TLB fetch timeouts
-	 * if a TLB entry is purged while in use.  sba_mark_invalid()
-	 * purges IOTLB entries in power-of-two sizes, so we also
-	 * allocate IOVA space in power-of-two sizes.
-	 */
-	bits_wanted = 1UL << get_iovp_order(bits_wanted << iovp_shift);
-
-	if (likely(bits_wanted == 1)) {
-		unsigned int bitshiftcnt;
-		for(; res_ptr < res_end ; res_ptr++) {
-			if (likely(*res_ptr != ~0UL)) {
-				bitshiftcnt = ffz(*res_ptr);
-				*res_ptr |= (1UL << bitshiftcnt);
-				pide = ptr_to_pide(ioc, res_ptr, bitshiftcnt);
-				ioc->res_bitshift = bitshiftcnt + bits_wanted;
-				goto found_it;
-			}
-		}
-		goto not_found;
-
-	}
-	
-	if (likely(bits_wanted <= BITS_PER_LONG/2)) {
-		/*
-		** Search the resource bit map on well-aligned values.
-		** "o" is the alignment.
-		** We need the alignment to invalidate I/O TLB using
-		** SBA HW features in the unmap path.
-		*/
-		unsigned long o = 1 << get_iovp_order(bits_wanted << iovp_shift);
-		uint bitshiftcnt = ROUNDUP(ioc->res_bitshift, o);
-		unsigned long mask, base_mask;
-
-		base_mask = RESMAP_MASK(bits_wanted);
-		mask = base_mask << bitshiftcnt;
-
-		DBG_RES("%s() o %ld %p", __func__, o, res_ptr);
-		for(; res_ptr < res_end ; res_ptr++)
-		{ 
-			DBG_RES("    %p %lx %lx\n", res_ptr, mask, *res_ptr);
-			ASSERT(0 != mask);
-			for (; mask ; mask <<= o, bitshiftcnt += o) {
-				tpide = ptr_to_pide(ioc, res_ptr, bitshiftcnt);
-				ret = iommu_is_span_boundary(tpide, bits_wanted,
-							     shift,
-							     boundary_size);
-				if ((0 == ((*res_ptr) & mask)) && !ret) {
-					*res_ptr |= mask;     /* mark resources busy! */
-					pide = tpide;
-					ioc->res_bitshift = bitshiftcnt + bits_wanted;
-					goto found_it;
-				}
-			}
-
-			bitshiftcnt = 0;
-			mask = base_mask;
-
-		}
-
-	} else {
-		int qwords, bits, i;
-		unsigned long *end;
-
-		qwords = bits_wanted >> 6; /* /64 */
-		bits = bits_wanted - (qwords * BITS_PER_LONG);
-
-		end = res_end - qwords;
-
-		for (; res_ptr < end; res_ptr++) {
-			tpide = ptr_to_pide(ioc, res_ptr, 0);
-			ret = iommu_is_span_boundary(tpide, bits_wanted,
-						     shift, boundary_size);
-			if (ret)
-				goto next_ptr;
-			for (i = 0 ; i < qwords ; i++) {
-				if (res_ptr[i] != 0)
-					goto next_ptr;
-			}
-			if (bits && res_ptr[i] && (__ffs(res_ptr[i]) < bits))
-				continue;
-
-			/* Found it, mark it */
-			for (i = 0 ; i < qwords ; i++)
-				res_ptr[i] = ~0UL;
-			res_ptr[i] |= RESMAP_MASK(bits);
-
-			pide = tpide;
-			res_ptr += qwords;
-			ioc->res_bitshift = bits;
-			goto found_it;
-next_ptr:
-			;
-		}
-	}
-
-not_found:
-	prefetch(ioc->res_map);
-	ioc->res_hint = (unsigned long *) ioc->res_map;
-	ioc->res_bitshift = 0;
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-	return (pide);
-
-found_it:
-	ioc->res_hint = res_ptr;
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-	return (pide);
-}
-
-
-/**
- * sba_alloc_range - find free bits and mark them in IO PDIR resource bitmap
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @size: number of bytes to create a mapping for
- *
- * Given a size, find consecutive unmarked and then mark those bits in the
- * resource bit map.
- */
-static int
-sba_alloc_range(struct ioc *ioc, struct device *dev, size_t size)
-{
-	unsigned int pages_needed = size >> iovp_shift;
-#ifdef PDIR_SEARCH_TIMING
-	unsigned long itc_start;
-#endif
-	unsigned long pide;
-
-	ASSERT(pages_needed);
-	ASSERT(0 == (size & ~iovp_mask));
-
-#ifdef PDIR_SEARCH_TIMING
-	itc_start = ia64_get_itc();
-#endif
-	/*
-	** "seek and ye shall find"...praying never hurts either...
-	*/
-	pide = sba_search_bitmap(ioc, dev, pages_needed, 1);
-	if (unlikely(pide >= (ioc->res_size << 3))) {
-		pide = sba_search_bitmap(ioc, dev, pages_needed, 0);
-		if (unlikely(pide >= (ioc->res_size << 3))) {
-#if DELAYED_RESOURCE_CNT > 0
-			unsigned long flags;
-
-			/*
-			** With delayed resource freeing, we can give this one more shot.  We're
-			** getting close to being in trouble here, so do what we can to make this
-			** one count.
-			*/
-			spin_lock_irqsave(&ioc->saved_lock, flags);
-			if (ioc->saved_cnt > 0) {
-				struct sba_dma_pair *d;
-				int cnt = ioc->saved_cnt;
-
-				d = &(ioc->saved[ioc->saved_cnt - 1]);
-
-				spin_lock(&ioc->res_lock);
-				while (cnt--) {
-					sba_mark_invalid(ioc, d->iova, d->size);
-					sba_free_range(ioc, d->iova, d->size);
-					d--;
-				}
-				ioc->saved_cnt = 0;
-				READ_REG(ioc->ioc_hpa+IOC_PCOM);	/* flush purges */
-				spin_unlock(&ioc->res_lock);
-			}
-			spin_unlock_irqrestore(&ioc->saved_lock, flags);
-
-			pide = sba_search_bitmap(ioc, dev, pages_needed, 0);
-			if (unlikely(pide >= (ioc->res_size << 3))) {
-				printk(KERN_WARNING "%s: I/O MMU @ %p is"
-				       "out of mapping resources, %u %u %lx\n",
-				       __func__, ioc->ioc_hpa, ioc->res_size,
-				       pages_needed, dma_get_seg_boundary(dev));
-				return -1;
-			}
-#else
-			printk(KERN_WARNING "%s: I/O MMU @ %p is"
-			       "out of mapping resources, %u %u %lx\n",
-			       __func__, ioc->ioc_hpa, ioc->res_size,
-			       pages_needed, dma_get_seg_boundary(dev));
-			return -1;
-#endif
-		}
-	}
-
-#ifdef PDIR_SEARCH_TIMING
-	ioc->avg_search[ioc->avg_idx++] = (ia64_get_itc() - itc_start) / pages_needed;
-	ioc->avg_idx &= SBA_SEARCH_SAMPLE - 1;
-#endif
-
-	prefetchw(&(ioc->pdir_base[pide]));
-
-#ifdef ASSERT_PDIR_SANITY
-	/* verify the first enable bit is clear */
-	if(0x00 != ((u8 *) ioc->pdir_base)[pide*PDIR_ENTRY_SIZE + 7]) {
-		sba_dump_pdir_entry(ioc, "sba_search_bitmap() botched it?", pide);
-	}
-#endif
-
-	DBG_RES("%s(%x) %d -> %lx hint %x/%x\n",
-		__func__, size, pages_needed, pide,
-		(uint) ((unsigned long) ioc->res_hint - (unsigned long) ioc->res_map),
-		ioc->res_bitshift );
-
-	return (pide);
-}
-
-
-/**
- * sba_free_range - unmark bits in IO PDIR resource bitmap
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @iova: IO virtual address which was previously allocated.
- * @size: number of bytes to create a mapping for
- *
- * clear bits in the ioc's resource map
- */
-static SBA_INLINE void
-sba_free_range(struct ioc *ioc, dma_addr_t iova, size_t size)
-{
-	unsigned long iovp = SBA_IOVP(ioc, iova);
-	unsigned int pide = PDIR_INDEX(iovp);
-	unsigned int ridx = pide >> 3;	/* convert bit to byte address */
-	unsigned long *res_ptr = (unsigned long *) &((ioc)->res_map[ridx & ~RESMAP_IDX_MASK]);
-	int bits_not_wanted = size >> iovp_shift;
-	unsigned long m;
-
-	/* Round up to power-of-two size: see AR2305 note above */
-	bits_not_wanted = 1UL << get_iovp_order(bits_not_wanted << iovp_shift);
-	for (; bits_not_wanted > 0 ; res_ptr++) {
-		
-		if (unlikely(bits_not_wanted > BITS_PER_LONG)) {
-
-			/* these mappings start 64bit aligned */
-			*res_ptr = 0UL;
-			bits_not_wanted -= BITS_PER_LONG;
-			pide += BITS_PER_LONG;
-
-		} else {
-
-			/* 3-bits "bit" address plus 2 (or 3) bits for "byte" == bit in word */
-			m = RESMAP_MASK(bits_not_wanted) << (pide & (BITS_PER_LONG - 1));
-			bits_not_wanted = 0;
-
-			DBG_RES("%s( ,%x,%x) %x/%lx %x %p %lx\n", __func__, (uint) iova, size,
-			        bits_not_wanted, m, pide, res_ptr, *res_ptr);
-
-			ASSERT(m != 0);
-			ASSERT(bits_not_wanted);
-			ASSERT((*res_ptr & m) == m); /* verify same bits are set */
-			*res_ptr &= ~m;
-		}
-	}
-}
-
-
-/**************************************************************
-*
-*   "Dynamic DMA Mapping" support (aka "Coherent I/O")
-*
-***************************************************************/
-
-/**
- * sba_io_pdir_entry - fill in one IO PDIR entry
- * @pdir_ptr:  pointer to IO PDIR entry
- * @vba: Virtual CPU address of buffer to map
- *
- * SBA Mapping Routine
- *
- * Given a virtual address (vba, arg1) sba_io_pdir_entry()
- * loads the I/O PDIR entry pointed to by pdir_ptr (arg0).
- * Each IO Pdir entry consists of 8 bytes as shown below
- * (LSB == bit 0):
- *
- *  63                    40                                 11    7        0
- * +-+---------------------+----------------------------------+----+--------+
- * |V|        U            |            PPN[39:12]            | U  |   FF   |
- * +-+---------------------+----------------------------------+----+--------+
- *
- *  V  == Valid Bit
- *  U  == Unused
- * PPN == Physical Page Number
- *
- * The physical address fields are filled with the results of virt_to_phys()
- * on the vba.
- */
-
-#if 1
-#define sba_io_pdir_entry(pdir_ptr, vba) *pdir_ptr = ((vba & ~0xE000000000000FFFULL)	\
-						      | 0x8000000000000000ULL)
-#else
-void SBA_INLINE
-sba_io_pdir_entry(u64 *pdir_ptr, unsigned long vba)
-{
-	*pdir_ptr = ((vba & ~0xE000000000000FFFULL) | 0x80000000000000FFULL);
-}
-#endif
-
-#ifdef ENABLE_MARK_CLEAN
-/*
- * Since DMA is i-cache coherent, any (complete) pages that were written via
- * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
- * flush them when they get mapped into an executable vm-area.
- */
-static void mark_clean(void *addr, size_t size)
-{
-	struct folio *folio = virt_to_folio(addr);
-	ssize_t left = size;
-	size_t offset = offset_in_folio(folio, addr);
-
-	if (offset) {
-		left -= folio_size(folio) - offset;
-		if (left <= 0)
-			return;
-		folio = folio_next(folio);
-	}
-
-	while (left >= folio_size(folio)) {
-		left -= folio_size(folio);
-		set_bit(PG_arch_1, &folio->flags);
-		if (!left)
-			break;
-		folio = folio_next(folio);
-	}
-}
-#endif
-
-/**
- * sba_mark_invalid - invalidate one or more IO PDIR entries
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @iova:  IO Virtual Address mapped earlier
- * @byte_cnt:  number of bytes this mapping covers.
- *
- * Marking the IO PDIR entry(ies) as Invalid and invalidate
- * corresponding IO TLB entry. The PCOM (Purge Command Register)
- * is to purge stale entries in the IO TLB when unmapping entries.
- *
- * The PCOM register supports purging of multiple pages, with a minium
- * of 1 page and a maximum of 2GB. Hardware requires the address be
- * aligned to the size of the range being purged. The size of the range
- * must be a power of 2. The "Cool perf optimization" in the
- * allocation routine helps keep that true.
- */
-static SBA_INLINE void
-sba_mark_invalid(struct ioc *ioc, dma_addr_t iova, size_t byte_cnt)
-{
-	u32 iovp = (u32) SBA_IOVP(ioc,iova);
-
-	int off = PDIR_INDEX(iovp);
-
-	/* Must be non-zero and rounded up */
-	ASSERT(byte_cnt > 0);
-	ASSERT(0 == (byte_cnt & ~iovp_mask));
-
-#ifdef ASSERT_PDIR_SANITY
-	/* Assert first pdir entry is set */
-	if (!(ioc->pdir_base[off] >> 60)) {
-		sba_dump_pdir_entry(ioc,"sba_mark_invalid()", PDIR_INDEX(iovp));
-	}
-#endif
-
-	if (byte_cnt <= iovp_size)
-	{
-		ASSERT(off < ioc->pdir_size);
-
-		iovp |= iovp_shift;     /* set "size" field for PCOM */
-
-#ifndef FULL_VALID_PDIR
-		/*
-		** clear I/O PDIR entry "valid" bit
-		** Do NOT clear the rest - save it for debugging.
-		** We should only clear bits that have previously
-		** been enabled.
-		*/
-		ioc->pdir_base[off] &= ~(0x80000000000000FFULL);
-#else
-		/*
-  		** If we want to maintain the PDIR as valid, put in
-		** the spill page so devices prefetching won't
-		** cause a hard fail.
-		*/
-		ioc->pdir_base[off] = (0x80000000000000FFULL | prefetch_spill_page);
-#endif
-	} else {
-		u32 t = get_iovp_order(byte_cnt) + iovp_shift;
-
-		iovp |= t;
-		ASSERT(t <= 31);   /* 2GB! Max value of "size" field */
-
-		do {
-			/* verify this pdir entry is enabled */
-			ASSERT(ioc->pdir_base[off]  >> 63);
-#ifndef FULL_VALID_PDIR
-			/* clear I/O Pdir entry "valid" bit first */
-			ioc->pdir_base[off] &= ~(0x80000000000000FFULL);
-#else
-			ioc->pdir_base[off] = (0x80000000000000FFULL | prefetch_spill_page);
-#endif
-			off++;
-			byte_cnt -= iovp_size;
-		} while (byte_cnt > 0);
-	}
-
-	WRITE_REG(iovp | ioc->ibase, ioc->ioc_hpa+IOC_PCOM);
-}
-
-/**
- * sba_map_page - map one buffer and return IOVA for DMA
- * @dev: instance of PCI owned by the driver that's asking.
- * @page: page to map
- * @poff: offset into page
- * @size: number of bytes to map
- * @dir: dma direction
- * @attrs: optional dma attributes
- *
- * See Documentation/core-api/dma-api-howto.rst
- */
-static dma_addr_t sba_map_page(struct device *dev, struct page *page,
-			       unsigned long poff, size_t size,
-			       enum dma_data_direction dir,
-			       unsigned long attrs)
-{
-	struct ioc *ioc;
-	void *addr = page_address(page) + poff;
-	dma_addr_t iovp;
-	dma_addr_t offset;
-	u64 *pdir_start;
-	int pide;
-#ifdef ASSERT_PDIR_SANITY
-	unsigned long flags;
-#endif
-#ifdef ALLOW_IOV_BYPASS
-	unsigned long pci_addr = virt_to_phys(addr);
-#endif
-
-#ifdef ALLOW_IOV_BYPASS
-	ASSERT(to_pci_dev(dev)->dma_mask);
-	/*
- 	** Check if the PCI device can DMA to ptr... if so, just return ptr
- 	*/
-	if (likely((pci_addr & ~to_pci_dev(dev)->dma_mask) == 0)) {
-		/*
- 		** Device is bit capable of DMA'ing to the buffer...
-		** just return the PCI address of ptr
- 		*/
-		DBG_BYPASS("sba_map_page() bypass mask/addr: "
-			   "0x%lx/0x%lx\n",
-		           to_pci_dev(dev)->dma_mask, pci_addr);
-		return pci_addr;
-	}
-#endif
-	ioc = GET_IOC(dev);
-	ASSERT(ioc);
-
-	prefetch(ioc->res_hint);
-
-	ASSERT(size > 0);
-	ASSERT(size <= DMA_CHUNK_SIZE);
-
-	/* save offset bits */
-	offset = ((dma_addr_t) (long) addr) & ~iovp_mask;
-
-	/* round up to nearest iovp_size */
-	size = (size + offset + ~iovp_mask) & iovp_mask;
-
-#ifdef ASSERT_PDIR_SANITY
-	spin_lock_irqsave(&ioc->res_lock, flags);
-	if (sba_check_pdir(ioc,"Check before sba_map_page()"))
-		panic("Sanity check failed");
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-#endif
-
-	pide = sba_alloc_range(ioc, dev, size);
-	if (pide < 0)
-		return DMA_MAPPING_ERROR;
-
-	iovp = (dma_addr_t) pide << iovp_shift;
-
-	DBG_RUN("%s() 0x%p -> 0x%lx\n", __func__, addr, (long) iovp | offset);
-
-	pdir_start = &(ioc->pdir_base[pide]);
-
-	while (size > 0) {
-		ASSERT(((u8 *)pdir_start)[7] == 0); /* verify availability */
-		sba_io_pdir_entry(pdir_start, (unsigned long) addr);
-
-		DBG_RUN("     pdir 0x%p %lx\n", pdir_start, *pdir_start);
-
-		addr += iovp_size;
-		size -= iovp_size;
-		pdir_start++;
-	}
-	/* force pdir update */
-	wmb();
-
-	/* form complete address */
-#ifdef ASSERT_PDIR_SANITY
-	spin_lock_irqsave(&ioc->res_lock, flags);
-	sba_check_pdir(ioc,"Check after sba_map_page()");
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-#endif
-	return SBA_IOVA(ioc, iovp, offset);
-}
-
-#ifdef ENABLE_MARK_CLEAN
-static SBA_INLINE void
-sba_mark_clean(struct ioc *ioc, dma_addr_t iova, size_t size)
-{
-	u32	iovp = (u32) SBA_IOVP(ioc,iova);
-	int	off = PDIR_INDEX(iovp);
-	void	*addr;
-
-	if (size <= iovp_size) {
-		addr = phys_to_virt(ioc->pdir_base[off] &
-		                    ~0xE000000000000FFFULL);
-		mark_clean(addr, size);
-	} else {
-		do {
-			addr = phys_to_virt(ioc->pdir_base[off] &
-			                    ~0xE000000000000FFFULL);
-			mark_clean(addr, min(size, iovp_size));
-			off++;
-			size -= iovp_size;
-		} while (size > 0);
-	}
-}
-#endif
-
-/**
- * sba_unmap_page - unmap one IOVA and free resources
- * @dev: instance of PCI owned by the driver that's asking.
- * @iova:  IOVA of driver buffer previously mapped.
- * @size:  number of bytes mapped in driver buffer.
- * @dir:  R/W or both.
- * @attrs: optional dma attributes
- *
- * See Documentation/core-api/dma-api-howto.rst
- */
-static void sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size,
-			   enum dma_data_direction dir, unsigned long attrs)
-{
-	struct ioc *ioc;
-#if DELAYED_RESOURCE_CNT > 0
-	struct sba_dma_pair *d;
-#endif
-	unsigned long flags;
-	dma_addr_t offset;
-
-	ioc = GET_IOC(dev);
-	ASSERT(ioc);
-
-#ifdef ALLOW_IOV_BYPASS
-	if (likely((iova & ioc->imask) != ioc->ibase)) {
-		/*
-		** Address does not fall w/in IOVA, must be bypassing
-		*/
-		DBG_BYPASS("sba_unmap_page() bypass addr: 0x%lx\n",
-			   iova);
-
-#ifdef ENABLE_MARK_CLEAN
-		if (dir == DMA_FROM_DEVICE) {
-			mark_clean(phys_to_virt(iova), size);
-		}
-#endif
-		return;
-	}
-#endif
-	offset = iova & ~iovp_mask;
-
-	DBG_RUN("%s() iovp 0x%lx/%x\n", __func__, (long) iova, size);
-
-	iova ^= offset;        /* clear offset bits */
-	size += offset;
-	size = ROUNDUP(size, iovp_size);
-
-#ifdef ENABLE_MARK_CLEAN
-	if (dir == DMA_FROM_DEVICE)
-		sba_mark_clean(ioc, iova, size);
-#endif
-
-#if DELAYED_RESOURCE_CNT > 0
-	spin_lock_irqsave(&ioc->saved_lock, flags);
-	d = &(ioc->saved[ioc->saved_cnt]);
-	d->iova = iova;
-	d->size = size;
-	if (unlikely(++(ioc->saved_cnt) >= DELAYED_RESOURCE_CNT)) {
-		int cnt = ioc->saved_cnt;
-		spin_lock(&ioc->res_lock);
-		while (cnt--) {
-			sba_mark_invalid(ioc, d->iova, d->size);
-			sba_free_range(ioc, d->iova, d->size);
-			d--;
-		}
-		ioc->saved_cnt = 0;
-		READ_REG(ioc->ioc_hpa+IOC_PCOM);	/* flush purges */
-		spin_unlock(&ioc->res_lock);
-	}
-	spin_unlock_irqrestore(&ioc->saved_lock, flags);
-#else /* DELAYED_RESOURCE_CNT == 0 */
-	spin_lock_irqsave(&ioc->res_lock, flags);
-	sba_mark_invalid(ioc, iova, size);
-	sba_free_range(ioc, iova, size);
-	READ_REG(ioc->ioc_hpa+IOC_PCOM);	/* flush purges */
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-#endif /* DELAYED_RESOURCE_CNT == 0 */
-}
-
-/**
- * sba_alloc_coherent - allocate/map shared mem for DMA
- * @dev: instance of PCI owned by the driver that's asking.
- * @size:  number of bytes mapped in driver buffer.
- * @dma_handle:  IOVA of new buffer.
- *
- * See Documentation/core-api/dma-api-howto.rst
- */
-static void *
-sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		   gfp_t flags, unsigned long attrs)
-{
-	struct page *page;
-	struct ioc *ioc;
-	int node = -1;
-	void *addr;
-
-	ioc = GET_IOC(dev);
-	ASSERT(ioc);
-#ifdef CONFIG_NUMA
-	node = ioc->node;
-#endif
-
-	page = alloc_pages_node(node, flags, get_order(size));
-	if (unlikely(!page))
-		return NULL;
-
-	addr = page_address(page);
-	memset(addr, 0, size);
-	*dma_handle = page_to_phys(page);
-
-#ifdef ALLOW_IOV_BYPASS
-	ASSERT(dev->coherent_dma_mask);
-	/*
- 	** Check if the PCI device can DMA to ptr... if so, just return ptr
- 	*/
-	if (likely((*dma_handle & ~dev->coherent_dma_mask) == 0)) {
-		DBG_BYPASS("sba_alloc_coherent() bypass mask/addr: 0x%lx/0x%lx\n",
-		           dev->coherent_dma_mask, *dma_handle);
-
-		return addr;
-	}
-#endif
-
-	/*
-	 * If device can't bypass or bypass is disabled, pass the 32bit fake
-	 * device to map single to get an iova mapping.
-	 */
-	*dma_handle = sba_map_page(&ioc->sac_only_dev->dev, page, 0, size,
-			DMA_BIDIRECTIONAL, 0);
-	if (dma_mapping_error(dev, *dma_handle))
-		return NULL;
-	return addr;
-}
-
-
-/**
- * sba_free_coherent - free/unmap shared mem for DMA
- * @dev: instance of PCI owned by the driver that's asking.
- * @size:  number of bytes mapped in driver buffer.
- * @vaddr:  virtual address IOVA of "consistent" buffer.
- * @dma_handler:  IO virtual address of "consistent" buffer.
- *
- * See Documentation/core-api/dma-api-howto.rst
- */
-static void sba_free_coherent(struct device *dev, size_t size, void *vaddr,
-			      dma_addr_t dma_handle, unsigned long attrs)
-{
-	sba_unmap_page(dev, dma_handle, size, 0, 0);
-	free_pages((unsigned long) vaddr, get_order(size));
-}
-
-
-/*
-** Since 0 is a valid pdir_base index value, can't use that
-** to determine if a value is valid or not. Use a flag to indicate
-** the SG list entry contains a valid pdir index.
-*/
-#define PIDE_FLAG 0x1UL
-
-#ifdef DEBUG_LARGE_SG_ENTRIES
-int dump_run_sg = 0;
-#endif
-
-
-/**
- * sba_fill_pdir - write allocated SG entries into IO PDIR
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @startsg:  list of IOVA/size pairs
- * @nents: number of entries in startsg list
- *
- * Take preprocessed SG list and write corresponding entries
- * in the IO PDIR.
- */
-
-static SBA_INLINE int
-sba_fill_pdir(
-	struct ioc *ioc,
-	struct scatterlist *startsg,
-	int nents)
-{
-	struct scatterlist *dma_sg = startsg;	/* pointer to current DMA */
-	int n_mappings = 0;
-	u64 *pdirp = NULL;
-	unsigned long dma_offset = 0;
-
-	while (nents-- > 0) {
-		int     cnt = startsg->dma_length;
-		startsg->dma_length = 0;
-
-#ifdef DEBUG_LARGE_SG_ENTRIES
-		if (dump_run_sg)
-			printk(" %2d : %08lx/%05x %p\n",
-				nents, startsg->dma_address, cnt,
-				sba_sg_address(startsg));
-#else
-		DBG_RUN_SG(" %d : %08lx/%05x %p\n",
-				nents, startsg->dma_address, cnt,
-				sba_sg_address(startsg));
-#endif
-		/*
-		** Look for the start of a new DMA stream
-		*/
-		if (startsg->dma_address & PIDE_FLAG) {
-			u32 pide = startsg->dma_address & ~PIDE_FLAG;
-			dma_offset = (unsigned long) pide & ~iovp_mask;
-			startsg->dma_address = 0;
-			if (n_mappings)
-				dma_sg = sg_next(dma_sg);
-			dma_sg->dma_address = pide | ioc->ibase;
-			pdirp = &(ioc->pdir_base[pide >> iovp_shift]);
-			n_mappings++;
-		}
-
-		/*
-		** Look for a VCONTIG chunk
-		*/
-		if (cnt) {
-			unsigned long vaddr = (unsigned long) sba_sg_address(startsg);
-			ASSERT(pdirp);
-
-			/* Since multiple Vcontig blocks could make up
-			** one DMA stream, *add* cnt to dma_len.
-			*/
-			dma_sg->dma_length += cnt;
-			cnt += dma_offset;
-			dma_offset=0;	/* only want offset on first chunk */
-			cnt = ROUNDUP(cnt, iovp_size);
-			do {
-				sba_io_pdir_entry(pdirp, vaddr);
-				vaddr += iovp_size;
-				cnt -= iovp_size;
-				pdirp++;
-			} while (cnt > 0);
-		}
-		startsg = sg_next(startsg);
-	}
-	/* force pdir update */
-	wmb();
-
-#ifdef DEBUG_LARGE_SG_ENTRIES
-	dump_run_sg = 0;
-#endif
-	return(n_mappings);
-}
-
-
-/*
-** Two address ranges are DMA contiguous *iff* "end of prev" and
-** "start of next" are both on an IOV page boundary.
-**
-** (shift left is a quick trick to mask off upper bits)
-*/
-#define DMA_CONTIG(__X, __Y) \
-	(((((unsigned long) __X) | ((unsigned long) __Y)) << (BITS_PER_LONG - iovp_shift)) == 0UL)
-
-
-/**
- * sba_coalesce_chunks - preprocess the SG list
- * @ioc: IO MMU structure which owns the pdir we are interested in.
- * @startsg:  list of IOVA/size pairs
- * @nents: number of entries in startsg list
- *
- * First pass is to walk the SG list and determine where the breaks are
- * in the DMA stream. Allocates PDIR entries but does not fill them.
- * Returns the number of DMA chunks.
- *
- * Doing the fill separate from the coalescing/allocation keeps the
- * code simpler. Future enhancement could make one pass through
- * the sglist do both.
- */
-static SBA_INLINE int
-sba_coalesce_chunks(struct ioc *ioc, struct device *dev,
-	struct scatterlist *startsg,
-	int nents)
-{
-	struct scatterlist *vcontig_sg;    /* VCONTIG chunk head */
-	unsigned long vcontig_len;         /* len of VCONTIG chunk */
-	unsigned long vcontig_end;
-	struct scatterlist *dma_sg;        /* next DMA stream head */
-	unsigned long dma_offset, dma_len; /* start/len of DMA stream */
-	int n_mappings = 0;
-	unsigned int max_seg_size = dma_get_max_seg_size(dev);
-	int idx;
-
-	while (nents > 0) {
-		unsigned long vaddr = (unsigned long) sba_sg_address(startsg);
-
-		/*
-		** Prepare for first/next DMA stream
-		*/
-		dma_sg = vcontig_sg = startsg;
-		dma_len = vcontig_len = vcontig_end = startsg->length;
-		vcontig_end +=  vaddr;
-		dma_offset = vaddr & ~iovp_mask;
-
-		/* PARANOID: clear entries */
-		startsg->dma_address = startsg->dma_length = 0;
-
-		/*
-		** This loop terminates one iteration "early" since
-		** it's always looking one "ahead".
-		*/
-		while (--nents > 0) {
-			unsigned long vaddr;	/* tmp */
-
-			startsg = sg_next(startsg);
-
-			/* PARANOID */
-			startsg->dma_address = startsg->dma_length = 0;
-
-			/* catch brokenness in SCSI layer */
-			ASSERT(startsg->length <= DMA_CHUNK_SIZE);
-
-			/*
-			** First make sure current dma stream won't
-			** exceed DMA_CHUNK_SIZE if we coalesce the
-			** next entry.
-			*/
-			if (((dma_len + dma_offset + startsg->length + ~iovp_mask) & iovp_mask)
-			    > DMA_CHUNK_SIZE)
-				break;
-
-			if (dma_len + startsg->length > max_seg_size)
-				break;
-
-			/*
-			** Then look for virtually contiguous blocks.
-			**
-			** append the next transaction?
-			*/
-			vaddr = (unsigned long) sba_sg_address(startsg);
-			if  (vcontig_end == vaddr)
-			{
-				vcontig_len += startsg->length;
-				vcontig_end += startsg->length;
-				dma_len     += startsg->length;
-				continue;
-			}
-
-#ifdef DEBUG_LARGE_SG_ENTRIES
-			dump_run_sg = (vcontig_len > iovp_size);
-#endif
-
-			/*
-			** Not virtually contiguous.
-			** Terminate prev chunk.
-			** Start a new chunk.
-			**
-			** Once we start a new VCONTIG chunk, dma_offset
-			** can't change. And we need the offset from the first
-			** chunk - not the last one. Ergo Successive chunks
-			** must start on page boundaries and dove tail
-			** with it's predecessor.
-			*/
-			vcontig_sg->dma_length = vcontig_len;
-
-			vcontig_sg = startsg;
-			vcontig_len = startsg->length;
-
-			/*
-			** 3) do the entries end/start on page boundaries?
-			**    Don't update vcontig_end until we've checked.
-			*/
-			if (DMA_CONTIG(vcontig_end, vaddr))
-			{
-				vcontig_end = vcontig_len + vaddr;
-				dma_len += vcontig_len;
-				continue;
-			} else {
-				break;
-			}
-		}
-
-		/*
-		** End of DMA Stream
-		** Terminate last VCONTIG block.
-		** Allocate space for DMA stream.
-		*/
-		vcontig_sg->dma_length = vcontig_len;
-		dma_len = (dma_len + dma_offset + ~iovp_mask) & iovp_mask;
-		ASSERT(dma_len <= DMA_CHUNK_SIZE);
-		idx = sba_alloc_range(ioc, dev, dma_len);
-		if (idx < 0) {
-			dma_sg->dma_length = 0;
-			return -1;
-		}
-		dma_sg->dma_address = (dma_addr_t)(PIDE_FLAG | (idx << iovp_shift)
-						   | dma_offset);
-		n_mappings++;
-	}
-
-	return n_mappings;
-}
-
-static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
-			       int nents, enum dma_data_direction dir,
-			       unsigned long attrs);
-/**
- * sba_map_sg - map Scatter/Gather list
- * @dev: instance of PCI owned by the driver that's asking.
- * @sglist:  array of buffer/length pairs
- * @nents:  number of entries in list
- * @dir:  R/W or both.
- * @attrs: optional dma attributes
- *
- * See Documentation/core-api/dma-api-howto.rst
- */
-static int sba_map_sg_attrs(struct device *dev, struct scatterlist *sglist,
-			    int nents, enum dma_data_direction dir,
-			    unsigned long attrs)
-{
-	struct ioc *ioc;
-	int coalesced, filled = 0;
-#ifdef ASSERT_PDIR_SANITY
-	unsigned long flags;
-#endif
-#ifdef ALLOW_IOV_BYPASS_SG
-	struct scatterlist *sg;
-#endif
-
-	DBG_RUN_SG("%s() START %d entries\n", __func__, nents);
-	ioc = GET_IOC(dev);
-	ASSERT(ioc);
-
-#ifdef ALLOW_IOV_BYPASS_SG
-	ASSERT(to_pci_dev(dev)->dma_mask);
-	if (likely((ioc->dma_mask & ~to_pci_dev(dev)->dma_mask) == 0)) {
-		for_each_sg(sglist, sg, nents, filled) {
-			sg->dma_length = sg->length;
-			sg->dma_address = virt_to_phys(sba_sg_address(sg));
-		}
-		return filled;
-	}
-#endif
-	/* Fast path single entry scatterlists. */
-	if (nents == 1) {
-		sglist->dma_length = sglist->length;
-		sglist->dma_address = sba_map_page(dev, sg_page(sglist),
-				sglist->offset, sglist->length, dir, attrs);
-		if (dma_mapping_error(dev, sglist->dma_address))
-			return -EIO;
-		return 1;
-	}
-
-#ifdef ASSERT_PDIR_SANITY
-	spin_lock_irqsave(&ioc->res_lock, flags);
-	if (sba_check_pdir(ioc,"Check before sba_map_sg_attrs()"))
-	{
-		sba_dump_sg(ioc, sglist, nents);
-		panic("Check before sba_map_sg_attrs()");
-	}
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-#endif
-
-	prefetch(ioc->res_hint);
-
-	/*
-	** First coalesce the chunks and allocate I/O pdir space
-	**
-	** If this is one DMA stream, we can properly map using the
-	** correct virtual address associated with each DMA page.
-	** w/o this association, we wouldn't have coherent DMA!
-	** Access to the virtual address is what forces a two pass algorithm.
-	*/
-	coalesced = sba_coalesce_chunks(ioc, dev, sglist, nents);
-	if (coalesced < 0) {
-		sba_unmap_sg_attrs(dev, sglist, nents, dir, attrs);
-		return -ENOMEM;
-	}
-
-	/*
-	** Program the I/O Pdir
-	**
-	** map the virtual addresses to the I/O Pdir
-	** o dma_address will contain the pdir index
-	** o dma_len will contain the number of bytes to map
-	** o address contains the virtual address.
-	*/
-	filled = sba_fill_pdir(ioc, sglist, nents);
-
-#ifdef ASSERT_PDIR_SANITY
-	spin_lock_irqsave(&ioc->res_lock, flags);
-	if (sba_check_pdir(ioc,"Check after sba_map_sg_attrs()"))
-	{
-		sba_dump_sg(ioc, sglist, nents);
-		panic("Check after sba_map_sg_attrs()\n");
-	}
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-#endif
-
-	ASSERT(coalesced == filled);
-	DBG_RUN_SG("%s() DONE %d mappings\n", __func__, filled);
-
-	return filled;
-}
-
-/**
- * sba_unmap_sg_attrs - unmap Scatter/Gather list
- * @dev: instance of PCI owned by the driver that's asking.
- * @sglist:  array of buffer/length pairs
- * @nents:  number of entries in list
- * @dir:  R/W or both.
- * @attrs: optional dma attributes
- *
- * See Documentation/core-api/dma-api-howto.rst
- */
-static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
-			       int nents, enum dma_data_direction dir,
-			       unsigned long attrs)
-{
-#ifdef ASSERT_PDIR_SANITY
-	struct ioc *ioc;
-	unsigned long flags;
-#endif
-
-	DBG_RUN_SG("%s() START %d entries,  %p,%x\n",
-		   __func__, nents, sba_sg_address(sglist), sglist->length);
-
-#ifdef ASSERT_PDIR_SANITY
-	ioc = GET_IOC(dev);
-	ASSERT(ioc);
-
-	spin_lock_irqsave(&ioc->res_lock, flags);
-	sba_check_pdir(ioc,"Check before sba_unmap_sg_attrs()");
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-#endif
-
-	while (nents && sglist->dma_length) {
-
-		sba_unmap_page(dev, sglist->dma_address, sglist->dma_length,
-			       dir, attrs);
-		sglist = sg_next(sglist);
-		nents--;
-	}
-
-	DBG_RUN_SG("%s() DONE (nents %d)\n", __func__,  nents);
-
-#ifdef ASSERT_PDIR_SANITY
-	spin_lock_irqsave(&ioc->res_lock, flags);
-	sba_check_pdir(ioc,"Check after sba_unmap_sg_attrs()");
-	spin_unlock_irqrestore(&ioc->res_lock, flags);
-#endif
-
-}
-
-/**************************************************************
-*
-*   Initialization and claim
-*
-***************************************************************/
-
-static void
-ioc_iova_init(struct ioc *ioc)
-{
-	int tcnfg;
-	int agp_found = 0;
-	struct pci_dev *device = NULL;
-#ifdef FULL_VALID_PDIR
-	unsigned long index;
-#endif
-
-	/*
-	** Firmware programs the base and size of a "safe IOVA space"
-	** (one that doesn't overlap memory or LMMIO space) in the
-	** IBASE and IMASK registers.
-	*/
-	ioc->ibase = READ_REG(ioc->ioc_hpa + IOC_IBASE) & ~0x1UL;
-	ioc->imask = READ_REG(ioc->ioc_hpa + IOC_IMASK) | 0xFFFFFFFF00000000UL;
-
-	ioc->iov_size = ~ioc->imask + 1;
-
-	DBG_INIT("%s() hpa %p IOV base 0x%lx mask 0x%lx (%dMB)\n",
-		__func__, ioc->ioc_hpa, ioc->ibase, ioc->imask,
-		ioc->iov_size >> 20);
-
-	switch (iovp_size) {
-		case  4*1024: tcnfg = 0; break;
-		case  8*1024: tcnfg = 1; break;
-		case 16*1024: tcnfg = 2; break;
-		case 64*1024: tcnfg = 3; break;
-		default:
-			panic(PFX "Unsupported IOTLB page size %ldK",
-				iovp_size >> 10);
-			break;
-	}
-	WRITE_REG(tcnfg, ioc->ioc_hpa + IOC_TCNFG);
-
-	ioc->pdir_size = (ioc->iov_size / iovp_size) * PDIR_ENTRY_SIZE;
-	ioc->pdir_base = (void *) __get_free_pages(GFP_KERNEL,
-						   get_order(ioc->pdir_size));
-	if (!ioc->pdir_base)
-		panic(PFX "Couldn't allocate I/O Page Table\n");
-
-	memset(ioc->pdir_base, 0, ioc->pdir_size);
-
-	DBG_INIT("%s() IOV page size %ldK pdir %p size %x\n", __func__,
-		iovp_size >> 10, ioc->pdir_base, ioc->pdir_size);
-
-	ASSERT(ALIGN((unsigned long) ioc->pdir_base, 4*1024) == (unsigned long) ioc->pdir_base);
-	WRITE_REG(virt_to_phys(ioc->pdir_base), ioc->ioc_hpa + IOC_PDIR_BASE);
-
-	/*
-	** If an AGP device is present, only use half of the IOV space
-	** for PCI DMA.  Unfortunately we can't know ahead of time
-	** whether GART support will actually be used, for now we
-	** can just key on an AGP device found in the system.
-	** We program the next pdir index after we stop w/ a key for
-	** the GART code to handshake on.
-	*/
-	for_each_pci_dev(device)	
-		agp_found |= pci_find_capability(device, PCI_CAP_ID_AGP);
-
-	if (agp_found && reserve_sba_gart) {
-		printk(KERN_INFO PFX "reserving %dMb of IOVA space at 0x%lx for agpgart\n",
-		      ioc->iov_size/2 >> 20, ioc->ibase + ioc->iov_size/2);
-		ioc->pdir_size /= 2;
-		((u64 *)ioc->pdir_base)[PDIR_INDEX(ioc->iov_size/2)] = ZX1_SBA_IOMMU_COOKIE;
-	}
-#ifdef FULL_VALID_PDIR
-	/*
-  	** Check to see if the spill page has been allocated, we don't need more than
-	** one across multiple SBAs.
-	*/
-	if (!prefetch_spill_page) {
-		char *spill_poison = "SBAIOMMU POISON";
-		int poison_size = 16;
-		void *poison_addr, *addr;
-
-		addr = (void *)__get_free_pages(GFP_KERNEL, get_order(iovp_size));
-		if (!addr)
-			panic(PFX "Couldn't allocate PDIR spill page\n");
-
-		poison_addr = addr;
-		for ( ; (u64) poison_addr < addr + iovp_size; poison_addr += poison_size)
-			memcpy(poison_addr, spill_poison, poison_size);
-
-		prefetch_spill_page = virt_to_phys(addr);
-
-		DBG_INIT("%s() prefetch spill addr: 0x%lx\n", __func__, prefetch_spill_page);
-	}
-	/*
-  	** Set all the PDIR entries valid w/ the spill page as the target
-	*/
-	for (index = 0 ; index < (ioc->pdir_size / PDIR_ENTRY_SIZE) ; index++)
-		((u64 *)ioc->pdir_base)[index] = (0x80000000000000FF | prefetch_spill_page);
-#endif
-
-	/* Clear I/O TLB of any possible entries */
-	WRITE_REG(ioc->ibase | (get_iovp_order(ioc->iov_size) + iovp_shift), ioc->ioc_hpa + IOC_PCOM);
-	READ_REG(ioc->ioc_hpa + IOC_PCOM);
-
-	/* Enable IOVA translation */
-	WRITE_REG(ioc->ibase | 1, ioc->ioc_hpa + IOC_IBASE);
-	READ_REG(ioc->ioc_hpa + IOC_IBASE);
-}
-
-static void __init
-ioc_resource_init(struct ioc *ioc)
-{
-	spin_lock_init(&ioc->res_lock);
-#if DELAYED_RESOURCE_CNT > 0
-	spin_lock_init(&ioc->saved_lock);
-#endif
-
-	/* resource map size dictated by pdir_size */
-	ioc->res_size = ioc->pdir_size / PDIR_ENTRY_SIZE; /* entries */
-	ioc->res_size >>= 3;  /* convert bit count to byte count */
-	DBG_INIT("%s() res_size 0x%x\n", __func__, ioc->res_size);
-
-	ioc->res_map = (char *) __get_free_pages(GFP_KERNEL,
-						 get_order(ioc->res_size));
-	if (!ioc->res_map)
-		panic(PFX "Couldn't allocate resource map\n");
-
-	memset(ioc->res_map, 0, ioc->res_size);
-	/* next available IOVP - circular search */
-	ioc->res_hint = (unsigned long *) ioc->res_map;
-
-#ifdef ASSERT_PDIR_SANITY
-	/* Mark first bit busy - ie no IOVA 0 */
-	ioc->res_map[0] = 0x1;
-	ioc->pdir_base[0] = 0x8000000000000000ULL | ZX1_SBA_IOMMU_COOKIE;
-#endif
-#ifdef FULL_VALID_PDIR
-	/* Mark the last resource used so we don't prefetch beyond IOVA space */
-	ioc->res_map[ioc->res_size - 1] |= 0x80UL; /* res_map is chars */
-	ioc->pdir_base[(ioc->pdir_size / PDIR_ENTRY_SIZE) - 1] = (0x80000000000000FF
-							      | prefetch_spill_page);
-#endif
-
-	DBG_INIT("%s() res_map %x %p\n", __func__,
-		 ioc->res_size, (void *) ioc->res_map);
-}
-
-static void __init
-ioc_sac_init(struct ioc *ioc)
-{
-	struct pci_dev *sac = NULL;
-	struct pci_controller *controller = NULL;
-
-	/*
-	 * pci_alloc_coherent() must return a DMA address which is
-	 * SAC (single address cycle) addressable, so allocate a
-	 * pseudo-device to enforce that.
-	 */
-	sac = kzalloc(sizeof(*sac), GFP_KERNEL);
-	if (!sac)
-		panic(PFX "Couldn't allocate struct pci_dev");
-
-	controller = kzalloc(sizeof(*controller), GFP_KERNEL);
-	if (!controller)
-		panic(PFX "Couldn't allocate struct pci_controller");
-
-	controller->iommu = ioc;
-	sac->sysdata = controller;
-	sac->dma_mask = 0xFFFFFFFFUL;
-	sac->dev.bus = &pci_bus_type;
-	ioc->sac_only_dev = sac;
-}
-
-static void __init
-ioc_zx1_init(struct ioc *ioc)
-{
-	unsigned long rope_config;
-	unsigned int i;
-
-	if (ioc->rev < 0x20)
-		panic(PFX "IOC 2.0 or later required for IOMMU support\n");
-
-	/* 38 bit memory controller + extra bit for range displaced by MMIO */
-	ioc->dma_mask = (0x1UL << 39) - 1;
-
-	/*
-	** Clear ROPE(N)_CONFIG AO bit.
-	** Disables "NT Ordering" (~= !"Relaxed Ordering")
-	** Overrides bit 1 in DMA Hint Sets.
-	** Improves netperf UDP_STREAM by ~10% for tg3 on bcm5701.
-	*/
-	for (i=0; i<(8*8); i+=8) {
-		rope_config = READ_REG(ioc->ioc_hpa + IOC_ROPE0_CFG + i);
-		rope_config &= ~IOC_ROPE_AO;
-		WRITE_REG(rope_config, ioc->ioc_hpa + IOC_ROPE0_CFG + i);
-	}
-}
-
-typedef void (initfunc)(struct ioc *);
-
-struct ioc_iommu {
-	u32 func_id;
-	char *name;
-	initfunc *init;
-};
-
-static struct ioc_iommu ioc_iommu_info[] __initdata = {
-	{ ZX1_IOC_ID, "zx1", ioc_zx1_init },
-	{ ZX2_IOC_ID, "zx2", NULL },
-	{ SX1000_IOC_ID, "sx1000", NULL },
-	{ SX2000_IOC_ID, "sx2000", NULL },
-};
-
-static void __init ioc_init(unsigned long hpa, struct ioc *ioc)
-{
-	struct ioc_iommu *info;
-
-	ioc->next = ioc_list;
-	ioc_list = ioc;
-
-	ioc->ioc_hpa = ioremap(hpa, 0x1000);
-
-	ioc->func_id = READ_REG(ioc->ioc_hpa + IOC_FUNC_ID);
-	ioc->rev = READ_REG(ioc->ioc_hpa + IOC_FCLASS) & 0xFFUL;
-	ioc->dma_mask = 0xFFFFFFFFFFFFFFFFUL;	/* conservative */
-
-	for (info = ioc_iommu_info; info < ioc_iommu_info + ARRAY_SIZE(ioc_iommu_info); info++) {
-		if (ioc->func_id == info->func_id) {
-			ioc->name = info->name;
-			if (info->init)
-				(info->init)(ioc);
-		}
-	}
-
-	iovp_size = (1 << iovp_shift);
-	iovp_mask = ~(iovp_size - 1);
-
-	DBG_INIT("%s: PAGE_SIZE %ldK, iovp_size %ldK\n", __func__,
-		PAGE_SIZE >> 10, iovp_size >> 10);
-
-	if (!ioc->name) {
-		ioc->name = kmalloc(24, GFP_KERNEL);
-		if (ioc->name)
-			sprintf((char *) ioc->name, "Unknown (%04x:%04x)",
-				ioc->func_id & 0xFFFF, (ioc->func_id >> 16) & 0xFFFF);
-		else
-			ioc->name = "Unknown";
-	}
-
-	ioc_iova_init(ioc);
-	ioc_resource_init(ioc);
-	ioc_sac_init(ioc);
-
-	printk(KERN_INFO PFX
-		"%s %d.%d HPA 0x%lx IOVA space %dMb at 0x%lx\n",
-		ioc->name, (ioc->rev >> 4) & 0xF, ioc->rev & 0xF,
-		hpa, ioc->iov_size >> 20, ioc->ibase);
-}
-
-
-
-/**************************************************************************
-**
-**   SBA initialization code (HW and SW)
-**
-**   o identify SBA chip itself
-**   o FIXME: initialize DMA hints for reasonable defaults
-**
-**************************************************************************/
-
-#ifdef CONFIG_PROC_FS
-static void *
-ioc_start(struct seq_file *s, loff_t *pos)
-{
-	struct ioc *ioc;
-	loff_t n = *pos;
-
-	for (ioc = ioc_list; ioc; ioc = ioc->next)
-		if (!n--)
-			return ioc;
-
-	return NULL;
-}
-
-static void *
-ioc_next(struct seq_file *s, void *v, loff_t *pos)
-{
-	struct ioc *ioc = v;
-
-	++*pos;
-	return ioc->next;
-}
-
-static void
-ioc_stop(struct seq_file *s, void *v)
-{
-}
-
-static int
-ioc_show(struct seq_file *s, void *v)
-{
-	struct ioc *ioc = v;
-	unsigned long *res_ptr = (unsigned long *)ioc->res_map;
-	int i, used = 0;
-
-	seq_printf(s, "Hewlett Packard %s IOC rev %d.%d\n",
-		ioc->name, ((ioc->rev >> 4) & 0xF), (ioc->rev & 0xF));
-#ifdef CONFIG_NUMA
-	if (ioc->node != NUMA_NO_NODE)
-		seq_printf(s, "NUMA node       : %d\n", ioc->node);
-#endif
-	seq_printf(s, "IOVA size       : %ld MB\n", ((ioc->pdir_size >> 3) * iovp_size)/(1024*1024));
-	seq_printf(s, "IOVA page size  : %ld kb\n", iovp_size/1024);
-
-	for (i = 0; i < (ioc->res_size / sizeof(unsigned long)); ++i, ++res_ptr)
-		used += hweight64(*res_ptr);
-
-	seq_printf(s, "PDIR size       : %d entries\n", ioc->pdir_size >> 3);
-	seq_printf(s, "PDIR used       : %d entries\n", used);
-
-#ifdef PDIR_SEARCH_TIMING
-	{
-		unsigned long i = 0, avg = 0, min, max;
-		min = max = ioc->avg_search[0];
-		for (i = 0; i < SBA_SEARCH_SAMPLE; i++) {
-			avg += ioc->avg_search[i];
-			if (ioc->avg_search[i] > max) max = ioc->avg_search[i];
-			if (ioc->avg_search[i] < min) min = ioc->avg_search[i];
-		}
-		avg /= SBA_SEARCH_SAMPLE;
-		seq_printf(s, "Bitmap search   : %ld/%ld/%ld (min/avg/max CPU Cycles/IOVA page)\n",
-		           min, avg, max);
-	}
-#endif
-#ifndef ALLOW_IOV_BYPASS
-	 seq_printf(s, "IOVA bypass disabled\n");
-#endif
-	return 0;
-}
-
-static const struct seq_operations ioc_seq_ops = {
-	.start = ioc_start,
-	.next  = ioc_next,
-	.stop  = ioc_stop,
-	.show  = ioc_show
-};
-
-static void __init
-ioc_proc_init(void)
-{
-	struct proc_dir_entry *dir;
-
-	dir = proc_mkdir("bus/mckinley", NULL);
-	if (!dir)
-		return;
-
-	proc_create_seq(ioc_list->name, 0, dir, &ioc_seq_ops);
-}
-#endif
-
-static void
-sba_connect_bus(struct pci_bus *bus)
-{
-	acpi_handle handle, parent;
-	acpi_status status;
-	struct ioc *ioc;
-
-	if (!PCI_CONTROLLER(bus))
-		panic(PFX "no sysdata on bus %d!\n", bus->number);
-
-	if (PCI_CONTROLLER(bus)->iommu)
-		return;
-
-	handle = acpi_device_handle(PCI_CONTROLLER(bus)->companion);
-	if (!handle)
-		return;
-
-	/*
-	 * The IOC scope encloses PCI root bridges in the ACPI
-	 * namespace, so work our way out until we find an IOC we
-	 * claimed previously.
-	 */
-	do {
-		for (ioc = ioc_list; ioc; ioc = ioc->next)
-			if (ioc->handle == handle) {
-				PCI_CONTROLLER(bus)->iommu = ioc;
-				return;
-			}
-
-		status = acpi_get_parent(handle, &parent);
-		handle = parent;
-	} while (ACPI_SUCCESS(status));
-
-	printk(KERN_WARNING "No IOC for PCI Bus %04x:%02x in ACPI\n", pci_domain_nr(bus), bus->number);
-}
-
-static void __init
-sba_map_ioc_to_node(struct ioc *ioc, acpi_handle handle)
-{
-#ifdef CONFIG_NUMA
-	unsigned int node;
-
-	node = acpi_get_node(handle);
-	if (node != NUMA_NO_NODE && !node_online(node))
-		node = NUMA_NO_NODE;
-
-	ioc->node = node;
-#endif
-}
-
-static void __init acpi_sba_ioc_add(struct ioc *ioc)
-{
-	acpi_handle handle = ioc->handle;
-	acpi_status status;
-	u64 hpa, length;
-	struct acpi_device_info *adi;
-
-	ioc_found = ioc->next;
-	status = hp_acpi_csr_space(handle, &hpa, &length);
-	if (ACPI_FAILURE(status))
-		goto err;
-
-	status = acpi_get_object_info(handle, &adi);
-	if (ACPI_FAILURE(status))
-		goto err;
-
-	/*
-	 * For HWP0001, only SBA appears in ACPI namespace.  It encloses the PCI
-	 * root bridges, and its CSR space includes the IOC function.
-	 */
-	if (strncmp("HWP0001", adi->hardware_id.string, 7) == 0) {
-		hpa += ZX1_IOC_OFFSET;
-		/* zx1 based systems default to kernel page size iommu pages */
-		if (!iovp_shift)
-			iovp_shift = min(PAGE_SHIFT, 16);
-	}
-	kfree(adi);
-
-	/*
-	 * default anything not caught above or specified on cmdline to 4k
-	 * iommu page size
-	 */
-	if (!iovp_shift)
-		iovp_shift = 12;
-
-	ioc_init(hpa, ioc);
-	/* setup NUMA node association */
-	sba_map_ioc_to_node(ioc, handle);
-	return;
-
- err:
-	kfree(ioc);
-}
-
-static const struct acpi_device_id hp_ioc_iommu_device_ids[] = {
-	{"HWP0001", 0},
-	{"HWP0004", 0},
-	{"", 0},
-};
-
-static int acpi_sba_ioc_attach(struct acpi_device *device,
-			       const struct acpi_device_id *not_used)
-{
-	struct ioc *ioc;
-
-	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
-	if (!ioc)
-		return -ENOMEM;
-
-	ioc->next = ioc_found;
-	ioc_found = ioc;
-	ioc->handle = device->handle;
-	return 1;
-}
-
-
-static struct acpi_scan_handler acpi_sba_ioc_handler = {
-	.ids	= hp_ioc_iommu_device_ids,
-	.attach	= acpi_sba_ioc_attach,
-};
-
-static int __init acpi_sba_ioc_init_acpi(void)
-{
-	return acpi_scan_add_handler(&acpi_sba_ioc_handler);
-}
-/* This has to run before acpi_scan_init(). */
-arch_initcall(acpi_sba_ioc_init_acpi);
-
-static int sba_dma_supported (struct device *dev, u64 mask)
-{
-	/* make sure it's at least 32bit capable */
-	return ((mask & 0xFFFFFFFFUL) == 0xFFFFFFFFUL);
-}
-
-static const struct dma_map_ops sba_dma_ops = {
-	.alloc			= sba_alloc_coherent,
-	.free			= sba_free_coherent,
-	.map_page		= sba_map_page,
-	.unmap_page		= sba_unmap_page,
-	.map_sg			= sba_map_sg_attrs,
-	.unmap_sg		= sba_unmap_sg_attrs,
-	.dma_supported		= sba_dma_supported,
-	.mmap			= dma_common_mmap,
-	.get_sgtable		= dma_common_get_sgtable,
-	.alloc_pages		= dma_common_alloc_pages,
-	.free_pages		= dma_common_free_pages,
-};
-
-static int __init
-sba_init(void)
-{
-	/*
-	 * If we are booting a kdump kernel, the sba_iommu will cause devices
-	 * that were not shutdown properly to MCA as soon as they are turned
-	 * back on.  Our only option for a successful kdump kernel boot is to
-	 * use swiotlb.
-	 */
-	if (is_kdump_kernel())
-		return 0;
-
-	/*
-	 * ioc_found should be populated by the acpi_sba_ioc_handler's .attach()
-	 * routine, but that only happens if acpi_scan_init() has already run.
-	 */
-	while (ioc_found)
-		acpi_sba_ioc_add(ioc_found);
-
-	if (!ioc_list)
-		return 0;
-
-	{
-		struct pci_bus *b = NULL;
-		while ((b = pci_find_next_bus(b)) != NULL)
-			sba_connect_bus(b);
-	}
-
-	/* no need for swiotlb with the iommu */
-	swiotlb_exit();
-	dma_ops = &sba_dma_ops;
-
-#ifdef CONFIG_PROC_FS
-	ioc_proc_init();
-#endif
-	return 0;
-}
-
-subsys_initcall(sba_init); /* must be initialized after ACPI etc., but before any drivers... */
-
-static int __init
-nosbagart(char *str)
-{
-	reserve_sba_gart = 0;
-	return 1;
-}
-
-__setup("nosbagart", nosbagart);
-
-static int __init
-sba_page_override(char *str)
-{
-	unsigned long page_size;
-
-	page_size = memparse(str, &str);
-	switch (page_size) {
-		case 4096:
-		case 8192:
-		case 16384:
-		case 65536:
-			iovp_shift = ffs(page_size) - 1;
-			break;
-		default:
-			printk("%s: unknown/unsupported iommu page size %ld\n",
-			       __func__, page_size);
-	}
-
-	return 1;
-}
-
-__setup("sbapagesize=",sba_page_override);
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
deleted file mode 100644
index aefae2efde9f..000000000000
--- a/arch/ia64/include/asm/Kbuild
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-generated-y += syscall_table.h
-generic-y += agp.h
-generic-y += kvm_para.h
-generic-y += mcs_spinlock.h
-generic-y += vtime.h
diff --git a/arch/ia64/include/asm/acenv.h b/arch/ia64/include/asm/acenv.h
deleted file mode 100644
index 9d673cd4c2ad..000000000000
--- a/arch/ia64/include/asm/acenv.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * IA64 specific ACPICA environments and implementation
- *
- * Copyright (C) 2014, Intel Corporation
- *   Author: Lv Zheng <lv.zheng@intel.com>
- */
-
-#ifndef _ASM_IA64_ACENV_H
-#define _ASM_IA64_ACENV_H
-
-#include <asm/intrinsics.h>
-
-#define COMPILER_DEPENDENT_INT64	long
-#define COMPILER_DEPENDENT_UINT64	unsigned long
-
-/* Asm macros */
-
-static inline int
-ia64_acpi_acquire_global_lock(unsigned int *lock)
-{
-	unsigned int old, new, val;
-	do {
-		old = *lock;
-		new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
-		val = ia64_cmpxchg4_acq(lock, new, old);
-	} while (unlikely (val != old));
-	return (new < 3) ? -1 : 0;
-}
-
-static inline int
-ia64_acpi_release_global_lock(unsigned int *lock)
-{
-	unsigned int old, new, val;
-	do {
-		old = *lock;
-		new = old & ~0x3;
-		val = ia64_cmpxchg4_acq(lock, new, old);
-	} while (unlikely (val != old));
-	return old & 0x1;
-}
-
-#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq)				\
-	((Acq) = ia64_acpi_acquire_global_lock(&facs->global_lock))
-
-#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq)				\
-	((Acq) = ia64_acpi_release_global_lock(&facs->global_lock))
-
-#endif /* _ASM_IA64_ACENV_H */
diff --git a/arch/ia64/include/asm/acpi-ext.h b/arch/ia64/include/asm/acpi-ext.h
deleted file mode 100644
index eaa57583d151..000000000000
--- a/arch/ia64/include/asm/acpi-ext.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * (c) Copyright 2003, 2006 Hewlett-Packard Development Company, L.P.
- *	Alex Williamson <alex.williamson@hp.com>
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- *
- * Vendor specific extensions to ACPI.
- */
-
-#ifndef _ASM_IA64_ACPI_EXT_H
-#define _ASM_IA64_ACPI_EXT_H
-
-#include <linux/types.h>
-
-extern acpi_status hp_acpi_csr_space (acpi_handle, u64 *base, u64 *length);
-
-#endif /* _ASM_IA64_ACPI_EXT_H */
diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h
deleted file mode 100644
index 58500a964238..000000000000
--- a/arch/ia64/include/asm/acpi.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- *  Copyright (C) 1999 VA Linux Systems
- *  Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- *  Copyright (C) 2000,2001 J.I. Lee <jung-ik.lee@intel.com>
- *  Copyright (C) 2001,2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- */
-
-#ifndef _ASM_ACPI_H
-#define _ASM_ACPI_H
-
-#ifdef __KERNEL__
-
-#include <acpi/proc_cap_intel.h>
-
-#include <linux/init.h>
-#include <linux/numa.h>
-#include <asm/numa.h>
-
-
-extern int acpi_lapic;
-#define acpi_disabled 0	/* ACPI always enabled on IA64 */
-#define acpi_noirq 0	/* ACPI always enabled on IA64 */
-#define acpi_pci_disabled 0 /* ACPI PCI always enabled on IA64 */
-#define acpi_strict 1	/* no ACPI spec workarounds on IA64 */
-
-static inline bool acpi_has_cpu_in_madt(void)
-{
-	return !!acpi_lapic;
-}
-
-#define acpi_processor_cstate_check(x) (x) /* no idle limits on IA64 :) */
-static inline void disable_acpi(void) { }
-
-int acpi_request_vector (u32 int_type);
-int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
-
-/* Low-level suspend routine. */
-extern int acpi_suspend_lowlevel(void);
-
-static inline unsigned long acpi_get_wakeup_address(void)
-{
-	return 0;
-}
-
-/*
- * Record the cpei override flag and current logical cpu. This is
- * useful for CPU removal.
- */
-extern unsigned int can_cpei_retarget(void);
-extern unsigned int is_cpu_cpei_target(unsigned int cpu);
-extern void set_cpei_target_cpu(unsigned int cpu);
-extern unsigned int get_cpei_target_cpu(void);
-extern void prefill_possible_map(void);
-#ifdef CONFIG_ACPI_HOTPLUG_CPU
-extern int additional_cpus;
-#else
-#define additional_cpus 0
-#endif
-
-#ifdef CONFIG_ACPI_NUMA
-#if MAX_NUMNODES > 256
-#define MAX_PXM_DOMAINS MAX_NUMNODES
-#else
-#define MAX_PXM_DOMAINS (256)
-#endif
-extern int pxm_to_nid_map[MAX_PXM_DOMAINS];
-extern int __initdata nid_to_pxm_map[MAX_NUMNODES];
-#endif
-
-static inline bool arch_has_acpi_pdc(void) { return true; }
-static inline void arch_acpi_set_proc_cap_bits(u32 *cap)
-{
-	*cap |= ACPI_PROC_CAP_EST_CAPABILITY_SMP;
-}
-
-#ifdef CONFIG_ACPI_NUMA
-extern cpumask_t early_cpu_possible_map;
-#define for_each_possible_early_cpu(cpu)  \
-	for_each_cpu((cpu), &early_cpu_possible_map)
-
-static inline void per_cpu_scan_finalize(int min_cpus, int reserve_cpus)
-{
-	int low_cpu, high_cpu;
-	int cpu;
-	int next_nid = 0;
-
-	low_cpu = cpumask_weight(&early_cpu_possible_map);
-
-	high_cpu = max(low_cpu, min_cpus);
-	high_cpu = min(high_cpu + reserve_cpus, NR_CPUS);
-
-	for (cpu = low_cpu; cpu < high_cpu; cpu++) {
-		cpumask_set_cpu(cpu, &early_cpu_possible_map);
-		if (node_cpuid[cpu].nid == NUMA_NO_NODE) {
-			node_cpuid[cpu].nid = next_nid;
-			next_nid++;
-			if (next_nid >= num_online_nodes())
-				next_nid = 0;
-		}
-	}
-}
-
-extern void acpi_numa_fixup(void);
-
-#endif /* CONFIG_ACPI_NUMA */
-
-#endif /*__KERNEL__*/
-
-#endif /*_ASM_ACPI_H*/
diff --git a/arch/ia64/include/asm/asm-offsets.h b/arch/ia64/include/asm/asm-offsets.h
deleted file mode 100644
index d370ee36a182..000000000000
--- a/arch/ia64/include/asm/asm-offsets.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <generated/asm-offsets.h>
diff --git a/arch/ia64/include/asm/asm-prototypes.h b/arch/ia64/include/asm/asm-prototypes.h
deleted file mode 100644
index a96689447a74..000000000000
--- a/arch/ia64/include/asm/asm-prototypes.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_ASM_PROTOTYPES_H
-#define _ASM_IA64_ASM_PROTOTYPES_H
-
-#include <asm/cacheflush.h>
-#include <asm/checksum.h>
-#include <asm/esi.h>
-#include <asm/ftrace.h>
-#include <asm/page.h>
-#include <asm/pal.h>
-#include <asm/string.h>
-#include <linux/uaccess.h>
-#include <asm/unwind.h>
-#include <asm/xor.h>
-
-extern const char ia64_ivt[];
-
-signed int __divsi3(signed int, unsigned int);
-signed int __modsi3(signed int, unsigned int);
-
-signed long long __divdi3(signed long long, unsigned long long);
-signed long long __moddi3(signed long long, unsigned long long);
-
-unsigned int __udivsi3(unsigned int, unsigned int);
-unsigned int __umodsi3(unsigned int, unsigned int);
-
-unsigned long long __udivdi3(unsigned long long, unsigned long long);
-unsigned long long __umoddi3(unsigned long long, unsigned long long);
-
-#endif /* _ASM_IA64_ASM_PROTOTYPES_H */
diff --git a/arch/ia64/include/asm/asmmacro.h b/arch/ia64/include/asm/asmmacro.h
deleted file mode 100644
index 52619c517f09..000000000000
--- a/arch/ia64/include/asm/asmmacro.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_ASMMACRO_H
-#define _ASM_IA64_ASMMACRO_H
-
-/*
- * Copyright (C) 2000-2001, 2003-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-
-#define ENTRY(name)				\
-	.align 32;				\
-	.proc name;				\
-name:
-
-#define ENTRY_MIN_ALIGN(name)			\
-	.align 16;				\
-	.proc name;				\
-name:
-
-#define GLOBAL_ENTRY(name)			\
-	.global name;				\
-	ENTRY(name)
-
-#define END(name)				\
-	.endp name
-
-/*
- * Helper macros to make unwind directives more readable:
- */
-
-/* prologue_gr: */
-#define ASM_UNW_PRLG_RP			0x8
-#define ASM_UNW_PRLG_PFS		0x4
-#define ASM_UNW_PRLG_PSP		0x2
-#define ASM_UNW_PRLG_PR			0x1
-#define ASM_UNW_PRLG_GRSAVE(ninputs)	(32+(ninputs))
-
-/*
- * Helper macros for accessing user memory.
- *
- * When adding any new .section/.previous entries here, make sure to
- * also add it to the DISCARD section in arch/ia64/kernel/gate.lds.S or
- * unpleasant things will happen.
- */
-
-	.section "__ex_table", "a"		// declare section & section attributes
-	.previous
-
-# define EX(y,x...)				\
-	.xdata4 "__ex_table", 99f-., y-.;	\
-  [99:]	x
-# define EXCLR(y,x...)				\
-	.xdata4 "__ex_table", 99f-., y-.+4;	\
-  [99:]	x
-
-/*
- * Tag MCA recoverable instruction ranges.
- */
-
-	.section "__mca_table", "a"		// declare section & section attributes
-	.previous
-
-# define MCA_RECOVER_RANGE(y)			\
-	.xdata4 "__mca_table", y-., 99f-.;	\
-  [99:]
-
-/*
- * Mark instructions that need a load of a virtual address patched to be
- * a load of a physical address.  We use this either in critical performance
- * path (ivt.S - TLB miss processing) or in places where it might not be
- * safe to use a "tpa" instruction (mca_asm.S - error recovery).
- */
-	.section ".data..patch.vtop", "a"	// declare section & section attributes
-	.previous
-
-#define	LOAD_PHYSICAL(pr, reg, obj)		\
-[1:](pr)movl reg = obj;				\
-	.xdata4 ".data..patch.vtop", 1b-.
-
-/*
- * For now, we always put in the McKinley E9 workaround.  On CPUs that don't need it,
- * we'll patch out the work-around bundles with NOPs, so their impact is minimal.
- */
-#define DO_MCKINLEY_E9_WORKAROUND
-
-#ifdef DO_MCKINLEY_E9_WORKAROUND
-	.section ".data..patch.mckinley_e9", "a"
-	.previous
-/* workaround for Itanium 2 Errata 9: */
-# define FSYS_RETURN					\
-	.xdata4 ".data..patch.mckinley_e9", 1f-.;	\
-1:{ .mib;						\
-	nop.m 0;					\
-	mov r16=ar.pfs;					\
-	br.call.sptk.many b7=2f;;			\
-  };							\
-2:{ .mib;						\
-	nop.m 0;					\
-	mov ar.pfs=r16;					\
-	br.ret.sptk.many b6;;				\
-  }
-#else
-# define FSYS_RETURN	br.ret.sptk.many b6
-#endif
-
-/*
- * If physical stack register size is different from DEF_NUM_STACK_REG,
- * dynamically patch the kernel for correct size.
- */
-	.section ".data..patch.phys_stack_reg", "a"
-	.previous
-#define LOAD_PHYS_STACK_REG_SIZE(reg)			\
-[1:]	adds reg=IA64_NUM_PHYS_STACK_REG*8+8,r0;	\
-	.xdata4 ".data..patch.phys_stack_reg", 1b-.
-
-/*
- * Up until early 2004, use of .align within a function caused bad unwind info.
- * TEXT_ALIGN(n) expands into ".align n" if a fixed GAS is available or into nothing
- * otherwise.
- */
-#ifdef HAVE_WORKING_TEXT_ALIGN
-# define TEXT_ALIGN(n)	.align n
-#else
-# define TEXT_ALIGN(n)
-#endif
-
-#ifdef HAVE_SERIALIZE_DIRECTIVE
-# define dv_serialize_data		.serialize.data
-# define dv_serialize_instruction	.serialize.instruction
-#else
-# define dv_serialize_data
-# define dv_serialize_instruction
-#endif
-
-#endif /* _ASM_IA64_ASMMACRO_H */
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
deleted file mode 100644
index 6540a628d257..000000000000
--- a/arch/ia64/include/asm/atomic.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_ATOMIC_H
-#define _ASM_IA64_ATOMIC_H
-
-/*
- * Atomic operations that C can't guarantee us.  Useful for
- * resource counting etc..
- *
- * NOTE: don't mess with the types below!  The "unsigned long" and
- * "int" types were carefully placed so as to ensure proper operation
- * of the macros.
- *
- * Copyright (C) 1998, 1999, 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/types.h>
-
-#include <asm/intrinsics.h>
-#include <asm/barrier.h>
-
-
-#define ATOMIC64_INIT(i)	{ (i) }
-
-#define arch_atomic_read(v)	READ_ONCE((v)->counter)
-#define arch_atomic64_read(v)	READ_ONCE((v)->counter)
-
-#define arch_atomic_set(v,i)	WRITE_ONCE(((v)->counter), (i))
-#define arch_atomic64_set(v,i)	WRITE_ONCE(((v)->counter), (i))
-
-#define ATOMIC_OP(op, c_op)						\
-static __inline__ int							\
-ia64_atomic_##op (int i, atomic_t *v)					\
-{									\
-	__s32 old, new;							\
-	CMPXCHG_BUGCHECK_DECL						\
-									\
-	do {								\
-		CMPXCHG_BUGCHECK(v);					\
-		old = arch_atomic_read(v);				\
-		new = old c_op i;					\
-	} while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic_t)) != old); \
-	return new;							\
-}
-
-#define ATOMIC_FETCH_OP(op, c_op)					\
-static __inline__ int							\
-ia64_atomic_fetch_##op (int i, atomic_t *v)				\
-{									\
-	__s32 old, new;							\
-	CMPXCHG_BUGCHECK_DECL						\
-									\
-	do {								\
-		CMPXCHG_BUGCHECK(v);					\
-		old = arch_atomic_read(v);				\
-		new = old c_op i;					\
-	} while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic_t)) != old); \
-	return old;							\
-}
-
-#define ATOMIC_OPS(op, c_op)						\
-	ATOMIC_OP(op, c_op)						\
-	ATOMIC_FETCH_OP(op, c_op)
-
-ATOMIC_OPS(add, +)
-ATOMIC_OPS(sub, -)
-
-#ifdef __OPTIMIZE__
-#define __ia64_atomic_const(i)						\
-	static const int __ia64_atomic_p = __builtin_constant_p(i) ?	\
-		((i) == 1 || (i) == 4 || (i) == 8 || (i) == 16 ||	\
-		 (i) == -1 || (i) == -4 || (i) == -8 || (i) == -16) : 0;\
-	__ia64_atomic_p
-#else
-#define __ia64_atomic_const(i)	0
-#endif
-
-#define arch_atomic_add_return(i,v)					\
-({									\
-	int __ia64_aar_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter)	\
-		: ia64_atomic_add(__ia64_aar_i, v);			\
-})
-
-#define arch_atomic_sub_return(i,v)					\
-({									\
-	int __ia64_asr_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter)	\
-		: ia64_atomic_sub(__ia64_asr_i, v);			\
-})
-
-#define arch_atomic_fetch_add(i,v)					\
-({									\
-	int __ia64_aar_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetchadd(__ia64_aar_i, &(v)->counter, acq)	\
-		: ia64_atomic_fetch_add(__ia64_aar_i, v);		\
-})
-
-#define arch_atomic_fetch_sub(i,v)					\
-({									\
-	int __ia64_asr_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetchadd(-__ia64_asr_i, &(v)->counter, acq)	\
-		: ia64_atomic_fetch_sub(__ia64_asr_i, v);		\
-})
-
-ATOMIC_FETCH_OP(and, &)
-ATOMIC_FETCH_OP(or, |)
-ATOMIC_FETCH_OP(xor, ^)
-
-#define arch_atomic_and(i,v)	(void)ia64_atomic_fetch_and(i,v)
-#define arch_atomic_or(i,v)	(void)ia64_atomic_fetch_or(i,v)
-#define arch_atomic_xor(i,v)	(void)ia64_atomic_fetch_xor(i,v)
-
-#define arch_atomic_fetch_and(i,v)	ia64_atomic_fetch_and(i,v)
-#define arch_atomic_fetch_or(i,v)	ia64_atomic_fetch_or(i,v)
-#define arch_atomic_fetch_xor(i,v)	ia64_atomic_fetch_xor(i,v)
-
-#undef ATOMIC_OPS
-#undef ATOMIC_FETCH_OP
-#undef ATOMIC_OP
-
-#define ATOMIC64_OP(op, c_op)						\
-static __inline__ s64							\
-ia64_atomic64_##op (s64 i, atomic64_t *v)				\
-{									\
-	s64 old, new;							\
-	CMPXCHG_BUGCHECK_DECL						\
-									\
-	do {								\
-		CMPXCHG_BUGCHECK(v);					\
-		old = arch_atomic64_read(v);				\
-		new = old c_op i;					\
-	} while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic64_t)) != old); \
-	return new;							\
-}
-
-#define ATOMIC64_FETCH_OP(op, c_op)					\
-static __inline__ s64							\
-ia64_atomic64_fetch_##op (s64 i, atomic64_t *v)				\
-{									\
-	s64 old, new;							\
-	CMPXCHG_BUGCHECK_DECL						\
-									\
-	do {								\
-		CMPXCHG_BUGCHECK(v);					\
-		old = arch_atomic64_read(v);				\
-		new = old c_op i;					\
-	} while (ia64_cmpxchg(acq, v, old, new, sizeof(atomic64_t)) != old); \
-	return old;							\
-}
-
-#define ATOMIC64_OPS(op, c_op)						\
-	ATOMIC64_OP(op, c_op)						\
-	ATOMIC64_FETCH_OP(op, c_op)
-
-ATOMIC64_OPS(add, +)
-ATOMIC64_OPS(sub, -)
-
-#define arch_atomic64_add_return(i,v)					\
-({									\
-	s64 __ia64_aar_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter)	\
-		: ia64_atomic64_add(__ia64_aar_i, v);			\
-})
-
-#define arch_atomic64_sub_return(i,v)					\
-({									\
-	s64 __ia64_asr_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter)	\
-		: ia64_atomic64_sub(__ia64_asr_i, v);			\
-})
-
-#define arch_atomic64_fetch_add(i,v)					\
-({									\
-	s64 __ia64_aar_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetchadd(__ia64_aar_i, &(v)->counter, acq)	\
-		: ia64_atomic64_fetch_add(__ia64_aar_i, v);		\
-})
-
-#define arch_atomic64_fetch_sub(i,v)					\
-({									\
-	s64 __ia64_asr_i = (i);						\
-	__ia64_atomic_const(i)						\
-		? ia64_fetchadd(-__ia64_asr_i, &(v)->counter, acq)	\
-		: ia64_atomic64_fetch_sub(__ia64_asr_i, v);		\
-})
-
-ATOMIC64_FETCH_OP(and, &)
-ATOMIC64_FETCH_OP(or, |)
-ATOMIC64_FETCH_OP(xor, ^)
-
-#define arch_atomic64_and(i,v)	(void)ia64_atomic64_fetch_and(i,v)
-#define arch_atomic64_or(i,v)	(void)ia64_atomic64_fetch_or(i,v)
-#define arch_atomic64_xor(i,v)	(void)ia64_atomic64_fetch_xor(i,v)
-
-#define arch_atomic64_fetch_and(i,v)	ia64_atomic64_fetch_and(i,v)
-#define arch_atomic64_fetch_or(i,v)	ia64_atomic64_fetch_or(i,v)
-#define arch_atomic64_fetch_xor(i,v)	ia64_atomic64_fetch_xor(i,v)
-
-#undef ATOMIC64_OPS
-#undef ATOMIC64_FETCH_OP
-#undef ATOMIC64_OP
-
-#define arch_atomic_add(i,v)		(void)arch_atomic_add_return((i), (v))
-#define arch_atomic_sub(i,v)		(void)arch_atomic_sub_return((i), (v))
-
-#define arch_atomic64_add(i,v)		(void)arch_atomic64_add_return((i), (v))
-#define arch_atomic64_sub(i,v)		(void)arch_atomic64_sub_return((i), (v))
-
-#endif /* _ASM_IA64_ATOMIC_H */
diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h
deleted file mode 100644
index 751cdd353446..000000000000
--- a/arch/ia64/include/asm/barrier.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Memory barrier definitions.  This is based on information published
- * in the Processor Abstraction Layer and the System Abstraction Layer
- * manual.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- */
-#ifndef _ASM_IA64_BARRIER_H
-#define _ASM_IA64_BARRIER_H
-
-#include <linux/compiler.h>
-
-/*
- * Macros to force memory ordering.  In these descriptions, "previous"
- * and "subsequent" refer to program order; "visible" means that all
- * architecturally visible effects of a memory access have occurred
- * (at a minimum, this means the memory has been read or written).
- *
- *   wmb():	Guarantees that all preceding stores to memory-
- *		like regions are visible before any subsequent
- *		stores and that all following stores will be
- *		visible only after all previous stores.
- *   rmb():	Like wmb(), but for reads.
- *   mb():	wmb()/rmb() combo, i.e., all previous memory
- *		accesses are visible before all subsequent
- *		accesses and vice versa.  This is also known as
- *		a "fence."
- *
- * Note: "mb()" and its variants cannot be used as a fence to order
- * accesses to memory mapped I/O registers.  For that, mf.a needs to
- * be used.  However, we don't want to always use mf.a because (a)
- * it's (presumably) much slower than mf and (b) mf.a is supported for
- * sequential memory pages only.
- */
-#define mb()		ia64_mf()
-#define rmb()		mb()
-#define wmb()		mb()
-
-#define dma_rmb()	mb()
-#define dma_wmb()	mb()
-
-# define __smp_mb()	mb()
-
-#define __smp_mb__before_atomic()	barrier()
-#define __smp_mb__after_atomic()	barrier()
-
-/*
- * IA64 GCC turns volatile stores into st.rel and volatile loads into ld.acq no
- * need for asm trickery!
- */
-
-#define __smp_store_release(p, v)						\
-do {									\
-	compiletime_assert_atomic_type(*p);				\
-	barrier();							\
-	WRITE_ONCE(*p, v);						\
-} while (0)
-
-#define __smp_load_acquire(p)						\
-({									\
-	typeof(*p) ___p1 = READ_ONCE(*p);				\
-	compiletime_assert_atomic_type(*p);				\
-	barrier();							\
-	___p1;								\
-})
-
-/*
- * The group barrier in front of the rsm & ssm are necessary to ensure
- * that none of the previous instructions in the same group are
- * affected by the rsm/ssm.
- */
-
-#include <asm-generic/barrier.h>
-
-#endif /* _ASM_IA64_BARRIER_H */
diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h
deleted file mode 100644
index 1accb7842f58..000000000000
--- a/arch/ia64/include/asm/bitops.h
+++ /dev/null
@@ -1,453 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_BITOPS_H
-#define _ASM_IA64_BITOPS_H
-
-/*
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 02/06/02 find_next_bit() and find_first_bit() added from Erich Focht's ia64
- * O(1) scheduler patch
- */
-
-#ifndef _LINUX_BITOPS_H
-#error only <linux/bitops.h> can be included directly
-#endif
-
-#include <linux/compiler.h>
-#include <linux/types.h>
-#include <asm/intrinsics.h>
-#include <asm/barrier.h>
-
-/**
- * set_bit - Atomically set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * This function is atomic and may not be reordered.  See __set_bit()
- * if you do not require the atomic guarantees.
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- *
- * The address must be (at least) "long" aligned.
- * Note that there are driver (e.g., eepro100) which use these operations to
- * operate on hw-defined data-structures, so we can't easily change these
- * operations to force a bigger alignment.
- *
- * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
- */
-static __inline__ void
-set_bit (int nr, volatile void *addr)
-{
-	__u32 bit, old, new;
-	volatile __u32 *m;
-	CMPXCHG_BUGCHECK_DECL
-
-	m = (volatile __u32 *) addr + (nr >> 5);
-	bit = 1 << (nr & 31);
-	do {
-		CMPXCHG_BUGCHECK(m);
-		old = *m;
-		new = old | bit;
-	} while (cmpxchg_acq(m, old, new) != old);
-}
-
-/**
- * arch___set_bit - Set a bit in memory
- * @nr: the bit to set
- * @addr: the address to start counting from
- *
- * Unlike set_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static __always_inline void
-arch___set_bit(unsigned long nr, volatile unsigned long *addr)
-{
-	*((__u32 *) addr + (nr >> 5)) |= (1 << (nr & 31));
-}
-
-/**
- * clear_bit - Clears a bit in memory
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit() is atomic and may not be reordered.  However, it does
- * not contain a memory barrier, so if it is used for locking purposes,
- * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic()
- * in order to ensure changes are visible on other processors.
- */
-static __inline__ void
-clear_bit (int nr, volatile void *addr)
-{
-	__u32 mask, old, new;
-	volatile __u32 *m;
-	CMPXCHG_BUGCHECK_DECL
-
-	m = (volatile __u32 *) addr + (nr >> 5);
-	mask = ~(1 << (nr & 31));
-	do {
-		CMPXCHG_BUGCHECK(m);
-		old = *m;
-		new = old & mask;
-	} while (cmpxchg_acq(m, old, new) != old);
-}
-
-/**
- * clear_bit_unlock - Clears a bit in memory with release
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * clear_bit_unlock() is atomic and may not be reordered.  It does
- * contain a memory barrier suitable for unlock type operations.
- */
-static __inline__ void
-clear_bit_unlock (int nr, volatile void *addr)
-{
-	__u32 mask, old, new;
-	volatile __u32 *m;
-	CMPXCHG_BUGCHECK_DECL
-
-	m = (volatile __u32 *) addr + (nr >> 5);
-	mask = ~(1 << (nr & 31));
-	do {
-		CMPXCHG_BUGCHECK(m);
-		old = *m;
-		new = old & mask;
-	} while (cmpxchg_rel(m, old, new) != old);
-}
-
-/**
- * __clear_bit_unlock - Non-atomically clears a bit in memory with release
- * @nr: Bit to clear
- * @addr: Address to start counting from
- *
- * Similarly to clear_bit_unlock, the implementation uses a store
- * with release semantics. See also arch_spin_unlock().
- */
-static __inline__ void
-__clear_bit_unlock(int nr, void *addr)
-{
-	__u32 * const m = (__u32 *) addr + (nr >> 5);
-	__u32 const new = *m & ~(1 << (nr & 31));
-
-	ia64_st4_rel_nta(m, new);
-}
-
-/**
- * arch___clear_bit - Clears a bit in memory (non-atomic version)
- * @nr: the bit to clear
- * @addr: the address to start counting from
- *
- * Unlike clear_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static __always_inline void
-arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
-{
-	*((__u32 *) addr + (nr >> 5)) &= ~(1 << (nr & 31));
-}
-
-/**
- * change_bit - Toggle a bit in memory
- * @nr: Bit to toggle
- * @addr: Address to start counting from
- *
- * change_bit() is atomic and may not be reordered.
- * Note that @nr may be almost arbitrarily large; this function is not
- * restricted to acting on a single-word quantity.
- */
-static __inline__ void
-change_bit (int nr, volatile void *addr)
-{
-	__u32 bit, old, new;
-	volatile __u32 *m;
-	CMPXCHG_BUGCHECK_DECL
-
-	m = (volatile __u32 *) addr + (nr >> 5);
-	bit = (1 << (nr & 31));
-	do {
-		CMPXCHG_BUGCHECK(m);
-		old = *m;
-		new = old ^ bit;
-	} while (cmpxchg_acq(m, old, new) != old);
-}
-
-/**
- * arch___change_bit - Toggle a bit in memory
- * @nr: the bit to toggle
- * @addr: the address to start counting from
- *
- * Unlike change_bit(), this function is non-atomic and may be reordered.
- * If it's called on the same region of memory simultaneously, the effect
- * may be that only one operation succeeds.
- */
-static __always_inline void
-arch___change_bit(unsigned long nr, volatile unsigned long *addr)
-{
-	*((__u32 *) addr + (nr >> 5)) ^= (1 << (nr & 31));
-}
-
-/**
- * test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.  
- * It also implies the acquisition side of the memory barrier.
- */
-static __inline__ int
-test_and_set_bit (int nr, volatile void *addr)
-{
-	__u32 bit, old, new;
-	volatile __u32 *m;
-	CMPXCHG_BUGCHECK_DECL
-
-	m = (volatile __u32 *) addr + (nr >> 5);
-	bit = 1 << (nr & 31);
-	do {
-		CMPXCHG_BUGCHECK(m);
-		old = *m;
-		new = old | bit;
-	} while (cmpxchg_acq(m, old, new) != old);
-	return (old & bit) != 0;
-}
-
-/**
- * test_and_set_bit_lock - Set a bit and return its old value for lock
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This is the same as test_and_set_bit on ia64
- */
-#define test_and_set_bit_lock test_and_set_bit
-
-/**
- * arch___test_and_set_bit - Set a bit and return its old value
- * @nr: Bit to set
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.  
- * If two examples of this operation race, one can appear to succeed
- * but actually fail.  You must protect multiple accesses with a lock.
- */
-static __always_inline bool
-arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
-{
-	__u32 *p = (__u32 *) addr + (nr >> 5);
-	__u32 m = 1 << (nr & 31);
-	int oldbitset = (*p & m) != 0;
-
-	*p |= m;
-	return oldbitset;
-}
-
-/**
- * test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.  
- * It also implies the acquisition side of the memory barrier.
- */
-static __inline__ int
-test_and_clear_bit (int nr, volatile void *addr)
-{
-	__u32 mask, old, new;
-	volatile __u32 *m;
-	CMPXCHG_BUGCHECK_DECL
-
-	m = (volatile __u32 *) addr + (nr >> 5);
-	mask = ~(1 << (nr & 31));
-	do {
-		CMPXCHG_BUGCHECK(m);
-		old = *m;
-		new = old & mask;
-	} while (cmpxchg_acq(m, old, new) != old);
-	return (old & ~mask) != 0;
-}
-
-/**
- * arch___test_and_clear_bit - Clear a bit and return its old value
- * @nr: Bit to clear
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.  
- * If two examples of this operation race, one can appear to succeed
- * but actually fail.  You must protect multiple accesses with a lock.
- */
-static __always_inline bool
-arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
-{
-	__u32 *p = (__u32 *) addr + (nr >> 5);
-	__u32 m = 1 << (nr & 31);
-	int oldbitset = (*p & m) != 0;
-
-	*p &= ~m;
-	return oldbitset;
-}
-
-/**
- * test_and_change_bit - Change a bit and return its old value
- * @nr: Bit to change
- * @addr: Address to count from
- *
- * This operation is atomic and cannot be reordered.  
- * It also implies the acquisition side of the memory barrier.
- */
-static __inline__ int
-test_and_change_bit (int nr, volatile void *addr)
-{
-	__u32 bit, old, new;
-	volatile __u32 *m;
-	CMPXCHG_BUGCHECK_DECL
-
-	m = (volatile __u32 *) addr + (nr >> 5);
-	bit = (1 << (nr & 31));
-	do {
-		CMPXCHG_BUGCHECK(m);
-		old = *m;
-		new = old ^ bit;
-	} while (cmpxchg_acq(m, old, new) != old);
-	return (old & bit) != 0;
-}
-
-/**
- * arch___test_and_change_bit - Change a bit and return its old value
- * @nr: Bit to change
- * @addr: Address to count from
- *
- * This operation is non-atomic and can be reordered.
- */
-static __always_inline bool
-arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
-{
-	__u32 old, bit = (1 << (nr & 31));
-	__u32 *m = (__u32 *) addr + (nr >> 5);
-
-	old = *m;
-	*m = old ^ bit;
-	return (old & bit) != 0;
-}
-
-#define arch_test_bit generic_test_bit
-#define arch_test_bit_acquire generic_test_bit_acquire
-
-/**
- * ffz - find the first zero bit in a long word
- * @x: The long word to find the bit in
- *
- * Returns the bit-number (0..63) of the first (least significant) zero bit.
- * Undefined if no zero exists, so code should check against ~0UL first...
- */
-static inline unsigned long
-ffz (unsigned long x)
-{
-	unsigned long result;
-
-	result = ia64_popcnt(x & (~x - 1));
-	return result;
-}
-
-/**
- * __ffs - find first bit in word.
- * @x: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static __inline__ unsigned long
-__ffs (unsigned long x)
-{
-	unsigned long result;
-
-	result = ia64_popcnt((x-1) & ~x);
-	return result;
-}
-
-#ifdef __KERNEL__
-
-/*
- * Return bit number of last (most-significant) bit set.  Undefined
- * for x==0.  Bits are numbered from 0..63 (e.g., ia64_fls(9) == 3).
- */
-static inline unsigned long
-ia64_fls (unsigned long x)
-{
-	long double d = x;
-	long exp;
-
-	exp = ia64_getf_exp(d);
-	return exp - 0xffff;
-}
-
-/*
- * Find the last (most significant) bit set.  Returns 0 for x==0 and
- * bits are numbered from 1..32 (e.g., fls(9) == 4).
- */
-static inline int fls(unsigned int t)
-{
-	unsigned long x = t & 0xffffffffu;
-
-	if (!x)
-		return 0;
-	x |= x >> 1;
-	x |= x >> 2;
-	x |= x >> 4;
-	x |= x >> 8;
-	x |= x >> 16;
-	return ia64_popcnt(x);
-}
-
-/*
- * Find the last (most significant) bit set.  Undefined for x==0.
- * Bits are numbered from 0..63 (e.g., __fls(9) == 3).
- */
-static inline unsigned long
-__fls (unsigned long x)
-{
-	x |= x >> 1;
-	x |= x >> 2;
-	x |= x >> 4;
-	x |= x >> 8;
-	x |= x >> 16;
-	x |= x >> 32;
-	return ia64_popcnt(x) - 1;
-}
-
-#include <asm-generic/bitops/fls64.h>
-
-#include <asm-generic/bitops/builtin-ffs.h>
-
-/*
- * hweightN: returns the hamming weight (i.e. the number
- * of bits set) of a N-bit word
- */
-static __inline__ unsigned long __arch_hweight64(unsigned long x)
-{
-	unsigned long result;
-	result = ia64_popcnt(x);
-	return result;
-}
-
-#define __arch_hweight32(x) ((unsigned int) __arch_hweight64((x) & 0xfffffffful))
-#define __arch_hweight16(x) ((unsigned int) __arch_hweight64((x) & 0xfffful))
-#define __arch_hweight8(x)  ((unsigned int) __arch_hweight64((x) & 0xfful))
-
-#include <asm-generic/bitops/const_hweight.h>
-
-#endif /* __KERNEL__ */
-
-#ifdef __KERNEL__
-
-#include <asm-generic/bitops/non-instrumented-non-atomic.h>
-
-#include <asm-generic/bitops/le.h>
-
-#include <asm-generic/bitops/ext2-atomic-setbit.h>
-
-#include <asm-generic/bitops/sched.h>
-
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_IA64_BITOPS_H */
diff --git a/arch/ia64/include/asm/bug.h b/arch/ia64/include/asm/bug.h
deleted file mode 100644
index 66b37a532765..000000000000
--- a/arch/ia64/include/asm/bug.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_BUG_H
-#define _ASM_IA64_BUG_H
-
-#ifdef CONFIG_BUG
-#define ia64_abort()	__builtin_trap()
-#define BUG() do {						\
-	printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__);	\
-	barrier_before_unreachable();				\
-	ia64_abort();						\
-} while (0)
-
-/* should this BUG be made generic? */
-#define HAVE_ARCH_BUG
-#endif
-
-#include <asm-generic/bug.h>
-
-#endif
diff --git a/arch/ia64/include/asm/cache.h b/arch/ia64/include/asm/cache.h
deleted file mode 100644
index 2f1c70647068..000000000000
--- a/arch/ia64/include/asm/cache.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_CACHE_H
-#define _ASM_IA64_CACHE_H
-
-
-/*
- * Copyright (C) 1998-2000 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-/* Bytes per L1 (data) cache line.  */
-#define L1_CACHE_SHIFT		CONFIG_IA64_L1_CACHE_SHIFT
-#define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
-
-#ifdef CONFIG_SMP
-# define SMP_CACHE_SHIFT	L1_CACHE_SHIFT
-# define SMP_CACHE_BYTES	L1_CACHE_BYTES
-#else
-  /*
-   * The "aligned" directive can only _increase_ alignment, so this is
-   * safe and provides an easy way to avoid wasting space on a
-   * uni-processor:
-   */
-# define SMP_CACHE_SHIFT	3
-# define SMP_CACHE_BYTES	(1 << 3)
-#endif
-
-#define __read_mostly __section(".data..read_mostly")
-
-#endif /* _ASM_IA64_CACHE_H */
diff --git a/arch/ia64/include/asm/cacheflush.h b/arch/ia64/include/asm/cacheflush.h
deleted file mode 100644
index eac493fa9e0d..000000000000
--- a/arch/ia64/include/asm/cacheflush.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_CACHEFLUSH_H
-#define _ASM_IA64_CACHEFLUSH_H
-
-/*
- * Copyright (C) 2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/page-flags.h>
-#include <linux/bitops.h>
-
-#include <asm/page.h>
-
-#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
-static inline void flush_dcache_folio(struct folio *folio)
-{
-	clear_bit(PG_arch_1, &folio->flags);
-}
-#define flush_dcache_folio flush_dcache_folio
-
-static inline void flush_dcache_page(struct page *page)
-{
-	flush_dcache_folio(page_folio(page));
-}
-
-extern void flush_icache_range(unsigned long start, unsigned long end);
-#define flush_icache_range flush_icache_range
-extern void clflush_cache_range(void *addr, int size);
-
-#define flush_icache_user_page(vma, page, user_addr, len)					\
-do {												\
-	unsigned long _addr = (unsigned long) page_address(page) + ((user_addr) & ~PAGE_MASK);	\
-	flush_icache_range(_addr, _addr + (len));						\
-} while (0)
-
-#include <asm-generic/cacheflush.h>
-
-#endif /* _ASM_IA64_CACHEFLUSH_H */
diff --git a/arch/ia64/include/asm/checksum.h b/arch/ia64/include/asm/checksum.h
deleted file mode 100644
index f3026213aa32..000000000000
--- a/arch/ia64/include/asm/checksum.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_CHECKSUM_H
-#define _ASM_IA64_CHECKSUM_H
-
-/*
- * Modified 1998, 1999
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-
-/*
- * This is a version of ip_compute_csum() optimized for IP headers,
- * which always checksum on 4 octet boundaries.
- */
-extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);
-
-/*
- * Computes the checksum of the TCP/UDP pseudo-header returns a 16-bit
- * checksum, already complemented
- */
-extern __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
-				 __u32 len, __u8 proto, __wsum sum);
-
-extern __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
-				 __u32 len, __u8 proto, __wsum sum);
-
-/*
- * Computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 32-bit boundary
- */
-extern __wsum csum_partial(const void *buff, int len, __wsum sum);
-
-/*
- * This routine is used for miscellaneous IP-like checksums, mainly in
- * icmp.c
- */
-extern __sum16 ip_compute_csum(const void *buff, int len);
-
-/*
- * Fold a partial checksum without adding pseudo headers.
- */
-static inline __sum16 csum_fold(__wsum csum)
-{
-	u32 sum = (__force u32)csum;
-	sum = (sum & 0xffff) + (sum >> 16);
-	sum = (sum & 0xffff) + (sum >> 16);
-	return (__force __sum16)~sum;
-}
-
-#define _HAVE_ARCH_IPV6_CSUM	1
-struct in6_addr;
-extern __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
-			       const struct in6_addr *daddr,
-			       __u32 len, __u8 proto, __wsum csum);
-
-#endif /* _ASM_IA64_CHECKSUM_H */
diff --git a/arch/ia64/include/asm/clocksource.h b/arch/ia64/include/asm/clocksource.h
deleted file mode 100644
index 71a517751afa..000000000000
--- a/arch/ia64/include/asm/clocksource.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* IA64-specific clocksource additions */
-
-#ifndef _ASM_IA64_CLOCKSOURCE_H
-#define _ASM_IA64_CLOCKSOURCE_H
-
-struct arch_clocksource_data {
-	void *fsys_mmio;        /* used by fsyscall asm code */
-};
-
-#endif /* _ASM_IA64_CLOCKSOURCE_H */
diff --git a/arch/ia64/include/asm/cmpxchg.h b/arch/ia64/include/asm/cmpxchg.h
deleted file mode 100644
index d85ee1a0a227..000000000000
--- a/arch/ia64/include/asm/cmpxchg.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_CMPXCHG_H
-#define _ASM_IA64_CMPXCHG_H
-
-#include <uapi/asm/cmpxchg.h>
-
-#define arch_xchg(ptr, x)	\
-({(__typeof__(*(ptr))) __arch_xchg((unsigned long) (x), (ptr), sizeof(*(ptr)));})
-
-#define arch_cmpxchg(ptr, o, n)		cmpxchg_acq((ptr), (o), (n))
-#define arch_cmpxchg64(ptr, o, n)	cmpxchg_acq((ptr), (o), (n))
-
-#define arch_cmpxchg_local		arch_cmpxchg
-#define arch_cmpxchg64_local		arch_cmpxchg64
-
-#ifdef CONFIG_IA64_DEBUG_CMPXCHG
-# define CMPXCHG_BUGCHECK_DECL	int _cmpxchg_bugcheck_count = 128;
-# define CMPXCHG_BUGCHECK(v)						\
-do {									\
-	if (_cmpxchg_bugcheck_count-- <= 0) {				\
-		void *ip;						\
-		extern int _printk(const char *fmt, ...);		\
-		ip = (void *) ia64_getreg(_IA64_REG_IP);		\
-		_printk("CMPXCHG_BUGCHECK: stuck at %p on word %p\n", ip, (v));\
-		break;							\
-	}								\
-} while (0)
-#else /* !CONFIG_IA64_DEBUG_CMPXCHG */
-# define CMPXCHG_BUGCHECK_DECL
-# define CMPXCHG_BUGCHECK(v)
-#endif /* !CONFIG_IA64_DEBUG_CMPXCHG */
-
-#endif /* _ASM_IA64_CMPXCHG_H */
diff --git a/arch/ia64/include/asm/cpu.h b/arch/ia64/include/asm/cpu.h
deleted file mode 100644
index db125df9e088..000000000000
--- a/arch/ia64/include/asm/cpu.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_CPU_H_
-#define _ASM_IA64_CPU_H_
-
-#include <linux/device.h>
-#include <linux/cpu.h>
-#include <linux/topology.h>
-#include <linux/percpu.h>
-
-struct ia64_cpu {
-	struct cpu cpu;
-};
-
-DECLARE_PER_CPU(struct ia64_cpu, cpu_devices);
-
-DECLARE_PER_CPU(int, cpu_state);
-
-#ifdef CONFIG_HOTPLUG_CPU
-extern int arch_register_cpu(int num);
-extern void arch_unregister_cpu(int);
-#endif
-
-#endif /* _ASM_IA64_CPU_H_ */
diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h
deleted file mode 100644
index 7f28c3564d5d..000000000000
--- a/arch/ia64/include/asm/cputime.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Definitions for measuring cputime on ia64 machines.
- *
- * Based on <asm-powerpc/cputime.h>.
- *
- * Copyright (C) 2007 FUJITSU LIMITED
- * Copyright (C) 2007 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
- *
- * If we have CONFIG_VIRT_CPU_ACCOUNTING_NATIVE, we measure cpu time in nsec.
- * Otherwise we measure cpu time in jiffies using the generic definitions.
- */
-
-#ifndef __IA64_CPUTIME_H
-#define __IA64_CPUTIME_H
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-extern void arch_vtime_task_switch(struct task_struct *tsk);
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-
-#endif /* __IA64_CPUTIME_H */
diff --git a/arch/ia64/include/asm/current.h b/arch/ia64/include/asm/current.h
deleted file mode 100644
index 86fbcc88dff2..000000000000
--- a/arch/ia64/include/asm/current.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_CURRENT_H
-#define _ASM_IA64_CURRENT_H
-
-/*
- * Modified 1998-2000
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-
-#include <asm/intrinsics.h>
-
-/*
- * In kernel mode, thread pointer (r13) is used to point to the current task
- * structure.
- */
-#define current	((struct task_struct *) ia64_getreg(_IA64_REG_TP))
-
-#endif /* _ASM_IA64_CURRENT_H */
diff --git a/arch/ia64/include/asm/cyclone.h b/arch/ia64/include/asm/cyclone.h
deleted file mode 100644
index a481393647e9..000000000000
--- a/arch/ia64/include/asm/cyclone.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ASM_IA64_CYCLONE_H
-#define ASM_IA64_CYCLONE_H
-
-#ifdef	CONFIG_IA64_CYCLONE
-extern int use_cyclone;
-extern void __init cyclone_setup(void);
-#else	/* CONFIG_IA64_CYCLONE */
-#define use_cyclone 0
-static inline void cyclone_setup(void)
-{
-	printk(KERN_ERR "Cyclone Counter: System not configured"
-					" w/ CONFIG_IA64_CYCLONE.\n");
-}
-#endif	/* CONFIG_IA64_CYCLONE */
-#endif	/* !ASM_IA64_CYCLONE_H */
diff --git a/arch/ia64/include/asm/delay.h b/arch/ia64/include/asm/delay.h
deleted file mode 100644
index 0227ac586107..000000000000
--- a/arch/ia64/include/asm/delay.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_DELAY_H
-#define _ASM_IA64_DELAY_H
-
-/*
- * Delay routines using a pre-computed "cycles/usec" value.
- *
- * Copyright (C) 1998, 1999 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/compiler.h>
-
-#include <asm/intrinsics.h>
-#include <asm/processor.h>
-
-static __inline__ void
-ia64_set_itm (unsigned long val)
-{
-	ia64_setreg(_IA64_REG_CR_ITM, val);
-	ia64_srlz_d();
-}
-
-static __inline__ unsigned long
-ia64_get_itm (void)
-{
-	unsigned long result;
-
-	result = ia64_getreg(_IA64_REG_CR_ITM);
-	ia64_srlz_d();
-	return result;
-}
-
-static __inline__ void
-ia64_set_itv (unsigned long val)
-{
-	ia64_setreg(_IA64_REG_CR_ITV, val);
-	ia64_srlz_d();
-}
-
-static __inline__ unsigned long
-ia64_get_itv (void)
-{
-	return ia64_getreg(_IA64_REG_CR_ITV);
-}
-
-static __inline__ void
-ia64_set_itc (unsigned long val)
-{
-	ia64_setreg(_IA64_REG_AR_ITC, val);
-	ia64_srlz_d();
-}
-
-static __inline__ unsigned long
-ia64_get_itc (void)
-{
-	unsigned long result;
-
-	result = ia64_getreg(_IA64_REG_AR_ITC);
-	ia64_barrier();
-#ifdef CONFIG_ITANIUM
-	while (unlikely((__s32) result == -1)) {
-		result = ia64_getreg(_IA64_REG_AR_ITC);
-		ia64_barrier();
-	}
-#endif
-	return result;
-}
-
-extern void ia64_delay_loop (unsigned long loops);
-
-static __inline__ void
-__delay (unsigned long loops)
-{
-	if (unlikely(loops < 1))
-		return;
-
-	ia64_delay_loop (loops - 1);
-}
-
-extern void udelay (unsigned long usecs);
-
-#endif /* _ASM_IA64_DELAY_H */
diff --git a/arch/ia64/include/asm/device.h b/arch/ia64/include/asm/device.h
deleted file mode 100644
index 918b198cd5bb..000000000000
--- a/arch/ia64/include/asm/device.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Arch specific extensions to struct device
- */
-#ifndef _ASM_IA64_DEVICE_H
-#define _ASM_IA64_DEVICE_H
-
-struct dev_archdata {
-};
-
-struct pdev_archdata {
-};
-
-#endif /* _ASM_IA64_DEVICE_H */
diff --git a/arch/ia64/include/asm/div64.h b/arch/ia64/include/asm/div64.h
deleted file mode 100644
index 6cd978cefb28..000000000000
--- a/arch/ia64/include/asm/div64.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/div64.h>
diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
deleted file mode 100644
index af6fa8e1597c..000000000000
--- a/arch/ia64/include/asm/dma-mapping.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_DMA_MAPPING_H
-#define _ASM_IA64_DMA_MAPPING_H
-
-/*
- * Copyright (C) 2003-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-extern const struct dma_map_ops *dma_ops;
-
-static inline const struct dma_map_ops *get_arch_dma_ops(void)
-{
-	return dma_ops;
-}
-
-#endif /* _ASM_IA64_DMA_MAPPING_H */
diff --git a/arch/ia64/include/asm/dma.h b/arch/ia64/include/asm/dma.h
deleted file mode 100644
index eaed2626ffda..000000000000
--- a/arch/ia64/include/asm/dma.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_DMA_H
-#define _ASM_IA64_DMA_H
-
-/*
- * Copyright (C) 1998-2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-
-#include <asm/io.h>		/* need byte IO */
-
-extern unsigned long MAX_DMA_ADDRESS;
-
-#define free_dma(x)
-
-#endif /* _ASM_IA64_DMA_H */
diff --git a/arch/ia64/include/asm/dmi.h b/arch/ia64/include/asm/dmi.h
deleted file mode 100644
index ecd9e0a0f5f9..000000000000
--- a/arch/ia64/include/asm/dmi.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_DMI_H
-#define _ASM_DMI_H 1
-
-#include <linux/slab.h>
-#include <asm/io.h>
-
-/* Use normal IO mappings for DMI */
-#define dmi_early_remap		ioremap
-#define dmi_early_unmap(x, l)	iounmap(x)
-#define dmi_remap		ioremap
-#define dmi_unmap		iounmap
-#define dmi_alloc(l)		kzalloc(l, GFP_ATOMIC)
-
-#endif
diff --git a/arch/ia64/include/asm/early_ioremap.h b/arch/ia64/include/asm/early_ioremap.h
deleted file mode 100644
index 934191b1e2e3..000000000000
--- a/arch/ia64/include/asm/early_ioremap.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_EARLY_IOREMAP_H
-#define _ASM_IA64_EARLY_IOREMAP_H
-
-extern void __iomem * early_ioremap (unsigned long phys_addr, unsigned long size);
-#define early_memremap(phys_addr, size)        early_ioremap(phys_addr, size)
-
-extern void early_iounmap (volatile void __iomem *addr, unsigned long size);
-#define early_memunmap(addr, size)             early_iounmap(addr, size)
-
-#endif
diff --git a/arch/ia64/include/asm/efi.h b/arch/ia64/include/asm/efi.h
deleted file mode 100644
index 6a4a50d8f19a..000000000000
--- a/arch/ia64/include/asm/efi.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_EFI_H
-#define _ASM_EFI_H
-
-typedef int (*efi_freemem_callback_t) (u64 start, u64 end, void *arg);
-
-void *efi_get_pal_addr(void);
-void efi_map_pal_code(void);
-void efi_memmap_walk(efi_freemem_callback_t, void *);
-void efi_memmap_walk_uc(efi_freemem_callback_t, void *);
-void efi_gettimeofday(struct timespec64 *ts);
-
-#endif
diff --git a/arch/ia64/include/asm/elf.h b/arch/ia64/include/asm/elf.h
deleted file mode 100644
index 2ef5f9966ad1..000000000000
--- a/arch/ia64/include/asm/elf.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_ELF_H
-#define _ASM_IA64_ELF_H
-
-/*
- * ELF-specific definitions.
- *
- * Copyright (C) 1998-1999, 2002-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-
-#include <asm/fpu.h>
-#include <asm/page.h>
-#include <asm/auxvec.h>
-
-/*
- * This is used to ensure we don't load something for the wrong architecture.
- */
-#define elf_check_arch(x) ((x)->e_machine == EM_IA_64)
-
-/*
- * These are used to set parameters in the core dumps.
- */
-#define ELF_CLASS	ELFCLASS64
-#define ELF_DATA	ELFDATA2LSB
-#define ELF_ARCH	EM_IA_64
-
-#define CORE_DUMP_USE_REGSET
-
-/* Least-significant four bits of ELF header's e_flags are OS-specific.  The bits are
-   interpreted as follows by Linux: */
-#define EF_IA_64_LINUX_EXECUTABLE_STACK	0x1	/* is stack (& heap) executable by default? */
-
-#define ELF_EXEC_PAGESIZE	PAGE_SIZE
-
-/*
- * This is the location that an ET_DYN program is loaded if exec'ed.
- * Typical use of this is to invoke "./ld.so someprog" to test out a
- * new version of the loader.  We need to make sure that it is out of
- * the way of the program that it will "exec", and that there is
- * sufficient room for the brk.
- */
-#define ELF_ET_DYN_BASE		(TASK_UNMAPPED_BASE + 0x800000000UL)
-
-#define PT_IA_64_UNWIND		0x70000001
-
-/* IA-64 relocations: */
-#define R_IA64_NONE		0x00	/* none */
-#define R_IA64_IMM14		0x21	/* symbol + addend, add imm14 */
-#define R_IA64_IMM22		0x22	/* symbol + addend, add imm22 */
-#define R_IA64_IMM64		0x23	/* symbol + addend, mov imm64 */
-#define R_IA64_DIR32MSB		0x24	/* symbol + addend, data4 MSB */
-#define R_IA64_DIR32LSB		0x25	/* symbol + addend, data4 LSB */
-#define R_IA64_DIR64MSB		0x26	/* symbol + addend, data8 MSB */
-#define R_IA64_DIR64LSB		0x27	/* symbol + addend, data8 LSB */
-#define R_IA64_GPREL22		0x2a	/* @gprel(sym+add), add imm22 */
-#define R_IA64_GPREL64I		0x2b	/* @gprel(sym+add), mov imm64 */
-#define R_IA64_GPREL32MSB	0x2c	/* @gprel(sym+add), data4 MSB */
-#define R_IA64_GPREL32LSB	0x2d	/* @gprel(sym+add), data4 LSB */
-#define R_IA64_GPREL64MSB	0x2e	/* @gprel(sym+add), data8 MSB */
-#define R_IA64_GPREL64LSB	0x2f	/* @gprel(sym+add), data8 LSB */
-#define R_IA64_LTOFF22		0x32	/* @ltoff(sym+add), add imm22 */
-#define R_IA64_LTOFF64I		0x33	/* @ltoff(sym+add), mov imm64 */
-#define R_IA64_PLTOFF22		0x3a	/* @pltoff(sym+add), add imm22 */
-#define R_IA64_PLTOFF64I	0x3b	/* @pltoff(sym+add), mov imm64 */
-#define R_IA64_PLTOFF64MSB	0x3e	/* @pltoff(sym+add), data8 MSB */
-#define R_IA64_PLTOFF64LSB	0x3f	/* @pltoff(sym+add), data8 LSB */
-#define R_IA64_FPTR64I		0x43	/* @fptr(sym+add), mov imm64 */
-#define R_IA64_FPTR32MSB	0x44	/* @fptr(sym+add), data4 MSB */
-#define R_IA64_FPTR32LSB	0x45	/* @fptr(sym+add), data4 LSB */
-#define R_IA64_FPTR64MSB	0x46	/* @fptr(sym+add), data8 MSB */
-#define R_IA64_FPTR64LSB	0x47	/* @fptr(sym+add), data8 LSB */
-#define R_IA64_PCREL60B		0x48	/* @pcrel(sym+add), brl */
-#define R_IA64_PCREL21B		0x49	/* @pcrel(sym+add), ptb, call */
-#define R_IA64_PCREL21M		0x4a	/* @pcrel(sym+add), chk.s */
-#define R_IA64_PCREL21F		0x4b	/* @pcrel(sym+add), fchkf */
-#define R_IA64_PCREL32MSB	0x4c	/* @pcrel(sym+add), data4 MSB */
-#define R_IA64_PCREL32LSB	0x4d	/* @pcrel(sym+add), data4 LSB */
-#define R_IA64_PCREL64MSB	0x4e	/* @pcrel(sym+add), data8 MSB */
-#define R_IA64_PCREL64LSB	0x4f	/* @pcrel(sym+add), data8 LSB */
-#define R_IA64_LTOFF_FPTR22	0x52	/* @ltoff(@fptr(s+a)), imm22 */
-#define R_IA64_LTOFF_FPTR64I	0x53	/* @ltoff(@fptr(s+a)), imm64 */
-#define R_IA64_LTOFF_FPTR32MSB	0x54	/* @ltoff(@fptr(s+a)), 4 MSB */
-#define R_IA64_LTOFF_FPTR32LSB	0x55	/* @ltoff(@fptr(s+a)), 4 LSB */
-#define R_IA64_LTOFF_FPTR64MSB	0x56	/* @ltoff(@fptr(s+a)), 8 MSB */
-#define R_IA64_LTOFF_FPTR64LSB	0x57	/* @ltoff(@fptr(s+a)), 8 LSB */
-#define R_IA64_SEGREL32MSB	0x5c	/* @segrel(sym+add), data4 MSB */
-#define R_IA64_SEGREL32LSB	0x5d	/* @segrel(sym+add), data4 LSB */
-#define R_IA64_SEGREL64MSB	0x5e	/* @segrel(sym+add), data8 MSB */
-#define R_IA64_SEGREL64LSB	0x5f	/* @segrel(sym+add), data8 LSB */
-#define R_IA64_SECREL32MSB	0x64	/* @secrel(sym+add), data4 MSB */
-#define R_IA64_SECREL32LSB	0x65	/* @secrel(sym+add), data4 LSB */
-#define R_IA64_SECREL64MSB	0x66	/* @secrel(sym+add), data8 MSB */
-#define R_IA64_SECREL64LSB	0x67	/* @secrel(sym+add), data8 LSB */
-#define R_IA64_REL32MSB		0x6c	/* data 4 + REL */
-#define R_IA64_REL32LSB		0x6d	/* data 4 + REL */
-#define R_IA64_REL64MSB		0x6e	/* data 8 + REL */
-#define R_IA64_REL64LSB		0x6f	/* data 8 + REL */
-#define R_IA64_LTV32MSB		0x74	/* symbol + addend, data4 MSB */
-#define R_IA64_LTV32LSB		0x75	/* symbol + addend, data4 LSB */
-#define R_IA64_LTV64MSB		0x76	/* symbol + addend, data8 MSB */
-#define R_IA64_LTV64LSB		0x77	/* symbol + addend, data8 LSB */
-#define R_IA64_PCREL21BI	0x79	/* @pcrel(sym+add), ptb, call */
-#define R_IA64_PCREL22		0x7a	/* @pcrel(sym+add), imm22 */
-#define R_IA64_PCREL64I		0x7b	/* @pcrel(sym+add), imm64 */
-#define R_IA64_IPLTMSB		0x80	/* dynamic reloc, imported PLT, MSB */
-#define R_IA64_IPLTLSB		0x81	/* dynamic reloc, imported PLT, LSB */
-#define R_IA64_COPY		0x84	/* dynamic reloc, data copy */
-#define R_IA64_SUB		0x85	/* -symbol + addend, add imm22 */
-#define R_IA64_LTOFF22X		0x86	/* LTOFF22, relaxable.  */
-#define R_IA64_LDXMOV		0x87	/* Use of LTOFF22X.  */
-#define R_IA64_TPREL14		0x91	/* @tprel(sym+add), add imm14 */
-#define R_IA64_TPREL22		0x92	/* @tprel(sym+add), add imm22 */
-#define R_IA64_TPREL64I		0x93	/* @tprel(sym+add), add imm64 */
-#define R_IA64_TPREL64MSB	0x96	/* @tprel(sym+add), data8 MSB */
-#define R_IA64_TPREL64LSB	0x97	/* @tprel(sym+add), data8 LSB */
-#define R_IA64_LTOFF_TPREL22	0x9a	/* @ltoff(@tprel(s+a)), add imm22 */
-#define R_IA64_DTPMOD64MSB	0xa6	/* @dtpmod(sym+add), data8 MSB */
-#define R_IA64_DTPMOD64LSB	0xa7	/* @dtpmod(sym+add), data8 LSB */
-#define R_IA64_LTOFF_DTPMOD22	0xaa	/* @ltoff(@dtpmod(s+a)), imm22 */
-#define R_IA64_DTPREL14		0xb1	/* @dtprel(sym+add), imm14 */
-#define R_IA64_DTPREL22		0xb2	/* @dtprel(sym+add), imm22 */
-#define R_IA64_DTPREL64I	0xb3	/* @dtprel(sym+add), imm64 */
-#define R_IA64_DTPREL32MSB	0xb4	/* @dtprel(sym+add), data4 MSB */
-#define R_IA64_DTPREL32LSB	0xb5	/* @dtprel(sym+add), data4 LSB */
-#define R_IA64_DTPREL64MSB	0xb6	/* @dtprel(sym+add), data8 MSB */
-#define R_IA64_DTPREL64LSB	0xb7	/* @dtprel(sym+add), data8 LSB */
-#define R_IA64_LTOFF_DTPREL22	0xba	/* @ltoff(@dtprel(s+a)), imm22 */
-
-/* IA-64 specific section flags: */
-#define SHF_IA_64_SHORT		0x10000000	/* section near gp */
-
-/*
- * We use (abuse?) this macro to insert the (empty) vm_area that is
- * used to map the register backing store.  I don't see any better
- * place to do this, but we should discuss this with Linus once we can
- * talk to him...
- */
-extern void ia64_init_addr_space (void);
-#define ELF_PLAT_INIT(_r, load_addr)	ia64_init_addr_space()
-
-/* ELF register definitions.  This is needed for core dump support.  */
-
-/*
- * elf_gregset_t contains the application-level state in the following order:
- *	r0-r31
- *	NaT bits (for r0-r31; bit N == 1 iff rN is a NaT)
- *	predicate registers (p0-p63)
- *	b0-b7
- *	ip cfm psr
- *	ar.rsc ar.bsp ar.bspstore ar.rnat
- *	ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec ar.csd ar.ssd
- */
-#define ELF_NGREG	128	/* we really need just 72 but let's leave some headroom... */
-#define ELF_NFPREG	128	/* f0 and f1 could be omitted, but so what... */
-
-/* elf_gregset_t register offsets */
-#define ELF_GR_0_OFFSET     0
-#define ELF_NAT_OFFSET     (32 * sizeof(elf_greg_t))
-#define ELF_PR_OFFSET      (33 * sizeof(elf_greg_t))
-#define ELF_BR_0_OFFSET    (34 * sizeof(elf_greg_t))
-#define ELF_CR_IIP_OFFSET  (42 * sizeof(elf_greg_t))
-#define ELF_CFM_OFFSET     (43 * sizeof(elf_greg_t))
-#define ELF_CR_IPSR_OFFSET (44 * sizeof(elf_greg_t))
-#define ELF_GR_OFFSET(i)   (ELF_GR_0_OFFSET + i * sizeof(elf_greg_t))
-#define ELF_BR_OFFSET(i)   (ELF_BR_0_OFFSET + i * sizeof(elf_greg_t))
-#define ELF_AR_RSC_OFFSET  (45 * sizeof(elf_greg_t))
-#define ELF_AR_BSP_OFFSET  (46 * sizeof(elf_greg_t))
-#define ELF_AR_BSPSTORE_OFFSET (47 * sizeof(elf_greg_t))
-#define ELF_AR_RNAT_OFFSET (48 * sizeof(elf_greg_t))
-#define ELF_AR_CCV_OFFSET  (49 * sizeof(elf_greg_t))
-#define ELF_AR_UNAT_OFFSET (50 * sizeof(elf_greg_t))
-#define ELF_AR_FPSR_OFFSET (51 * sizeof(elf_greg_t))
-#define ELF_AR_PFS_OFFSET  (52 * sizeof(elf_greg_t))
-#define ELF_AR_LC_OFFSET   (53 * sizeof(elf_greg_t))
-#define ELF_AR_EC_OFFSET   (54 * sizeof(elf_greg_t))
-#define ELF_AR_CSD_OFFSET  (55 * sizeof(elf_greg_t))
-#define ELF_AR_SSD_OFFSET  (56 * sizeof(elf_greg_t))
-#define ELF_AR_END_OFFSET  (57 * sizeof(elf_greg_t))
-
-typedef unsigned long elf_greg_t;
-typedef elf_greg_t elf_gregset_t[ELF_NGREG];
-
-typedef struct ia64_fpreg elf_fpreg_t;
-typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
-
-
-
-struct pt_regs;	/* forward declaration... */
-extern void ia64_elf_core_copy_regs (struct pt_regs *src, elf_gregset_t dst);
-#define ELF_CORE_COPY_REGS(_dest,_regs)	ia64_elf_core_copy_regs(_regs, _dest);
-
-/* This macro yields a bitmask that programs can use to figure out
-   what instruction set this CPU supports.  */
-#define ELF_HWCAP 	0
-
-/* This macro yields a string that ld.so will use to load
-   implementation specific libraries for optimization.  Not terribly
-   relevant until we have real hardware to play with... */
-#define ELF_PLATFORM	NULL
-
-#define elf_read_implies_exec(ex, executable_stack)					\
-	((executable_stack!=EXSTACK_DISABLE_X) && ((ex).e_flags & EF_IA_64_LINUX_EXECUTABLE_STACK) != 0)
-
-struct task_struct;
-
-#define GATE_EHDR	((const struct elfhdr *) GATE_ADDR)
-
-/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
-#define ARCH_DLINFO								\
-do {										\
-	extern char __kernel_syscall_via_epc[];					\
-	NEW_AUX_ENT(AT_SYSINFO, (unsigned long) __kernel_syscall_via_epc);	\
-	NEW_AUX_ENT(AT_SYSINFO_EHDR, (unsigned long) GATE_EHDR);		\
-} while (0)
-
-/*
- * format for entries in the Global Offset Table
- */
-struct got_entry {
-	uint64_t val;
-};
-
-/*
- * Layout of the Function Descriptor
- */
-struct fdesc {
-	uint64_t addr;
-	uint64_t gp;
-};
-
-#endif /* _ASM_IA64_ELF_H */
diff --git a/arch/ia64/include/asm/emergency-restart.h b/arch/ia64/include/asm/emergency-restart.h
deleted file mode 100644
index 108d8c48e42e..000000000000
--- a/arch/ia64/include/asm/emergency-restart.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_EMERGENCY_RESTART_H
-#define _ASM_EMERGENCY_RESTART_H
-
-#include <asm-generic/emergency-restart.h>
-
-#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/arch/ia64/include/asm/esi.h b/arch/ia64/include/asm/esi.h
deleted file mode 100644
index 56d1310af06e..000000000000
--- a/arch/ia64/include/asm/esi.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * ESI service calls.
- *
- * Copyright (c) Copyright 2005-2006 Hewlett-Packard Development Company, L.P.
- * 	Alex Williamson <alex.williamson@hp.com>
- */
-#ifndef esi_h
-#define esi_h
-
-#include <linux/efi.h>
-
-#define ESI_QUERY			0x00000001
-#define ESI_OPEN_HANDLE			0x02000000
-#define ESI_CLOSE_HANDLE		0x02000001
-
-enum esi_proc_type {
-	ESI_PROC_SERIALIZED,	/* calls need to be serialized */
-	ESI_PROC_MP_SAFE,	/* MP-safe, but not reentrant */
-	ESI_PROC_REENTRANT	/* MP-safe and reentrant */
-};
-
-extern struct ia64_sal_retval esi_call_phys (void *, u64 *);
-extern int ia64_esi_call(efi_guid_t, struct ia64_sal_retval *,
-			 enum esi_proc_type,
-			 u64, u64, u64, u64, u64, u64, u64, u64);
-extern int ia64_esi_call_phys(efi_guid_t, struct ia64_sal_retval *, u64, u64,
-                              u64, u64, u64, u64, u64, u64);
-
-#endif /* esi_h */
diff --git a/arch/ia64/include/asm/exception.h b/arch/ia64/include/asm/exception.h
deleted file mode 100644
index 1d5df8116a31..000000000000
--- a/arch/ia64/include/asm/exception.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef __ASM_EXCEPTION_H
-#define __ASM_EXCEPTION_H
-
-struct pt_regs;
-struct exception_table_entry;
-
-extern void ia64_handle_exception(struct pt_regs *regs,
-				  const struct exception_table_entry *e);
-
-#define ia64_done_with_exception(regs)					  \
-({									  \
-	int __ex_ret = 0;						  \
-	const struct exception_table_entry *e;				  \
-	e = search_exception_tables((regs)->cr_iip + ia64_psr(regs)->ri); \
-	if (e) {							  \
-		ia64_handle_exception(regs, e);				  \
-		__ex_ret = 1;						  \
-	}								  \
-	__ex_ret;							  \
-})
-
-#endif	/* __ASM_EXCEPTION_H */
diff --git a/arch/ia64/include/asm/extable.h b/arch/ia64/include/asm/extable.h
deleted file mode 100644
index 83eac6aa0639..000000000000
--- a/arch/ia64/include/asm/extable.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_EXTABLE_H
-#define _ASM_IA64_EXTABLE_H
-
-#define ARCH_HAS_RELATIVE_EXTABLE
-
-struct exception_table_entry {
-	int insn;	/* location-relative address of insn this fixup is for */
-	int fixup;	/* location-relative continuation addr.; if bit 2 is set, r9 is set to 0 */
-};
-
-#endif
diff --git a/arch/ia64/include/asm/fb.h b/arch/ia64/include/asm/fb.h
deleted file mode 100644
index 1717b26fd423..000000000000
--- a/arch/ia64/include/asm/fb.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_FB_H_
-#define _ASM_FB_H_
-
-#include <linux/compiler.h>
-#include <linux/efi.h>
-#include <linux/string.h>
-
-#include <asm/page.h>
-
-struct file;
-
-static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
-				unsigned long off)
-{
-	if (efi_range_is_wc(vma->vm_start, vma->vm_end - vma->vm_start))
-		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
-	else
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-}
-#define fb_pgprotect fb_pgprotect
-
-static inline void fb_memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
-{
-	memcpy(to, (void __force *)from, n);
-}
-#define fb_memcpy_fromio fb_memcpy_fromio
-
-static inline void fb_memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
-{
-	memcpy((void __force *)to, from, n);
-}
-#define fb_memcpy_toio fb_memcpy_toio
-
-static inline void fb_memset_io(volatile void __iomem *addr, int c, size_t n)
-{
-	memset((void __force *)addr, c, n);
-}
-#define fb_memset fb_memset_io
-
-#include <asm-generic/fb.h>
-
-#endif /* _ASM_FB_H_ */
diff --git a/arch/ia64/include/asm/fpswa.h b/arch/ia64/include/asm/fpswa.h
deleted file mode 100644
index 2a0c23728b26..000000000000
--- a/arch/ia64/include/asm/fpswa.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_FPSWA_H
-#define _ASM_IA64_FPSWA_H
-
-/*
- * Floating-point Software Assist
- *
- * Copyright (C) 1999 Intel Corporation.
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Goutham Rao <goutham.rao@intel.com>
- */
-
-typedef struct {
-	/* 4 * 128 bits */
-	unsigned long fp_lp[4*2];
-} fp_state_low_preserved_t;
-
-typedef struct {
-	/* 10 * 128 bits */
-	unsigned long fp_lv[10 * 2];
-} fp_state_low_volatile_t;
-
-typedef	struct {
-	/* 16 * 128 bits */
-	unsigned long fp_hp[16 * 2];
-} fp_state_high_preserved_t;
-
-typedef struct {
-	/* 96 * 128 bits */
-	unsigned long fp_hv[96 * 2];
-} fp_state_high_volatile_t;
-
-/**
- * floating point state to be passed to the FP emulation library by
- * the trap/fault handler
- */
-typedef struct {
-	unsigned long			bitmask_low64;
-	unsigned long			bitmask_high64;
-	fp_state_low_preserved_t	*fp_state_low_preserved;
-	fp_state_low_volatile_t		*fp_state_low_volatile;
-	fp_state_high_preserved_t	*fp_state_high_preserved;
-	fp_state_high_volatile_t	*fp_state_high_volatile;
-} fp_state_t;
-
-typedef struct {
-	unsigned long status;
-	unsigned long err0;
-	unsigned long err1;
-	unsigned long err2;
-} fpswa_ret_t;
-
-/**
- * function header for the Floating Point software assist
- * library. This function is invoked by the Floating point software
- * assist trap/fault handler.
- */
-typedef fpswa_ret_t (*efi_fpswa_t) (unsigned long trap_type, void *bundle, unsigned long *ipsr,
-				    unsigned long *fsr, unsigned long *isr, unsigned long *preds,
-				    unsigned long *ifs, fp_state_t *fp_state);
-
-/**
- * This is the FPSWA library interface as defined by EFI.  We need to pass a 
- * pointer to the interface itself on a call to the assist library
- */
-typedef struct {
-	unsigned int	 revision;
-	unsigned int	 reserved;
-	efi_fpswa_t	 fpswa;
-} fpswa_interface_t;
-
-extern fpswa_interface_t *fpswa_interface;
-
-#endif /* _ASM_IA64_FPSWA_H */
diff --git a/arch/ia64/include/asm/ftrace.h b/arch/ia64/include/asm/ftrace.h
deleted file mode 100644
index a07a8e575453..000000000000
--- a/arch/ia64/include/asm/ftrace.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_FTRACE_H
-#define _ASM_IA64_FTRACE_H
-
-#ifdef CONFIG_FUNCTION_TRACER
-#define MCOUNT_INSN_SIZE        32 /* sizeof mcount call */
-
-#ifndef __ASSEMBLY__
-extern void _mcount(unsigned long pfs, unsigned long r1, unsigned long b0, unsigned long r0);
-#define mcount _mcount
-
-/* In IA64, MCOUNT_ADDR is set in link time, so it's not a constant at compile time */
-#define MCOUNT_ADDR (((struct fnptr *)mcount)->ip)
-#define FTRACE_ADDR (((struct fnptr *)ftrace_caller)->ip)
-
-static inline unsigned long ftrace_call_adjust(unsigned long addr)
-{
-	/* second bundle, insn 2 */
-	return addr - 0x12;
-}
-
-struct dyn_arch_ftrace {
-};
-#endif
-
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#endif /* _ASM_IA64_FTRACE_H */
diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h
deleted file mode 100644
index 1db26b432d8c..000000000000
--- a/arch/ia64/include/asm/futex.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_FUTEX_H
-#define _ASM_FUTEX_H
-
-#include <linux/futex.h>
-#include <linux/uaccess.h>
-#include <asm/errno.h>
-
-#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
-do {									\
-	register unsigned long r8 __asm ("r8") = 0;			\
-	__asm__ __volatile__(						\
-		"	mf;;					\n"	\
-		"[1:] "	insn ";;				\n"	\
-		"	.xdata4 \"__ex_table\", 1b-., 2f-.	\n"	\
-		"[2:]"							\
-		: "+r" (r8), "=r" (oldval)				\
-		: "r" (uaddr), "r" (oparg)				\
-		: "memory");						\
-	ret = r8;							\
-} while (0)
-
-#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
-do {									\
-	register unsigned long r8 __asm ("r8") = 0;			\
-	int val, newval;						\
-	do {								\
-		__asm__ __volatile__(					\
-			"	mf;;				  \n"	\
-			"[1:]	ld4 %3=[%4];;			  \n"	\
-			"	mov %2=%3			  \n"	\
-				insn	";;			  \n"	\
-			"	mov ar.ccv=%2;;			  \n"	\
-			"[2:]	cmpxchg4.acq %1=[%4],%3,ar.ccv;;  \n"	\
-			"	.xdata4 \"__ex_table\", 1b-., 3f-.\n"	\
-			"	.xdata4 \"__ex_table\", 2b-., 3f-.\n"	\
-			"[3:]"						\
-			: "+r" (r8), "=r" (val), "=&r" (oldval),	\
-			   "=&r" (newval)				\
-			: "r" (uaddr), "r" (oparg)			\
-			: "memory");					\
-		if (unlikely (r8))					\
-			break;						\
-	} while (unlikely (val != oldval));				\
-	ret = r8;							\
-} while (0)
-
-static inline int
-arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
-{
-	int oldval = 0, ret;
-
-	if (!access_ok(uaddr, sizeof(u32)))
-		return -EFAULT;
-
-	switch (op) {
-	case FUTEX_OP_SET:
-		__futex_atomic_op1("xchg4 %1=[%2],%3", ret, oldval, uaddr,
-				   oparg);
-		break;
-	case FUTEX_OP_ADD:
-		__futex_atomic_op2("add %3=%3,%5", ret, oldval, uaddr, oparg);
-		break;
-	case FUTEX_OP_OR:
-		__futex_atomic_op2("or %3=%3,%5", ret, oldval, uaddr, oparg);
-		break;
-	case FUTEX_OP_ANDN:
-		__futex_atomic_op2("and %3=%3,%5", ret, oldval, uaddr,
-				   ~oparg);
-		break;
-	case FUTEX_OP_XOR:
-		__futex_atomic_op2("xor %3=%3,%5", ret, oldval, uaddr, oparg);
-		break;
-	default:
-		ret = -ENOSYS;
-	}
-
-	if (!ret)
-		*oval = oldval;
-
-	return ret;
-}
-
-static inline int
-futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
-			      u32 oldval, u32 newval)
-{
-	if (!access_ok(uaddr, sizeof(u32)))
-		return -EFAULT;
-
-	{
-		register unsigned long r8 __asm ("r8") = 0;
-		unsigned long prev;
-		__asm__ __volatile__(
-			"	mf;;					\n"
-			"	mov ar.ccv=%4;;				\n"
-			"[1:]	cmpxchg4.acq %1=[%2],%3,ar.ccv		\n"
-			"	.xdata4 \"__ex_table\", 1b-., 2f-.	\n"
-			"[2:]"
-			: "+r" (r8), "=&r" (prev)
-			: "r" (uaddr), "r" (newval),
-			  "rO" ((long) (unsigned) oldval)
-			: "memory");
-		*uval = prev;
-		return r8;
-	}
-}
-
-#endif /* _ASM_FUTEX_H */
diff --git a/arch/ia64/include/asm/gcc_intrin.h b/arch/ia64/include/asm/gcc_intrin.h
deleted file mode 100644
index 83f230b23867..000000000000
--- a/arch/ia64/include/asm/gcc_intrin.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Copyright (C) 2002,2003 Jun Nakajima <jun.nakajima@intel.com>
- * Copyright (C) 2002,2003 Suresh Siddha <suresh.b.siddha@intel.com>
- */
-#ifndef _ASM_IA64_GCC_INTRIN_H
-#define _ASM_IA64_GCC_INTRIN_H
-
-#include <uapi/asm/gcc_intrin.h>
-
-register unsigned long ia64_r13 asm ("r13") __used;
-#endif /* _ASM_IA64_GCC_INTRIN_H */
diff --git a/arch/ia64/include/asm/hardirq.h b/arch/ia64/include/asm/hardirq.h
deleted file mode 100644
index ccde7c2ba00f..000000000000
--- a/arch/ia64/include/asm/hardirq.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_HARDIRQ_H
-#define _ASM_IA64_HARDIRQ_H
-
-/*
- * Modified 1998-2002, 2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-/*
- * No irq_cpustat_t for IA-64.  The data is held in the per-CPU data structure.
- */
-
-#define __ARCH_IRQ_STAT	1
-
-#define local_softirq_pending_ref	ia64_cpu_info.softirq_pending
-
-#include <linux/threads.h>
-#include <linux/irq.h>
-
-#include <asm/processor.h>
-
-extern void __iomem *ipi_base_addr;
-
-void ack_bad_irq(unsigned int irq);
-
-#endif /* _ASM_IA64_HARDIRQ_H */
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h
deleted file mode 100644
index 026ead47cd53..000000000000
--- a/arch/ia64/include/asm/hugetlb.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_HUGETLB_H
-#define _ASM_IA64_HUGETLB_H
-
-#include <asm/page.h>
-
-#define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
-void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
-			    unsigned long end, unsigned long floor,
-			    unsigned long ceiling);
-
-#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
-int prepare_hugepage_range(struct file *file,
-			unsigned long addr, unsigned long len);
-
-static inline int is_hugepage_only_range(struct mm_struct *mm,
-					 unsigned long addr,
-					 unsigned long len)
-{
-	return (REGION_NUMBER(addr) == RGN_HPAGE ||
-		REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE);
-}
-#define is_hugepage_only_range is_hugepage_only_range
-
-#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
-static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
-					  unsigned long addr, pte_t *ptep)
-{
-	return *ptep;
-}
-
-#include <asm-generic/hugetlb.h>
-
-#endif /* _ASM_IA64_HUGETLB_H */
diff --git a/arch/ia64/include/asm/hw_irq.h b/arch/ia64/include/asm/hw_irq.h
deleted file mode 100644
index 5d267132f8cb..000000000000
--- a/arch/ia64/include/asm/hw_irq.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_HW_IRQ_H
-#define _ASM_IA64_HW_IRQ_H
-
-/*
- * Copyright (C) 2001-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/profile.h>
-
-#include <asm/ptrace.h>
-#include <asm/smp.h>
-
-typedef u8 ia64_vector;
-
-/*
- * 0 special
- *
- * 1,3-14 are reserved from firmware
- *
- * 16-255 (vectored external interrupts) are available
- *
- * 15 spurious interrupt (see IVR)
- *
- * 16 lowest priority, 255 highest priority
- *
- * 15 classes of 16 interrupts each.
- */
-#define IA64_MIN_VECTORED_IRQ		 16
-#define IA64_MAX_VECTORED_IRQ		255
-#define IA64_NUM_VECTORS		256
-
-#define AUTO_ASSIGN			-1
-
-#define IA64_SPURIOUS_INT_VECTOR	0x0f
-
-/*
- * Vectors 0x10-0x1f are used for low priority interrupts, e.g. CMCI.
- */
-#define IA64_CPEP_VECTOR		0x1c	/* corrected platform error polling vector */
-#define IA64_CMCP_VECTOR		0x1d	/* corrected machine-check polling vector */
-#define IA64_CPE_VECTOR			0x1e	/* corrected platform error interrupt vector */
-#define IA64_CMC_VECTOR			0x1f	/* corrected machine-check interrupt vector */
-/*
- * Vectors 0x20-0x2f are reserved for legacy ISA IRQs.
- * Use vectors 0x30-0xe7 as the default device vector range for ia64.
- * Platforms may choose to reduce this range in platform_irq_setup, but the
- * platform range must fall within
- *	[IA64_DEF_FIRST_DEVICE_VECTOR..IA64_DEF_LAST_DEVICE_VECTOR]
- */
-extern int ia64_first_device_vector;
-extern int ia64_last_device_vector;
-
-#ifdef CONFIG_SMP
-/* Reserve the lower priority vector than device vectors for "move IRQ" IPI */
-#define IA64_IRQ_MOVE_VECTOR		0x30	/* "move IRQ" IPI */
-#define IA64_DEF_FIRST_DEVICE_VECTOR	0x31
-#else
-#define IA64_DEF_FIRST_DEVICE_VECTOR	0x30
-#endif
-#define IA64_DEF_LAST_DEVICE_VECTOR	0xe7
-#define IA64_FIRST_DEVICE_VECTOR	ia64_first_device_vector
-#define IA64_LAST_DEVICE_VECTOR		ia64_last_device_vector
-#define IA64_MAX_DEVICE_VECTORS		(IA64_DEF_LAST_DEVICE_VECTOR - IA64_DEF_FIRST_DEVICE_VECTOR + 1)
-#define IA64_NUM_DEVICE_VECTORS		(IA64_LAST_DEVICE_VECTOR - IA64_FIRST_DEVICE_VECTOR + 1)
-
-#define IA64_MCA_RENDEZ_VECTOR		0xe8	/* MCA rendez interrupt */
-#define IA64_TIMER_VECTOR		0xef	/* use highest-prio group 15 interrupt for timer */
-#define	IA64_MCA_WAKEUP_VECTOR		0xf0	/* MCA wakeup (must be >MCA_RENDEZ_VECTOR) */
-#define IA64_IPI_LOCAL_TLB_FLUSH	0xfc	/* SMP flush local TLB */
-#define IA64_IPI_RESCHEDULE		0xfd	/* SMP reschedule */
-#define IA64_IPI_VECTOR			0xfe	/* inter-processor interrupt vector */
-
-/* Used for encoding redirected irqs */
-
-#define IA64_IRQ_REDIRECTED		(1 << 31)
-
-/* IA64 inter-cpu interrupt related definitions */
-
-#define IA64_IPI_DEFAULT_BASE_ADDR	0xfee00000
-
-/* Delivery modes for inter-cpu interrupts */
-enum {
-        IA64_IPI_DM_INT =       0x0,    /* pend an external interrupt */
-        IA64_IPI_DM_PMI =       0x2,    /* pend a PMI */
-        IA64_IPI_DM_NMI =       0x4,    /* pend an NMI (vector 2) */
-        IA64_IPI_DM_INIT =      0x5,    /* pend an INIT interrupt */
-        IA64_IPI_DM_EXTINT =    0x7,    /* pend an 8259-compatible interrupt. */
-};
-
-extern __u8 isa_irq_to_vector_map[16];
-#define isa_irq_to_vector(x)	isa_irq_to_vector_map[(x)]
-
-struct irq_cfg {
-	ia64_vector vector;
-	cpumask_t domain;
-	cpumask_t old_domain;
-	unsigned move_cleanup_count;
-	u8 move_in_progress : 1;
-};
-extern spinlock_t vector_lock;
-extern struct irq_cfg irq_cfg[NR_IRQS];
-#define irq_to_domain(x)	irq_cfg[(x)].domain
-DECLARE_PER_CPU(int[IA64_NUM_VECTORS], vector_irq);
-
-extern struct irq_chip irq_type_ia64_lsapic;	/* CPU-internal interrupt controller */
-
-#define ia64_register_ipi	ia64_native_register_ipi
-#define assign_irq_vector	ia64_native_assign_irq_vector
-#define free_irq_vector		ia64_native_free_irq_vector
-#define ia64_resend_irq		ia64_native_resend_irq
-
-extern void ia64_native_register_ipi(void);
-extern int bind_irq_vector(int irq, int vector, cpumask_t domain);
-extern int ia64_native_assign_irq_vector (int irq);	/* allocate a free vector */
-extern void ia64_native_free_irq_vector (int vector);
-extern int reserve_irq_vector (int vector);
-extern void __setup_vector_irq(int cpu);
-extern void ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect);
-extern void destroy_and_reserve_irq (unsigned int irq);
-
-#ifdef CONFIG_SMP
-extern int irq_prepare_move(int irq, int cpu);
-extern void irq_complete_move(unsigned int irq);
-#else
-static inline int irq_prepare_move(int irq, int cpu) { return 0; }
-static inline void irq_complete_move(unsigned int irq) {}
-#endif
-
-static inline void ia64_native_resend_irq(unsigned int vector)
-{
-	ia64_send_ipi(smp_processor_id(), vector, IA64_IPI_DM_INT, 0);
-}
-
-/*
- * Next follows the irq descriptor interface.  On IA-64, each CPU supports 256 interrupt
- * vectors.  On smaller systems, there is a one-to-one correspondence between interrupt
- * vectors and the Linux irq numbers.  However, larger systems may have multiple interrupt
- * domains meaning that the translation from vector number to irq number depends on the
- * interrupt domain that a CPU belongs to.  This API abstracts such platform-dependent
- * differences and provides a uniform means to translate between vector and irq numbers
- * and to obtain the irq descriptor for a given irq number.
- */
-
-/* Extract the IA-64 vector that corresponds to IRQ.  */
-static inline ia64_vector
-irq_to_vector (int irq)
-{
-	return irq_cfg[irq].vector;
-}
-
-/*
- * Convert the local IA-64 vector to the corresponding irq number.  This translation is
- * done in the context of the interrupt domain that the currently executing CPU belongs
- * to.
- */
-static inline unsigned int
-local_vector_to_irq (ia64_vector vec)
-{
-	return __this_cpu_read(vector_irq[vec]);
-}
-
-#endif /* _ASM_IA64_HW_IRQ_H */
diff --git a/arch/ia64/include/asm/idle.h b/arch/ia64/include/asm/idle.h
deleted file mode 100644
index 97c55b97e0ba..000000000000
--- a/arch/ia64/include/asm/idle.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_IDLE_H
-#define _ASM_IA64_IDLE_H
-
-static inline void enter_idle(void) { }
-static inline void exit_idle(void) { }
-
-#endif /* _ASM_IA64_IDLE_H */
diff --git a/arch/ia64/include/asm/intrinsics.h b/arch/ia64/include/asm/intrinsics.h
deleted file mode 100644
index 035b17fe12ef..000000000000
--- a/arch/ia64/include/asm/intrinsics.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Compiler-dependent intrinsics.
- *
- * Copyright (C) 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#ifndef _ASM_IA64_INTRINSICS_H
-#define _ASM_IA64_INTRINSICS_H
-
-#include <uapi/asm/intrinsics.h>
-
-#endif /* _ASM_IA64_INTRINSICS_H */
diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h
deleted file mode 100644
index eedc0afa8cad..000000000000
--- a/arch/ia64/include/asm/io.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_IO_H
-#define _ASM_IA64_IO_H
-
-/*
- * This file contains the definitions for the emulated IO instructions
- * inb/inw/inl/outb/outw/outl and the "string versions" of the same
- * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
- * versions of the single-IO instructions (inb_p/inw_p/..).
- *
- * This file is not meant to be obfuscating: it's just complicated to
- * (a) handle it all in a way that makes gcc able to optimize it as
- * well as possible and (b) trying to avoid writing the same thing
- * over and over again with slight variations and possibly making a
- * mistake somewhere.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- */
-
-#include <asm/unaligned.h>
-#include <asm/early_ioremap.h>
-
-#define __IA64_UNCACHED_OFFSET	RGN_BASE(RGN_UNCACHED)
-
-/*
- * The legacy I/O space defined by the ia64 architecture supports only 65536 ports, but
- * large machines may have multiple other I/O spaces so we can't place any a priori limit
- * on IO_SPACE_LIMIT.  These additional spaces are described in ACPI.
- */
-#define IO_SPACE_LIMIT		0xffffffffffffffffUL
-
-#define MAX_IO_SPACES_BITS		8
-#define MAX_IO_SPACES			(1UL << MAX_IO_SPACES_BITS)
-#define IO_SPACE_BITS			24
-#define IO_SPACE_SIZE			(1UL << IO_SPACE_BITS)
-
-#define IO_SPACE_NR(port)		((port) >> IO_SPACE_BITS)
-#define IO_SPACE_BASE(space)		((space) << IO_SPACE_BITS)
-#define IO_SPACE_PORT(port)		((port) & (IO_SPACE_SIZE - 1))
-
-#define IO_SPACE_SPARSE_ENCODING(p)	((((p) >> 2) << 12) | ((p) & 0xfff))
-
-struct io_space {
-	unsigned long mmio_base;	/* base in MMIO space */
-	int sparse;
-};
-
-extern struct io_space io_space[];
-extern unsigned int num_io_spaces;
-
-# ifdef __KERNEL__
-
-/*
- * All MMIO iomem cookies are in region 6; anything less is a PIO cookie:
- *	0xCxxxxxxxxxxxxxxx	MMIO cookie (return from ioremap)
- *	0x000000001SPPPPPP	PIO cookie (S=space number, P..P=port)
- *
- * ioread/writeX() uses the leading 1 in PIO cookies (PIO_OFFSET) to catch
- * code that uses bare port numbers without the prerequisite pci_iomap().
- */
-#define PIO_OFFSET		(1UL << (MAX_IO_SPACES_BITS + IO_SPACE_BITS))
-#define PIO_MASK		(PIO_OFFSET - 1)
-#define PIO_RESERVED		__IA64_UNCACHED_OFFSET
-#define HAVE_ARCH_PIO_SIZE
-
-#include <asm/intrinsics.h>
-#include <asm/page.h>
-#include <asm-generic/iomap.h>
-
-/*
- * Change virtual addresses to physical addresses and vv.
- */
-static inline unsigned long
-virt_to_phys (volatile void *address)
-{
-	return (unsigned long) address - PAGE_OFFSET;
-}
-#define virt_to_phys virt_to_phys
-
-static inline void*
-phys_to_virt (unsigned long address)
-{
-	return (void *) (address + PAGE_OFFSET);
-}
-#define phys_to_virt phys_to_virt
-
-#define ARCH_HAS_VALID_PHYS_ADDR_RANGE
-extern u64 kern_mem_attribute (unsigned long phys_addr, unsigned long size);
-extern int valid_phys_addr_range (phys_addr_t addr, size_t count); /* efi.c */
-extern int valid_mmap_phys_addr_range (unsigned long pfn, size_t count);
-
-# endif /* KERNEL */
-
-/*
- * Memory fence w/accept.  This should never be used in code that is
- * not IA-64 specific.
- */
-#define __ia64_mf_a()	ia64_mfa()
-
-static inline void*
-__ia64_mk_io_addr (unsigned long port)
-{
-	struct io_space *space;
-	unsigned long offset;
-
-	space = &io_space[IO_SPACE_NR(port)];
-	port = IO_SPACE_PORT(port);
-	if (space->sparse)
-		offset = IO_SPACE_SPARSE_ENCODING(port);
-	else
-		offset = port;
-
-	return (void *) (space->mmio_base | offset);
-}
-
-/*
- * For the in/out routines, we need to do "mf.a" _after_ doing the I/O access to ensure
- * that the access has completed before executing other I/O accesses.  Since we're doing
- * the accesses through an uncachable (UC) translation, the CPU will execute them in
- * program order.  However, we still need to tell the compiler not to shuffle them around
- * during optimization, which is why we use "volatile" pointers.
- */
-
-#define inb inb
-static inline unsigned int inb(unsigned long port)
-{
-	volatile unsigned char *addr = __ia64_mk_io_addr(port);
-	unsigned char ret;
-
-	ret = *addr;
-	__ia64_mf_a();
-	return ret;
-}
-
-#define inw inw
-static inline unsigned int inw(unsigned long port)
-{
-	volatile unsigned short *addr = __ia64_mk_io_addr(port);
-	unsigned short ret;
-
-	ret = *addr;
-	__ia64_mf_a();
-	return ret;
-}
-
-#define inl inl
-static inline unsigned int inl(unsigned long port)
-{
-	volatile unsigned int *addr = __ia64_mk_io_addr(port);
-	unsigned int ret;
-
-	ret = *addr;
-	__ia64_mf_a();
-	return ret;
-}
-
-#define outb outb
-static inline void outb(unsigned char val, unsigned long port)
-{
-	volatile unsigned char *addr = __ia64_mk_io_addr(port);
-
-	*addr = val;
-	__ia64_mf_a();
-}
-
-#define outw outw
-static inline void outw(unsigned short val, unsigned long port)
-{
-	volatile unsigned short *addr = __ia64_mk_io_addr(port);
-
-	*addr = val;
-	__ia64_mf_a();
-}
-
-#define outl outl
-static inline void outl(unsigned int val, unsigned long port)
-{
-	volatile unsigned int *addr = __ia64_mk_io_addr(port);
-
-	*addr = val;
-	__ia64_mf_a();
-}
-
-#define insb insb
-static inline void insb(unsigned long port, void *dst, unsigned long count)
-{
-	unsigned char *dp = dst;
-
-	while (count--)
-		*dp++ = inb(port);
-}
-
-#define insw insw
-static inline void insw(unsigned long port, void *dst, unsigned long count)
-{
-	unsigned short *dp = dst;
-
-	while (count--)
-		put_unaligned(inw(port), dp++);
-}
-
-#define insl insl
-static inline void insl(unsigned long port, void *dst, unsigned long count)
-{
-	unsigned int *dp = dst;
-
-	while (count--)
-		put_unaligned(inl(port), dp++);
-}
-
-#define outsb outsb
-static inline void outsb(unsigned long port, const void *src,
-		unsigned long count)
-{
-	const unsigned char *sp = src;
-
-	while (count--)
-		outb(*sp++, port);
-}
-
-#define outsw outsw
-static inline void outsw(unsigned long port, const void *src,
-		unsigned long count)
-{
-	const unsigned short *sp = src;
-
-	while (count--)
-		outw(get_unaligned(sp++), port);
-}
-
-#define outsl outsl
-static inline void outsl(unsigned long port, const void *src,
-		unsigned long count)
-{
-	const unsigned int *sp = src;
-
-	while (count--)
-		outl(get_unaligned(sp++), port);
-}
-
-# ifdef __KERNEL__
-
-#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL)
-
-extern void __iomem * ioremap_uc(unsigned long offset, unsigned long size);
-
-#define ioremap_prot ioremap_prot
-#define ioremap_cache ioremap
-#define ioremap_uc ioremap_uc
-#define iounmap iounmap
-
-/*
- * String version of IO memory access ops:
- */
-extern void memcpy_fromio(void *dst, const volatile void __iomem *src, long n);
-extern void memcpy_toio(volatile void __iomem *dst, const void *src, long n);
-extern void memset_io(volatile void __iomem *s, int c, long n);
-
-#define memcpy_fromio memcpy_fromio
-#define memcpy_toio memcpy_toio
-#define memset_io memset_io
-#define xlate_dev_mem_ptr xlate_dev_mem_ptr
-#include <asm-generic/io.h>
-#undef PCI_IOBASE
-
-# endif /* __KERNEL__ */
-
-#endif /* _ASM_IA64_IO_H */
diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h
deleted file mode 100644
index eb0db20c9d4c..000000000000
--- a/arch/ia64/include/asm/iommu.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_IOMMU_H
-#define _ASM_IA64_IOMMU_H 1
-
-#include <linux/acpi.h>
-
-/* 10 seconds */
-#define DMAR_OPERATION_TIMEOUT (((cycles_t) local_cpu_data->itc_freq)*10)
-
-extern void no_iommu_init(void);
-#ifdef	CONFIG_INTEL_IOMMU
-extern int force_iommu, no_iommu;
-extern int iommu_detected;
-
-static inline int __init
-arch_rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) { return 0; }
-#else
-#define no_iommu		(1)
-#define iommu_detected		(0)
-#endif
-
-#endif
diff --git a/arch/ia64/include/asm/iosapic.h b/arch/ia64/include/asm/iosapic.h
deleted file mode 100644
index a91aeb413e17..000000000000
--- a/arch/ia64/include/asm/iosapic.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ASM_IA64_IOSAPIC_H
-#define __ASM_IA64_IOSAPIC_H
-
-#define	IOSAPIC_REG_SELECT	0x0
-#define	IOSAPIC_WINDOW		0x10
-#define	IOSAPIC_EOI		0x40
-
-#define	IOSAPIC_VERSION		0x1
-
-/*
- * Redirection table entry
- */
-#define	IOSAPIC_RTE_LOW(i)	(0x10+i*2)
-#define	IOSAPIC_RTE_HIGH(i)	(0x11+i*2)
-
-#define	IOSAPIC_DEST_SHIFT		16
-
-/*
- * Delivery mode
- */
-#define	IOSAPIC_DELIVERY_SHIFT		8
-#define	IOSAPIC_FIXED			0x0
-#define	IOSAPIC_LOWEST_PRIORITY	0x1
-#define	IOSAPIC_PMI			0x2
-#define	IOSAPIC_NMI			0x4
-#define	IOSAPIC_INIT			0x5
-#define	IOSAPIC_EXTINT			0x7
-
-/*
- * Interrupt polarity
- */
-#define	IOSAPIC_POLARITY_SHIFT		13
-#define	IOSAPIC_POL_HIGH		0
-#define	IOSAPIC_POL_LOW		1
-
-/*
- * Trigger mode
- */
-#define	IOSAPIC_TRIGGER_SHIFT		15
-#define	IOSAPIC_EDGE			0
-#define	IOSAPIC_LEVEL			1
-
-/*
- * Mask bit
- */
-
-#define	IOSAPIC_MASK_SHIFT		16
-#define	IOSAPIC_MASK			(1<<IOSAPIC_MASK_SHIFT)
-
-#define IOSAPIC_VECTOR_MASK		0xffffff00
-
-#ifndef __ASSEMBLY__
-
-#define NR_IOSAPICS			256
-
-#define iosapic_pcat_compat_init	ia64_native_iosapic_pcat_compat_init
-#define __iosapic_read			__ia64_native_iosapic_read
-#define __iosapic_write			__ia64_native_iosapic_write
-#define iosapic_get_irq_chip		ia64_native_iosapic_get_irq_chip
-
-extern void __init ia64_native_iosapic_pcat_compat_init(void);
-extern struct irq_chip *ia64_native_iosapic_get_irq_chip(unsigned long trigger);
-
-static inline unsigned int
-__ia64_native_iosapic_read(char __iomem *iosapic, unsigned int reg)
-{
-	writel(reg, iosapic + IOSAPIC_REG_SELECT);
-	return readl(iosapic + IOSAPIC_WINDOW);
-}
-
-static inline void
-__ia64_native_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
-{
-	writel(reg, iosapic + IOSAPIC_REG_SELECT);
-	writel(val, iosapic + IOSAPIC_WINDOW);
-}
-
-static inline void iosapic_eoi(char __iomem *iosapic, u32 vector)
-{
-	writel(vector, iosapic + IOSAPIC_EOI);
-}
-
-extern void __init iosapic_system_init (int pcat_compat);
-extern int iosapic_init (unsigned long address, unsigned int gsi_base);
-extern int iosapic_remove (unsigned int gsi_base);
-extern int gsi_to_irq (unsigned int gsi);
-extern int iosapic_register_intr (unsigned int gsi, unsigned long polarity,
-				  unsigned long trigger);
-extern void iosapic_unregister_intr (unsigned int irq);
-extern void iosapic_override_isa_irq (unsigned int isa_irq, unsigned int gsi,
-				      unsigned long polarity,
-				      unsigned long trigger);
-extern int __init iosapic_register_platform_intr (u32 int_type,
-					   unsigned int gsi,
-					   int pmi_vector,
-					   u16 eid, u16 id,
-					   unsigned long polarity,
-					   unsigned long trigger);
-
-#ifdef CONFIG_NUMA
-extern void map_iosapic_to_node (unsigned int, int);
-#endif
-
-# endif /* !__ASSEMBLY__ */
-#endif /* __ASM_IA64_IOSAPIC_H */
diff --git a/arch/ia64/include/asm/irq.h b/arch/ia64/include/asm/irq.h
deleted file mode 100644
index 0eccf33dfe8b..000000000000
--- a/arch/ia64/include/asm/irq.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_IRQ_H
-#define _ASM_IA64_IRQ_H
-
-/*
- * Copyright (C) 1999-2000, 2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * 11/24/98	S.Eranian 	updated TIMER_IRQ and irq_canonicalize
- * 01/20/99	S.Eranian	added keyboard interrupt
- * 02/29/00     D.Mosberger	moved most things into hw_irq.h
- */
-
-#include <linux/types.h>
-#include <linux/cpumask.h>
-#include <asm/native/irq.h>
-
-#define NR_IRQS		IA64_NATIVE_NR_IRQS
-
-static __inline__ int
-irq_canonicalize (int irq)
-{
-	/*
-	 * We do the legacy thing here of pretending that irqs < 16
-	 * are 8259 irqs.  This really shouldn't be necessary at all,
-	 * but we keep it here as serial.c still uses it...
-	 */
-	return ((irq == 2) ? 9 : irq);
-}
-
-extern void set_irq_affinity_info (unsigned int irq, int dest, int redir);
-
-int create_irq(void);
-void destroy_irq(unsigned int irq);
-
-#endif /* _ASM_IA64_IRQ_H */
diff --git a/arch/ia64/include/asm/irq_regs.h b/arch/ia64/include/asm/irq_regs.h
deleted file mode 100644
index 3dd9c0b70270..000000000000
--- a/arch/ia64/include/asm/irq_regs.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/irq_regs.h>
diff --git a/arch/ia64/include/asm/irq_remapping.h b/arch/ia64/include/asm/irq_remapping.h
deleted file mode 100644
index 547a6e87018c..000000000000
--- a/arch/ia64/include/asm/irq_remapping.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __IA64_INTR_REMAPPING_H
-#define __IA64_INTR_REMAPPING_H
-#define irq_remapping_enabled 0
-#endif
diff --git a/arch/ia64/include/asm/irqflags.h b/arch/ia64/include/asm/irqflags.h
deleted file mode 100644
index 1dc30f12e545..000000000000
--- a/arch/ia64/include/asm/irqflags.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * IRQ flags defines.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- */
-
-#ifndef _ASM_IA64_IRQFLAGS_H
-#define _ASM_IA64_IRQFLAGS_H
-
-#include <asm/pal.h>
-#include <asm/kregs.h>
-
-#ifdef CONFIG_IA64_DEBUG_IRQ
-extern unsigned long last_cli_ip;
-static inline void arch_maybe_save_ip(unsigned long flags)
-{
-	if (flags & IA64_PSR_I)
-		last_cli_ip = ia64_getreg(_IA64_REG_IP);
-}
-#else
-#define arch_maybe_save_ip(flags) do {} while (0)
-#endif
-
-/*
- * - clearing psr.i is implicitly serialized (visible by next insn)
- * - setting psr.i requires data serialization
- * - we need a stop-bit before reading PSR because we sometimes
- *   write a floating-point register right before reading the PSR
- *   and that writes to PSR.mfl
- */
-
-static inline unsigned long arch_local_save_flags(void)
-{
-	ia64_stop();
-	return ia64_getreg(_IA64_REG_PSR);
-}
-
-static inline unsigned long arch_local_irq_save(void)
-{
-	unsigned long flags = arch_local_save_flags();
-
-	ia64_stop();
-	ia64_rsm(IA64_PSR_I);
-	arch_maybe_save_ip(flags);
-	return flags;
-}
-
-static inline void arch_local_irq_disable(void)
-{
-#ifdef CONFIG_IA64_DEBUG_IRQ
-	arch_local_irq_save();
-#else
-	ia64_stop();
-	ia64_rsm(IA64_PSR_I);
-#endif
-}
-
-static inline void arch_local_irq_enable(void)
-{
-	ia64_stop();
-	ia64_ssm(IA64_PSR_I);
-	ia64_srlz_d();
-}
-
-static inline void arch_local_irq_restore(unsigned long flags)
-{
-#ifdef CONFIG_IA64_DEBUG_IRQ
-	unsigned long old_psr = arch_local_save_flags();
-#endif
-	ia64_intrin_local_irq_restore(flags & IA64_PSR_I);
-	arch_maybe_save_ip(old_psr & ~flags);
-}
-
-static inline bool arch_irqs_disabled_flags(unsigned long flags)
-{
-	return (flags & IA64_PSR_I) == 0;
-}
-
-static inline bool arch_irqs_disabled(void)
-{
-	return arch_irqs_disabled_flags(arch_local_save_flags());
-}
-
-static inline void arch_safe_halt(void)
-{
-	arch_local_irq_enable();
-	ia64_pal_halt_light();	/* PAL_HALT_LIGHT */
-}
-
-
-#endif /* _ASM_IA64_IRQFLAGS_H */
diff --git a/arch/ia64/include/asm/kdebug.h b/arch/ia64/include/asm/kdebug.h
deleted file mode 100644
index 4f7e6dc974bc..000000000000
--- a/arch/ia64/include/asm/kdebug.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _IA64_KDEBUG_H
-#define _IA64_KDEBUG_H 1
-/*
- *
- * Copyright (C) Intel Corporation, 2005
- *
- * 2005-Apr     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
- *              <anil.s.keshavamurthy@intel.com> adopted from
- *              include/asm-x86_64/kdebug.h
- *
- * 2005-Oct	Keith Owens <kaos@sgi.com>.  Expand notify_die to cover more
- *		events.
- */
-
-enum die_val {
-	DIE_BREAK = 1,
-	DIE_FAULT,
-	DIE_OOPS,
-	DIE_MACHINE_HALT,
-	DIE_MACHINE_RESTART,
-	DIE_MCA_MONARCH_ENTER,
-	DIE_MCA_MONARCH_PROCESS,
-	DIE_MCA_MONARCH_LEAVE,
-	DIE_MCA_SLAVE_ENTER,
-	DIE_MCA_SLAVE_PROCESS,
-	DIE_MCA_SLAVE_LEAVE,
-	DIE_MCA_RENDZVOUS_ENTER,
-	DIE_MCA_RENDZVOUS_PROCESS,
-	DIE_MCA_RENDZVOUS_LEAVE,
-	DIE_MCA_NEW_TIMEOUT,
-	DIE_INIT_ENTER,
-	DIE_INIT_MONARCH_ENTER,
-	DIE_INIT_MONARCH_PROCESS,
-	DIE_INIT_MONARCH_LEAVE,
-	DIE_INIT_SLAVE_ENTER,
-	DIE_INIT_SLAVE_PROCESS,
-	DIE_INIT_SLAVE_LEAVE,
-	DIE_KDEBUG_ENTER,
-	DIE_KDEBUG_LEAVE,
-	DIE_KDUMP_ENTER,
-	DIE_KDUMP_LEAVE,
-};
-
-#endif
diff --git a/arch/ia64/include/asm/kexec.h b/arch/ia64/include/asm/kexec.h
deleted file mode 100644
index 294b1e1ebd2d..000000000000
--- a/arch/ia64/include/asm/kexec.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_KEXEC_H
-#define _ASM_IA64_KEXEC_H
-
-#include <asm/setup.h>
-
-/* Maximum physical address we can use pages from */
-#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
-/* Maximum address we can reach in physical address mode */
-#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
-/* Maximum address we can use for the control code buffer */
-#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
-
-#define KEXEC_CONTROL_PAGE_SIZE (8192 + 8192 + 4096)
-
-/* The native architecture */
-#define KEXEC_ARCH KEXEC_ARCH_IA_64
-
-#define kexec_flush_icache_page(page) do { \
-                unsigned long page_addr = (unsigned long)page_address(page); \
-                flush_icache_range(page_addr, page_addr + PAGE_SIZE); \
-        } while(0)
-
-extern struct kimage *ia64_kimage;
-extern const unsigned int relocate_new_kernel_size;
-extern void relocate_new_kernel(unsigned long, unsigned long,
-		struct ia64_boot_param *, unsigned long);
-static inline void
-crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs)
-{
-}
-extern struct resource efi_memmap_res;
-extern struct resource boot_param_res;
-extern void kdump_smp_send_stop(void);
-extern void kdump_smp_send_init(void);
-extern void kexec_disable_iosapic(void);
-extern void crash_save_this_cpu(void);
-struct rsvd_region;
-extern unsigned long kdump_find_rsvd_region(unsigned long size,
-		struct rsvd_region *rsvd_regions, int n);
-extern void kdump_cpu_freeze(struct unw_frame_info *info, void *arg);
-extern int kdump_status[];
-extern atomic_t kdump_cpu_freezed;
-extern atomic_t kdump_in_progress;
-
-#endif /* _ASM_IA64_KEXEC_H */
diff --git a/arch/ia64/include/asm/kprobes.h b/arch/ia64/include/asm/kprobes.h
deleted file mode 100644
index 9e956768946c..000000000000
--- a/arch/ia64/include/asm/kprobes.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _ASM_KPROBES_H
-#define _ASM_KPROBES_H
-/*
- *  Kernel Probes (KProbes)
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- * Copyright (C) Intel Corporation, 2005
- *
- * 2005-Apr     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
- *              <anil.s.keshavamurthy@intel.com> adapted from i386
- */
-#include <asm-generic/kprobes.h>
-#include <asm/break.h>
-
-#define BREAK_INST	(long)(__IA64_BREAK_KPROBE << 6)
-
-#ifdef CONFIG_KPROBES
-
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/percpu.h>
-
-#define __ARCH_WANT_KPROBES_INSN_SLOT
-#define MAX_INSN_SIZE   2	/* last half is for kprobe-booster */
-#define NOP_M_INST	(long)(1<<27)
-#define BRL_INST(i1, i2) ((long)((0xcL << 37) |	/* brl */ \
-				(0x1L << 12) |	/* many */ \
-				(((i1) & 1) << 36) | ((i2) << 13))) /* imm */
-
-typedef union cmp_inst {
-	struct {
-	unsigned long long qp : 6;
-	unsigned long long p1 : 6;
-	unsigned long long c  : 1;
-	unsigned long long r2 : 7;
-	unsigned long long r3 : 7;
-	unsigned long long p2 : 6;
-	unsigned long long ta : 1;
-	unsigned long long x2 : 2;
-	unsigned long long tb : 1;
-	unsigned long long opcode : 4;
-	unsigned long long reserved : 23;
-	}f;
-	unsigned long long l;
-} cmp_inst_t;
-
-struct kprobe;
-
-typedef struct _bundle {
-	struct {
-		unsigned long long template : 5;
-		unsigned long long slot0 : 41;
-		unsigned long long slot1_p0 : 64-46;
-	} quad0;
-	struct {
-		unsigned long long slot1_p1 : 41 - (64-46);
-		unsigned long long slot2 : 41;
-	} quad1;
-} __attribute__((__aligned__(16)))  bundle_t;
-
-struct prev_kprobe {
-	struct kprobe *kp;
-	unsigned long status;
-};
-
-#define	MAX_PARAM_RSE_SIZE	(0x60+0x60/0x3f)
-/* per-cpu kprobe control block */
-#define ARCH_PREV_KPROBE_SZ 2
-struct kprobe_ctlblk {
-	unsigned long kprobe_status;
-	unsigned long *bsp;
-	unsigned long cfm;
-	atomic_t prev_kprobe_index;
-	struct prev_kprobe prev_kprobe[ARCH_PREV_KPROBE_SZ];
-};
-
-#define kretprobe_blacklist_size 0
-
-#define SLOT0_OPCODE_SHIFT	(37)
-#define SLOT1_p1_OPCODE_SHIFT	(37 - (64-46))
-#define SLOT2_OPCODE_SHIFT 	(37)
-
-#define INDIRECT_CALL_OPCODE		(1)
-#define IP_RELATIVE_CALL_OPCODE		(5)
-#define IP_RELATIVE_BRANCH_OPCODE	(4)
-#define IP_RELATIVE_PREDICT_OPCODE	(7)
-#define LONG_BRANCH_OPCODE		(0xC)
-#define LONG_CALL_OPCODE		(0xD)
-#define flush_insn_slot(p)		do { } while (0)
-
-typedef struct kprobe_opcode {
-	bundle_t bundle;
-} kprobe_opcode_t;
-
-/* Architecture specific copy of original instruction*/
-struct arch_specific_insn {
-	/* copy of the instruction to be emulated */
-	kprobe_opcode_t *insn;
- #define INST_FLAG_FIX_RELATIVE_IP_ADDR		1
- #define INST_FLAG_FIX_BRANCH_REG		2
- #define INST_FLAG_BREAK_INST			4
- #define INST_FLAG_BOOSTABLE			8
- 	unsigned long inst_flag;
- 	unsigned short target_br_reg;
-	unsigned short slot;
-};
-
-extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
-extern int kprobe_exceptions_notify(struct notifier_block *self,
-				    unsigned long val, void *data);
-
-extern void arch_remove_kprobe(struct kprobe *p);
-
-#endif /* CONFIG_KPROBES */
-#endif /* _ASM_KPROBES_H */
diff --git a/arch/ia64/include/asm/kregs.h b/arch/ia64/include/asm/kregs.h
deleted file mode 100644
index 44113b75e4eb..000000000000
--- a/arch/ia64/include/asm/kregs.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_KREGS_H
-#define _ASM_IA64_KREGS_H
-
-/*
- * Copyright (C) 2001-2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-/*
- * This file defines the kernel register usage convention used by Linux/ia64.
- */
-
-/*
- * Kernel registers:
- */
-#define IA64_KR_IO_BASE		0	/* ar.k0: legacy I/O base address */
-#define IA64_KR_TSSD		1	/* ar.k1: IVE uses this as the TSSD */
-#define IA64_KR_PER_CPU_DATA	3	/* ar.k3: physical per-CPU base */
-#define IA64_KR_CURRENT_STACK	4	/* ar.k4: what's mapped in IA64_TR_CURRENT_STACK */
-#define IA64_KR_FPU_OWNER	5	/* ar.k5: fpu-owner (UP only, at the moment) */
-#define IA64_KR_CURRENT		6	/* ar.k6: "current" task pointer */
-#define IA64_KR_PT_BASE		7	/* ar.k7: page table base address (physical) */
-
-#define _IA64_KR_PASTE(x,y)	x##y
-#define _IA64_KR_PREFIX(n)	_IA64_KR_PASTE(ar.k, n)
-#define IA64_KR(n)		_IA64_KR_PREFIX(IA64_KR_##n)
-
-/*
- * Translation registers:
- */
-#define IA64_TR_KERNEL		0	/* itr0, dtr0: maps kernel image (code & data) */
-#define IA64_TR_PALCODE		1	/* itr1: maps PALcode as required by EFI */
-#define IA64_TR_CURRENT_STACK	1	/* dtr1: maps kernel's memory- & register-stacks */
-
-#define IA64_TR_ALLOC_BASE	2 	/* itr&dtr: Base of dynamic TR resource*/
-#define IA64_TR_ALLOC_MAX	64 	/* Max number for dynamic use*/
-
-/* Processor status register bits: */
-#define IA64_PSR_BE_BIT		1
-#define IA64_PSR_UP_BIT		2
-#define IA64_PSR_AC_BIT		3
-#define IA64_PSR_MFL_BIT	4
-#define IA64_PSR_MFH_BIT	5
-#define IA64_PSR_IC_BIT		13
-#define IA64_PSR_I_BIT		14
-#define IA64_PSR_PK_BIT		15
-#define IA64_PSR_DT_BIT		17
-#define IA64_PSR_DFL_BIT	18
-#define IA64_PSR_DFH_BIT	19
-#define IA64_PSR_SP_BIT		20
-#define IA64_PSR_PP_BIT		21
-#define IA64_PSR_DI_BIT		22
-#define IA64_PSR_SI_BIT		23
-#define IA64_PSR_DB_BIT		24
-#define IA64_PSR_LP_BIT		25
-#define IA64_PSR_TB_BIT		26
-#define IA64_PSR_RT_BIT		27
-/* The following are not affected by save_flags()/restore_flags(): */
-#define IA64_PSR_CPL0_BIT	32
-#define IA64_PSR_CPL1_BIT	33
-#define IA64_PSR_IS_BIT		34
-#define IA64_PSR_MC_BIT		35
-#define IA64_PSR_IT_BIT		36
-#define IA64_PSR_ID_BIT		37
-#define IA64_PSR_DA_BIT		38
-#define IA64_PSR_DD_BIT		39
-#define IA64_PSR_SS_BIT		40
-#define IA64_PSR_RI_BIT		41
-#define IA64_PSR_ED_BIT		43
-#define IA64_PSR_BN_BIT		44
-#define IA64_PSR_IA_BIT		45
-
-/* A mask of PSR bits that we generally don't want to inherit across a clone2() or an
-   execve().  Only list flags here that need to be cleared/set for BOTH clone2() and
-   execve().  */
-#define IA64_PSR_BITS_TO_CLEAR	(IA64_PSR_MFL | IA64_PSR_MFH | IA64_PSR_DB | IA64_PSR_LP | \
-				 IA64_PSR_TB  | IA64_PSR_ID  | IA64_PSR_DA | IA64_PSR_DD | \
-				 IA64_PSR_SS  | IA64_PSR_ED  | IA64_PSR_IA)
-#define IA64_PSR_BITS_TO_SET	(IA64_PSR_DFH | IA64_PSR_SP)
-
-#define IA64_PSR_BE	(__IA64_UL(1) << IA64_PSR_BE_BIT)
-#define IA64_PSR_UP	(__IA64_UL(1) << IA64_PSR_UP_BIT)
-#define IA64_PSR_AC	(__IA64_UL(1) << IA64_PSR_AC_BIT)
-#define IA64_PSR_MFL	(__IA64_UL(1) << IA64_PSR_MFL_BIT)
-#define IA64_PSR_MFH	(__IA64_UL(1) << IA64_PSR_MFH_BIT)
-#define IA64_PSR_IC	(__IA64_UL(1) << IA64_PSR_IC_BIT)
-#define IA64_PSR_I	(__IA64_UL(1) << IA64_PSR_I_BIT)
-#define IA64_PSR_PK	(__IA64_UL(1) << IA64_PSR_PK_BIT)
-#define IA64_PSR_DT	(__IA64_UL(1) << IA64_PSR_DT_BIT)
-#define IA64_PSR_DFL	(__IA64_UL(1) << IA64_PSR_DFL_BIT)
-#define IA64_PSR_DFH	(__IA64_UL(1) << IA64_PSR_DFH_BIT)
-#define IA64_PSR_SP	(__IA64_UL(1) << IA64_PSR_SP_BIT)
-#define IA64_PSR_PP	(__IA64_UL(1) << IA64_PSR_PP_BIT)
-#define IA64_PSR_DI	(__IA64_UL(1) << IA64_PSR_DI_BIT)
-#define IA64_PSR_SI	(__IA64_UL(1) << IA64_PSR_SI_BIT)
-#define IA64_PSR_DB	(__IA64_UL(1) << IA64_PSR_DB_BIT)
-#define IA64_PSR_LP	(__IA64_UL(1) << IA64_PSR_LP_BIT)
-#define IA64_PSR_TB	(__IA64_UL(1) << IA64_PSR_TB_BIT)
-#define IA64_PSR_RT	(__IA64_UL(1) << IA64_PSR_RT_BIT)
-/* The following are not affected by save_flags()/restore_flags(): */
-#define IA64_PSR_CPL	(__IA64_UL(3) << IA64_PSR_CPL0_BIT)
-#define IA64_PSR_IS	(__IA64_UL(1) << IA64_PSR_IS_BIT)
-#define IA64_PSR_MC	(__IA64_UL(1) << IA64_PSR_MC_BIT)
-#define IA64_PSR_IT	(__IA64_UL(1) << IA64_PSR_IT_BIT)
-#define IA64_PSR_ID	(__IA64_UL(1) << IA64_PSR_ID_BIT)
-#define IA64_PSR_DA	(__IA64_UL(1) << IA64_PSR_DA_BIT)
-#define IA64_PSR_DD	(__IA64_UL(1) << IA64_PSR_DD_BIT)
-#define IA64_PSR_SS	(__IA64_UL(1) << IA64_PSR_SS_BIT)
-#define IA64_PSR_RI	(__IA64_UL(3) << IA64_PSR_RI_BIT)
-#define IA64_PSR_ED	(__IA64_UL(1) << IA64_PSR_ED_BIT)
-#define IA64_PSR_BN	(__IA64_UL(1) << IA64_PSR_BN_BIT)
-#define IA64_PSR_IA	(__IA64_UL(1) << IA64_PSR_IA_BIT)
-
-/* User mask bits: */
-#define IA64_PSR_UM	(IA64_PSR_BE | IA64_PSR_UP | IA64_PSR_AC | IA64_PSR_MFL | IA64_PSR_MFH)
-
-/* Default Control Register */
-#define IA64_DCR_PP_BIT		 0	/* privileged performance monitor default */
-#define IA64_DCR_BE_BIT		 1	/* big-endian default */
-#define IA64_DCR_LC_BIT		 2	/* ia32 lock-check enable */
-#define IA64_DCR_DM_BIT		 8	/* defer TLB miss faults */
-#define IA64_DCR_DP_BIT		 9	/* defer page-not-present faults */
-#define IA64_DCR_DK_BIT		10	/* defer key miss faults */
-#define IA64_DCR_DX_BIT		11	/* defer key permission faults */
-#define IA64_DCR_DR_BIT		12	/* defer access right faults */
-#define IA64_DCR_DA_BIT		13	/* defer access bit faults */
-#define IA64_DCR_DD_BIT		14	/* defer debug faults */
-
-#define IA64_DCR_PP	(__IA64_UL(1) << IA64_DCR_PP_BIT)
-#define IA64_DCR_BE	(__IA64_UL(1) << IA64_DCR_BE_BIT)
-#define IA64_DCR_LC	(__IA64_UL(1) << IA64_DCR_LC_BIT)
-#define IA64_DCR_DM	(__IA64_UL(1) << IA64_DCR_DM_BIT)
-#define IA64_DCR_DP	(__IA64_UL(1) << IA64_DCR_DP_BIT)
-#define IA64_DCR_DK	(__IA64_UL(1) << IA64_DCR_DK_BIT)
-#define IA64_DCR_DX	(__IA64_UL(1) << IA64_DCR_DX_BIT)
-#define IA64_DCR_DR	(__IA64_UL(1) << IA64_DCR_DR_BIT)
-#define IA64_DCR_DA	(__IA64_UL(1) << IA64_DCR_DA_BIT)
-#define IA64_DCR_DD	(__IA64_UL(1) << IA64_DCR_DD_BIT)
-
-/* Interrupt Status Register */
-#define IA64_ISR_X_BIT		32	/* execute access */
-#define IA64_ISR_W_BIT		33	/* write access */
-#define IA64_ISR_R_BIT		34	/* read access */
-#define IA64_ISR_NA_BIT		35	/* non-access */
-#define IA64_ISR_SP_BIT		36	/* speculative load exception */
-#define IA64_ISR_RS_BIT		37	/* mandatory register-stack exception */
-#define IA64_ISR_IR_BIT		38	/* invalid register frame exception */
-#define IA64_ISR_CODE_MASK	0xf
-
-#define IA64_ISR_X	(__IA64_UL(1) << IA64_ISR_X_BIT)
-#define IA64_ISR_W	(__IA64_UL(1) << IA64_ISR_W_BIT)
-#define IA64_ISR_R	(__IA64_UL(1) << IA64_ISR_R_BIT)
-#define IA64_ISR_NA	(__IA64_UL(1) << IA64_ISR_NA_BIT)
-#define IA64_ISR_SP	(__IA64_UL(1) << IA64_ISR_SP_BIT)
-#define IA64_ISR_RS	(__IA64_UL(1) << IA64_ISR_RS_BIT)
-#define IA64_ISR_IR	(__IA64_UL(1) << IA64_ISR_IR_BIT)
-
-/* ISR code field for non-access instructions */
-#define IA64_ISR_CODE_TPA	0
-#define IA64_ISR_CODE_FC	1
-#define IA64_ISR_CODE_PROBE	2
-#define IA64_ISR_CODE_TAK	3
-#define IA64_ISR_CODE_LFETCH	4
-#define IA64_ISR_CODE_PROBEF	5
-
-#endif /* _ASM_IA64_kREGS_H */
diff --git a/arch/ia64/include/asm/libata-portmap.h b/arch/ia64/include/asm/libata-portmap.h
deleted file mode 100644
index 757f84e5dc6e..000000000000
--- a/arch/ia64/include/asm/libata-portmap.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ASM_IA64_LIBATA_PORTMAP_H
-#define __ASM_IA64_LIBATA_PORTMAP_H
-
-#define ATA_PRIMARY_IRQ(dev)	isa_irq_to_vector(14)
-
-#define ATA_SECONDARY_IRQ(dev)	isa_irq_to_vector(15)
-
-#endif
diff --git a/arch/ia64/include/asm/linkage.h b/arch/ia64/include/asm/linkage.h
deleted file mode 100644
index 5178af560925..000000000000
--- a/arch/ia64/include/asm/linkage.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ASM_LINKAGE_H
-#define __ASM_LINKAGE_H
-
-#ifndef __ASSEMBLY__
-
-#define asmlinkage CPP_ASMLINKAGE __attribute__((syscall_linkage))
-
-#else
-
-#include <asm/asmmacro.h>
-
-#endif
-
-#define cond_syscall(x) asm(".weak\t" #x "#\n" #x "#\t=\tsys_ni_syscall#")
-#define SYSCALL_ALIAS(alias, name)					\
-	asm ( #alias "# = " #name "#\n\t.globl " #alias "#")
-
-#endif
diff --git a/arch/ia64/include/asm/local.h b/arch/ia64/include/asm/local.h
deleted file mode 100644
index c11c530f74d0..000000000000
--- a/arch/ia64/include/asm/local.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/local.h>
diff --git a/arch/ia64/include/asm/mca.h b/arch/ia64/include/asm/mca.h
deleted file mode 100644
index 05805249296c..000000000000
--- a/arch/ia64/include/asm/mca.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * File:	mca.h
- * Purpose:	Machine check handling specific defines
- *
- * Copyright (C) 1999, 2004 Silicon Graphics, Inc.
- * Copyright (C) Vijay Chander <vijay@engr.sgi.com>
- * Copyright (C) Srinivasa Thirumalachar <sprasad@engr.sgi.com>
- * Copyright (C) Russ Anderson <rja@sgi.com>
- */
-
-#ifndef _ASM_IA64_MCA_H
-#define _ASM_IA64_MCA_H
-
-#if !defined(__ASSEMBLY__)
-
-#include <linux/percpu.h>
-#include <linux/threads.h>
-#include <linux/types.h>
-#include <asm/ptrace.h>
-
-#define IA64_MCA_RENDEZ_TIMEOUT		(20 * 1000)	/* value in milliseconds - 20 seconds */
-
-typedef struct ia64_fptr {
-	unsigned long fp;
-	unsigned long gp;
-} ia64_fptr_t;
-
-typedef union cmcv_reg_u {
-	u64	cmcv_regval;
-	struct	{
-		u64	cmcr_vector		: 8;
-		u64	cmcr_reserved1		: 4;
-		u64	cmcr_ignored1		: 1;
-		u64	cmcr_reserved2		: 3;
-		u64	cmcr_mask		: 1;
-		u64	cmcr_ignored2		: 47;
-	} cmcv_reg_s;
-
-} cmcv_reg_t;
-
-#define cmcv_mask		cmcv_reg_s.cmcr_mask
-#define cmcv_vector		cmcv_reg_s.cmcr_vector
-
-enum {
-	IA64_MCA_RENDEZ_CHECKIN_NOTDONE	=	0x0,
-	IA64_MCA_RENDEZ_CHECKIN_DONE	=	0x1,
-	IA64_MCA_RENDEZ_CHECKIN_INIT	=	0x2,
-	IA64_MCA_RENDEZ_CHECKIN_CONCURRENT_MCA	=	0x3,
-};
-
-/* Information maintained by the MC infrastructure */
-typedef struct ia64_mc_info_s {
-	u64		imi_mca_handler;
-	size_t		imi_mca_handler_size;
-	u64		imi_monarch_init_handler;
-	size_t		imi_monarch_init_handler_size;
-	u64		imi_slave_init_handler;
-	size_t		imi_slave_init_handler_size;
-	u8		imi_rendez_checkin[NR_CPUS];
-
-} ia64_mc_info_t;
-
-/* Handover state from SAL to OS and vice versa, for both MCA and INIT events.
- * Besides the handover state, it also contains some saved registers from the
- * time of the event.
- * Note: mca_asm.S depends on the precise layout of this structure.
- */
-
-struct ia64_sal_os_state {
-
-	/* SAL to OS */
-	unsigned long		os_gp;			/* GP of the os registered with the SAL, physical */
-	unsigned long		pal_proc;		/* PAL_PROC entry point, physical */
-	unsigned long		sal_proc;		/* SAL_PROC entry point, physical */
-	unsigned long		rv_rc;			/* MCA - Rendezvous state, INIT - reason code */
-	unsigned long		proc_state_param;	/* from R18 */
-	unsigned long		monarch;		/* 1 for a monarch event, 0 for a slave */
-
-	/* common */
-	unsigned long		sal_ra;			/* Return address in SAL, physical */
-	unsigned long		sal_gp;			/* GP of the SAL - physical */
-	struct pal_min_state_area *pal_min_state;	/* from R17.  physical in asm, virtual in C */
-	/* Previous values of IA64_KR(CURRENT) and IA64_KR(CURRENT_STACK).
-	 * Note: if the MCA/INIT recovery code wants to resume to a new context
-	 * then it must change these values to reflect the new kernel stack.
-	 */
-	unsigned long		prev_IA64_KR_CURRENT;	/* previous value of IA64_KR(CURRENT) */
-	unsigned long		prev_IA64_KR_CURRENT_STACK;
-	struct task_struct	*prev_task;		/* previous task, NULL if it is not useful */
-	/* Some interrupt registers are not saved in minstate, pt_regs or
-	 * switch_stack.  Because MCA/INIT can occur when interrupts are
-	 * disabled, we need to save the additional interrupt registers over
-	 * MCA/INIT and resume.
-	 */
-	unsigned long		isr;
-	unsigned long		ifa;
-	unsigned long		itir;
-	unsigned long		iipa;
-	unsigned long		iim;
-	unsigned long		iha;
-
-	/* OS to SAL */
-	unsigned long		os_status;		/* OS status to SAL, enum below */
-	unsigned long		context;		/* 0 if return to same context
-							   1 if return to new context */
-
-	/* I-resources */
-	unsigned long		iip;
-	unsigned long		ipsr;
-	unsigned long		ifs;
-};
-
-enum {
-	IA64_MCA_CORRECTED	=	0x0,	/* Error has been corrected by OS_MCA */
-	IA64_MCA_WARM_BOOT	=	-1,	/* Warm boot of the system need from SAL */
-	IA64_MCA_COLD_BOOT	=	-2,	/* Cold boot of the system need from SAL */
-	IA64_MCA_HALT		=	-3	/* System to be halted by SAL */
-};
-
-enum {
-	IA64_INIT_RESUME	=	0x0,	/* Resume after return from INIT */
-	IA64_INIT_WARM_BOOT	=	-1,	/* Warm boot of the system need from SAL */
-};
-
-enum {
-	IA64_MCA_SAME_CONTEXT	=	0x0,	/* SAL to return to same context */
-	IA64_MCA_NEW_CONTEXT	=	-1	/* SAL to return to new context */
-};
-
-/* Per-CPU MCA state that is too big for normal per-CPU variables.  */
-
-struct ia64_mca_cpu {
-	u64 mca_stack[KERNEL_STACK_SIZE/8];
-	u64 init_stack[KERNEL_STACK_SIZE/8];
-};
-
-/* Array of physical addresses of each CPU's MCA area.  */
-extern unsigned long __per_cpu_mca[NR_CPUS];
-
-extern int cpe_vector;
-extern int ia64_cpe_irq;
-extern void ia64_mca_init(void);
-extern void ia64_mca_irq_init(void);
-extern void ia64_mca_cpu_init(void *);
-extern void ia64_os_mca_dispatch(void);
-extern void ia64_os_mca_dispatch_end(void);
-extern void ia64_mca_ucmc_handler(struct pt_regs *, struct ia64_sal_os_state *);
-extern void ia64_init_handler(struct pt_regs *,
-			      struct switch_stack *,
-			      struct ia64_sal_os_state *);
-extern void ia64_os_init_on_kdump(void);
-extern void ia64_monarch_init_handler(void);
-extern void ia64_slave_init_handler(void);
-extern void ia64_mca_cmc_vector_setup(void);
-extern int  ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *));
-extern void ia64_unreg_MCA_extension(void);
-extern unsigned long ia64_get_rnat(unsigned long *);
-extern void ia64_set_psr_mc(void);
-extern void ia64_mca_printk(const char * fmt, ...)
-	 __attribute__ ((format (printf, 1, 2)));
-
-struct ia64_mca_notify_die {
-	struct ia64_sal_os_state *sos;
-	int *monarch_cpu;
-	int *data;
-};
-
-DECLARE_PER_CPU(u64, ia64_mca_pal_base);
-
-#else	/* __ASSEMBLY__ */
-
-#define IA64_MCA_CORRECTED	0x0	/* Error has been corrected by OS_MCA */
-#define IA64_MCA_WARM_BOOT	-1	/* Warm boot of the system need from SAL */
-#define IA64_MCA_COLD_BOOT	-2	/* Cold boot of the system need from SAL */
-#define IA64_MCA_HALT		-3	/* System to be halted by SAL */
-
-#define IA64_INIT_RESUME	0x0	/* Resume after return from INIT */
-#define IA64_INIT_WARM_BOOT	-1	/* Warm boot of the system need from SAL */
-
-#define IA64_MCA_SAME_CONTEXT	0x0	/* SAL to return to same context */
-#define IA64_MCA_NEW_CONTEXT	-1	/* SAL to return to new context */
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_IA64_MCA_H */
diff --git a/arch/ia64/include/asm/mca_asm.h b/arch/ia64/include/asm/mca_asm.h
deleted file mode 100644
index e3ab1f41f1c3..000000000000
--- a/arch/ia64/include/asm/mca_asm.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * File:	mca_asm.h
- * Purpose:	Machine check handling specific defines
- *
- * Copyright (C) 1999 Silicon Graphics, Inc.
- * Copyright (C) Vijay Chander <vijay@engr.sgi.com>
- * Copyright (C) Srinivasa Thirumalachar <sprasad@engr.sgi.com>
- * Copyright (C) 2000 Hewlett-Packard Co.
- * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2002 Intel Corp.
- * Copyright (C) 2002 Jenna Hall <jenna.s.hall@intel.com>
- * Copyright (C) 2005 Silicon Graphics, Inc
- * Copyright (C) 2005 Keith Owens <kaos@sgi.com>
- */
-#ifndef _ASM_IA64_MCA_ASM_H
-#define _ASM_IA64_MCA_ASM_H
-
-#include <asm/percpu.h>
-
-#define PSR_IC		13
-#define PSR_I		14
-#define	PSR_DT		17
-#define PSR_RT		27
-#define PSR_MC		35
-#define PSR_IT		36
-#define PSR_BN		44
-
-/*
- * This macro converts a instruction virtual address to a physical address
- * Right now for simulation purposes the virtual addresses are
- * direct mapped to physical addresses.
- *	1. Lop off bits 61 thru 63 in the virtual address
- */
-#define INST_VA_TO_PA(addr)							\
-	dep	addr	= 0, addr, 61, 3
-/*
- * This macro converts a data virtual address to a physical address
- * Right now for simulation purposes the virtual addresses are
- * direct mapped to physical addresses.
- *	1. Lop off bits 61 thru 63 in the virtual address
- */
-#define DATA_VA_TO_PA(addr)							\
-	tpa	addr	= addr
-/*
- * This macro converts a data physical address to a virtual address
- * Right now for simulation purposes the virtual addresses are
- * direct mapped to physical addresses.
- *	1. Put 0x7 in bits 61 thru 63.
- */
-#define DATA_PA_TO_VA(addr,temp)							\
-	mov	temp	= 0x7	;;							\
-	dep	addr	= temp, addr, 61, 3
-
-#define GET_THIS_PADDR(reg, var)		\
-	mov	reg = IA64_KR(PER_CPU_DATA);;	\
-        addl	reg = THIS_CPU(var), reg
-
-/*
- * This macro jumps to the instruction at the given virtual address
- * and starts execution in physical mode with all the address
- * translations turned off.
- *	1.	Save the current psr
- *	2.	Make sure that all the upper 32 bits are off
- *
- *	3.	Clear the interrupt enable and interrupt state collection bits
- *		in the psr before updating the ipsr and iip.
- *
- *	4.	Turn off the instruction, data and rse translation bits of the psr
- *		and store the new value into ipsr
- *		Also make sure that the interrupts are disabled.
- *		Ensure that we are in little endian mode.
- *		[psr.{rt, it, dt, i, be} = 0]
- *
- *	5.	Get the physical address corresponding to the virtual address
- *		of the next instruction bundle and put it in iip.
- *		(Using magic numbers 24 and 40 in the deposint instruction since
- *		 the IA64_SDK code directly maps to lower 24bits as physical address
- *		 from a virtual address).
- *
- *	6.	Do an rfi to move the values from ipsr to psr and iip to ip.
- */
-#define  PHYSICAL_MODE_ENTER(temp1, temp2, start_addr, old_psr)				\
-	mov	old_psr = psr;								\
-	;;										\
-	dep	old_psr = 0, old_psr, 32, 32;						\
-											\
-	mov	ar.rsc = 0 ;								\
-	;;										\
-	srlz.d;										\
-	mov	temp2 = ar.bspstore;							\
-	;;										\
-	DATA_VA_TO_PA(temp2);								\
-	;;										\
-	mov	temp1 = ar.rnat;							\
-	;;										\
-	mov	ar.bspstore = temp2;							\
-	;;										\
-	mov	ar.rnat = temp1;							\
-	mov	temp1 = psr;								\
-	mov	temp2 = psr;								\
-	;;										\
-											\
-	dep	temp2 = 0, temp2, PSR_IC, 2;						\
-	;;										\
-	mov	psr.l = temp2;								\
-	;;										\
-	srlz.d;										\
-	dep	temp1 = 0, temp1, 32, 32;						\
-	;;										\
-	dep	temp1 = 0, temp1, PSR_IT, 1;						\
-	;;										\
-	dep	temp1 = 0, temp1, PSR_DT, 1;						\
-	;;										\
-	dep	temp1 = 0, temp1, PSR_RT, 1;						\
-	;;										\
-	dep	temp1 = 0, temp1, PSR_I, 1;						\
-	;;										\
-	dep	temp1 = 0, temp1, PSR_IC, 1;						\
-	;;										\
-	dep	temp1 = -1, temp1, PSR_MC, 1;						\
-	;;										\
-	mov	cr.ipsr = temp1;							\
-	;;										\
-	LOAD_PHYSICAL(p0, temp2, start_addr);						\
-	;;										\
-	mov	cr.iip = temp2;								\
-	mov	cr.ifs = r0;								\
-	DATA_VA_TO_PA(sp);								\
-	DATA_VA_TO_PA(gp);								\
-	;;										\
-	srlz.i;										\
-	;;										\
-	nop	1;									\
-	nop	2;									\
-	nop	1;									\
-	nop	2;									\
-	rfi;										\
-	;;
-
-/*
- * This macro jumps to the instruction at the given virtual address
- * and starts execution in virtual mode with all the address
- * translations turned on.
- *	1.	Get the old saved psr
- *
- *	2.	Clear the interrupt state collection bit in the current psr.
- *
- *	3.	Set the instruction translation bit back in the old psr
- *		Note we have to do this since we are right now saving only the
- *		lower 32-bits of old psr.(Also the old psr has the data and
- *		rse translation bits on)
- *
- *	4.	Set ipsr to this old_psr with "it" bit set and "bn" = 1.
- *
- *	5.	Reset the current thread pointer (r13).
- *
- *	6.	Set iip to the virtual address of the next instruction bundle.
- *
- *	7.	Do an rfi to move ipsr to psr and iip to ip.
- */
-
-#define VIRTUAL_MODE_ENTER(temp1, temp2, start_addr, old_psr)	\
-	mov	temp2 = psr;					\
-	;;							\
-	mov	old_psr = temp2;				\
-	;;							\
-	dep	temp2 = 0, temp2, PSR_IC, 2;			\
-	;;							\
-	mov	psr.l = temp2;					\
-	mov	ar.rsc = 0;					\
-	;;							\
-	srlz.d;							\
-	mov	r13 = ar.k6;					\
-	mov	temp2 = ar.bspstore;				\
-	;;							\
-	DATA_PA_TO_VA(temp2,temp1);				\
-	;;							\
-	mov	temp1 = ar.rnat;				\
-	;;							\
-	mov	ar.bspstore = temp2;				\
-	;;							\
-	mov	ar.rnat = temp1;				\
-	;;							\
-	mov	temp1 = old_psr;				\
-	;;							\
-	mov	temp2 = 1;					\
-	;;							\
-	dep	temp1 = temp2, temp1, PSR_IC, 1;		\
-	;;							\
-	dep	temp1 = temp2, temp1, PSR_IT, 1;		\
-	;;							\
-	dep	temp1 = temp2, temp1, PSR_DT, 1;		\
-	;;							\
-	dep	temp1 = temp2, temp1, PSR_RT, 1;		\
-	;;							\
-	dep	temp1 = temp2, temp1, PSR_BN, 1;		\
-	;;							\
-								\
-	mov     cr.ipsr = temp1;				\
-	movl	temp2 = start_addr;				\
-	;;							\
-	mov	cr.iip = temp2;					\
-	movl	gp = __gp					\
-	;;							\
-	DATA_PA_TO_VA(sp, temp1);				\
-	srlz.i;							\
-	;;							\
-	nop	1;						\
-	nop	2;						\
-	nop	1;						\
-	rfi							\
-	;;
-
-/*
- * The MCA and INIT stacks in struct ia64_mca_cpu look like normal kernel
- * stacks, except that the SAL/OS state and a switch_stack are stored near the
- * top of the MCA/INIT stack.  To support concurrent entry to MCA or INIT, as
- * well as MCA over INIT, each event needs its own SAL/OS state.  All entries
- * are 16 byte aligned.
- *
- *      +---------------------------+
- *      |          pt_regs          |
- *      +---------------------------+
- *      |        switch_stack       |
- *      +---------------------------+
- *      |        SAL/OS state       |
- *      +---------------------------+
- *      |    16 byte scratch area   |
- *      +---------------------------+ <-------- SP at start of C MCA handler
- *      |           .....           |
- *      +---------------------------+
- *      | RBS for MCA/INIT handler  |
- *      +---------------------------+
- *      | struct task for MCA/INIT  |
- *      +---------------------------+ <-------- Bottom of MCA/INIT stack
- */
-
-#define ALIGN16(x)			((x)&~15)
-#define MCA_PT_REGS_OFFSET		ALIGN16(KERNEL_STACK_SIZE-IA64_PT_REGS_SIZE)
-#define MCA_SWITCH_STACK_OFFSET		ALIGN16(MCA_PT_REGS_OFFSET-IA64_SWITCH_STACK_SIZE)
-#define MCA_SOS_OFFSET			ALIGN16(MCA_SWITCH_STACK_OFFSET-IA64_SAL_OS_STATE_SIZE)
-#define MCA_SP_OFFSET			ALIGN16(MCA_SOS_OFFSET-16)
-
-#endif /* _ASM_IA64_MCA_ASM_H */
diff --git a/arch/ia64/include/asm/meminit.h b/arch/ia64/include/asm/meminit.h
deleted file mode 100644
index f1d5bf2ba847..000000000000
--- a/arch/ia64/include/asm/meminit.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef meminit_h
-#define meminit_h
-
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
-
-/*
- * Entries defined so far:
- * 	- boot param structure itself
- * 	- memory map
- * 	- initrd (optional)
- * 	- command line string
- * 	- kernel code & data
- * 	- crash dumping code reserved region
- * 	- Kernel memory map built from EFI memory map
- * 	- ELF core header
- *
- * More could be added if necessary
- */
-#define IA64_MAX_RSVD_REGIONS 9
-
-struct rsvd_region {
-	u64 start;	/* virtual address of beginning of element */
-	u64 end;	/* virtual address of end of element + 1 */
-};
-
-extern struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1];
-
-extern void find_memory (void);
-extern void reserve_memory (void);
-extern void find_initrd (void);
-extern int filter_rsvd_memory (u64 start, u64 end, void *arg);
-extern int filter_memory (u64 start, u64 end, void *arg);
-extern unsigned long efi_memmap_init(u64 *s, u64 *e);
-extern int find_max_min_low_pfn (u64, u64, void *);
-
-extern unsigned long vmcore_find_descriptor_size(unsigned long address);
-
-/*
- * For rounding an address to the next IA64_GRANULE_SIZE or order
- */
-#define GRANULEROUNDDOWN(n)	((n) & ~(IA64_GRANULE_SIZE-1))
-#define GRANULEROUNDUP(n)	(((n)+IA64_GRANULE_SIZE-1) & ~(IA64_GRANULE_SIZE-1))
-
-#ifdef CONFIG_NUMA
-  extern void call_pernode_memory (unsigned long start, unsigned long len, void *func);
-#else
-# define call_pernode_memory(start, len, func)	(*func)(start, len, 0)
-#endif
-
-#define IGNORE_PFN0	1	/* XXX fix me: ignore pfn 0 until TLB miss handler is updated... */
-
-extern int register_active_ranges(u64 start, u64 len, int nid);
-
-#endif /* meminit_h */
diff --git a/arch/ia64/include/asm/mman.h b/arch/ia64/include/asm/mman.h
deleted file mode 100644
index 15cf100add0e..000000000000
--- a/arch/ia64/include/asm/mman.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Based on <asm-i386/mman.h>.
- *
- * Modified 1998-2000, 2002
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#ifndef _ASM_IA64_MMAN_H
-#define _ASM_IA64_MMAN_H
-
-#include <uapi/asm/mman.h>
-
-#ifndef __ASSEMBLY__
-#define arch_mmap_check	ia64_mmap_check
-int ia64_mmap_check(unsigned long addr, unsigned long len,
-		unsigned long flags);
-#endif
-#endif /* _ASM_IA64_MMAN_H */
diff --git a/arch/ia64/include/asm/mmiowb.h b/arch/ia64/include/asm/mmiowb.h
deleted file mode 100644
index d67aab4ea3b4..000000000000
--- a/arch/ia64/include/asm/mmiowb.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _ASM_IA64_MMIOWB_H
-#define _ASM_IA64_MMIOWB_H
-
-/**
- * mmiowb - I/O write barrier
- *
- * Ensure ordering of I/O space writes.  This will make sure that writes
- * following the barrier will arrive after all previous writes.  For most
- * ia64 platforms, this is a simple 'mf.a' instruction.
- */
-#define mmiowb()	ia64_mfa()
-
-#include <asm-generic/mmiowb.h>
-
-#endif	/* _ASM_IA64_MMIOWB_H */
diff --git a/arch/ia64/include/asm/mmu.h b/arch/ia64/include/asm/mmu.h
deleted file mode 100644
index f75f44f531c2..000000000000
--- a/arch/ia64/include/asm/mmu.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __MMU_H
-#define __MMU_H
-
-/*
- * Type for a context number.  We declare it volatile to ensure proper
- * ordering when it's accessed outside of spinlock'd critical sections
- * (e.g., as done in activate_mm() and init_new_context()).
- */
-typedef volatile unsigned long mm_context_t;
-
-typedef unsigned long nv_mm_context_t;
-
-#endif
diff --git a/arch/ia64/include/asm/mmu_context.h b/arch/ia64/include/asm/mmu_context.h
deleted file mode 100644
index 06257e355d00..000000000000
--- a/arch/ia64/include/asm/mmu_context.h
+++ /dev/null
@@ -1,194 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_MMU_CONTEXT_H
-#define _ASM_IA64_MMU_CONTEXT_H
-
-/*
- * Copyright (C) 1998-2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-/*
- * Routines to manage the allocation of task context numbers.  Task context
- * numbers are used to reduce or eliminate the need to perform TLB flushes
- * due to context switches.  Context numbers are implemented using ia-64
- * region ids.  Since the IA-64 TLB does not consider the region number when
- * performing a TLB lookup, we need to assign a unique region id to each
- * region in a process.  We use the least significant three bits in aregion
- * id for this purpose.
- */
-
-#define IA64_REGION_ID_KERNEL	0 /* the kernel's region id (tlb.c depends on this being 0) */
-
-#define ia64_rid(ctx,addr)	(((ctx) << 3) | (addr >> 61))
-
-# include <asm/page.h>
-# ifndef __ASSEMBLY__
-
-#include <linux/compiler.h>
-#include <linux/percpu.h>
-#include <linux/sched.h>
-#include <linux/mm_types.h>
-#include <linux/spinlock.h>
-
-#include <asm/processor.h>
-#include <asm-generic/mm_hooks.h>
-
-struct ia64_ctx {
-	spinlock_t lock;
-	unsigned int next;	/* next context number to use */
-	unsigned int limit;     /* available free range */
-	unsigned int max_ctx;   /* max. context value supported by all CPUs */
-				/* call wrap_mmu_context when next >= max */
-	unsigned long *bitmap;  /* bitmap size is max_ctx+1 */
-	unsigned long *flushmap;/* pending rid to be flushed */
-};
-
-extern struct ia64_ctx ia64_ctx;
-DECLARE_PER_CPU(u8, ia64_need_tlb_flush);
-
-extern void mmu_context_init (void);
-extern void wrap_mmu_context (struct mm_struct *mm);
-
-/*
- * When the context counter wraps around all TLBs need to be flushed because
- * an old context number might have been reused. This is signalled by the
- * ia64_need_tlb_flush per-CPU variable, which is checked in the routine
- * below. Called by activate_mm(). <efocht@ess.nec.de>
- */
-static inline void
-delayed_tlb_flush (void)
-{
-	extern void local_flush_tlb_all (void);
-	unsigned long flags;
-
-	if (unlikely(__ia64_per_cpu_var(ia64_need_tlb_flush))) {
-		spin_lock_irqsave(&ia64_ctx.lock, flags);
-		if (__ia64_per_cpu_var(ia64_need_tlb_flush)) {
-			local_flush_tlb_all();
-			__ia64_per_cpu_var(ia64_need_tlb_flush) = 0;
-		}
-		spin_unlock_irqrestore(&ia64_ctx.lock, flags);
-	}
-}
-
-static inline nv_mm_context_t
-get_mmu_context (struct mm_struct *mm)
-{
-	unsigned long flags;
-	nv_mm_context_t context = mm->context;
-
-	if (likely(context))
-		goto out;
-
-	spin_lock_irqsave(&ia64_ctx.lock, flags);
-	/* re-check, now that we've got the lock: */
-	context = mm->context;
-	if (context == 0) {
-		cpumask_clear(mm_cpumask(mm));
-		if (ia64_ctx.next >= ia64_ctx.limit) {
-			ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
-					ia64_ctx.max_ctx, ia64_ctx.next);
-			ia64_ctx.limit = find_next_bit(ia64_ctx.bitmap,
-					ia64_ctx.max_ctx, ia64_ctx.next);
-			if (ia64_ctx.next >= ia64_ctx.max_ctx)
-				wrap_mmu_context(mm);
-		}
-		mm->context = context = ia64_ctx.next++;
-		__set_bit(context, ia64_ctx.bitmap);
-	}
-	spin_unlock_irqrestore(&ia64_ctx.lock, flags);
-out:
-	/*
-	 * Ensure we're not starting to use "context" before any old
-	 * uses of it are gone from our TLB.
-	 */
-	delayed_tlb_flush();
-
-	return context;
-}
-
-/*
- * Initialize context number to some sane value.  MM is guaranteed to be a
- * brand-new address-space, so no TLB flushing is needed, ever.
- */
-#define init_new_context init_new_context
-static inline int
-init_new_context (struct task_struct *p, struct mm_struct *mm)
-{
-	mm->context = 0;
-	return 0;
-}
-
-static inline void
-reload_context (nv_mm_context_t context)
-{
-	unsigned long rid;
-	unsigned long rid_incr = 0;
-	unsigned long rr0, rr1, rr2, rr3, rr4;
-
-#ifdef CONFIG_HUGETLB_PAGE
-	unsigned long old_rr4;
-	old_rr4 = ia64_get_rr(RGN_BASE(RGN_HPAGE));
-#endif
-	rid = context << 3;	/* make space for encoding the region number */
-	rid_incr = 1 << 8;
-
-	/* encode the region id, preferred page size, and VHPT enable bit: */
-	rr0 = (rid << 8) | (PAGE_SHIFT << 2) | 1;
-	rr1 = rr0 + 1*rid_incr;
-	rr2 = rr0 + 2*rid_incr;
-	rr3 = rr0 + 3*rid_incr;
-	rr4 = rr0 + 4*rid_incr;
-#ifdef  CONFIG_HUGETLB_PAGE
-	rr4 = (rr4 & (~(0xfcUL))) | (old_rr4 & 0xfc);
-
-#  if RGN_HPAGE != 4
-#    error "reload_context assumes RGN_HPAGE is 4"
-#  endif
-#endif
-
-	ia64_set_rr0_to_rr4(rr0, rr1, rr2, rr3, rr4);
-	ia64_srlz_i();			/* srlz.i implies srlz.d */
-}
-
-/*
- * Must be called with preemption off
- */
-static inline void
-activate_context (struct mm_struct *mm)
-{
-	nv_mm_context_t context;
-
-	do {
-		context = get_mmu_context(mm);
-		if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm)))
-			cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
-		reload_context(context);
-		/*
-		 * in the unlikely event of a TLB-flush by another thread,
-		 * redo the load.
-		 */
-	} while (unlikely(context != mm->context));
-}
-
-/*
- * Switch from address space PREV to address space NEXT.
- */
-#define activate_mm activate_mm
-static inline void
-activate_mm (struct mm_struct *prev, struct mm_struct *next)
-{
-	/*
-	 * We may get interrupts here, but that's OK because interrupt
-	 * handlers cannot touch user-space.
-	 */
-	ia64_set_kr(IA64_KR_PT_BASE, __pa(next->pgd));
-	activate_context(next);
-}
-
-#define switch_mm(prev_mm,next_mm,next_task)	activate_mm(prev_mm, next_mm)
-
-#include <asm-generic/mmu_context.h>
-
-# endif /* ! __ASSEMBLY__ */
-#endif /* _ASM_IA64_MMU_CONTEXT_H */
diff --git a/arch/ia64/include/asm/mmzone.h b/arch/ia64/include/asm/mmzone.h
deleted file mode 100644
index 767201f66c93..000000000000
--- a/arch/ia64/include/asm/mmzone.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (c) 2000,2003 Silicon Graphics, Inc.  All rights reserved.
- * Copyright (c) 2002 NEC Corp.
- * Copyright (c) 2002 Erich Focht <efocht@ess.nec.de>
- * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
- */
-#ifndef _ASM_IA64_MMZONE_H
-#define _ASM_IA64_MMZONE_H
-
-#include <linux/numa.h>
-#include <asm/page.h>
-#include <asm/meminit.h>
-
-#ifdef CONFIG_NUMA
-
-static inline int pfn_to_nid(unsigned long pfn)
-{
-	extern int paddr_to_nid(unsigned long);
-	int nid = paddr_to_nid(pfn << PAGE_SHIFT);
-	if (nid < 0)
-		return 0;
-	else
-		return nid;
-}
-
-#define MAX_PHYSNODE_ID		2048
-#endif /* CONFIG_NUMA */
-
-#define NR_NODE_MEMBLKS		(MAX_NUMNODES * 4)
-
-#endif /* _ASM_IA64_MMZONE_H */
diff --git a/arch/ia64/include/asm/module.h b/arch/ia64/include/asm/module.h
deleted file mode 100644
index 7271b9c5fc76..000000000000
--- a/arch/ia64/include/asm/module.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_MODULE_H
-#define _ASM_IA64_MODULE_H
-
-#include <asm-generic/module.h>
-
-/*
- * IA-64-specific support for kernel module loader.
- *
- * Copyright (C) 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-struct elf64_shdr;			/* forward declration */
-
-struct mod_arch_specific {
-	/* Used only at module load time. */
-	struct elf64_shdr *core_plt;	/* core PLT section */
-	struct elf64_shdr *init_plt;	/* init PLT section */
-	struct elf64_shdr *got;		/* global offset table */
-	struct elf64_shdr *opd;		/* official procedure descriptors */
-	struct elf64_shdr *unwind;	/* unwind-table section */
-	unsigned long gp;		/* global-pointer for module */
-	unsigned int next_got_entry;	/* index of next available got entry */
-
-	/* Used at module run and cleanup time. */
-	void *core_unw_table;		/* core unwind-table cookie returned by unwinder */
-	void *init_unw_table;		/* init unwind-table cookie returned by unwinder */
-	void *opd_addr;			/* symbolize uses .opd to get to actual function */
-	unsigned long opd_size;
-};
-
-#define ARCH_SHF_SMALL	SHF_IA_64_SHORT
-
-#endif /* _ASM_IA64_MODULE_H */
diff --git a/arch/ia64/include/asm/module.lds.h b/arch/ia64/include/asm/module.lds.h
deleted file mode 100644
index eff68f362793..000000000000
--- a/arch/ia64/include/asm/module.lds.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-SECTIONS {
-	/* Group unwind sections into a single section: */
-	.IA_64.unwind_info : { *(.IA_64.unwind_info*) }
-	.IA_64.unwind : { *(.IA_64.unwind*) }
-	/*
-	 * Create place-holder sections to hold the PLTs, GOT, and
-	 * official procedure-descriptors (.opd).
-	 */
-	.core.plt : { BYTE(0) }
-	.init.plt : { BYTE(0) }
-	.got : { BYTE(0) }
-	.opd : { BYTE(0) }
-}
diff --git a/arch/ia64/include/asm/msidef.h b/arch/ia64/include/asm/msidef.h
deleted file mode 100644
index 18d0e4226748..000000000000
--- a/arch/ia64/include/asm/msidef.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _IA64_MSI_DEF_H
-#define _IA64_MSI_DEF_H
-
-/*
- * Shifts for APIC-based data
- */
-
-#define     MSI_DATA_VECTOR_SHIFT	0
-#define	    MSI_DATA_VECTOR(v)		(((u8)v) << MSI_DATA_VECTOR_SHIFT)
-#define     MSI_DATA_VECTOR_MASK	0xffffff00
-
-#define     MSI_DATA_DELIVERY_MODE_SHIFT	8
-#define     MSI_DATA_DELIVERY_FIXED	(0 << MSI_DATA_DELIVERY_MODE_SHIFT)
-#define     MSI_DATA_DELIVERY_LOWPRI	(1 << MSI_DATA_DELIVERY_MODE_SHIFT)
-
-#define     MSI_DATA_LEVEL_SHIFT	14
-#define     MSI_DATA_LEVEL_DEASSERT	(0 << MSI_DATA_LEVEL_SHIFT)
-#define     MSI_DATA_LEVEL_ASSERT	(1 << MSI_DATA_LEVEL_SHIFT)
-
-#define     MSI_DATA_TRIGGER_SHIFT	15
-#define     MSI_DATA_TRIGGER_EDGE	(0 << MSI_DATA_TRIGGER_SHIFT)
-#define     MSI_DATA_TRIGGER_LEVEL	(1 << MSI_DATA_TRIGGER_SHIFT)
-
-/*
- * Shift/mask fields for APIC-based bus address
- */
-
-#define     MSI_ADDR_DEST_ID_SHIFT	4
-#define     MSI_ADDR_HEADER		0xfee00000
-
-#define     MSI_ADDR_DEST_ID_MASK	0xfff0000f
-#define     MSI_ADDR_DEST_ID_CPU(cpu)	((cpu) << MSI_ADDR_DEST_ID_SHIFT)
-
-#define     MSI_ADDR_DEST_MODE_SHIFT	2
-#define     MSI_ADDR_DEST_MODE_PHYS	(0 << MSI_ADDR_DEST_MODE_SHIFT)
-#define	    MSI_ADDR_DEST_MODE_LOGIC	(1 << MSI_ADDR_DEST_MODE_SHIFT)
-
-#define     MSI_ADDR_REDIRECTION_SHIFT	3
-#define     MSI_ADDR_REDIRECTION_CPU	(0 << MSI_ADDR_REDIRECTION_SHIFT)
-#define     MSI_ADDR_REDIRECTION_LOWPRI	(1 << MSI_ADDR_REDIRECTION_SHIFT)
-
-#endif/* _IA64_MSI_DEF_H */
diff --git a/arch/ia64/include/asm/native/inst.h b/arch/ia64/include/asm/native/inst.h
deleted file mode 100644
index e08662396029..000000000000
--- a/arch/ia64/include/asm/native/inst.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/******************************************************************************
- * arch/ia64/include/asm/native/inst.h
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- */
-
-#define DO_SAVE_MIN		IA64_NATIVE_DO_SAVE_MIN
-
-#define MOV_FROM_IFA(reg)	\
-	mov reg = cr.ifa
-
-#define MOV_FROM_ITIR(reg)	\
-	mov reg = cr.itir
-
-#define MOV_FROM_ISR(reg)	\
-	mov reg = cr.isr
-
-#define MOV_FROM_IHA(reg)	\
-	mov reg = cr.iha
-
-#define MOV_FROM_IPSR(pred, reg)	\
-(pred)	mov reg = cr.ipsr
-
-#define MOV_FROM_IIM(reg)	\
-	mov reg = cr.iim
-
-#define MOV_FROM_IIP(reg)	\
-	mov reg = cr.iip
-
-#define MOV_FROM_IVR(reg, clob)	\
-	mov reg = cr.ivr
-
-#define MOV_FROM_PSR(pred, reg, clob)	\
-(pred)	mov reg = psr
-
-#define MOV_FROM_ITC(pred, pred_clob, reg, clob)	\
-(pred)	mov reg = ar.itc
-
-#define MOV_TO_IFA(reg, clob)	\
-	mov cr.ifa = reg
-
-#define MOV_TO_ITIR(pred, reg, clob)	\
-(pred)	mov cr.itir = reg
-
-#define MOV_TO_IHA(pred, reg, clob)	\
-(pred)	mov cr.iha = reg
-
-#define MOV_TO_IPSR(pred, reg, clob)		\
-(pred)	mov cr.ipsr = reg
-
-#define MOV_TO_IFS(pred, reg, clob)	\
-(pred)	mov cr.ifs = reg
-
-#define MOV_TO_IIP(reg, clob)	\
-	mov cr.iip = reg
-
-#define MOV_TO_KR(kr, reg, clob0, clob1)	\
-	mov IA64_KR(kr) = reg
-
-#define ITC_I(pred, reg, clob)	\
-(pred)	itc.i reg
-
-#define ITC_D(pred, reg, clob)	\
-(pred)	itc.d reg
-
-#define ITC_I_AND_D(pred_i, pred_d, reg, clob)	\
-(pred_i) itc.i reg;				\
-(pred_d) itc.d reg
-
-#define THASH(pred, reg0, reg1, clob)		\
-(pred)	thash reg0 = reg1
-
-#define SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(clob0, clob1)		\
-	ssm psr.ic | PSR_DEFAULT_BITS					\
-	;;								\
-	srlz.i /* guarantee that interruption collectin is on */	\
-	;;
-
-#define SSM_PSR_IC_AND_SRLZ_D(clob0, clob1)	\
-	ssm psr.ic				\
-	;;					\
-	srlz.d
-
-#define RSM_PSR_IC(clob)	\
-	rsm psr.ic
-
-#define SSM_PSR_I(pred, pred_clob, clob)	\
-(pred)	ssm psr.i
-
-#define RSM_PSR_I(pred, clob0, clob1)	\
-(pred)	rsm psr.i
-
-#define RSM_PSR_I_IC(clob0, clob1, clob2)	\
-	rsm psr.i | psr.ic
-
-#define RSM_PSR_DT		\
-	rsm psr.dt
-
-#define RSM_PSR_BE_I(clob0, clob1)	\
-	rsm psr.be | psr.i
-
-#define SSM_PSR_DT_AND_SRLZ_I	\
-	ssm psr.dt		\
-	;;			\
-	srlz.i
-
-#define BSW_0(clob0, clob1, clob2)	\
-	bsw.0
-
-#define BSW_1(clob0, clob1)	\
-	bsw.1
-
-#define COVER	\
-	cover
-
-#define RFI	\
-	rfi
diff --git a/arch/ia64/include/asm/native/irq.h b/arch/ia64/include/asm/native/irq.h
deleted file mode 100644
index aa74915f8aa2..000000000000
--- a/arch/ia64/include/asm/native/irq.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/******************************************************************************
- * arch/ia64/include/asm/native/irq.h
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- */
-
-#ifndef _ASM_IA64_NATIVE_IRQ_H
-#define _ASM_IA64_NATIVE_IRQ_H
-
-#define NR_VECTORS	256
-
-#if (NR_VECTORS + 32 * NR_CPUS) < 1024
-#define IA64_NATIVE_NR_IRQS (NR_VECTORS + 32 * NR_CPUS)
-#else
-#define IA64_NATIVE_NR_IRQS 1024
-#endif
-
-#endif /* _ASM_IA64_NATIVE_IRQ_H */
diff --git a/arch/ia64/include/asm/native/patchlist.h b/arch/ia64/include/asm/native/patchlist.h
deleted file mode 100644
index f13e7675758c..000000000000
--- a/arch/ia64/include/asm/native/patchlist.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/******************************************************************************
- * arch/ia64/include/asm/native/inst.h
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- */
-
-#define __paravirt_start_gate_fsyscall_patchlist		\
-	__ia64_native_start_gate_fsyscall_patchlist
-#define __paravirt_end_gate_fsyscall_patchlist			\
-	__ia64_native_end_gate_fsyscall_patchlist
-#define __paravirt_start_gate_brl_fsys_bubble_down_patchlist	\
-	__ia64_native_start_gate_brl_fsys_bubble_down_patchlist
-#define __paravirt_end_gate_brl_fsys_bubble_down_patchlist	\
-	__ia64_native_end_gate_brl_fsys_bubble_down_patchlist
-#define __paravirt_start_gate_vtop_patchlist			\
-	__ia64_native_start_gate_vtop_patchlist
-#define __paravirt_end_gate_vtop_patchlist			\
-	__ia64_native_end_gate_vtop_patchlist
-#define __paravirt_start_gate_mckinley_e9_patchlist		\
-	__ia64_native_start_gate_mckinley_e9_patchlist
-#define __paravirt_end_gate_mckinley_e9_patchlist		\
-	__ia64_native_end_gate_mckinley_e9_patchlist
diff --git a/arch/ia64/include/asm/nodedata.h b/arch/ia64/include/asm/nodedata.h
deleted file mode 100644
index 2fb337b0e9b7..000000000000
--- a/arch/ia64/include/asm/nodedata.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (c) 2000 Silicon Graphics, Inc.  All rights reserved.
- * Copyright (c) 2002 NEC Corp.
- * Copyright (c) 2002 Erich Focht <efocht@ess.nec.de>
- * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
- */
-#ifndef _ASM_IA64_NODEDATA_H
-#define _ASM_IA64_NODEDATA_H
-
-#include <linux/numa.h>
-
-#include <asm/percpu.h>
-#include <asm/mmzone.h>
-
-#ifdef CONFIG_NUMA
-
-/*
- * Node Data. One of these structures is located on each node of a NUMA system.
- */
-
-struct pglist_data;
-struct ia64_node_data {
-	short			active_cpu_count;
-	short			node;
-	struct pglist_data	*pg_data_ptrs[MAX_NUMNODES];
-};
-
-
-/*
- * Return a pointer to the node_data structure for the executing cpu.
- */
-#define local_node_data		(local_cpu_data->node_data)
-
-/*
- * Given a node id, return a pointer to the pg_data_t for the node.
- *
- * NODE_DATA 	- should be used in all code not related to system
- *		  initialization. It uses pernode data structures to minimize
- *		  offnode memory references. However, these structure are not 
- *		  present during boot. This macro can be used once cpu_init
- *		  completes.
- */
-#define NODE_DATA(nid)		(local_node_data->pg_data_ptrs[nid])
-
-/*
- * LOCAL_DATA_ADDR - This is to calculate the address of other node's
- *		     "local_node_data" at hot-plug phase. The local_node_data
- *		     is pointed by per_cpu_page. Kernel usually use it for
- *		     just executing cpu. However, when new node is hot-added,
- *		     the addresses of local data for other nodes are necessary
- *		     to update all of them.
- */
-#define LOCAL_DATA_ADDR(pgdat)  			\
-	((struct ia64_node_data *)((u64)(pgdat) + 	\
-				   L1_CACHE_ALIGN(sizeof(struct pglist_data))))
-
-#endif /* CONFIG_NUMA */
-
-#endif /* _ASM_IA64_NODEDATA_H */
diff --git a/arch/ia64/include/asm/numa.h b/arch/ia64/include/asm/numa.h
deleted file mode 100644
index c5c253cb9bd6..000000000000
--- a/arch/ia64/include/asm/numa.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * This file contains NUMA specific prototypes and definitions.
- *
- * 2002/08/05 Erich Focht <efocht@ess.nec.de>
- *
- */
-#ifndef _ASM_IA64_NUMA_H
-#define _ASM_IA64_NUMA_H
-
-
-#ifdef CONFIG_NUMA
-
-#include <linux/cache.h>
-#include <linux/cpumask.h>
-#include <linux/numa.h>
-#include <linux/smp.h>
-#include <linux/threads.h>
-
-#include <asm/mmzone.h>
-
-extern u16 cpu_to_node_map[NR_CPUS] __cacheline_aligned;
-extern cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
-extern pg_data_t *pgdat_list[MAX_NUMNODES];
-
-/* Stuff below this line could be architecture independent */
-
-extern int num_node_memblks;		/* total number of memory chunks */
-
-/*
- * List of node memory chunks. Filled when parsing SRAT table to
- * obtain information about memory nodes.
-*/
-
-struct node_memblk_s {
-	unsigned long start_paddr;
-	unsigned long size;
-	int nid;		/* which logical node contains this chunk? */
-	int bank;		/* which mem bank on this node */
-};
-
-struct node_cpuid_s {
-	u16	phys_id;	/* id << 8 | eid */
-	int	nid;		/* logical node containing this CPU */
-};
-
-extern struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
-extern struct node_cpuid_s node_cpuid[NR_CPUS];
-
-/*
- * ACPI 2.0 SLIT (System Locality Information Table)
- * http://devresource.hp.com/devresource/Docs/TechPapers/IA64/slit.pdf
- *
- * This is a matrix with "distances" between nodes, they should be
- * proportional to the memory access latency ratios.
- */
-
-extern u8 numa_slit[MAX_NUMNODES * MAX_NUMNODES];
-#define slit_distance(from,to) (numa_slit[(from) * MAX_NUMNODES + (to)])
-extern int __node_distance(int from, int to);
-#define node_distance(from,to) __node_distance(from, to)
-
-extern int paddr_to_nid(unsigned long paddr);
-
-#define local_nodeid (cpu_to_node_map[smp_processor_id()])
-
-#define numa_off     0
-
-extern void map_cpu_to_node(int cpu, int nid);
-extern void unmap_cpu_from_node(int cpu, int nid);
-extern void numa_clear_node(int cpu);
-
-#else /* !CONFIG_NUMA */
-#define map_cpu_to_node(cpu, nid)	do{}while(0)
-#define unmap_cpu_from_node(cpu, nid)	do{}while(0)
-#define paddr_to_nid(addr)	0
-#define numa_clear_node(cpu)	do { } while (0)
-#endif /* CONFIG_NUMA */
-
-#endif /* _ASM_IA64_NUMA_H */
diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h
deleted file mode 100644
index 310b09c3342d..000000000000
--- a/arch/ia64/include/asm/page.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PAGE_H
-#define _ASM_IA64_PAGE_H
-/*
- * Pagetable related stuff.
- *
- * Copyright (C) 1998, 1999, 2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <asm/intrinsics.h>
-#include <asm/types.h>
-
-/*
- * The top three bits of an IA64 address are its Region Number.
- * Different regions are assigned to different purposes.
- */
-#define RGN_SHIFT	(61)
-#define RGN_BASE(r)	(__IA64_UL_CONST(r)<<RGN_SHIFT)
-#define RGN_BITS	(RGN_BASE(-1))
-
-#define RGN_KERNEL	7	/* Identity mapped region */
-#define RGN_UNCACHED    6	/* Identity mapped I/O region */
-#define RGN_GATE	5	/* Gate page, Kernel text, etc */
-#define RGN_HPAGE	4	/* For Huge TLB pages */
-
-/*
- * PAGE_SHIFT determines the actual kernel page size.
- */
-#if defined(CONFIG_IA64_PAGE_SIZE_4KB)
-# define PAGE_SHIFT	12
-#elif defined(CONFIG_IA64_PAGE_SIZE_8KB)
-# define PAGE_SHIFT	13
-#elif defined(CONFIG_IA64_PAGE_SIZE_16KB)
-# define PAGE_SHIFT	14
-#elif defined(CONFIG_IA64_PAGE_SIZE_64KB)
-# define PAGE_SHIFT	16
-#else
-# error Unsupported page size!
-#endif
-
-#define PAGE_SIZE		(__IA64_UL_CONST(1) << PAGE_SHIFT)
-#define PAGE_MASK		(~(PAGE_SIZE - 1))
-
-#define PERCPU_PAGE_SHIFT	18	/* log2() of max. size of per-CPU area */
-#define PERCPU_PAGE_SIZE	(__IA64_UL_CONST(1) << PERCPU_PAGE_SHIFT)
-
-
-#ifdef CONFIG_HUGETLB_PAGE
-# define HPAGE_REGION_BASE	RGN_BASE(RGN_HPAGE)
-# define HPAGE_SHIFT		hpage_shift
-# define HPAGE_SHIFT_DEFAULT	28	/* check ia64 SDM for architecture supported size */
-# define HPAGE_SIZE		(__IA64_UL_CONST(1) << HPAGE_SHIFT)
-# define HPAGE_MASK		(~(HPAGE_SIZE - 1))
-
-# define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-#endif /* CONFIG_HUGETLB_PAGE */
-
-#ifdef __ASSEMBLY__
-# define __pa(x)		((x) - PAGE_OFFSET)
-# define __va(x)		((x) + PAGE_OFFSET)
-#else /* !__ASSEMBLY */
-#  define STRICT_MM_TYPECHECKS
-
-extern void clear_page (void *page);
-extern void copy_page (void *to, void *from);
-
-/*
- * clear_user_page() and copy_user_page() can't be inline functions because
- * flush_dcache_page() can't be defined until later...
- */
-#define clear_user_page(addr, vaddr, page)	\
-do {						\
-	clear_page(addr);			\
-	flush_dcache_page(page);		\
-} while (0)
-
-#define copy_user_page(to, from, vaddr, page)	\
-do {						\
-	copy_page((to), (from));		\
-	flush_dcache_page(page);		\
-} while (0)
-
-
-#define vma_alloc_zeroed_movable_folio(vma, vaddr)			\
-({									\
-	struct folio *folio = vma_alloc_folio(				\
-		GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false); \
-	if (folio)							\
-		flush_dcache_folio(folio);				\
-	folio;								\
-})
-
-#define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
-
-#include <asm-generic/memory_model.h>
-
-#define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
-#define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define pfn_to_kaddr(pfn)	__va((pfn) << PAGE_SHIFT)
-
-typedef union ia64_va {
-	struct {
-		unsigned long off : 61;		/* intra-region offset */
-		unsigned long reg :  3;		/* region number */
-	} f;
-	unsigned long l;
-	void *p;
-} ia64_va;
-
-/*
- * Note: These macros depend on the fact that PAGE_OFFSET has all
- * region bits set to 1 and all other bits set to zero.  They are
- * expressed in this way to ensure they result in a single "dep"
- * instruction.
- */
-#define __pa(x)		({ia64_va _v; _v.l = (long) (x); _v.f.reg = 0; _v.l;})
-#define __va(x)		({ia64_va _v; _v.l = (long) (x); _v.f.reg = -1; _v.p;})
-
-#define REGION_NUMBER(x)	({ia64_va _v; _v.l = (long) (x); _v.f.reg;})
-#define REGION_OFFSET(x)	({ia64_va _v; _v.l = (long) (x); _v.f.off;})
-
-#ifdef CONFIG_HUGETLB_PAGE
-# define htlbpage_to_page(x)	(((unsigned long) REGION_NUMBER(x) << 61)			\
-				 | (REGION_OFFSET(x) >> (HPAGE_SHIFT-PAGE_SHIFT)))
-# define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
-extern unsigned int hpage_shift;
-#endif
-
-static __inline__ int
-get_order (unsigned long size)
-{
-	long double d = size - 1;
-	long order;
-
-	order = ia64_getf_exp(d);
-	order = order - PAGE_SHIFT - 0xffff + 1;
-	if (order < 0)
-		order = 0;
-	return order;
-}
-
-#endif /* !__ASSEMBLY__ */
-
-#ifdef STRICT_MM_TYPECHECKS
-  /*
-   * These are used to make use of C type-checking..
-   */
-  typedef struct { unsigned long pte; } pte_t;
-  typedef struct { unsigned long pmd; } pmd_t;
-#if CONFIG_PGTABLE_LEVELS == 4
-  typedef struct { unsigned long pud; } pud_t;
-#endif
-  typedef struct { unsigned long pgd; } pgd_t;
-  typedef struct { unsigned long pgprot; } pgprot_t;
-  typedef struct page *pgtable_t;
-
-# define pte_val(x)	((x).pte)
-# define pmd_val(x)	((x).pmd)
-#if CONFIG_PGTABLE_LEVELS == 4
-# define pud_val(x)	((x).pud)
-#endif
-# define pgd_val(x)	((x).pgd)
-# define pgprot_val(x)	((x).pgprot)
-
-# define __pte(x)	((pte_t) { (x) } )
-# define __pmd(x)	((pmd_t) { (x) } )
-# define __pgprot(x)	((pgprot_t) { (x) } )
-
-#else /* !STRICT_MM_TYPECHECKS */
-  /*
-   * .. while these make it easier on the compiler
-   */
-# ifndef __ASSEMBLY__
-    typedef unsigned long pte_t;
-    typedef unsigned long pmd_t;
-    typedef unsigned long pgd_t;
-    typedef unsigned long pgprot_t;
-    typedef struct page *pgtable_t;
-# endif
-
-# define pte_val(x)	(x)
-# define pmd_val(x)	(x)
-# define pgd_val(x)	(x)
-# define pgprot_val(x)	(x)
-
-# define __pte(x)	(x)
-# define __pgd(x)	(x)
-# define __pgprot(x)	(x)
-#endif /* !STRICT_MM_TYPECHECKS */
-
-#define PAGE_OFFSET			RGN_BASE(RGN_KERNEL)
-
-#define VM_DATA_DEFAULT_FLAGS	VM_DATA_FLAGS_TSK_EXEC
-
-#define GATE_ADDR		RGN_BASE(RGN_GATE)
-
-/*
- * 0xa000000000000000+2*PERCPU_PAGE_SIZE
- * - 0xa000000000000000+3*PERCPU_PAGE_SIZE remain unmapped (guard page)
- */
-#define KERNEL_START		 (GATE_ADDR+__IA64_UL_CONST(0x100000000))
-#define PERCPU_ADDR		(-PERCPU_PAGE_SIZE)
-#define LOAD_OFFSET		(KERNEL_START - KERNEL_TR_PAGE_SIZE)
-
-#define __HAVE_ARCH_GATE_AREA	1
-
-#endif /* _ASM_IA64_PAGE_H */
diff --git a/arch/ia64/include/asm/pal.h b/arch/ia64/include/asm/pal.h
deleted file mode 100644
index e6b652f9e45e..000000000000
--- a/arch/ia64/include/asm/pal.h
+++ /dev/null
@@ -1,1827 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PAL_H
-#define _ASM_IA64_PAL_H
-
-/*
- * Processor Abstraction Layer definitions.
- *
- * This is based on Intel IA-64 Architecture Software Developer's Manual rev 1.0
- * chapter 11 IA-64 Processor Abstraction Layer
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999 Srinivasa Prasad Thirumalachar <sprasad@sprasad.engr.sgi.com>
- * Copyright (C) 2008 Silicon Graphics, Inc. (SGI)
- *
- * 99/10/01	davidm	Make sure we pass zero for reserved parameters.
- * 00/03/07	davidm	Updated pal_cache_flush() to be in sync with PAL v2.6.
- * 00/03/23     cfleck  Modified processor min-state save area to match updated PAL & SAL info
- * 00/05/24     eranian Updated to latest PAL spec, fix structures bugs, added
- * 00/05/25	eranian Support for stack calls, and static physical calls
- * 00/06/18	eranian Support for stacked physical calls
- * 06/10/26	rja	Support for Intel Itanium Architecture Software Developer's
- *			Manual Rev 2.2 (Jan 2006)
- */
-
-/*
- * Note that some of these calls use a static-register only calling
- * convention which has nothing to do with the regular calling
- * convention.
- */
-#define PAL_CACHE_FLUSH		1	/* flush i/d cache */
-#define PAL_CACHE_INFO		2	/* get detailed i/d cache info */
-#define PAL_CACHE_INIT		3	/* initialize i/d cache */
-#define PAL_CACHE_SUMMARY	4	/* get summary of cache hierarchy */
-#define PAL_MEM_ATTRIB		5	/* list supported memory attributes */
-#define PAL_PTCE_INFO		6	/* purge TLB info */
-#define PAL_VM_INFO		7	/* return supported virtual memory features */
-#define PAL_VM_SUMMARY		8	/* return summary on supported vm features */
-#define PAL_BUS_GET_FEATURES	9	/* return processor bus interface features settings */
-#define PAL_BUS_SET_FEATURES	10	/* set processor bus features */
-#define PAL_DEBUG_INFO		11	/* get number of debug registers */
-#define PAL_FIXED_ADDR		12	/* get fixed component of processors's directed address */
-#define PAL_FREQ_BASE		13	/* base frequency of the platform */
-#define PAL_FREQ_RATIOS		14	/* ratio of processor, bus and ITC frequency */
-#define PAL_PERF_MON_INFO	15	/* return performance monitor info */
-#define PAL_PLATFORM_ADDR	16	/* set processor interrupt block and IO port space addr */
-#define PAL_PROC_GET_FEATURES	17	/* get configurable processor features & settings */
-#define PAL_PROC_SET_FEATURES	18	/* enable/disable configurable processor features */
-#define PAL_RSE_INFO		19	/* return rse information */
-#define PAL_VERSION		20	/* return version of PAL code */
-#define PAL_MC_CLEAR_LOG	21	/* clear all processor log info */
-#define PAL_MC_DRAIN		22	/* drain operations which could result in an MCA */
-#define PAL_MC_EXPECTED		23	/* set/reset expected MCA indicator */
-#define PAL_MC_DYNAMIC_STATE	24	/* get processor dynamic state */
-#define PAL_MC_ERROR_INFO	25	/* get processor MCA info and static state */
-#define PAL_MC_RESUME		26	/* Return to interrupted process */
-#define PAL_MC_REGISTER_MEM	27	/* Register memory for PAL to use during MCAs and inits */
-#define PAL_HALT		28	/* enter the low power HALT state */
-#define PAL_HALT_LIGHT		29	/* enter the low power light halt state*/
-#define PAL_COPY_INFO		30	/* returns info needed to relocate PAL */
-#define PAL_CACHE_LINE_INIT	31	/* init tags & data of cache line */
-#define PAL_PMI_ENTRYPOINT	32	/* register PMI memory entry points with the processor */
-#define PAL_ENTER_IA_32_ENV	33	/* enter IA-32 system environment */
-#define PAL_VM_PAGE_SIZE	34	/* return vm TC and page walker page sizes */
-
-#define PAL_MEM_FOR_TEST	37	/* get amount of memory needed for late processor test */
-#define PAL_CACHE_PROT_INFO	38	/* get i/d cache protection info */
-#define PAL_REGISTER_INFO	39	/* return AR and CR register information*/
-#define PAL_SHUTDOWN		40	/* enter processor shutdown state */
-#define PAL_PREFETCH_VISIBILITY	41	/* Make Processor Prefetches Visible */
-#define PAL_LOGICAL_TO_PHYSICAL 42	/* returns information on logical to physical processor mapping */
-#define PAL_CACHE_SHARED_INFO	43	/* returns information on caches shared by logical processor */
-#define PAL_GET_HW_POLICY	48	/* Get current hardware resource sharing policy */
-#define PAL_SET_HW_POLICY	49	/* Set current hardware resource sharing policy */
-#define PAL_VP_INFO		50	/* Information about virtual processor features */
-#define PAL_MC_HW_TRACKING	51	/* Hardware tracking status */
-
-#define PAL_COPY_PAL		256	/* relocate PAL procedures and PAL PMI */
-#define PAL_HALT_INFO		257	/* return the low power capabilities of processor */
-#define PAL_TEST_PROC		258	/* perform late processor self-test */
-#define PAL_CACHE_READ		259	/* read tag & data of cacheline for diagnostic testing */
-#define PAL_CACHE_WRITE		260	/* write tag & data of cacheline for diagnostic testing */
-#define PAL_VM_TR_READ		261	/* read contents of translation register */
-#define PAL_GET_PSTATE		262	/* get the current P-state */
-#define PAL_SET_PSTATE		263	/* set the P-state */
-#define PAL_BRAND_INFO		274	/* Processor branding information */
-
-#define PAL_GET_PSTATE_TYPE_LASTSET	0
-#define PAL_GET_PSTATE_TYPE_AVGANDRESET	1
-#define PAL_GET_PSTATE_TYPE_AVGNORESET	2
-#define PAL_GET_PSTATE_TYPE_INSTANT	3
-
-#define PAL_MC_ERROR_INJECT	276	/* Injects processor error or returns injection capabilities */
-
-#ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-#include <asm/fpu.h>
-#include <asm/intrinsics.h>
-
-/*
- * Data types needed to pass information into PAL procedures and
- * interpret information returned by them.
- */
-
-/* Return status from the PAL procedure */
-typedef s64				pal_status_t;
-
-#define PAL_STATUS_SUCCESS		0	/* No error */
-#define PAL_STATUS_UNIMPLEMENTED	(-1)	/* Unimplemented procedure */
-#define PAL_STATUS_EINVAL		(-2)	/* Invalid argument */
-#define PAL_STATUS_ERROR		(-3)	/* Error */
-#define PAL_STATUS_CACHE_INIT_FAIL	(-4)	/* Could not initialize the
-						 * specified level and type of
-						 * cache without sideeffects
-						 * and "restrict" was 1
-						 */
-#define PAL_STATUS_REQUIRES_MEMORY	(-9)	/* Call requires PAL memory buffer */
-
-/* Processor cache level in the hierarchy */
-typedef u64				pal_cache_level_t;
-#define PAL_CACHE_LEVEL_L0		0	/* L0 */
-#define PAL_CACHE_LEVEL_L1		1	/* L1 */
-#define PAL_CACHE_LEVEL_L2		2	/* L2 */
-
-
-/* Processor cache type at a particular level in the hierarchy */
-
-typedef u64				pal_cache_type_t;
-#define PAL_CACHE_TYPE_INSTRUCTION	1	/* Instruction cache */
-#define PAL_CACHE_TYPE_DATA		2	/* Data or unified cache */
-#define PAL_CACHE_TYPE_INSTRUCTION_DATA	3	/* Both Data & Instruction */
-
-
-#define PAL_CACHE_FLUSH_INVALIDATE	1	/* Invalidate clean lines */
-#define PAL_CACHE_FLUSH_CHK_INTRS	2	/* check for interrupts/mc while flushing */
-
-/* Processor cache line size in bytes  */
-typedef int				pal_cache_line_size_t;
-
-/* Processor cache line state */
-typedef u64				pal_cache_line_state_t;
-#define PAL_CACHE_LINE_STATE_INVALID	0	/* Invalid */
-#define PAL_CACHE_LINE_STATE_SHARED	1	/* Shared */
-#define PAL_CACHE_LINE_STATE_EXCLUSIVE	2	/* Exclusive */
-#define PAL_CACHE_LINE_STATE_MODIFIED	3	/* Modified */
-
-typedef struct pal_freq_ratio {
-	u32 den, num;		/* numerator & denominator */
-} itc_ratio, proc_ratio;
-
-typedef	union  pal_cache_config_info_1_s {
-	struct {
-		u64		u		: 1,	/* 0 Unified cache ? */
-				at		: 2,	/* 2-1 Cache mem attr*/
-				reserved	: 5,	/* 7-3 Reserved */
-				associativity	: 8,	/* 16-8 Associativity*/
-				line_size	: 8,	/* 23-17 Line size */
-				stride		: 8,	/* 31-24 Stride */
-				store_latency	: 8,	/*39-32 Store latency*/
-				load_latency	: 8,	/* 47-40 Load latency*/
-				store_hints	: 8,	/* 55-48 Store hints*/
-				load_hints	: 8;	/* 63-56 Load hints */
-	} pcci1_bits;
-	u64			pcci1_data;
-} pal_cache_config_info_1_t;
-
-typedef	union  pal_cache_config_info_2_s {
-	struct {
-		u32		cache_size;		/*cache size in bytes*/
-
-
-		u32		alias_boundary	: 8,	/* 39-32 aliased addr
-							 * separation for max
-							 * performance.
-							 */
-				tag_ls_bit	: 8,	/* 47-40 LSb of addr*/
-				tag_ms_bit	: 8,	/* 55-48 MSb of addr*/
-				reserved	: 8;	/* 63-56 Reserved */
-	} pcci2_bits;
-	u64			pcci2_data;
-} pal_cache_config_info_2_t;
-
-
-typedef struct pal_cache_config_info_s {
-	pal_status_t			pcci_status;
-	pal_cache_config_info_1_t	pcci_info_1;
-	pal_cache_config_info_2_t	pcci_info_2;
-	u64				pcci_reserved;
-} pal_cache_config_info_t;
-
-#define pcci_ld_hints		pcci_info_1.pcci1_bits.load_hints
-#define pcci_st_hints		pcci_info_1.pcci1_bits.store_hints
-#define pcci_ld_latency		pcci_info_1.pcci1_bits.load_latency
-#define pcci_st_latency		pcci_info_1.pcci1_bits.store_latency
-#define pcci_stride		pcci_info_1.pcci1_bits.stride
-#define pcci_line_size		pcci_info_1.pcci1_bits.line_size
-#define pcci_assoc		pcci_info_1.pcci1_bits.associativity
-#define pcci_cache_attr		pcci_info_1.pcci1_bits.at
-#define pcci_unified		pcci_info_1.pcci1_bits.u
-#define pcci_tag_msb		pcci_info_2.pcci2_bits.tag_ms_bit
-#define pcci_tag_lsb		pcci_info_2.pcci2_bits.tag_ls_bit
-#define pcci_alias_boundary	pcci_info_2.pcci2_bits.alias_boundary
-#define pcci_cache_size		pcci_info_2.pcci2_bits.cache_size
-
-
-
-/* Possible values for cache attributes */
-
-#define PAL_CACHE_ATTR_WT		0	/* Write through cache */
-#define PAL_CACHE_ATTR_WB		1	/* Write back cache */
-#define PAL_CACHE_ATTR_WT_OR_WB		2	/* Either write thru or write
-						 * back depending on TLB
-						 * memory attributes
-						 */
-
-
-/* Possible values for cache hints */
-
-#define PAL_CACHE_HINT_TEMP_1		0	/* Temporal level 1 */
-#define PAL_CACHE_HINT_NTEMP_1		1	/* Non-temporal level 1 */
-#define PAL_CACHE_HINT_NTEMP_ALL	3	/* Non-temporal all levels */
-
-/* Processor cache protection  information */
-typedef union pal_cache_protection_element_u {
-	u32			pcpi_data;
-	struct {
-		u32		data_bits	: 8, /* # data bits covered by
-						      * each unit of protection
-						      */
-
-				tagprot_lsb	: 6, /* Least -do- */
-				tagprot_msb	: 6, /* Most Sig. tag address
-						      * bit that this
-						      * protection covers.
-						      */
-				prot_bits	: 6, /* # of protection bits */
-				method		: 4, /* Protection method */
-				t_d		: 2; /* Indicates which part
-						      * of the cache this
-						      * protection encoding
-						      * applies.
-						      */
-	} pcp_info;
-} pal_cache_protection_element_t;
-
-#define pcpi_cache_prot_part	pcp_info.t_d
-#define pcpi_prot_method	pcp_info.method
-#define pcpi_prot_bits		pcp_info.prot_bits
-#define pcpi_tagprot_msb	pcp_info.tagprot_msb
-#define pcpi_tagprot_lsb	pcp_info.tagprot_lsb
-#define pcpi_data_bits		pcp_info.data_bits
-
-/* Processor cache part encodings */
-#define PAL_CACHE_PROT_PART_DATA	0	/* Data protection  */
-#define PAL_CACHE_PROT_PART_TAG		1	/* Tag  protection */
-#define PAL_CACHE_PROT_PART_TAG_DATA	2	/* Tag+data protection (tag is
-						 * more significant )
-						 */
-#define PAL_CACHE_PROT_PART_DATA_TAG	3	/* Data+tag protection (data is
-						 * more significant )
-						 */
-#define PAL_CACHE_PROT_PART_MAX		6
-
-
-typedef struct pal_cache_protection_info_s {
-	pal_status_t			pcpi_status;
-	pal_cache_protection_element_t	pcp_info[PAL_CACHE_PROT_PART_MAX];
-} pal_cache_protection_info_t;
-
-
-/* Processor cache protection method encodings */
-#define PAL_CACHE_PROT_METHOD_NONE		0	/* No protection */
-#define PAL_CACHE_PROT_METHOD_ODD_PARITY	1	/* Odd parity */
-#define PAL_CACHE_PROT_METHOD_EVEN_PARITY	2	/* Even parity */
-#define PAL_CACHE_PROT_METHOD_ECC		3	/* ECC protection */
-
-
-/* Processor cache line identification in the hierarchy */
-typedef union pal_cache_line_id_u {
-	u64			pclid_data;
-	struct {
-		u64		cache_type	: 8,	/* 7-0 cache type */
-				level		: 8,	/* 15-8 level of the
-							 * cache in the
-							 * hierarchy.
-							 */
-				way		: 8,	/* 23-16 way in the set
-							 */
-				part		: 8,	/* 31-24 part of the
-							 * cache
-							 */
-				reserved	: 32;	/* 63-32 is reserved*/
-	} pclid_info_read;
-	struct {
-		u64		cache_type	: 8,	/* 7-0 cache type */
-				level		: 8,	/* 15-8 level of the
-							 * cache in the
-							 * hierarchy.
-							 */
-				way		: 8,	/* 23-16 way in the set
-							 */
-				part		: 8,	/* 31-24 part of the
-							 * cache
-							 */
-				mesi		: 8,	/* 39-32 cache line
-							 * state
-							 */
-				start		: 8,	/* 47-40 lsb of data to
-							 * invert
-							 */
-				length		: 8,	/* 55-48 #bits to
-							 * invert
-							 */
-				trigger		: 8;	/* 63-56 Trigger error
-							 * by doing a load
-							 * after the write
-							 */
-
-	} pclid_info_write;
-} pal_cache_line_id_u_t;
-
-#define pclid_read_part		pclid_info_read.part
-#define pclid_read_way		pclid_info_read.way
-#define pclid_read_level	pclid_info_read.level
-#define pclid_read_cache_type	pclid_info_read.cache_type
-
-#define pclid_write_trigger	pclid_info_write.trigger
-#define pclid_write_length	pclid_info_write.length
-#define pclid_write_start	pclid_info_write.start
-#define pclid_write_mesi	pclid_info_write.mesi
-#define pclid_write_part	pclid_info_write.part
-#define pclid_write_way		pclid_info_write.way
-#define pclid_write_level	pclid_info_write.level
-#define pclid_write_cache_type	pclid_info_write.cache_type
-
-/* Processor cache line part encodings */
-#define PAL_CACHE_LINE_ID_PART_DATA		0	/* Data */
-#define PAL_CACHE_LINE_ID_PART_TAG		1	/* Tag */
-#define PAL_CACHE_LINE_ID_PART_DATA_PROT	2	/* Data protection */
-#define PAL_CACHE_LINE_ID_PART_TAG_PROT		3	/* Tag protection */
-#define PAL_CACHE_LINE_ID_PART_DATA_TAG_PROT	4	/* Data+tag
-							 * protection
-							 */
-typedef struct pal_cache_line_info_s {
-	pal_status_t		pcli_status;		/* Return status of the read cache line
-							 * info call.
-							 */
-	u64			pcli_data;		/* 64-bit data, tag, protection bits .. */
-	u64			pcli_data_len;		/* data length in bits */
-	pal_cache_line_state_t	pcli_cache_line_state;	/* mesi state */
-
-} pal_cache_line_info_t;
-
-
-/* Machine Check related crap */
-
-/* Pending event status bits  */
-typedef u64					pal_mc_pending_events_t;
-
-#define PAL_MC_PENDING_MCA			(1 << 0)
-#define PAL_MC_PENDING_INIT			(1 << 1)
-
-/* Error information type */
-typedef u64					pal_mc_info_index_t;
-
-#define PAL_MC_INFO_PROCESSOR			0	/* Processor */
-#define PAL_MC_INFO_CACHE_CHECK			1	/* Cache check */
-#define PAL_MC_INFO_TLB_CHECK			2	/* Tlb check */
-#define PAL_MC_INFO_BUS_CHECK			3	/* Bus check */
-#define PAL_MC_INFO_REQ_ADDR			4	/* Requestor address */
-#define PAL_MC_INFO_RESP_ADDR			5	/* Responder address */
-#define PAL_MC_INFO_TARGET_ADDR			6	/* Target address */
-#define PAL_MC_INFO_IMPL_DEP			7	/* Implementation
-							 * dependent
-							 */
-
-#define PAL_TLB_CHECK_OP_PURGE			8
-
-typedef struct pal_process_state_info_s {
-	u64		reserved1	: 2,
-			rz		: 1,	/* PAL_CHECK processor
-						 * rendezvous
-						 * successful.
-						 */
-
-			ra		: 1,	/* PAL_CHECK attempted
-						 * a rendezvous.
-						 */
-			me		: 1,	/* Distinct multiple
-						 * errors occurred
-						 */
-
-			mn		: 1,	/* Min. state save
-						 * area has been
-						 * registered with PAL
-						 */
-
-			sy		: 1,	/* Storage integrity
-						 * synched
-						 */
-
-
-			co		: 1,	/* Continuable */
-			ci		: 1,	/* MC isolated */
-			us		: 1,	/* Uncontained storage
-						 * damage.
-						 */
-
-
-			hd		: 1,	/* Non-essential hw
-						 * lost (no loss of
-						 * functionality)
-						 * causing the
-						 * processor to run in
-						 * degraded mode.
-						 */
-
-			tl		: 1,	/* 1 => MC occurred
-						 * after an instr was
-						 * executed but before
-						 * the trap that
-						 * resulted from instr
-						 * execution was
-						 * generated.
-						 * (Trap Lost )
-						 */
-			mi		: 1,	/* More information available
-						 * call PAL_MC_ERROR_INFO
-						 */
-			pi		: 1,	/* Precise instruction pointer */
-			pm		: 1,	/* Precise min-state save area */
-
-			dy		: 1,	/* Processor dynamic
-						 * state valid
-						 */
-
-
-			in		: 1,	/* 0 = MC, 1 = INIT */
-			rs		: 1,	/* RSE valid */
-			cm		: 1,	/* MC corrected */
-			ex		: 1,	/* MC is expected */
-			cr		: 1,	/* Control regs valid*/
-			pc		: 1,	/* Perf cntrs valid */
-			dr		: 1,	/* Debug regs valid */
-			tr		: 1,	/* Translation regs
-						 * valid
-						 */
-			rr		: 1,	/* Region regs valid */
-			ar		: 1,	/* App regs valid */
-			br		: 1,	/* Branch regs valid */
-			pr		: 1,	/* Predicate registers
-						 * valid
-						 */
-
-			fp		: 1,	/* fp registers valid*/
-			b1		: 1,	/* Preserved bank one
-						 * general registers
-						 * are valid
-						 */
-			b0		: 1,	/* Preserved bank zero
-						 * general registers
-						 * are valid
-						 */
-			gr		: 1,	/* General registers
-						 * are valid
-						 * (excl. banked regs)
-						 */
-			dsize		: 16,	/* size of dynamic
-						 * state returned
-						 * by the processor
-						 */
-
-			se		: 1,	/* Shared error.  MCA in a
-						   shared structure */
-			reserved2	: 10,
-			cc		: 1,	/* Cache check */
-			tc		: 1,	/* TLB check */
-			bc		: 1,	/* Bus check */
-			rc		: 1,	/* Register file check */
-			uc		: 1;	/* Uarch check */
-
-} pal_processor_state_info_t;
-
-typedef struct pal_cache_check_info_s {
-	u64		op		: 4,	/* Type of cache
-						 * operation that
-						 * caused the machine
-						 * check.
-						 */
-			level		: 2,	/* Cache level */
-			reserved1	: 2,
-			dl		: 1,	/* Failure in data part
-						 * of cache line
-						 */
-			tl		: 1,	/* Failure in tag part
-						 * of cache line
-						 */
-			dc		: 1,	/* Failure in dcache */
-			ic		: 1,	/* Failure in icache */
-			mesi		: 3,	/* Cache line state */
-			mv		: 1,	/* mesi valid */
-			way		: 5,	/* Way in which the
-						 * error occurred
-						 */
-			wiv		: 1,	/* Way field valid */
-			reserved2	: 1,
-			dp		: 1,	/* Data poisoned on MBE */
-			reserved3	: 6,
-			hlth		: 2,	/* Health indicator */
-
-			index		: 20,	/* Cache line index */
-			reserved4	: 2,
-
-			is		: 1,	/* instruction set (1 == ia32) */
-			iv		: 1,	/* instruction set field valid */
-			pl		: 2,	/* privilege level */
-			pv		: 1,	/* privilege level field valid */
-			mcc		: 1,	/* Machine check corrected */
-			tv		: 1,	/* Target address
-						 * structure is valid
-						 */
-			rq		: 1,	/* Requester identifier
-						 * structure is valid
-						 */
-			rp		: 1,	/* Responder identifier
-						 * structure is valid
-						 */
-			pi		: 1;	/* Precise instruction pointer
-						 * structure is valid
-						 */
-} pal_cache_check_info_t;
-
-typedef struct pal_tlb_check_info_s {
-
-	u64		tr_slot		: 8,	/* Slot# of TR where
-						 * error occurred
-						 */
-			trv		: 1,	/* tr_slot field is valid */
-			reserved1	: 1,
-			level		: 2,	/* TLB level where failure occurred */
-			reserved2	: 4,
-			dtr		: 1,	/* Fail in data TR */
-			itr		: 1,	/* Fail in inst TR */
-			dtc		: 1,	/* Fail in data TC */
-			itc		: 1,	/* Fail in inst. TC */
-			op		: 4,	/* Cache operation */
-			reserved3	: 6,
-			hlth		: 2,	/* Health indicator */
-			reserved4	: 22,
-
-			is		: 1,	/* instruction set (1 == ia32) */
-			iv		: 1,	/* instruction set field valid */
-			pl		: 2,	/* privilege level */
-			pv		: 1,	/* privilege level field valid */
-			mcc		: 1,	/* Machine check corrected */
-			tv		: 1,	/* Target address
-						 * structure is valid
-						 */
-			rq		: 1,	/* Requester identifier
-						 * structure is valid
-						 */
-			rp		: 1,	/* Responder identifier
-						 * structure is valid
-						 */
-			pi		: 1;	/* Precise instruction pointer
-						 * structure is valid
-						 */
-} pal_tlb_check_info_t;
-
-typedef struct pal_bus_check_info_s {
-	u64		size		: 5,	/* Xaction size */
-			ib		: 1,	/* Internal bus error */
-			eb		: 1,	/* External bus error */
-			cc		: 1,	/* Error occurred
-						 * during cache-cache
-						 * transfer.
-						 */
-			type		: 8,	/* Bus xaction type*/
-			sev		: 5,	/* Bus error severity*/
-			hier		: 2,	/* Bus hierarchy level */
-			dp		: 1,	/* Data poisoned on MBE */
-			bsi		: 8,	/* Bus error status
-						 * info
-						 */
-			reserved2	: 22,
-
-			is		: 1,	/* instruction set (1 == ia32) */
-			iv		: 1,	/* instruction set field valid */
-			pl		: 2,	/* privilege level */
-			pv		: 1,	/* privilege level field valid */
-			mcc		: 1,	/* Machine check corrected */
-			tv		: 1,	/* Target address
-						 * structure is valid
-						 */
-			rq		: 1,	/* Requester identifier
-						 * structure is valid
-						 */
-			rp		: 1,	/* Responder identifier
-						 * structure is valid
-						 */
-			pi		: 1;	/* Precise instruction pointer
-						 * structure is valid
-						 */
-} pal_bus_check_info_t;
-
-typedef struct pal_reg_file_check_info_s {
-	u64		id		: 4,	/* Register file identifier */
-			op		: 4,	/* Type of register
-						 * operation that
-						 * caused the machine
-						 * check.
-						 */
-			reg_num		: 7,	/* Register number */
-			rnv		: 1,	/* reg_num valid */
-			reserved2	: 38,
-
-			is		: 1,	/* instruction set (1 == ia32) */
-			iv		: 1,	/* instruction set field valid */
-			pl		: 2,	/* privilege level */
-			pv		: 1,	/* privilege level field valid */
-			mcc		: 1,	/* Machine check corrected */
-			reserved3	: 3,
-			pi		: 1;	/* Precise instruction pointer
-						 * structure is valid
-						 */
-} pal_reg_file_check_info_t;
-
-typedef struct pal_uarch_check_info_s {
-	u64		sid		: 5,	/* Structure identification */
-			level		: 3,	/* Level of failure */
-			array_id	: 4,	/* Array identification */
-			op		: 4,	/* Type of
-						 * operation that
-						 * caused the machine
-						 * check.
-						 */
-			way		: 6,	/* Way of structure */
-			wv		: 1,	/* way valid */
-			xv		: 1,	/* index valid */
-			reserved1	: 6,
-			hlth		: 2,	/* Health indicator */
-			index		: 8,	/* Index or set of the uarch
-						 * structure that failed.
-						 */
-			reserved2	: 24,
-
-			is		: 1,	/* instruction set (1 == ia32) */
-			iv		: 1,	/* instruction set field valid */
-			pl		: 2,	/* privilege level */
-			pv		: 1,	/* privilege level field valid */
-			mcc		: 1,	/* Machine check corrected */
-			tv		: 1,	/* Target address
-						 * structure is valid
-						 */
-			rq		: 1,	/* Requester identifier
-						 * structure is valid
-						 */
-			rp		: 1,	/* Responder identifier
-						 * structure is valid
-						 */
-			pi		: 1;	/* Precise instruction pointer
-						 * structure is valid
-						 */
-} pal_uarch_check_info_t;
-
-typedef union pal_mc_error_info_u {
-	u64				pmei_data;
-	pal_processor_state_info_t	pme_processor;
-	pal_cache_check_info_t		pme_cache;
-	pal_tlb_check_info_t		pme_tlb;
-	pal_bus_check_info_t		pme_bus;
-	pal_reg_file_check_info_t	pme_reg_file;
-	pal_uarch_check_info_t		pme_uarch;
-} pal_mc_error_info_t;
-
-#define pmci_proc_unknown_check			pme_processor.uc
-#define pmci_proc_bus_check			pme_processor.bc
-#define pmci_proc_tlb_check			pme_processor.tc
-#define pmci_proc_cache_check			pme_processor.cc
-#define pmci_proc_dynamic_state_size		pme_processor.dsize
-#define pmci_proc_gpr_valid			pme_processor.gr
-#define pmci_proc_preserved_bank0_gpr_valid	pme_processor.b0
-#define pmci_proc_preserved_bank1_gpr_valid	pme_processor.b1
-#define pmci_proc_fp_valid			pme_processor.fp
-#define pmci_proc_predicate_regs_valid		pme_processor.pr
-#define pmci_proc_branch_regs_valid		pme_processor.br
-#define pmci_proc_app_regs_valid		pme_processor.ar
-#define pmci_proc_region_regs_valid		pme_processor.rr
-#define pmci_proc_translation_regs_valid	pme_processor.tr
-#define pmci_proc_debug_regs_valid		pme_processor.dr
-#define pmci_proc_perf_counters_valid		pme_processor.pc
-#define pmci_proc_control_regs_valid		pme_processor.cr
-#define pmci_proc_machine_check_expected	pme_processor.ex
-#define pmci_proc_machine_check_corrected	pme_processor.cm
-#define pmci_proc_rse_valid			pme_processor.rs
-#define pmci_proc_machine_check_or_init		pme_processor.in
-#define pmci_proc_dynamic_state_valid		pme_processor.dy
-#define pmci_proc_operation			pme_processor.op
-#define pmci_proc_trap_lost			pme_processor.tl
-#define pmci_proc_hardware_damage		pme_processor.hd
-#define pmci_proc_uncontained_storage_damage	pme_processor.us
-#define pmci_proc_machine_check_isolated	pme_processor.ci
-#define pmci_proc_continuable			pme_processor.co
-#define pmci_proc_storage_intergrity_synced	pme_processor.sy
-#define pmci_proc_min_state_save_area_regd	pme_processor.mn
-#define	pmci_proc_distinct_multiple_errors	pme_processor.me
-#define pmci_proc_pal_attempted_rendezvous	pme_processor.ra
-#define pmci_proc_pal_rendezvous_complete	pme_processor.rz
-
-
-#define pmci_cache_level			pme_cache.level
-#define pmci_cache_line_state			pme_cache.mesi
-#define pmci_cache_line_state_valid		pme_cache.mv
-#define pmci_cache_line_index			pme_cache.index
-#define pmci_cache_instr_cache_fail		pme_cache.ic
-#define pmci_cache_data_cache_fail		pme_cache.dc
-#define pmci_cache_line_tag_fail		pme_cache.tl
-#define pmci_cache_line_data_fail		pme_cache.dl
-#define pmci_cache_operation			pme_cache.op
-#define pmci_cache_way_valid			pme_cache.wv
-#define pmci_cache_target_address_valid		pme_cache.tv
-#define pmci_cache_way				pme_cache.way
-#define pmci_cache_mc				pme_cache.mc
-
-#define pmci_tlb_instr_translation_cache_fail	pme_tlb.itc
-#define pmci_tlb_data_translation_cache_fail	pme_tlb.dtc
-#define pmci_tlb_instr_translation_reg_fail	pme_tlb.itr
-#define pmci_tlb_data_translation_reg_fail	pme_tlb.dtr
-#define pmci_tlb_translation_reg_slot		pme_tlb.tr_slot
-#define pmci_tlb_mc				pme_tlb.mc
-
-#define pmci_bus_status_info			pme_bus.bsi
-#define pmci_bus_req_address_valid		pme_bus.rq
-#define pmci_bus_resp_address_valid		pme_bus.rp
-#define pmci_bus_target_address_valid		pme_bus.tv
-#define pmci_bus_error_severity			pme_bus.sev
-#define pmci_bus_transaction_type		pme_bus.type
-#define pmci_bus_cache_cache_transfer		pme_bus.cc
-#define pmci_bus_transaction_size		pme_bus.size
-#define pmci_bus_internal_error			pme_bus.ib
-#define pmci_bus_external_error			pme_bus.eb
-#define pmci_bus_mc				pme_bus.mc
-
-/*
- * NOTE: this min_state_save area struct only includes the 1KB
- * architectural state save area.  The other 3 KB is scratch space
- * for PAL.
- */
-
-struct pal_min_state_area {
-	u64	pmsa_nat_bits;		/* nat bits for saved GRs  */
-	u64	pmsa_gr[15];		/* GR1	- GR15		   */
-	u64	pmsa_bank0_gr[16];	/* GR16 - GR31		   */
-	u64	pmsa_bank1_gr[16];	/* GR16 - GR31		   */
-	u64	pmsa_pr;		/* predicate registers	   */
-	u64	pmsa_br0;		/* branch register 0	   */
-	u64	pmsa_rsc;		/* ar.rsc		   */
-	u64	pmsa_iip;		/* cr.iip		   */
-	u64	pmsa_ipsr;		/* cr.ipsr		   */
-	u64	pmsa_ifs;		/* cr.ifs		   */
-	u64	pmsa_xip;		/* previous iip		   */
-	u64	pmsa_xpsr;		/* previous psr		   */
-	u64	pmsa_xfs;		/* previous ifs		   */
-	u64	pmsa_br1;		/* branch register 1	   */
-	u64	pmsa_reserved[70];	/* pal_min_state_area should total to 1KB */
-};
-
-
-struct ia64_pal_retval {
-	/*
-	 * A zero status value indicates call completed without error.
-	 * A negative status value indicates reason of call failure.
-	 * A positive status value indicates success but an
-	 * informational value should be printed (e.g., "reboot for
-	 * change to take effect").
-	 */
-	s64 status;
-	u64 v0;
-	u64 v1;
-	u64 v2;
-};
-
-/*
- * Note: Currently unused PAL arguments are generally labeled
- * "reserved" so the value specified in the PAL documentation
- * (generally 0) MUST be passed.  Reserved parameters are not optional
- * parameters.
- */
-extern struct ia64_pal_retval ia64_pal_call_static (u64, u64, u64, u64);
-extern struct ia64_pal_retval ia64_pal_call_stacked (u64, u64, u64, u64);
-extern struct ia64_pal_retval ia64_pal_call_phys_static (u64, u64, u64, u64);
-extern struct ia64_pal_retval ia64_pal_call_phys_stacked (u64, u64, u64, u64);
-extern void ia64_save_scratch_fpregs (struct ia64_fpreg *);
-extern void ia64_load_scratch_fpregs (struct ia64_fpreg *);
-
-#define PAL_CALL(iprv,a0,a1,a2,a3) do {			\
-	struct ia64_fpreg fr[6];			\
-	ia64_save_scratch_fpregs(fr);			\
-	iprv = ia64_pal_call_static(a0, a1, a2, a3);	\
-	ia64_load_scratch_fpregs(fr);			\
-} while (0)
-
-#define PAL_CALL_STK(iprv,a0,a1,a2,a3) do {		\
-	struct ia64_fpreg fr[6];			\
-	ia64_save_scratch_fpregs(fr);			\
-	iprv = ia64_pal_call_stacked(a0, a1, a2, a3);	\
-	ia64_load_scratch_fpregs(fr);			\
-} while (0)
-
-#define PAL_CALL_PHYS(iprv,a0,a1,a2,a3) do {			\
-	struct ia64_fpreg fr[6];				\
-	ia64_save_scratch_fpregs(fr);				\
-	iprv = ia64_pal_call_phys_static(a0, a1, a2, a3);	\
-	ia64_load_scratch_fpregs(fr);				\
-} while (0)
-
-#define PAL_CALL_PHYS_STK(iprv,a0,a1,a2,a3) do {		\
-	struct ia64_fpreg fr[6];				\
-	ia64_save_scratch_fpregs(fr);				\
-	iprv = ia64_pal_call_phys_stacked(a0, a1, a2, a3);	\
-	ia64_load_scratch_fpregs(fr);				\
-} while (0)
-
-typedef int (*ia64_pal_handler) (u64, ...);
-extern ia64_pal_handler ia64_pal;
-extern void ia64_pal_handler_init (void *);
-
-extern ia64_pal_handler ia64_pal;
-
-extern pal_cache_config_info_t		l0d_cache_config_info;
-extern pal_cache_config_info_t		l0i_cache_config_info;
-extern pal_cache_config_info_t		l1_cache_config_info;
-extern pal_cache_config_info_t		l2_cache_config_info;
-
-extern pal_cache_protection_info_t	l0d_cache_protection_info;
-extern pal_cache_protection_info_t	l0i_cache_protection_info;
-extern pal_cache_protection_info_t	l1_cache_protection_info;
-extern pal_cache_protection_info_t	l2_cache_protection_info;
-
-extern pal_cache_config_info_t		pal_cache_config_info_get(pal_cache_level_t,
-								  pal_cache_type_t);
-
-extern pal_cache_protection_info_t	pal_cache_protection_info_get(pal_cache_level_t,
-								      pal_cache_type_t);
-
-
-extern void				pal_error(int);
-
-
-/* Useful wrappers for the current list of pal procedures */
-
-typedef union pal_bus_features_u {
-	u64	pal_bus_features_val;
-	struct {
-		u64	pbf_reserved1				:	29;
-		u64	pbf_req_bus_parking			:	1;
-		u64	pbf_bus_lock_mask			:	1;
-		u64	pbf_enable_half_xfer_rate		:	1;
-		u64	pbf_reserved2				:	20;
-		u64	pbf_enable_shared_line_replace		:	1;
-		u64	pbf_enable_exclusive_line_replace	:	1;
-		u64	pbf_disable_xaction_queueing		:	1;
-		u64	pbf_disable_resp_err_check		:	1;
-		u64	pbf_disable_berr_check			:	1;
-		u64	pbf_disable_bus_req_internal_err_signal	:	1;
-		u64	pbf_disable_bus_req_berr_signal		:	1;
-		u64	pbf_disable_bus_init_event_check	:	1;
-		u64	pbf_disable_bus_init_event_signal	:	1;
-		u64	pbf_disable_bus_addr_err_check		:	1;
-		u64	pbf_disable_bus_addr_err_signal		:	1;
-		u64	pbf_disable_bus_data_err_check		:	1;
-	} pal_bus_features_s;
-} pal_bus_features_u_t;
-
-extern void pal_bus_features_print (u64);
-
-/* Provide information about configurable processor bus features */
-static inline s64
-ia64_pal_bus_get_features (pal_bus_features_u_t *features_avail,
-			   pal_bus_features_u_t *features_status,
-			   pal_bus_features_u_t *features_control)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS(iprv, PAL_BUS_GET_FEATURES, 0, 0, 0);
-	if (features_avail)
-		features_avail->pal_bus_features_val = iprv.v0;
-	if (features_status)
-		features_status->pal_bus_features_val = iprv.v1;
-	if (features_control)
-		features_control->pal_bus_features_val = iprv.v2;
-	return iprv.status;
-}
-
-/* Enables/disables specific processor bus features */
-static inline s64
-ia64_pal_bus_set_features (pal_bus_features_u_t feature_select)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS(iprv, PAL_BUS_SET_FEATURES, feature_select.pal_bus_features_val, 0, 0);
-	return iprv.status;
-}
-
-/* Get detailed cache information */
-static inline s64
-ia64_pal_cache_config_info (u64 cache_level, u64 cache_type, pal_cache_config_info_t *conf)
-{
-	struct ia64_pal_retval iprv;
-
-	PAL_CALL(iprv, PAL_CACHE_INFO, cache_level, cache_type, 0);
-
-	if (iprv.status == 0) {
-		conf->pcci_status                 = iprv.status;
-		conf->pcci_info_1.pcci1_data      = iprv.v0;
-		conf->pcci_info_2.pcci2_data      = iprv.v1;
-		conf->pcci_reserved               = iprv.v2;
-	}
-	return iprv.status;
-
-}
-
-/* Get detailed cche protection information */
-static inline s64
-ia64_pal_cache_prot_info (u64 cache_level, u64 cache_type, pal_cache_protection_info_t *prot)
-{
-	struct ia64_pal_retval iprv;
-
-	PAL_CALL(iprv, PAL_CACHE_PROT_INFO, cache_level, cache_type, 0);
-
-	if (iprv.status == 0) {
-		prot->pcpi_status           = iprv.status;
-		prot->pcp_info[0].pcpi_data = iprv.v0 & 0xffffffff;
-		prot->pcp_info[1].pcpi_data = iprv.v0 >> 32;
-		prot->pcp_info[2].pcpi_data = iprv.v1 & 0xffffffff;
-		prot->pcp_info[3].pcpi_data = iprv.v1 >> 32;
-		prot->pcp_info[4].pcpi_data = iprv.v2 & 0xffffffff;
-		prot->pcp_info[5].pcpi_data = iprv.v2 >> 32;
-	}
-	return iprv.status;
-}
-
-/*
- * Flush the processor instruction or data caches.  *PROGRESS must be
- * initialized to zero before calling this for the first time..
- */
-static inline s64
-ia64_pal_cache_flush (u64 cache_type, u64 invalidate, u64 *progress, u64 *vector)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_CACHE_FLUSH, cache_type, invalidate, *progress);
-	if (vector)
-		*vector = iprv.v0;
-	*progress = iprv.v1;
-	return iprv.status;
-}
-
-
-/* Initialize the processor controlled caches */
-static inline s64
-ia64_pal_cache_init (u64 level, u64 cache_type, u64 rest)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_CACHE_INIT, level, cache_type, rest);
-	return iprv.status;
-}
-
-/* Initialize the tags and data of a data or unified cache line of
- * processor controlled cache to known values without the availability
- * of backing memory.
- */
-static inline s64
-ia64_pal_cache_line_init (u64 physical_addr, u64 data_value)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_CACHE_LINE_INIT, physical_addr, data_value, 0);
-	return iprv.status;
-}
-
-
-/* Read the data and tag of a processor controlled cache line for diags */
-static inline s64
-ia64_pal_cache_read (pal_cache_line_id_u_t line_id, u64 physical_addr)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS_STK(iprv, PAL_CACHE_READ, line_id.pclid_data,
-				physical_addr, 0);
-	return iprv.status;
-}
-
-/* Return summary information about the hierarchy of caches controlled by the processor */
-static inline long ia64_pal_cache_summary(unsigned long *cache_levels,
-						unsigned long *unique_caches)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_CACHE_SUMMARY, 0, 0, 0);
-	if (cache_levels)
-		*cache_levels = iprv.v0;
-	if (unique_caches)
-		*unique_caches = iprv.v1;
-	return iprv.status;
-}
-
-/* Write the data and tag of a processor-controlled cache line for diags */
-static inline s64
-ia64_pal_cache_write (pal_cache_line_id_u_t line_id, u64 physical_addr, u64 data)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS_STK(iprv, PAL_CACHE_WRITE, line_id.pclid_data,
-				physical_addr, data);
-	return iprv.status;
-}
-
-
-/* Return the parameters needed to copy relocatable PAL procedures from ROM to memory */
-static inline s64
-ia64_pal_copy_info (u64 copy_type, u64 num_procs, u64 num_iopics,
-		    u64 *buffer_size, u64 *buffer_align)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_COPY_INFO, copy_type, num_procs, num_iopics);
-	if (buffer_size)
-		*buffer_size = iprv.v0;
-	if (buffer_align)
-		*buffer_align = iprv.v1;
-	return iprv.status;
-}
-
-/* Copy relocatable PAL procedures from ROM to memory */
-static inline s64
-ia64_pal_copy_pal (u64 target_addr, u64 alloc_size, u64 processor, u64 *pal_proc_offset)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_COPY_PAL, target_addr, alloc_size, processor);
-	if (pal_proc_offset)
-		*pal_proc_offset = iprv.v0;
-	return iprv.status;
-}
-
-/* Return the number of instruction and data debug register pairs */
-static inline long ia64_pal_debug_info(unsigned long *inst_regs,
-						unsigned long *data_regs)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_DEBUG_INFO, 0, 0, 0);
-	if (inst_regs)
-		*inst_regs = iprv.v0;
-	if (data_regs)
-		*data_regs = iprv.v1;
-
-	return iprv.status;
-}
-
-#ifdef TBD
-/* Switch from IA64-system environment to IA-32 system environment */
-static inline s64
-ia64_pal_enter_ia32_env (ia32_env1, ia32_env2, ia32_env3)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_ENTER_IA_32_ENV, ia32_env1, ia32_env2, ia32_env3);
-	return iprv.status;
-}
-#endif
-
-/* Get unique geographical address of this processor on its bus */
-static inline s64
-ia64_pal_fixed_addr (u64 *global_unique_addr)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_FIXED_ADDR, 0, 0, 0);
-	if (global_unique_addr)
-		*global_unique_addr = iprv.v0;
-	return iprv.status;
-}
-
-/* Get base frequency of the platform if generated by the processor */
-static inline long ia64_pal_freq_base(unsigned long *platform_base_freq)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_FREQ_BASE, 0, 0, 0);
-	if (platform_base_freq)
-		*platform_base_freq = iprv.v0;
-	return iprv.status;
-}
-
-/*
- * Get the ratios for processor frequency, bus frequency and interval timer to
- * the base frequency of the platform
- */
-static inline s64
-ia64_pal_freq_ratios (struct pal_freq_ratio *proc_ratio, struct pal_freq_ratio *bus_ratio,
-		      struct pal_freq_ratio *itc_ratio)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_FREQ_RATIOS, 0, 0, 0);
-	if (proc_ratio)
-		*(u64 *)proc_ratio = iprv.v0;
-	if (bus_ratio)
-		*(u64 *)bus_ratio = iprv.v1;
-	if (itc_ratio)
-		*(u64 *)itc_ratio = iprv.v2;
-	return iprv.status;
-}
-
-/*
- * Get the current hardware resource sharing policy of the processor
- */
-static inline s64
-ia64_pal_get_hw_policy (u64 proc_num, u64 *cur_policy, u64 *num_impacted,
-			u64 *la)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_GET_HW_POLICY, proc_num, 0, 0);
-	if (cur_policy)
-		*cur_policy = iprv.v0;
-	if (num_impacted)
-		*num_impacted = iprv.v1;
-	if (la)
-		*la = iprv.v2;
-	return iprv.status;
-}
-
-/* Make the processor enter HALT or one of the implementation dependent low
- * power states where prefetching and execution are suspended and cache and
- * TLB coherency is not maintained.
- */
-static inline s64
-ia64_pal_halt (u64 halt_state)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_HALT, halt_state, 0, 0);
-	return iprv.status;
-}
-
-typedef union pal_power_mgmt_info_u {
-	u64			ppmi_data;
-	struct {
-	       u64		exit_latency		: 16,
-				entry_latency		: 16,
-				power_consumption	: 28,
-				im			: 1,
-				co			: 1,
-				reserved		: 2;
-	} pal_power_mgmt_info_s;
-} pal_power_mgmt_info_u_t;
-
-/* Return information about processor's optional power management capabilities. */
-static inline s64
-ia64_pal_halt_info (pal_power_mgmt_info_u_t *power_buf)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_STK(iprv, PAL_HALT_INFO, (unsigned long) power_buf, 0, 0);
-	return iprv.status;
-}
-
-/* Get the current P-state information */
-static inline s64
-ia64_pal_get_pstate (u64 *pstate_index, unsigned long type)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_STK(iprv, PAL_GET_PSTATE, type, 0, 0);
-	*pstate_index = iprv.v0;
-	return iprv.status;
-}
-
-/* Set the P-state */
-static inline s64
-ia64_pal_set_pstate (u64 pstate_index)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_STK(iprv, PAL_SET_PSTATE, pstate_index, 0, 0);
-	return iprv.status;
-}
-
-/* Processor branding information*/
-static inline s64
-ia64_pal_get_brand_info (char *brand_info)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_STK(iprv, PAL_BRAND_INFO, 0, (u64)brand_info, 0);
-	return iprv.status;
-}
-
-/* Cause the processor to enter LIGHT HALT state, where prefetching and execution are
- * suspended, but cache and TLB coherency is maintained.
- */
-static inline s64
-ia64_pal_halt_light (void)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_HALT_LIGHT, 0, 0, 0);
-	return iprv.status;
-}
-
-/* Clear all the processor error logging   registers and reset the indicator that allows
- * the error logging registers to be written. This procedure also checks the pending
- * machine check bit and pending INIT bit and reports their states.
- */
-static inline s64
-ia64_pal_mc_clear_log (u64 *pending_vector)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_CLEAR_LOG, 0, 0, 0);
-	if (pending_vector)
-		*pending_vector = iprv.v0;
-	return iprv.status;
-}
-
-/* Ensure that all outstanding transactions in a processor are completed or that any
- * MCA due to thes outstanding transaction is taken.
- */
-static inline s64
-ia64_pal_mc_drain (void)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_DRAIN, 0, 0, 0);
-	return iprv.status;
-}
-
-/* Return the machine check dynamic processor state */
-static inline s64
-ia64_pal_mc_dynamic_state (u64 info_type, u64 dy_buffer, u64 *size)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_DYNAMIC_STATE, info_type, dy_buffer, 0);
-	if (size)
-		*size = iprv.v0;
-	return iprv.status;
-}
-
-/* Return processor machine check information */
-static inline s64
-ia64_pal_mc_error_info (u64 info_index, u64 type_index, u64 *size, u64 *error_info)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_ERROR_INFO, info_index, type_index, 0);
-	if (size)
-		*size = iprv.v0;
-	if (error_info)
-		*error_info = iprv.v1;
-	return iprv.status;
-}
-
-/* Injects the requested processor error or returns info on
- * supported injection capabilities for current processor implementation
- */
-static inline s64
-ia64_pal_mc_error_inject_phys (u64 err_type_info, u64 err_struct_info,
-			u64 err_data_buffer, u64 *capabilities, u64 *resources)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS_STK(iprv, PAL_MC_ERROR_INJECT, err_type_info,
-			  err_struct_info, err_data_buffer);
-	if (capabilities)
-		*capabilities= iprv.v0;
-	if (resources)
-		*resources= iprv.v1;
-	return iprv.status;
-}
-
-static inline s64
-ia64_pal_mc_error_inject_virt (u64 err_type_info, u64 err_struct_info,
-			u64 err_data_buffer, u64 *capabilities, u64 *resources)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_STK(iprv, PAL_MC_ERROR_INJECT, err_type_info,
-			  err_struct_info, err_data_buffer);
-	if (capabilities)
-		*capabilities= iprv.v0;
-	if (resources)
-		*resources= iprv.v1;
-	return iprv.status;
-}
-
-/* Inform PALE_CHECK whether a machine check is expected so that PALE_CHECK willnot
- * attempt to correct any expected machine checks.
- */
-static inline s64
-ia64_pal_mc_expected (u64 expected, u64 *previous)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_EXPECTED, expected, 0, 0);
-	if (previous)
-		*previous = iprv.v0;
-	return iprv.status;
-}
-
-typedef union pal_hw_tracking_u {
-	u64			pht_data;
-	struct {
-		u64		itc	:4,	/* Instruction cache tracking */
-				dct	:4,	/* Date cache tracking */
-				itt	:4,	/* Instruction TLB tracking */
-				ddt	:4,	/* Data TLB tracking */
-				reserved:48;
-	} pal_hw_tracking_s;
-} pal_hw_tracking_u_t;
-
-/*
- * Hardware tracking status.
- */
-static inline s64
-ia64_pal_mc_hw_tracking (u64 *status)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_HW_TRACKING, 0, 0, 0);
-	if (status)
-		*status = iprv.v0;
-	return iprv.status;
-}
-
-/* Register a platform dependent location with PAL to which it can save
- * minimal processor state in the event of a machine check or initialization
- * event.
- */
-static inline s64
-ia64_pal_mc_register_mem (u64 physical_addr, u64 size, u64 *req_size)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_REGISTER_MEM, physical_addr, size, 0);
-	if (req_size)
-		*req_size = iprv.v0;
-	return iprv.status;
-}
-
-/* Restore minimal architectural processor state, set CMC interrupt if necessary
- * and resume execution
- */
-static inline s64
-ia64_pal_mc_resume (u64 set_cmci, u64 save_ptr)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MC_RESUME, set_cmci, save_ptr, 0);
-	return iprv.status;
-}
-
-/* Return the memory attributes implemented by the processor */
-static inline s64
-ia64_pal_mem_attrib (u64 *mem_attrib)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MEM_ATTRIB, 0, 0, 0);
-	if (mem_attrib)
-		*mem_attrib = iprv.v0 & 0xff;
-	return iprv.status;
-}
-
-/* Return the amount of memory needed for second phase of processor
- * self-test and the required alignment of memory.
- */
-static inline s64
-ia64_pal_mem_for_test (u64 *bytes_needed, u64 *alignment)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_MEM_FOR_TEST, 0, 0, 0);
-	if (bytes_needed)
-		*bytes_needed = iprv.v0;
-	if (alignment)
-		*alignment = iprv.v1;
-	return iprv.status;
-}
-
-typedef union pal_perf_mon_info_u {
-	u64			  ppmi_data;
-	struct {
-	       u64		generic		: 8,
-				width		: 8,
-				cycles		: 8,
-				retired		: 8,
-				reserved	: 32;
-	} pal_perf_mon_info_s;
-} pal_perf_mon_info_u_t;
-
-/* Return the performance monitor information about what can be counted
- * and how to configure the monitors to count the desired events.
- */
-static inline s64
-ia64_pal_perf_mon_info (u64 *pm_buffer, pal_perf_mon_info_u_t *pm_info)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_PERF_MON_INFO, (unsigned long) pm_buffer, 0, 0);
-	if (pm_info)
-		pm_info->ppmi_data = iprv.v0;
-	return iprv.status;
-}
-
-/* Specifies the physical address of the processor interrupt block
- * and I/O port space.
- */
-static inline s64
-ia64_pal_platform_addr (u64 type, u64 physical_addr)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_PLATFORM_ADDR, type, physical_addr, 0);
-	return iprv.status;
-}
-
-/* Set the SAL PMI entrypoint in memory */
-static inline s64
-ia64_pal_pmi_entrypoint (u64 sal_pmi_entry_addr)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_PMI_ENTRYPOINT, sal_pmi_entry_addr, 0, 0);
-	return iprv.status;
-}
-
-struct pal_features_s;
-/* Provide information about configurable processor features */
-static inline s64
-ia64_pal_proc_get_features (u64 *features_avail,
-			    u64 *features_status,
-			    u64 *features_control,
-			    u64 features_set)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS(iprv, PAL_PROC_GET_FEATURES, 0, features_set, 0);
-	if (iprv.status == 0) {
-		*features_avail   = iprv.v0;
-		*features_status  = iprv.v1;
-		*features_control = iprv.v2;
-	}
-	return iprv.status;
-}
-
-/* Enable/disable processor dependent features */
-static inline s64
-ia64_pal_proc_set_features (u64 feature_select)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS(iprv, PAL_PROC_SET_FEATURES, feature_select, 0, 0);
-	return iprv.status;
-}
-
-/*
- * Put everything in a struct so we avoid the global offset table whenever
- * possible.
- */
-typedef struct ia64_ptce_info_s {
-	unsigned long	base;
-	u32		count[2];
-	u32		stride[2];
-} ia64_ptce_info_t;
-
-/* Return the information required for the architected loop used to purge
- * (initialize) the entire TC
- */
-static inline s64
-ia64_get_ptce (ia64_ptce_info_t *ptce)
-{
-	struct ia64_pal_retval iprv;
-
-	if (!ptce)
-		return -1;
-
-	PAL_CALL(iprv, PAL_PTCE_INFO, 0, 0, 0);
-	if (iprv.status == 0) {
-		ptce->base = iprv.v0;
-		ptce->count[0] = iprv.v1 >> 32;
-		ptce->count[1] = iprv.v1 & 0xffffffff;
-		ptce->stride[0] = iprv.v2 >> 32;
-		ptce->stride[1] = iprv.v2 & 0xffffffff;
-	}
-	return iprv.status;
-}
-
-/* Return info about implemented application and control registers. */
-static inline s64
-ia64_pal_register_info (u64 info_request, u64 *reg_info_1, u64 *reg_info_2)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_REGISTER_INFO, info_request, 0, 0);
-	if (reg_info_1)
-		*reg_info_1 = iprv.v0;
-	if (reg_info_2)
-		*reg_info_2 = iprv.v1;
-	return iprv.status;
-}
-
-typedef union pal_hints_u {
-	unsigned long		ph_data;
-	struct {
-	       unsigned long	si		: 1,
-				li		: 1,
-				reserved	: 62;
-	} pal_hints_s;
-} pal_hints_u_t;
-
-/* Return information about the register stack and RSE for this processor
- * implementation.
- */
-static inline long ia64_pal_rse_info(unsigned long *num_phys_stacked,
-							pal_hints_u_t *hints)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_RSE_INFO, 0, 0, 0);
-	if (num_phys_stacked)
-		*num_phys_stacked = iprv.v0;
-	if (hints)
-		hints->ph_data = iprv.v1;
-	return iprv.status;
-}
-
-/*
- * Set the current hardware resource sharing policy of the processor
- */
-static inline s64
-ia64_pal_set_hw_policy (u64 policy)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_SET_HW_POLICY, policy, 0, 0);
-	return iprv.status;
-}
-
-/* Cause the processor to enter	SHUTDOWN state, where prefetching and execution are
- * suspended, but cause cache and TLB coherency to be maintained.
- * This is usually called in IA-32 mode.
- */
-static inline s64
-ia64_pal_shutdown (void)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_SHUTDOWN, 0, 0, 0);
-	return iprv.status;
-}
-
-/* Perform the second phase of processor self-test. */
-static inline s64
-ia64_pal_test_proc (u64 test_addr, u64 test_size, u64 attributes, u64 *self_test_state)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_TEST_PROC, test_addr, test_size, attributes);
-	if (self_test_state)
-		*self_test_state = iprv.v0;
-	return iprv.status;
-}
-
-typedef union  pal_version_u {
-	u64	pal_version_val;
-	struct {
-		u64	pv_pal_b_rev		:	8;
-		u64	pv_pal_b_model		:	8;
-		u64	pv_reserved1		:	8;
-		u64	pv_pal_vendor		:	8;
-		u64	pv_pal_a_rev		:	8;
-		u64	pv_pal_a_model		:	8;
-		u64	pv_reserved2		:	16;
-	} pal_version_s;
-} pal_version_u_t;
-
-
-/*
- * Return PAL version information.  While the documentation states that
- * PAL_VERSION can be called in either physical or virtual mode, some
- * implementations only allow physical calls.  We don't call it very often,
- * so the overhead isn't worth eliminating.
- */
-static inline s64
-ia64_pal_version (pal_version_u_t *pal_min_version, pal_version_u_t *pal_cur_version)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS(iprv, PAL_VERSION, 0, 0, 0);
-	if (pal_min_version)
-		pal_min_version->pal_version_val = iprv.v0;
-
-	if (pal_cur_version)
-		pal_cur_version->pal_version_val = iprv.v1;
-
-	return iprv.status;
-}
-
-typedef union pal_tc_info_u {
-	u64			pti_val;
-	struct {
-	       u64		num_sets	:	8,
-				associativity	:	8,
-				num_entries	:	16,
-				pf		:	1,
-				unified		:	1,
-				reduce_tr	:	1,
-				reserved	:	29;
-	} pal_tc_info_s;
-} pal_tc_info_u_t;
-
-#define tc_reduce_tr		pal_tc_info_s.reduce_tr
-#define tc_unified		pal_tc_info_s.unified
-#define tc_pf			pal_tc_info_s.pf
-#define tc_num_entries		pal_tc_info_s.num_entries
-#define tc_associativity	pal_tc_info_s.associativity
-#define tc_num_sets		pal_tc_info_s.num_sets
-
-
-/* Return information about the virtual memory characteristics of the processor
- * implementation.
- */
-static inline s64
-ia64_pal_vm_info (u64 tc_level, u64 tc_type,  pal_tc_info_u_t *tc_info, u64 *tc_pages)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_VM_INFO, tc_level, tc_type, 0);
-	if (tc_info)
-		tc_info->pti_val = iprv.v0;
-	if (tc_pages)
-		*tc_pages = iprv.v1;
-	return iprv.status;
-}
-
-/* Get page size information about the virtual memory characteristics of the processor
- * implementation.
- */
-static inline s64 ia64_pal_vm_page_size(u64 *tr_pages, u64 *vw_pages)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_VM_PAGE_SIZE, 0, 0, 0);
-	if (tr_pages)
-		*tr_pages = iprv.v0;
-	if (vw_pages)
-		*vw_pages = iprv.v1;
-	return iprv.status;
-}
-
-typedef union pal_vm_info_1_u {
-	u64			pvi1_val;
-	struct {
-		u64		vw		: 1,
-				phys_add_size	: 7,
-				key_size	: 8,
-				max_pkr		: 8,
-				hash_tag_id	: 8,
-				max_dtr_entry	: 8,
-				max_itr_entry	: 8,
-				max_unique_tcs	: 8,
-				num_tc_levels	: 8;
-	} pal_vm_info_1_s;
-} pal_vm_info_1_u_t;
-
-#define PAL_MAX_PURGES		0xFFFF		/* all ones is means unlimited */
-
-typedef union pal_vm_info_2_u {
-	u64			pvi2_val;
-	struct {
-		u64		impl_va_msb	: 8,
-				rid_size	: 8,
-				max_purges	: 16,
-				reserved	: 32;
-	} pal_vm_info_2_s;
-} pal_vm_info_2_u_t;
-
-/* Get summary information about the virtual memory characteristics of the processor
- * implementation.
- */
-static inline s64
-ia64_pal_vm_summary (pal_vm_info_1_u_t *vm_info_1, pal_vm_info_2_u_t *vm_info_2)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_VM_SUMMARY, 0, 0, 0);
-	if (vm_info_1)
-		vm_info_1->pvi1_val = iprv.v0;
-	if (vm_info_2)
-		vm_info_2->pvi2_val = iprv.v1;
-	return iprv.status;
-}
-
-typedef union pal_vp_info_u {
-	u64			pvi_val;
-	struct {
-		u64		index:		48,	/* virtual feature set info */
-				vmm_id:		16;	/* feature set id */
-	} pal_vp_info_s;
-} pal_vp_info_u_t;
-
-/*
- * Returns information about virtual processor features
- */
-static inline s64
-ia64_pal_vp_info (u64 feature_set, u64 vp_buffer, u64 *vp_info, u64 *vmm_id)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_VP_INFO, feature_set, vp_buffer, 0);
-	if (vp_info)
-		*vp_info = iprv.v0;
-	if (vmm_id)
-		*vmm_id = iprv.v1;
-	return iprv.status;
-}
-
-typedef union pal_itr_valid_u {
-	u64			piv_val;
-	struct {
-	       u64		access_rights_valid	: 1,
-				priv_level_valid	: 1,
-				dirty_bit_valid		: 1,
-				mem_attr_valid		: 1,
-				reserved		: 60;
-	} pal_tr_valid_s;
-} pal_tr_valid_u_t;
-
-/* Read a translation register */
-static inline s64
-ia64_pal_tr_read (u64 reg_num, u64 tr_type, u64 *tr_buffer, pal_tr_valid_u_t *tr_valid)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL_PHYS_STK(iprv, PAL_VM_TR_READ, reg_num, tr_type,(u64)ia64_tpa(tr_buffer));
-	if (tr_valid)
-		tr_valid->piv_val = iprv.v0;
-	return iprv.status;
-}
-
-/*
- * PAL_PREFETCH_VISIBILITY transaction types
- */
-#define PAL_VISIBILITY_VIRTUAL		0
-#define PAL_VISIBILITY_PHYSICAL		1
-
-/*
- * PAL_PREFETCH_VISIBILITY return codes
- */
-#define PAL_VISIBILITY_OK		1
-#define PAL_VISIBILITY_OK_REMOTE_NEEDED	0
-#define PAL_VISIBILITY_INVAL_ARG	-2
-#define PAL_VISIBILITY_ERROR		-3
-
-static inline s64
-ia64_pal_prefetch_visibility (s64 trans_type)
-{
-	struct ia64_pal_retval iprv;
-	PAL_CALL(iprv, PAL_PREFETCH_VISIBILITY, trans_type, 0, 0);
-	return iprv.status;
-}
-
-/* data structure for getting information on logical to physical mappings */
-typedef union pal_log_overview_u {
-	struct {
-		u64	num_log		:16,	/* Total number of logical
-						 * processors on this die
-						 */
-			tpc		:8,	/* Threads per core */
-			reserved3	:8,	/* Reserved */
-			cpp		:8,	/* Cores per processor */
-			reserved2	:8,	/* Reserved */
-			ppid		:8,	/* Physical processor ID */
-			reserved1	:8;	/* Reserved */
-	} overview_bits;
-	u64 overview_data;
-} pal_log_overview_t;
-
-typedef union pal_proc_n_log_info1_u{
-	struct {
-		u64	tid		:16,	/* Thread id */
-			reserved2	:16,	/* Reserved */
-			cid		:16,	/* Core id */
-			reserved1	:16;	/* Reserved */
-	} ppli1_bits;
-	u64	ppli1_data;
-} pal_proc_n_log_info1_t;
-
-typedef union pal_proc_n_log_info2_u {
-	struct {
-		u64	la		:16,	/* Logical address */
-			reserved	:48;	/* Reserved */
-	} ppli2_bits;
-	u64	ppli2_data;
-} pal_proc_n_log_info2_t;
-
-typedef struct pal_logical_to_physical_s
-{
-	pal_log_overview_t overview;
-	pal_proc_n_log_info1_t ppli1;
-	pal_proc_n_log_info2_t ppli2;
-} pal_logical_to_physical_t;
-
-#define overview_num_log	overview.overview_bits.num_log
-#define overview_tpc		overview.overview_bits.tpc
-#define overview_cpp		overview.overview_bits.cpp
-#define overview_ppid		overview.overview_bits.ppid
-#define log1_tid		ppli1.ppli1_bits.tid
-#define log1_cid		ppli1.ppli1_bits.cid
-#define log2_la			ppli2.ppli2_bits.la
-
-/* Get information on logical to physical processor mappings. */
-static inline s64
-ia64_pal_logical_to_phys(u64 proc_number, pal_logical_to_physical_t *mapping)
-{
-	struct ia64_pal_retval iprv;
-
-	PAL_CALL(iprv, PAL_LOGICAL_TO_PHYSICAL, proc_number, 0, 0);
-
-	if (iprv.status == PAL_STATUS_SUCCESS)
-	{
-		mapping->overview.overview_data = iprv.v0;
-		mapping->ppli1.ppli1_data = iprv.v1;
-		mapping->ppli2.ppli2_data = iprv.v2;
-	}
-
-	return iprv.status;
-}
-
-typedef struct pal_cache_shared_info_s
-{
-	u64 num_shared;
-	pal_proc_n_log_info1_t ppli1;
-	pal_proc_n_log_info2_t ppli2;
-} pal_cache_shared_info_t;
-
-/* Get information on logical to physical processor mappings. */
-static inline s64
-ia64_pal_cache_shared_info(u64 level,
-		u64 type,
-		u64 proc_number,
-		pal_cache_shared_info_t *info)
-{
-	struct ia64_pal_retval iprv;
-
-	PAL_CALL(iprv, PAL_CACHE_SHARED_INFO, level, type, proc_number);
-
-	if (iprv.status == PAL_STATUS_SUCCESS) {
-		info->num_shared = iprv.v0;
-		info->ppli1.ppli1_data = iprv.v1;
-		info->ppli2.ppli2_data = iprv.v2;
-	}
-
-	return iprv.status;
-}
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_IA64_PAL_H */
diff --git a/arch/ia64/include/asm/param.h b/arch/ia64/include/asm/param.h
deleted file mode 100644
index f0b786227c40..000000000000
--- a/arch/ia64/include/asm/param.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Fundamental kernel parameters.
- *
- * Based on <asm-i386/param.h>.
- *
- * Modified 1998, 1999, 2002-2003
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#ifndef _ASM_IA64_PARAM_H
-#define _ASM_IA64_PARAM_H
-
-#include <uapi/asm/param.h>
-
-# define HZ		CONFIG_HZ
-# define USER_HZ	HZ
-# define CLOCKS_PER_SEC	HZ	/* frequency at which times() counts */
-#endif /* _ASM_IA64_PARAM_H */
diff --git a/arch/ia64/include/asm/parport.h b/arch/ia64/include/asm/parport.h
deleted file mode 100644
index 360ca9bf2f6f..000000000000
--- a/arch/ia64/include/asm/parport.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * parport.h: platform-specific PC-style parport initialisation
- *
- * Copyright (C) 1999, 2000  Tim Waugh <tim@cyberelk.demon.co.uk>
- *
- * This file should only be included by drivers/parport/parport_pc.c.
- */
-
-#ifndef _ASM_IA64_PARPORT_H
-#define _ASM_IA64_PARPORT_H 1
-
-static int parport_pc_find_isa_ports(int autoirq, int autodma);
-
-static int parport_pc_find_nonpci_ports(int autoirq, int autodma)
-{
-	return parport_pc_find_isa_ports(autoirq, autodma);
-}
-
-#endif /* _ASM_IA64_PARPORT_H */
diff --git a/arch/ia64/include/asm/patch.h b/arch/ia64/include/asm/patch.h
deleted file mode 100644
index bd487ed22bf5..000000000000
--- a/arch/ia64/include/asm/patch.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PATCH_H
-#define _ASM_IA64_PATCH_H
-
-/*
- * Copyright (C) 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * There are a number of reasons for patching instructions.  Rather than duplicating code
- * all over the place, we put the common stuff here.  Reasons for patching: in-kernel
- * module-loader, virtual-to-physical patch-list, McKinley Errata 9 workaround, and gate
- * shared library.  Undoubtedly, some of these reasons will disappear and others will
- * be added over time.
- */
-#include <linux/elf.h>
-#include <linux/types.h>
-
-extern void ia64_patch (u64 insn_addr, u64 mask, u64 val);	/* patch any insn slot */
-extern void ia64_patch_imm64 (u64 insn_addr, u64 val);		/* patch "movl" w/abs. value*/
-extern void ia64_patch_imm60 (u64 insn_addr, u64 val);		/* patch "brl" w/ip-rel value */
-
-extern void ia64_patch_mckinley_e9 (unsigned long start, unsigned long end);
-extern void ia64_patch_vtop (unsigned long start, unsigned long end);
-extern void ia64_patch_phys_stack_reg(unsigned long val);
-extern void ia64_patch_rse (unsigned long start, unsigned long end);
-extern void ia64_patch_gate (void);
-
-#endif /* _ASM_IA64_PATCH_H */
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
deleted file mode 100644
index fa8f545c24c9..000000000000
--- a/arch/ia64/include/asm/pci.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PCI_H
-#define _ASM_IA64_PCI_H
-
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/scatterlist.h>
-
-#include <asm/io.h>
-#include <asm/hw_irq.h>
-
-struct pci_vector_struct {
-	__u16 segment;	/* PCI Segment number */
-	__u16 bus;	/* PCI Bus number */
-	__u32 pci_id;	/* ACPI split 16 bits device, 16 bits function (see section 6.1.1) */
-	__u8 pin;	/* PCI PIN (0 = A, 1 = B, 2 = C, 3 = D) */
-	__u32 irq;	/* IRQ assigned */
-};
-
-/*
- * Can be used to override the logic in pci_scan_bus for skipping already-configured bus
- * numbers - to be used for buggy BIOSes or architectures with incomplete PCI setup by the
- * loader.
- */
-#define pcibios_assign_all_busses()     0
-
-#define PCIBIOS_MIN_IO		0x1000
-#define PCIBIOS_MIN_MEM		0x10000000
-
-#define HAVE_PCI_MMAP
-#define ARCH_GENERIC_PCI_MMAP_RESOURCE
-#define arch_can_pci_mmap_wc()	1
-
-#define HAVE_PCI_LEGACY
-extern int pci_mmap_legacy_page_range(struct pci_bus *bus,
-				      struct vm_area_struct *vma,
-				      enum pci_mmap_state mmap_state);
-
-char *pci_get_legacy_mem(struct pci_bus *bus);
-int pci_legacy_read(struct pci_bus *bus, u16 port, u32 *val, u8 size);
-int pci_legacy_write(struct pci_bus *bus, u16 port, u32 val, u8 size);
-
-struct pci_controller {
-	struct acpi_device *companion;
-	void *iommu;
-	int segment;
-	int node;		/* nearest node with memory or NUMA_NO_NODE for global allocation */
-
-	void *platform_data;
-};
-
-
-#define PCI_CONTROLLER(busdev) ((struct pci_controller *) busdev->sysdata)
-#define pci_domain_nr(busdev)    (PCI_CONTROLLER(busdev)->segment)
-
-extern struct pci_ops pci_root_ops;
-
-static inline int pci_proc_domain(struct pci_bus *bus)
-{
-	return (pci_domain_nr(bus) != 0);
-}
-
-#endif /* _ASM_IA64_PCI_H */
diff --git a/arch/ia64/include/asm/percpu.h b/arch/ia64/include/asm/percpu.h
deleted file mode 100644
index f357b9bb3576..000000000000
--- a/arch/ia64/include/asm/percpu.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PERCPU_H
-#define _ASM_IA64_PERCPU_H
-
-/*
- * Copyright (C) 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#ifdef __ASSEMBLY__
-# define THIS_CPU(var)	(var)  /* use this to mark accesses to per-CPU variables... */
-#else /* !__ASSEMBLY__ */
-
-
-#include <linux/threads.h>
-
-#ifdef CONFIG_SMP
-
-#ifdef HAVE_MODEL_SMALL_ATTRIBUTE
-# define PER_CPU_ATTRIBUTES	__attribute__((__model__ (__small__)))
-#endif
-
-#define __my_cpu_offset	__ia64_per_cpu_var(local_per_cpu_offset)
-
-extern void *per_cpu_init(void);
-
-#else /* ! SMP */
-
-#define per_cpu_init()				(__phys_per_cpu_start)
-
-#endif	/* SMP */
-
-#define PER_CPU_BASE_SECTION ".data..percpu"
-
-/*
- * Be extremely careful when taking the address of this variable!  Due to virtual
- * remapping, it is different from the canonical address returned by this_cpu_ptr(&var)!
- * On the positive side, using __ia64_per_cpu_var() instead of this_cpu_ptr() is slightly
- * more efficient.
- */
-#define __ia64_per_cpu_var(var) (*({					\
-	__verify_pcpu_ptr(&(var));					\
-	((typeof(var) __kernel __force *)&(var));			\
-}))
-
-#include <asm-generic/percpu.h>
-
-/* Equal to __per_cpu_offset[smp_processor_id()], but faster to access: */
-DECLARE_PER_CPU(unsigned long, local_per_cpu_offset);
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* _ASM_IA64_PERCPU_H */
diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h
deleted file mode 100644
index 0fb2b6291d58..000000000000
--- a/arch/ia64/include/asm/pgalloc.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PGALLOC_H
-#define _ASM_IA64_PGALLOC_H
-
-/*
- * This file contains the functions and defines necessary to allocate
- * page tables.
- *
- * This hopefully works with any (fixed) ia-64 page-size, as defined
- * in <asm/page.h> (currently 8192).
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2000, Goutham Rao <goutham.rao@intel.com>
- */
-
-
-#include <linux/compiler.h>
-#include <linux/mm.h>
-#include <linux/page-flags.h>
-#include <linux/threads.h>
-
-#include <asm-generic/pgalloc.h>
-
-#include <asm/mmu_context.h>
-
-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
-{
-	return (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-}
-
-#if CONFIG_PGTABLE_LEVELS == 4
-static inline void
-p4d_populate(struct mm_struct *mm, p4d_t * p4d_entry, pud_t * pud)
-{
-	p4d_val(*p4d_entry) = __pa(pud);
-}
-
-#define __pud_free_tlb(tlb, pud, address)	pud_free((tlb)->mm, pud)
-#endif /* CONFIG_PGTABLE_LEVELS == 4 */
-
-static inline void
-pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
-{
-	pud_val(*pud_entry) = __pa(pmd);
-}
-
-#define __pmd_free_tlb(tlb, pmd, address)	pmd_free((tlb)->mm, pmd)
-
-static inline void
-pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, pgtable_t pte)
-{
-	pmd_val(*pmd_entry) = page_to_phys(pte);
-}
-
-static inline void
-pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte)
-{
-	pmd_val(*pmd_entry) = __pa(pte);
-}
-
-#define __pte_free_tlb(tlb, pte, address)	pte_free((tlb)->mm, pte)
-
-#endif				/* _ASM_IA64_PGALLOC_H */
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
deleted file mode 100644
index 9be2d2ba6016..000000000000
--- a/arch/ia64/include/asm/pgtable.h
+++ /dev/null
@@ -1,545 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PGTABLE_H
-#define _ASM_IA64_PGTABLE_H
-
-/*
- * This file contains the functions and defines necessary to modify and use
- * the IA-64 page table tree.
- *
- * This hopefully works with any (fixed) IA-64 page-size, as defined
- * in <asm/page.h>.
- *
- * Copyright (C) 1998-2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-
-#include <asm/mman.h>
-#include <asm/page.h>
-#include <asm/processor.h>
-#include <asm/types.h>
-
-#define IA64_MAX_PHYS_BITS	50	/* max. number of physical address bits (architected) */
-
-/*
- * First, define the various bits in a PTE.  Note that the PTE format
- * matches the VHPT short format, the firt doubleword of the VHPD long
- * format, and the first doubleword of the TLB insertion format.
- */
-#define _PAGE_P_BIT		0
-#define _PAGE_A_BIT		5
-#define _PAGE_D_BIT		6
-
-#define _PAGE_P			(1 << _PAGE_P_BIT)	/* page present bit */
-#define _PAGE_MA_WB		(0x0 <<  2)	/* write back memory attribute */
-#define _PAGE_MA_UC		(0x4 <<  2)	/* uncacheable memory attribute */
-#define _PAGE_MA_UCE		(0x5 <<  2)	/* UC exported attribute */
-#define _PAGE_MA_WC		(0x6 <<  2)	/* write coalescing memory attribute */
-#define _PAGE_MA_NAT		(0x7 <<  2)	/* not-a-thing attribute */
-#define _PAGE_MA_MASK		(0x7 <<  2)
-#define _PAGE_PL_0		(0 <<  7)	/* privilege level 0 (kernel) */
-#define _PAGE_PL_1		(1 <<  7)	/* privilege level 1 (unused) */
-#define _PAGE_PL_2		(2 <<  7)	/* privilege level 2 (unused) */
-#define _PAGE_PL_3		(3 <<  7)	/* privilege level 3 (user) */
-#define _PAGE_PL_MASK		(3 <<  7)
-#define _PAGE_AR_R		(0 <<  9)	/* read only */
-#define _PAGE_AR_RX		(1 <<  9)	/* read & execute */
-#define _PAGE_AR_RW		(2 <<  9)	/* read & write */
-#define _PAGE_AR_RWX		(3 <<  9)	/* read, write & execute */
-#define _PAGE_AR_R_RW		(4 <<  9)	/* read / read & write */
-#define _PAGE_AR_RX_RWX		(5 <<  9)	/* read & exec / read, write & exec */
-#define _PAGE_AR_RWX_RW		(6 <<  9)	/* read, write & exec / read & write */
-#define _PAGE_AR_X_RX		(7 <<  9)	/* exec & promote / read & exec */
-#define _PAGE_AR_MASK		(7 <<  9)
-#define _PAGE_AR_SHIFT		9
-#define _PAGE_A			(1 << _PAGE_A_BIT)	/* page accessed bit */
-#define _PAGE_D			(1 << _PAGE_D_BIT)	/* page dirty bit */
-#define _PAGE_PPN_MASK		(((__IA64_UL(1) << IA64_MAX_PHYS_BITS) - 1) & ~0xfffUL)
-#define _PAGE_ED		(__IA64_UL(1) << 52)	/* exception deferral */
-#define _PAGE_PROTNONE		(__IA64_UL(1) << 63)
-
-/* We borrow bit 7 to store the exclusive marker in swap PTEs. */
-#define _PAGE_SWP_EXCLUSIVE	(1 << 7)
-
-#define _PFN_MASK		_PAGE_PPN_MASK
-/* Mask of bits which may be changed by pte_modify(); the odd bits are there for _PAGE_PROTNONE */
-#define _PAGE_CHG_MASK	(_PAGE_P | _PAGE_PROTNONE | _PAGE_PL_MASK | _PAGE_AR_MASK | _PAGE_ED)
-
-#define _PAGE_SIZE_4K	12
-#define _PAGE_SIZE_8K	13
-#define _PAGE_SIZE_16K	14
-#define _PAGE_SIZE_64K	16
-#define _PAGE_SIZE_256K	18
-#define _PAGE_SIZE_1M	20
-#define _PAGE_SIZE_4M	22
-#define _PAGE_SIZE_16M	24
-#define _PAGE_SIZE_64M	26
-#define _PAGE_SIZE_256M	28
-#define _PAGE_SIZE_1G	30
-#define _PAGE_SIZE_4G	32
-
-#define __ACCESS_BITS		_PAGE_ED | _PAGE_A | _PAGE_P | _PAGE_MA_WB
-#define __DIRTY_BITS_NO_ED	_PAGE_A | _PAGE_P | _PAGE_D | _PAGE_MA_WB
-#define __DIRTY_BITS		_PAGE_ED | __DIRTY_BITS_NO_ED
-
-/*
- * How many pointers will a page table level hold expressed in shift
- */
-#define PTRS_PER_PTD_SHIFT	(PAGE_SHIFT-3)
-
-/*
- * Definitions for fourth level:
- */
-#define PTRS_PER_PTE	(__IA64_UL(1) << (PTRS_PER_PTD_SHIFT))
-
-/*
- * Definitions for third level:
- *
- * PMD_SHIFT determines the size of the area a third-level page table
- * can map.
- */
-#define PMD_SHIFT	(PAGE_SHIFT + (PTRS_PER_PTD_SHIFT))
-#define PMD_SIZE	(1UL << PMD_SHIFT)
-#define PMD_MASK	(~(PMD_SIZE-1))
-#define PTRS_PER_PMD	(1UL << (PTRS_PER_PTD_SHIFT))
-
-#if CONFIG_PGTABLE_LEVELS == 4
-/*
- * Definitions for second level:
- *
- * PUD_SHIFT determines the size of the area a second-level page table
- * can map.
- */
-#define PUD_SHIFT	(PMD_SHIFT + (PTRS_PER_PTD_SHIFT))
-#define PUD_SIZE	(1UL << PUD_SHIFT)
-#define PUD_MASK	(~(PUD_SIZE-1))
-#define PTRS_PER_PUD	(1UL << (PTRS_PER_PTD_SHIFT))
-#endif
-
-/*
- * Definitions for first level:
- *
- * PGDIR_SHIFT determines what a first-level page table entry can map.
- */
-#if CONFIG_PGTABLE_LEVELS == 4
-#define PGDIR_SHIFT		(PUD_SHIFT + (PTRS_PER_PTD_SHIFT))
-#else
-#define PGDIR_SHIFT		(PMD_SHIFT + (PTRS_PER_PTD_SHIFT))
-#endif
-#define PGDIR_SIZE		(__IA64_UL(1) << PGDIR_SHIFT)
-#define PGDIR_MASK		(~(PGDIR_SIZE-1))
-#define PTRS_PER_PGD_SHIFT	PTRS_PER_PTD_SHIFT
-#define PTRS_PER_PGD		(1UL << PTRS_PER_PGD_SHIFT)
-#define USER_PTRS_PER_PGD	(5*PTRS_PER_PGD/8)	/* regions 0-4 are user regions */
-
-/*
- * All the normal masks have the "page accessed" bits on, as any time
- * they are used, the page is accessed. They are cleared only by the
- * page-out routines.
- */
-#define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_A)
-#define PAGE_SHARED	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RW)
-#define PAGE_READONLY	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R)
-#define PAGE_COPY	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R)
-#define PAGE_COPY_EXEC	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX)
-#define PAGE_GATE	__pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_X_RX)
-#define PAGE_KERNEL	__pgprot(__DIRTY_BITS  | _PAGE_PL_0 | _PAGE_AR_RWX)
-#define PAGE_KERNELRX	__pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_RX)
-#define PAGE_KERNEL_UC	__pgprot(__DIRTY_BITS  | _PAGE_PL_0 | _PAGE_AR_RWX | \
-				 _PAGE_MA_UC)
-
-# ifndef __ASSEMBLY__
-
-#include <linux/sched/mm.h>	/* for mm_struct */
-#include <linux/bitops.h>
-#include <asm/cacheflush.h>
-#include <asm/mmu_context.h>
-
-/*
- * Next come the mappings that determine how mmap() protection bits
- * (PROT_EXEC, PROT_READ, PROT_WRITE, PROT_NONE) get implemented.  The
- * _P version gets used for a private shared memory segment, the _S
- * version gets used for a shared memory segment with MAP_SHARED on.
- * In a private shared memory segment, we do a copy-on-write if a task
- * attempts to write to the page.
- */
-	/* xwr */
-#define pgd_ERROR(e)	printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e))
-#if CONFIG_PGTABLE_LEVELS == 4
-#define pud_ERROR(e)	printk("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
-#endif
-#define pmd_ERROR(e)	printk("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
-#define pte_ERROR(e)	printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
-
-
-/*
- * Some definitions to translate between mem_map, PTEs, and page addresses:
- */
-
-
-/* Quick test to see if ADDR is a (potentially) valid physical address. */
-static inline long
-ia64_phys_addr_valid (unsigned long addr)
-{
-	return (addr & (local_cpu_data->unimpl_pa_mask)) == 0;
-}
-
-/*
- * Now come the defines and routines to manage and access the three-level
- * page table.
- */
-
-
-#define VMALLOC_START		(RGN_BASE(RGN_GATE) + 0x200000000UL)
-#if defined(CONFIG_SPARSEMEM) && defined(CONFIG_SPARSEMEM_VMEMMAP)
-/* SPARSEMEM_VMEMMAP uses half of vmalloc... */
-# define VMALLOC_END		(RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 10)))
-# define vmemmap		((struct page *)VMALLOC_END)
-#else
-# define VMALLOC_END		(RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 9)))
-#endif
-
-/* fs/proc/kcore.c */
-#define	kc_vaddr_to_offset(v) ((v) - RGN_BASE(RGN_GATE))
-#define	kc_offset_to_vaddr(o) ((o) + RGN_BASE(RGN_GATE))
-
-#define RGN_MAP_SHIFT (PGDIR_SHIFT + PTRS_PER_PGD_SHIFT - 3)
-#define RGN_MAP_LIMIT	((1UL << RGN_MAP_SHIFT) - PAGE_SIZE)	/* per region addr limit */
-
-#define PFN_PTE_SHIFT	PAGE_SHIFT
-/*
- * Conversion functions: convert page frame number (pfn) and a protection value to a page
- * table entry (pte).
- */
-#define pfn_pte(pfn, pgprot) \
-({ pte_t __pte; pte_val(__pte) = ((pfn) << PAGE_SHIFT) | pgprot_val(pgprot); __pte; })
-
-/* Extract pfn from pte.  */
-#define pte_pfn(_pte)		((pte_val(_pte) & _PFN_MASK) >> PAGE_SHIFT)
-
-#define mk_pte(page, pgprot)	pfn_pte(page_to_pfn(page), (pgprot))
-
-/* This takes a physical page address that is used by the remapping functions */
-#define mk_pte_phys(physpage, pgprot) \
-({ pte_t __pte; pte_val(__pte) = physpage + pgprot_val(pgprot); __pte; })
-
-#define pte_modify(_pte, newprot) \
-	(__pte((pte_val(_pte) & ~_PAGE_CHG_MASK) | (pgprot_val(newprot) & _PAGE_CHG_MASK)))
-
-#define pte_none(pte) 			(!pte_val(pte))
-#define pte_present(pte)		(pte_val(pte) & (_PAGE_P | _PAGE_PROTNONE))
-#define pte_clear(mm,addr,pte)		(pte_val(*(pte)) = 0UL)
-/* pte_page() returns the "struct page *" corresponding to the PTE: */
-#define pte_page(pte)			virt_to_page(((pte_val(pte) & _PFN_MASK) + PAGE_OFFSET))
-
-#define pmd_none(pmd)			(!pmd_val(pmd))
-#define pmd_bad(pmd)			(!ia64_phys_addr_valid(pmd_val(pmd)))
-#define pmd_present(pmd)		(pmd_val(pmd) != 0UL)
-#define pmd_clear(pmdp)			(pmd_val(*(pmdp)) = 0UL)
-#define pmd_page_vaddr(pmd)		((unsigned long) __va(pmd_val(pmd) & _PFN_MASK))
-#define pmd_pfn(pmd)			((pmd_val(pmd) & _PFN_MASK) >> PAGE_SHIFT)
-#define pmd_page(pmd)			virt_to_page((pmd_val(pmd) + PAGE_OFFSET))
-
-#define pud_none(pud)			(!pud_val(pud))
-#define pud_bad(pud)			(!ia64_phys_addr_valid(pud_val(pud)))
-#define pud_present(pud)		(pud_val(pud) != 0UL)
-#define pud_clear(pudp)			(pud_val(*(pudp)) = 0UL)
-#define pud_pgtable(pud)		((pmd_t *) __va(pud_val(pud) & _PFN_MASK))
-#define pud_page(pud)			virt_to_page((pud_val(pud) + PAGE_OFFSET))
-
-#if CONFIG_PGTABLE_LEVELS == 4
-#define p4d_none(p4d)			(!p4d_val(p4d))
-#define p4d_bad(p4d)			(!ia64_phys_addr_valid(p4d_val(p4d)))
-#define p4d_present(p4d)		(p4d_val(p4d) != 0UL)
-#define p4d_clear(p4dp)			(p4d_val(*(p4dp)) = 0UL)
-#define p4d_pgtable(p4d)		((pud_t *) __va(p4d_val(p4d) & _PFN_MASK))
-#define p4d_page(p4d)			virt_to_page((p4d_val(p4d) + PAGE_OFFSET))
-#endif
-
-/*
- * The following have defined behavior only work if pte_present() is true.
- */
-#define pte_write(pte)	((unsigned) (((pte_val(pte) & _PAGE_AR_MASK) >> _PAGE_AR_SHIFT) - 2) <= 4)
-#define pte_exec(pte)		((pte_val(pte) & _PAGE_AR_RX) != 0)
-#define pte_dirty(pte)		((pte_val(pte) & _PAGE_D) != 0)
-#define pte_young(pte)		((pte_val(pte) & _PAGE_A) != 0)
-
-/*
- * Note: we convert AR_RWX to AR_RX and AR_RW to AR_R by clearing the 2nd bit in the
- * access rights:
- */
-#define pte_wrprotect(pte)	(__pte(pte_val(pte) & ~_PAGE_AR_RW))
-#define pte_mkwrite_novma(pte)	(__pte(pte_val(pte) | _PAGE_AR_RW))
-#define pte_mkold(pte)		(__pte(pte_val(pte) & ~_PAGE_A))
-#define pte_mkyoung(pte)	(__pte(pte_val(pte) | _PAGE_A))
-#define pte_mkclean(pte)	(__pte(pte_val(pte) & ~_PAGE_D))
-#define pte_mkdirty(pte)	(__pte(pte_val(pte) | _PAGE_D))
-#define pte_mkhuge(pte)		(__pte(pte_val(pte)))
-
-/*
- * Because ia64's Icache and Dcache is not coherent (on a cpu), we need to
- * sync icache and dcache when we insert *new* executable page.
- *  __ia64_sync_icache_dcache() check Pg_arch_1 bit and flush icache
- * if necessary.
- *
- *  set_pte() is also called by the kernel, but we can expect that the kernel
- *  flushes icache explicitly if necessary.
- */
-#define pte_present_exec_user(pte)\
-	((pte_val(pte) & (_PAGE_P | _PAGE_PL_MASK | _PAGE_AR_RX)) == \
-		(_PAGE_P | _PAGE_PL_3 | _PAGE_AR_RX))
-
-extern void __ia64_sync_icache_dcache(pte_t pteval);
-static inline void set_pte(pte_t *ptep, pte_t pteval)
-{
-	/* page is present && page is user  && page is executable
-	 * && (page swapin or new page or page migration
-	 *	|| copy_on_write with page copying.)
-	 */
-	if (pte_present_exec_user(pteval) &&
-	    (!pte_present(*ptep) ||
-		pte_pfn(*ptep) != pte_pfn(pteval)))
-		/* load_module() calles flush_icache_range() explicitly*/
-		__ia64_sync_icache_dcache(pteval);
-	*ptep = pteval;
-}
-
-/*
- * Make page protection values cacheable, uncacheable, or write-
- * combining.  Note that "protection" is really a misnomer here as the
- * protection value contains the memory attribute bits, dirty bits, and
- * various other bits as well.
- */
-#define pgprot_cacheable(prot)		__pgprot((pgprot_val(prot) & ~_PAGE_MA_MASK) | _PAGE_MA_WB)
-#define pgprot_noncached(prot)		__pgprot((pgprot_val(prot) & ~_PAGE_MA_MASK) | _PAGE_MA_UC)
-#define pgprot_writecombine(prot)	__pgprot((pgprot_val(prot) & ~_PAGE_MA_MASK) | _PAGE_MA_WC)
-
-struct file;
-extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
-				     unsigned long size, pgprot_t vma_prot);
-#define __HAVE_PHYS_MEM_ACCESS_PROT
-
-static inline unsigned long
-pgd_index (unsigned long address)
-{
-	unsigned long region = address >> 61;
-	unsigned long l1index = (address >> PGDIR_SHIFT) & ((PTRS_PER_PGD >> 3) - 1);
-
-	return (region << (PAGE_SHIFT - 6)) | l1index;
-}
-#define pgd_index pgd_index
-
-/*
- * In the kernel's mapped region we know everything is in region number 5, so
- * as an optimisation its PGD already points to the area for that region.
- * However, this also means that we cannot use pgd_index() and we must
- * never add the region here.
- */
-#define pgd_offset_k(addr) \
-	(init_mm.pgd + (((addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)))
-
-/* Look up a pgd entry in the gate area.  On IA-64, the gate-area
-   resides in the kernel-mapped segment, hence we use pgd_offset_k()
-   here.  */
-#define pgd_offset_gate(mm, addr)	pgd_offset_k(addr)
-
-/* atomic versions of the some PTE manipulations: */
-
-static inline int
-ptep_test_and_clear_young (struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
-{
-#ifdef CONFIG_SMP
-	if (!pte_young(*ptep))
-		return 0;
-	return test_and_clear_bit(_PAGE_A_BIT, ptep);
-#else
-	pte_t pte = *ptep;
-	if (!pte_young(pte))
-		return 0;
-	set_pte_at(vma->vm_mm, addr, ptep, pte_mkold(pte));
-	return 1;
-#endif
-}
-
-static inline pte_t
-ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-#ifdef CONFIG_SMP
-	return __pte(xchg((long *) ptep, 0));
-#else
-	pte_t pte = *ptep;
-	pte_clear(mm, addr, ptep);
-	return pte;
-#endif
-}
-
-static inline void
-ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-#ifdef CONFIG_SMP
-	unsigned long new, old;
-
-	do {
-		old = pte_val(*ptep);
-		new = pte_val(pte_wrprotect(__pte (old)));
-	} while (cmpxchg((unsigned long *) ptep, old, new) != old);
-#else
-	pte_t old_pte = *ptep;
-	set_pte_at(mm, addr, ptep, pte_wrprotect(old_pte));
-#endif
-}
-
-static inline int
-pte_same (pte_t a, pte_t b)
-{
-	return pte_val(a) == pte_val(b);
-}
-
-#define update_mmu_cache_range(vmf, vma, address, ptep, nr) do { } while (0)
-#define update_mmu_cache(vma, address, ptep) do { } while (0)
-
-extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
-extern void paging_init (void);
-
-/*
- * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
- * are !pte_none() && !pte_present().
- *
- * Note: The macros below rely on the fact that MAX_SWAPFILES_SHIFT <= number of
- *	 bits in the swap-type field of the swap pte.  It would be nice to
- *	 enforce that, but we can't easily include <linux/swap.h> here.
- *	 (Of course, better still would be to define MAX_SWAPFILES_SHIFT here...).
- *
- * Format of swap pte:
- *	bit   0   : present bit (must be zero)
- *	bits  1- 6: swap type
- *	bit   7   : exclusive marker
- *	bits  8-62: swap offset
- *	bit  63   : _PAGE_PROTNONE bit
- */
-#define __swp_type(entry)		(((entry).val >> 1) & 0x3f)
-#define __swp_offset(entry)		(((entry).val << 1) >> 9)
-#define __swp_entry(type, offset)	((swp_entry_t) { ((type & 0x3f) << 1) | \
-							 ((long) (offset) << 8) })
-#define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) })
-#define __swp_entry_to_pte(x)		((pte_t) { (x).val })
-
-static inline int pte_swp_exclusive(pte_t pte)
-{
-	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
-}
-
-static inline pte_t pte_swp_mkexclusive(pte_t pte)
-{
-	pte_val(pte) |= _PAGE_SWP_EXCLUSIVE;
-	return pte;
-}
-
-static inline pte_t pte_swp_clear_exclusive(pte_t pte)
-{
-	pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE;
-	return pte;
-}
-
-/*
- * ZERO_PAGE is a global shared page that is always zero: used
- * for zero-mapped memory areas etc..
- */
-extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
-extern struct page *zero_page_memmap_ptr;
-#define ZERO_PAGE(vaddr) (zero_page_memmap_ptr)
-
-/* We provide our own get_unmapped_area to cope with VA holes for userland */
-#define HAVE_ARCH_UNMAPPED_AREA
-
-#ifdef CONFIG_HUGETLB_PAGE
-#define HUGETLB_PGDIR_SHIFT	(HPAGE_SHIFT + 2*(PAGE_SHIFT-3))
-#define HUGETLB_PGDIR_SIZE	(__IA64_UL(1) << HUGETLB_PGDIR_SHIFT)
-#define HUGETLB_PGDIR_MASK	(~(HUGETLB_PGDIR_SIZE-1))
-#endif
-
-
-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-/*
- * Update PTEP with ENTRY, which is guaranteed to be a less
- * restrictive PTE.  That is, ENTRY may have the ACCESSED, DIRTY, and
- * WRITABLE bits turned on, when the value at PTEP did not.  The
- * WRITABLE bit may only be turned if SAFELY_WRITABLE is TRUE.
- *
- * SAFELY_WRITABLE is TRUE if we can update the value at PTEP without
- * having to worry about races.  On SMP machines, there are only two
- * cases where this is true:
- *
- *	(1) *PTEP has the PRESENT bit turned OFF
- *	(2) ENTRY has the DIRTY bit turned ON
- *
- * On ia64, we could implement this routine with a cmpxchg()-loop
- * which ORs in the _PAGE_A/_PAGE_D bit if they're set in ENTRY.
- * However, like on x86, we can get a more streamlined version by
- * observing that it is OK to drop ACCESSED bit updates when
- * SAFELY_WRITABLE is FALSE.  Besides being rare, all that would do is
- * result in an extra Access-bit fault, which would then turn on the
- * ACCESSED bit in the low-level fault handler (iaccess_bit or
- * daccess_bit in ivt.S).
- */
-#ifdef CONFIG_SMP
-# define ptep_set_access_flags(__vma, __addr, __ptep, __entry, __safely_writable) \
-({									\
-	int __changed = !pte_same(*(__ptep), __entry);			\
-	if (__changed && __safely_writable) {				\
-		set_pte(__ptep, __entry);				\
-		flush_tlb_page(__vma, __addr);				\
-	}								\
-	__changed;							\
-})
-#else
-# define ptep_set_access_flags(__vma, __addr, __ptep, __entry, __safely_writable) \
-({									\
-	int __changed = !pte_same(*(__ptep), __entry);			\
-	if (__changed) {						\
-		set_pte_at((__vma)->vm_mm, (__addr), __ptep, __entry);	\
-		flush_tlb_page(__vma, __addr);				\
-	}								\
-	__changed;							\
-})
-#endif
-# endif /* !__ASSEMBLY__ */
-
-/*
- * Identity-mapped regions use a large page size.  We'll call such large pages
- * "granules".  If you can think of a better name that's unambiguous, let me
- * know...
- */
-#if defined(CONFIG_IA64_GRANULE_64MB)
-# define IA64_GRANULE_SHIFT	_PAGE_SIZE_64M
-#elif defined(CONFIG_IA64_GRANULE_16MB)
-# define IA64_GRANULE_SHIFT	_PAGE_SIZE_16M
-#endif
-#define IA64_GRANULE_SIZE	(1 << IA64_GRANULE_SHIFT)
-/*
- * log2() of the page size we use to map the kernel image (IA64_TR_KERNEL):
- */
-#define KERNEL_TR_PAGE_SHIFT	_PAGE_SIZE_64M
-#define KERNEL_TR_PAGE_SIZE	(1 << KERNEL_TR_PAGE_SHIFT)
-
-/* These tell get_user_pages() that the first gate page is accessible from user-level.  */
-#define FIXADDR_USER_START	GATE_ADDR
-#ifdef HAVE_BUGGY_SEGREL
-# define FIXADDR_USER_END	(GATE_ADDR + 2*PAGE_SIZE)
-#else
-# define FIXADDR_USER_END	(GATE_ADDR + 2*PERCPU_PAGE_SIZE)
-#endif
-
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-#define __HAVE_ARCH_PTE_SAME
-#define __HAVE_ARCH_PGD_OFFSET_GATE
-
-
-#if CONFIG_PGTABLE_LEVELS == 3
-#include <asm-generic/pgtable-nopud.h>
-#endif
-#include <asm-generic/pgtable-nop4d.h>
-
-#endif /* _ASM_IA64_PGTABLE_H */
diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h
deleted file mode 100644
index 47e3801b526a..000000000000
--- a/arch/ia64/include/asm/processor.h
+++ /dev/null
@@ -1,660 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_PROCESSOR_H
-#define _ASM_IA64_PROCESSOR_H
-
-/*
- * Copyright (C) 1998-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- *
- * 11/24/98	S.Eranian	added ia64_set_iva()
- * 12/03/99	D. Mosberger	implement thread_saved_pc() via kernel unwind API
- * 06/16/00	A. Mallick	added csd/ssd/tssd for ia32 support
- */
-
-
-#include <asm/intrinsics.h>
-#include <asm/kregs.h>
-#include <asm/ptrace.h>
-#include <asm/ustack.h>
-
-#define IA64_NUM_PHYS_STACK_REG	96
-#define IA64_NUM_DBG_REGS	8
-
-#define DEFAULT_MAP_BASE	__IA64_UL_CONST(0x2000000000000000)
-#define DEFAULT_TASK_SIZE	__IA64_UL_CONST(0xa000000000000000)
-
-/*
- * TASK_SIZE really is a mis-named.  It really is the maximum user
- * space address (plus one).  On IA-64, there are five regions of 2TB
- * each (assuming 8KB page size), for a total of 8TB of user virtual
- * address space.
- */
-#define TASK_SIZE       	DEFAULT_TASK_SIZE
-
-/*
- * This decides where the kernel will search for a free chunk of vm
- * space during mmap's.
- */
-#define TASK_UNMAPPED_BASE	(current->thread.map_base)
-
-#define IA64_THREAD_FPH_VALID	(__IA64_UL(1) << 0)	/* floating-point high state valid? */
-#define IA64_THREAD_DBG_VALID	(__IA64_UL(1) << 1)	/* debug registers valid? */
-#define IA64_THREAD_PM_VALID	(__IA64_UL(1) << 2)	/* performance registers valid? */
-#define IA64_THREAD_UAC_NOPRINT	(__IA64_UL(1) << 3)	/* don't log unaligned accesses */
-#define IA64_THREAD_UAC_SIGBUS	(__IA64_UL(1) << 4)	/* generate SIGBUS on unaligned acc. */
-#define IA64_THREAD_MIGRATION	(__IA64_UL(1) << 5)	/* require migration
-							   sync at ctx sw */
-#define IA64_THREAD_FPEMU_NOPRINT (__IA64_UL(1) << 6)	/* don't log any fpswa faults */
-#define IA64_THREAD_FPEMU_SIGFPE  (__IA64_UL(1) << 7)	/* send a SIGFPE for fpswa faults */
-
-#define IA64_THREAD_UAC_SHIFT	3
-#define IA64_THREAD_UAC_MASK	(IA64_THREAD_UAC_NOPRINT | IA64_THREAD_UAC_SIGBUS)
-#define IA64_THREAD_FPEMU_SHIFT	6
-#define IA64_THREAD_FPEMU_MASK	(IA64_THREAD_FPEMU_NOPRINT | IA64_THREAD_FPEMU_SIGFPE)
-
-
-/*
- * This shift should be large enough to be able to represent 1000000000/itc_freq with good
- * accuracy while being small enough to fit 10*1000000000<<IA64_NSEC_PER_CYC_SHIFT in 64 bits
- * (this will give enough slack to represent 10 seconds worth of time as a scaled number).
- */
-#define IA64_NSEC_PER_CYC_SHIFT	30
-
-#ifndef __ASSEMBLY__
-
-#include <linux/cache.h>
-#include <linux/compiler.h>
-#include <linux/threads.h>
-#include <linux/types.h>
-#include <linux/bitops.h>
-
-#include <asm/fpu.h>
-#include <asm/page.h>
-#include <asm/percpu.h>
-#include <asm/rse.h>
-#include <asm/unwind.h>
-#include <linux/atomic.h>
-#ifdef CONFIG_NUMA
-#include <asm/nodedata.h>
-#endif
-
-/* like above but expressed as bitfields for more efficient access: */
-struct ia64_psr {
-	__u64 reserved0 : 1;
-	__u64 be : 1;
-	__u64 up : 1;
-	__u64 ac : 1;
-	__u64 mfl : 1;
-	__u64 mfh : 1;
-	__u64 reserved1 : 7;
-	__u64 ic : 1;
-	__u64 i : 1;
-	__u64 pk : 1;
-	__u64 reserved2 : 1;
-	__u64 dt : 1;
-	__u64 dfl : 1;
-	__u64 dfh : 1;
-	__u64 sp : 1;
-	__u64 pp : 1;
-	__u64 di : 1;
-	__u64 si : 1;
-	__u64 db : 1;
-	__u64 lp : 1;
-	__u64 tb : 1;
-	__u64 rt : 1;
-	__u64 reserved3 : 4;
-	__u64 cpl : 2;
-	__u64 is : 1;
-	__u64 mc : 1;
-	__u64 it : 1;
-	__u64 id : 1;
-	__u64 da : 1;
-	__u64 dd : 1;
-	__u64 ss : 1;
-	__u64 ri : 2;
-	__u64 ed : 1;
-	__u64 bn : 1;
-	__u64 reserved4 : 19;
-};
-
-union ia64_isr {
-	__u64  val;
-	struct {
-		__u64 code : 16;
-		__u64 vector : 8;
-		__u64 reserved1 : 8;
-		__u64 x : 1;
-		__u64 w : 1;
-		__u64 r : 1;
-		__u64 na : 1;
-		__u64 sp : 1;
-		__u64 rs : 1;
-		__u64 ir : 1;
-		__u64 ni : 1;
-		__u64 so : 1;
-		__u64 ei : 2;
-		__u64 ed : 1;
-		__u64 reserved2 : 20;
-	};
-};
-
-union ia64_lid {
-	__u64 val;
-	struct {
-		__u64  rv  : 16;
-		__u64  eid : 8;
-		__u64  id  : 8;
-		__u64  ig  : 32;
-	};
-};
-
-union ia64_tpr {
-	__u64 val;
-	struct {
-		__u64 ig0 : 4;
-		__u64 mic : 4;
-		__u64 rsv : 8;
-		__u64 mmi : 1;
-		__u64 ig1 : 47;
-	};
-};
-
-union ia64_itir {
-	__u64 val;
-	struct {
-		__u64 rv3  :  2; /* 0-1 */
-		__u64 ps   :  6; /* 2-7 */
-		__u64 key  : 24; /* 8-31 */
-		__u64 rv4  : 32; /* 32-63 */
-	};
-};
-
-union  ia64_rr {
-	__u64 val;
-	struct {
-		__u64  ve	:  1;  /* enable hw walker */
-		__u64  reserved0:  1;  /* reserved */
-		__u64  ps	:  6;  /* log page size */
-		__u64  rid	: 24;  /* region id */
-		__u64  reserved1: 32;  /* reserved */
-	};
-};
-
-/*
- * CPU type, hardware bug flags, and per-CPU state.  Frequently used
- * state comes earlier:
- */
-struct cpuinfo_ia64 {
-	unsigned int softirq_pending;
-	unsigned long itm_delta;	/* # of clock cycles between clock ticks */
-	unsigned long itm_next;		/* interval timer mask value to use for next clock tick */
-	unsigned long nsec_per_cyc;	/* (1000000000<<IA64_NSEC_PER_CYC_SHIFT)/itc_freq */
-	unsigned long unimpl_va_mask;	/* mask of unimplemented virtual address bits (from PAL) */
-	unsigned long unimpl_pa_mask;	/* mask of unimplemented physical address bits (from PAL) */
-	unsigned long itc_freq;		/* frequency of ITC counter */
-	unsigned long proc_freq;	/* frequency of processor */
-	unsigned long cyc_per_usec;	/* itc_freq/1000000 */
-	unsigned long ptce_base;
-	unsigned int ptce_count[2];
-	unsigned int ptce_stride[2];
-	struct task_struct *ksoftirqd;	/* kernel softirq daemon for this CPU */
-
-#ifdef CONFIG_SMP
-	unsigned long loops_per_jiffy;
-	int cpu;
-	unsigned int socket_id;	/* physical processor socket id */
-	unsigned short core_id;	/* core id */
-	unsigned short thread_id; /* thread id */
-	unsigned short num_log;	/* Total number of logical processors on
-				 * this socket that were successfully booted */
-	unsigned char cores_per_socket;	/* Cores per processor socket */
-	unsigned char threads_per_core;	/* Threads per core */
-#endif
-
-	/* CPUID-derived information: */
-	unsigned long ppn;
-	unsigned long features;
-	unsigned char number;
-	unsigned char revision;
-	unsigned char model;
-	unsigned char family;
-	unsigned char archrev;
-	char vendor[16];
-	char *model_name;
-
-#ifdef CONFIG_NUMA
-	struct ia64_node_data *node_data;
-#endif
-};
-
-DECLARE_PER_CPU(struct cpuinfo_ia64, ia64_cpu_info);
-
-/*
- * The "local" data variable.  It refers to the per-CPU data of the currently executing
- * CPU, much like "current" points to the per-task data of the currently executing task.
- * Do not use the address of local_cpu_data, since it will be different from
- * cpu_data(smp_processor_id())!
- */
-#define local_cpu_data		(&__ia64_per_cpu_var(ia64_cpu_info))
-#define cpu_data(cpu)		(&per_cpu(ia64_cpu_info, cpu))
-
-extern void print_cpu_info (struct cpuinfo_ia64 *);
-
-#define SET_UNALIGN_CTL(task,value)								\
-({												\
-	(task)->thread.flags = (((task)->thread.flags & ~IA64_THREAD_UAC_MASK)			\
-				| (((value) << IA64_THREAD_UAC_SHIFT) & IA64_THREAD_UAC_MASK));	\
-	0;											\
-})
-#define GET_UNALIGN_CTL(task,addr)								\
-({												\
-	put_user(((task)->thread.flags & IA64_THREAD_UAC_MASK) >> IA64_THREAD_UAC_SHIFT,	\
-		 (int __user *) (addr));							\
-})
-
-#define SET_FPEMU_CTL(task,value)								\
-({												\
-	(task)->thread.flags = (((task)->thread.flags & ~IA64_THREAD_FPEMU_MASK)		\
-			  | (((value) << IA64_THREAD_FPEMU_SHIFT) & IA64_THREAD_FPEMU_MASK));	\
-	0;											\
-})
-#define GET_FPEMU_CTL(task,addr)								\
-({												\
-	put_user(((task)->thread.flags & IA64_THREAD_FPEMU_MASK) >> IA64_THREAD_FPEMU_SHIFT,	\
-		 (int __user *) (addr));							\
-})
-
-struct thread_struct {
-	__u32 flags;			/* various thread flags (see IA64_THREAD_*) */
-	/* writing on_ustack is performance-critical, so it's worth spending 8 bits on it... */
-	__u8 on_ustack;			/* executing on user-stacks? */
-	__u8 pad[3];
-	__u64 ksp;			/* kernel stack pointer */
-	__u64 map_base;			/* base address for get_unmapped_area() */
-	__u64 rbs_bot;			/* the base address for the RBS */
-	int last_fph_cpu;		/* CPU that may hold the contents of f32-f127 */
-	unsigned long dbr[IA64_NUM_DBG_REGS];
-	unsigned long ibr[IA64_NUM_DBG_REGS];
-	struct ia64_fpreg fph[96];	/* saved/loaded on demand */
-};
-
-#define INIT_THREAD {						\
-	.flags =	0,					\
-	.on_ustack =	0,					\
-	.ksp =		0,					\
-	.map_base =	DEFAULT_MAP_BASE,			\
-	.rbs_bot =	STACK_TOP - DEFAULT_USER_STACK_SIZE,	\
-	.last_fph_cpu =  -1,					\
-	.dbr =		{0, },					\
-	.ibr =		{0, },					\
-	.fph =		{{{{0}}}, }				\
-}
-
-#define start_thread(regs,new_ip,new_sp) do {							\
-	regs->cr_ipsr = ((regs->cr_ipsr | (IA64_PSR_BITS_TO_SET | IA64_PSR_CPL))		\
-			 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_RI | IA64_PSR_IS));		\
-	regs->cr_iip = new_ip;									\
-	regs->ar_rsc = 0xf;		/* eager mode, privilege level 3 */			\
-	regs->ar_rnat = 0;									\
-	regs->ar_bspstore = current->thread.rbs_bot;						\
-	regs->ar_fpsr = FPSR_DEFAULT;								\
-	regs->loadrs = 0;									\
-	regs->r8 = get_dumpable(current->mm);	/* set "don't zap registers" flag */		\
-	regs->r12 = new_sp - 16;	/* allocate 16 byte scratch area */			\
-	if (unlikely(get_dumpable(current->mm) != SUID_DUMP_USER)) {	\
-		/*										\
-		 * Zap scratch regs to avoid leaking bits between processes with different	\
-		 * uid/privileges.								\
-		 */										\
-		regs->ar_pfs = 0; regs->b0 = 0; regs->pr = 0;					\
-		regs->r1 = 0; regs->r9  = 0; regs->r11 = 0; regs->r13 = 0; regs->r15 = 0;	\
-	}											\
-} while (0)
-
-/* Forward declarations, a strange C thing... */
-struct mm_struct;
-struct task_struct;
-
-/* Get wait channel for task P.  */
-extern unsigned long __get_wchan (struct task_struct *p);
-
-/* Return instruction pointer of blocked task TSK.  */
-#define KSTK_EIP(tsk)					\
-  ({							\
-	struct pt_regs *_regs = task_pt_regs(tsk);	\
-	_regs->cr_iip + ia64_psr(_regs)->ri;		\
-  })
-
-/* Return stack pointer of blocked task TSK.  */
-#define KSTK_ESP(tsk)  ((tsk)->thread.ksp)
-
-extern void ia64_getreg_unknown_kr (void);
-extern void ia64_setreg_unknown_kr (void);
-
-#define ia64_get_kr(regnum)					\
-({								\
-	unsigned long r = 0;					\
-								\
-	switch (regnum) {					\
-	    case 0: r = ia64_getreg(_IA64_REG_AR_KR0); break;	\
-	    case 1: r = ia64_getreg(_IA64_REG_AR_KR1); break;	\
-	    case 2: r = ia64_getreg(_IA64_REG_AR_KR2); break;	\
-	    case 3: r = ia64_getreg(_IA64_REG_AR_KR3); break;	\
-	    case 4: r = ia64_getreg(_IA64_REG_AR_KR4); break;	\
-	    case 5: r = ia64_getreg(_IA64_REG_AR_KR5); break;	\
-	    case 6: r = ia64_getreg(_IA64_REG_AR_KR6); break;	\
-	    case 7: r = ia64_getreg(_IA64_REG_AR_KR7); break;	\
-	    default: ia64_getreg_unknown_kr(); break;		\
-	}							\
-	r;							\
-})
-
-#define ia64_set_kr(regnum, r) 					\
-({								\
-	switch (regnum) {					\
-	    case 0: ia64_setreg(_IA64_REG_AR_KR0, r); break;	\
-	    case 1: ia64_setreg(_IA64_REG_AR_KR1, r); break;	\
-	    case 2: ia64_setreg(_IA64_REG_AR_KR2, r); break;	\
-	    case 3: ia64_setreg(_IA64_REG_AR_KR3, r); break;	\
-	    case 4: ia64_setreg(_IA64_REG_AR_KR4, r); break;	\
-	    case 5: ia64_setreg(_IA64_REG_AR_KR5, r); break;	\
-	    case 6: ia64_setreg(_IA64_REG_AR_KR6, r); break;	\
-	    case 7: ia64_setreg(_IA64_REG_AR_KR7, r); break;	\
-	    default: ia64_setreg_unknown_kr(); break;		\
-	}							\
-})
-
-/*
- * The following three macros can't be inline functions because we don't have struct
- * task_struct at this point.
- */
-
-/*
- * Return TRUE if task T owns the fph partition of the CPU we're running on.
- * Must be called from code that has preemption disabled.
- */
-#define ia64_is_local_fpu_owner(t)								\
-({												\
-	struct task_struct *__ia64_islfo_task = (t);						\
-	(__ia64_islfo_task->thread.last_fph_cpu == smp_processor_id()				\
-	 && __ia64_islfo_task == (struct task_struct *) ia64_get_kr(IA64_KR_FPU_OWNER));	\
-})
-
-/*
- * Mark task T as owning the fph partition of the CPU we're running on.
- * Must be called from code that has preemption disabled.
- */
-#define ia64_set_local_fpu_owner(t) do {						\
-	struct task_struct *__ia64_slfo_task = (t);					\
-	__ia64_slfo_task->thread.last_fph_cpu = smp_processor_id();			\
-	ia64_set_kr(IA64_KR_FPU_OWNER, (unsigned long) __ia64_slfo_task);		\
-} while (0)
-
-/* Mark the fph partition of task T as being invalid on all CPUs.  */
-#define ia64_drop_fpu(t)	((t)->thread.last_fph_cpu = -1)
-
-extern void __ia64_init_fpu (void);
-extern void __ia64_save_fpu (struct ia64_fpreg *fph);
-extern void __ia64_load_fpu (struct ia64_fpreg *fph);
-extern void ia64_save_debug_regs (unsigned long *save_area);
-extern void ia64_load_debug_regs (unsigned long *save_area);
-
-#define ia64_fph_enable()	do { ia64_rsm(IA64_PSR_DFH); ia64_srlz_d(); } while (0)
-#define ia64_fph_disable()	do { ia64_ssm(IA64_PSR_DFH); ia64_srlz_d(); } while (0)
-
-/* load fp 0.0 into fph */
-static inline void
-ia64_init_fpu (void) {
-	ia64_fph_enable();
-	__ia64_init_fpu();
-	ia64_fph_disable();
-}
-
-/* save f32-f127 at FPH */
-static inline void
-ia64_save_fpu (struct ia64_fpreg *fph) {
-	ia64_fph_enable();
-	__ia64_save_fpu(fph);
-	ia64_fph_disable();
-}
-
-/* load f32-f127 from FPH */
-static inline void
-ia64_load_fpu (struct ia64_fpreg *fph) {
-	ia64_fph_enable();
-	__ia64_load_fpu(fph);
-	ia64_fph_disable();
-}
-
-static inline __u64
-ia64_clear_ic (void)
-{
-	__u64 psr;
-	psr = ia64_getreg(_IA64_REG_PSR);
-	ia64_stop();
-	ia64_rsm(IA64_PSR_I | IA64_PSR_IC);
-	ia64_srlz_i();
-	return psr;
-}
-
-/*
- * Restore the psr.
- */
-static inline void
-ia64_set_psr (__u64 psr)
-{
-	ia64_stop();
-	ia64_setreg(_IA64_REG_PSR_L, psr);
-	ia64_srlz_i();
-}
-
-/*
- * Insert a translation into an instruction and/or data translation
- * register.
- */
-static inline void
-ia64_itr (__u64 target_mask, __u64 tr_num,
-	  __u64 vmaddr, __u64 pte,
-	  __u64 log_page_size)
-{
-	ia64_setreg(_IA64_REG_CR_ITIR, (log_page_size << 2));
-	ia64_setreg(_IA64_REG_CR_IFA, vmaddr);
-	ia64_stop();
-	if (target_mask & 0x1)
-		ia64_itri(tr_num, pte);
-	if (target_mask & 0x2)
-		ia64_itrd(tr_num, pte);
-}
-
-/*
- * Insert a translation into the instruction and/or data translation
- * cache.
- */
-static inline void
-ia64_itc (__u64 target_mask, __u64 vmaddr, __u64 pte,
-	  __u64 log_page_size)
-{
-	ia64_setreg(_IA64_REG_CR_ITIR, (log_page_size << 2));
-	ia64_setreg(_IA64_REG_CR_IFA, vmaddr);
-	ia64_stop();
-	/* as per EAS2.6, itc must be the last instruction in an instruction group */
-	if (target_mask & 0x1)
-		ia64_itci(pte);
-	if (target_mask & 0x2)
-		ia64_itcd(pte);
-}
-
-/*
- * Purge a range of addresses from instruction and/or data translation
- * register(s).
- */
-static inline void
-ia64_ptr (__u64 target_mask, __u64 vmaddr, __u64 log_size)
-{
-	if (target_mask & 0x1)
-		ia64_ptri(vmaddr, (log_size << 2));
-	if (target_mask & 0x2)
-		ia64_ptrd(vmaddr, (log_size << 2));
-}
-
-/* Set the interrupt vector address.  The address must be suitably aligned (32KB).  */
-static inline void
-ia64_set_iva (void *ivt_addr)
-{
-	ia64_setreg(_IA64_REG_CR_IVA, (__u64) ivt_addr);
-	ia64_srlz_i();
-}
-
-/* Set the page table address and control bits.  */
-static inline void
-ia64_set_pta (__u64 pta)
-{
-	/* Note: srlz.i implies srlz.d */
-	ia64_setreg(_IA64_REG_CR_PTA, pta);
-	ia64_srlz_i();
-}
-
-static inline void
-ia64_eoi (void)
-{
-	ia64_setreg(_IA64_REG_CR_EOI, 0);
-	ia64_srlz_d();
-}
-
-#define cpu_relax()	ia64_hint(ia64_hint_pause)
-
-static inline int
-ia64_get_irr(unsigned int vector)
-{
-	unsigned int reg = vector / 64;
-	unsigned int bit = vector % 64;
-	unsigned long irr;
-
-	switch (reg) {
-	case 0: irr = ia64_getreg(_IA64_REG_CR_IRR0); break;
-	case 1: irr = ia64_getreg(_IA64_REG_CR_IRR1); break;
-	case 2: irr = ia64_getreg(_IA64_REG_CR_IRR2); break;
-	case 3: irr = ia64_getreg(_IA64_REG_CR_IRR3); break;
-	}
-
-	return test_bit(bit, &irr);
-}
-
-static inline void
-ia64_set_lrr0 (unsigned long val)
-{
-	ia64_setreg(_IA64_REG_CR_LRR0, val);
-	ia64_srlz_d();
-}
-
-static inline void
-ia64_set_lrr1 (unsigned long val)
-{
-	ia64_setreg(_IA64_REG_CR_LRR1, val);
-	ia64_srlz_d();
-}
-
-
-/*
- * Given the address to which a spill occurred, return the unat bit
- * number that corresponds to this address.
- */
-static inline __u64
-ia64_unat_pos (void *spill_addr)
-{
-	return ((__u64) spill_addr >> 3) & 0x3f;
-}
-
-/*
- * Set the NaT bit of an integer register which was spilled at address
- * SPILL_ADDR.  UNAT is the mask to be updated.
- */
-static inline void
-ia64_set_unat (__u64 *unat, void *spill_addr, unsigned long nat)
-{
-	__u64 bit = ia64_unat_pos(spill_addr);
-	__u64 mask = 1UL << bit;
-
-	*unat = (*unat & ~mask) | (nat << bit);
-}
-
-static inline __u64
-ia64_get_ivr (void)
-{
-	__u64 r;
-	ia64_srlz_d();
-	r = ia64_getreg(_IA64_REG_CR_IVR);
-	ia64_srlz_d();
-	return r;
-}
-
-static inline void
-ia64_set_dbr (__u64 regnum, __u64 value)
-{
-	__ia64_set_dbr(regnum, value);
-#ifdef CONFIG_ITANIUM
-	ia64_srlz_d();
-#endif
-}
-
-static inline __u64
-ia64_get_dbr (__u64 regnum)
-{
-	__u64 retval;
-
-	retval = __ia64_get_dbr(regnum);
-#ifdef CONFIG_ITANIUM
-	ia64_srlz_d();
-#endif
-	return retval;
-}
-
-static inline __u64
-ia64_rotr (__u64 w, __u64 n)
-{
-	return (w >> n) | (w << (64 - n));
-}
-
-#define ia64_rotl(w,n)	ia64_rotr((w), (64) - (n))
-
-/*
- * Take a mapped kernel address and return the equivalent address
- * in the region 7 identity mapped virtual area.
- */
-static inline void *
-ia64_imva (void *addr)
-{
-	void *result;
-	result = (void *) ia64_tpa(addr);
-	return __va(result);
-}
-
-#define ARCH_HAS_PREFETCH
-#define ARCH_HAS_PREFETCHW
-#define PREFETCH_STRIDE			L1_CACHE_BYTES
-
-static inline void
-prefetch (const void *x)
-{
-	 ia64_lfetch(ia64_lfhint_none, x);
-}
-
-static inline void
-prefetchw (const void *x)
-{
-	ia64_lfetch_excl(ia64_lfhint_none, x);
-}
-
-extern unsigned long boot_option_idle_override;
-
-enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_FORCE_MWAIT,
-			 IDLE_NOMWAIT, IDLE_POLL};
-
-void default_idle(void);
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* _ASM_IA64_PROCESSOR_H */
diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h
deleted file mode 100644
index 402874489890..000000000000
--- a/arch/ia64/include/asm/ptrace.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1998-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2003 Intel Co
- *	Suresh Siddha <suresh.b.siddha@intel.com>
- *	Fenghua Yu <fenghua.yu@intel.com>
- *	Arun Sharma <arun.sharma@intel.com>
- *
- * 12/07/98	S. Eranian	added pt_regs & switch_stack
- * 12/21/98	D. Mosberger	updated to match latest code
- *  6/17/99	D. Mosberger	added second unat member to "struct switch_stack"
- *
- */
-#ifndef _ASM_IA64_PTRACE_H
-#define _ASM_IA64_PTRACE_H
-
-#ifndef ASM_OFFSETS_C
-#include <asm/asm-offsets.h>
-#endif
-#include <uapi/asm/ptrace.h>
-
-/*
- * Base-2 logarithm of number of pages to allocate per task structure
- * (including register backing store and memory stack):
- */
-#if defined(CONFIG_IA64_PAGE_SIZE_4KB)
-# define KERNEL_STACK_SIZE_ORDER		3
-#elif defined(CONFIG_IA64_PAGE_SIZE_8KB)
-# define KERNEL_STACK_SIZE_ORDER		2
-#elif defined(CONFIG_IA64_PAGE_SIZE_16KB)
-# define KERNEL_STACK_SIZE_ORDER		1
-#else
-# define KERNEL_STACK_SIZE_ORDER		0
-#endif
-
-#define IA64_RBS_OFFSET			((IA64_TASK_SIZE + IA64_THREAD_INFO_SIZE + 31) & ~31)
-#define IA64_STK_OFFSET			((1 << KERNEL_STACK_SIZE_ORDER)*PAGE_SIZE)
-
-#define KERNEL_STACK_SIZE		IA64_STK_OFFSET
-
-#ifndef __ASSEMBLY__
-
-#include <asm/current.h>
-#include <asm/page.h>
-
-/*
- * We use the ia64_psr(regs)->ri to determine which of the three
- * instructions in bundle (16 bytes) took the sample. Generate
- * the canonical representation by adding to instruction pointer.
- */
-# define instruction_pointer(regs) ((regs)->cr_iip + ia64_psr(regs)->ri)
-# define instruction_pointer_set(regs, val)	\
-({						\
-	ia64_psr(regs)->ri = (val & 0xf);	\
-	regs->cr_iip = (val & ~0xfULL);		\
-})
-
-static inline unsigned long user_stack_pointer(struct pt_regs *regs)
-{
-	return regs->r12;
-}
-
-static inline int is_syscall_success(struct pt_regs *regs)
-{
-	return regs->r10 != -1;
-}
-
-static inline long regs_return_value(struct pt_regs *regs)
-{
-	if (is_syscall_success(regs))
-		return regs->r8;
-	else
-		return -regs->r8;
-}
-
-/* Conserve space in histogram by encoding slot bits in address
- * bits 2 and 3 rather than bits 0 and 1.
- */
-#define profile_pc(regs)						\
-({									\
-	unsigned long __ip = instruction_pointer(regs);			\
-	(__ip & ~3UL) + ((__ip & 3UL) << 2);				\
-})
-
-  /* given a pointer to a task_struct, return the user's pt_regs */
-# define task_pt_regs(t)		(((struct pt_regs *) ((char *) (t) + IA64_STK_OFFSET)) - 1)
-# define ia64_psr(regs)			((struct ia64_psr *) &(regs)->cr_ipsr)
-# define user_mode(regs)		(((struct ia64_psr *) &(regs)->cr_ipsr)->cpl != 0)
-# define user_stack(task,regs)	((long) regs - (long) task == IA64_STK_OFFSET - sizeof(*regs))
-# define fsys_mode(task,regs)					\
-  ({								\
-	  struct task_struct *_task = (task);			\
-	  struct pt_regs *_regs = (regs);			\
-	  !user_mode(_regs) && user_stack(_task, _regs);	\
-  })
-
-  /*
-   * System call handlers that, upon successful completion, need to return a negative value
-   * should call force_successful_syscall_return() right before returning.  On architectures
-   * where the syscall convention provides for a separate error flag (e.g., alpha, ia64,
-   * ppc{,64}, sparc{,64}, possibly others), this macro can be used to ensure that the error
-   * flag will not get set.  On architectures which do not support a separate error flag,
-   * the macro is a no-op and the spurious error condition needs to be filtered out by some
-   * other means (e.g., in user-level, by passing an extra argument to the syscall handler,
-   * or something along those lines).
-   *
-   * On ia64, we can clear the user's pt_regs->r8 to force a successful syscall.
-   */
-# define force_successful_syscall_return()	(task_pt_regs(current)->r8 = 0)
-
-  struct task_struct;			/* forward decl */
-  struct unw_frame_info;		/* forward decl */
-
-  extern unsigned long ia64_get_user_rbs_end (struct task_struct *, struct pt_regs *,
-					      unsigned long *);
-  extern long ia64_peek (struct task_struct *, struct switch_stack *, unsigned long,
-			 unsigned long, long *);
-  extern long ia64_poke (struct task_struct *, struct switch_stack *, unsigned long,
-			 unsigned long, long);
-  extern void ia64_flush_fph (struct task_struct *);
-  extern void ia64_sync_fph (struct task_struct *);
-  extern void ia64_sync_krbs(void);
-  extern long ia64_sync_user_rbs (struct task_struct *, struct switch_stack *,
-				  unsigned long, unsigned long);
-
-  /* get nat bits for scratch registers such that bit N==1 iff scratch register rN is a NaT */
-  extern unsigned long ia64_get_scratch_nat_bits (struct pt_regs *pt, unsigned long scratch_unat);
-  /* put nat bits for scratch registers such that scratch register rN is a NaT iff bit N==1 */
-  extern unsigned long ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat);
-
-  extern void ia64_increment_ip (struct pt_regs *pt);
-  extern void ia64_decrement_ip (struct pt_regs *pt);
-
-  extern void ia64_ptrace_stop(void);
-  #define arch_ptrace_stop() \
-	ia64_ptrace_stop()
-  #define arch_ptrace_stop_needed() \
-	(!test_thread_flag(TIF_RESTORE_RSE))
-
-  #define arch_has_single_step()  (1)
-  #define arch_has_block_step()   (1)
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_IA64_PTRACE_H */
diff --git a/arch/ia64/include/asm/sal.h b/arch/ia64/include/asm/sal.h
deleted file mode 100644
index 22749a201e92..000000000000
--- a/arch/ia64/include/asm/sal.h
+++ /dev/null
@@ -1,919 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_SAL_H
-#define _ASM_IA64_SAL_H
-
-/*
- * System Abstraction Layer definitions.
- *
- * This is based on version 2.5 of the manual "IA-64 System
- * Abstraction Layer".
- *
- * Copyright (C) 2001 Intel
- * Copyright (C) 2002 Jenna Hall <jenna.s.hall@intel.com>
- * Copyright (C) 2001 Fred Lewis <frederick.v.lewis@intel.com>
- * Copyright (C) 1998, 1999, 2001, 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Srinivasa Prasad Thirumalachar <sprasad@sprasad.engr.sgi.com>
- *
- * 02/01/04 J. Hall Updated Error Record Structures to conform to July 2001
- *		    revision of the SAL spec.
- * 01/01/03 fvlewis Updated Error Record Structures to conform with Nov. 2000
- *                  revision of the SAL spec.
- * 99/09/29 davidm	Updated for SAL 2.6.
- * 00/03/29 cfleck      Updated SAL Error Logging info for processor (SAL 2.6)
- *                      (plus examples of platform error info structures from smariset @ Intel)
- */
-
-#define IA64_SAL_PLATFORM_FEATURE_BUS_LOCK_BIT		0
-#define IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT_BIT	1
-#define IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT_BIT	2
-#define IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT_BIT	 	3
-
-#define IA64_SAL_PLATFORM_FEATURE_BUS_LOCK	  (1<<IA64_SAL_PLATFORM_FEATURE_BUS_LOCK_BIT)
-#define IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT (1<<IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT_BIT)
-#define IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT (1<<IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT_BIT)
-#define IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT	  (1<<IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT_BIT)
-
-#ifndef __ASSEMBLY__
-
-#include <linux/bcd.h>
-#include <linux/spinlock.h>
-#include <linux/efi.h>
-
-#include <asm/pal.h>
-#include <asm/fpu.h>
-
-extern unsigned long sal_systab_phys;
-extern spinlock_t sal_lock;
-
-/* SAL spec _requires_ eight args for each call. */
-#define __IA64_FW_CALL(entry,result,a0,a1,a2,a3,a4,a5,a6,a7)	\
-	result = (*entry)(a0,a1,a2,a3,a4,a5,a6,a7)
-
-# define IA64_FW_CALL(entry,result,args...) do {		\
-	unsigned long __ia64_sc_flags;				\
-	struct ia64_fpreg __ia64_sc_fr[6];			\
-	ia64_save_scratch_fpregs(__ia64_sc_fr);			\
-	spin_lock_irqsave(&sal_lock, __ia64_sc_flags);		\
-	__IA64_FW_CALL(entry, result, args);			\
-	spin_unlock_irqrestore(&sal_lock, __ia64_sc_flags);	\
-	ia64_load_scratch_fpregs(__ia64_sc_fr);			\
-} while (0)
-
-# define SAL_CALL(result,args...)			\
-	IA64_FW_CALL(ia64_sal, result, args);
-
-# define SAL_CALL_NOLOCK(result,args...) do {		\
-	unsigned long __ia64_scn_flags;			\
-	struct ia64_fpreg __ia64_scn_fr[6];		\
-	ia64_save_scratch_fpregs(__ia64_scn_fr);	\
-	local_irq_save(__ia64_scn_flags);		\
-	__IA64_FW_CALL(ia64_sal, result, args);		\
-	local_irq_restore(__ia64_scn_flags);		\
-	ia64_load_scratch_fpregs(__ia64_scn_fr);	\
-} while (0)
-
-# define SAL_CALL_REENTRANT(result,args...) do {	\
-	struct ia64_fpreg __ia64_scs_fr[6];		\
-	ia64_save_scratch_fpregs(__ia64_scs_fr);	\
-	preempt_disable();				\
-	__IA64_FW_CALL(ia64_sal, result, args);		\
-	preempt_enable();				\
-	ia64_load_scratch_fpregs(__ia64_scs_fr);	\
-} while (0)
-
-#define SAL_SET_VECTORS			0x01000000
-#define SAL_GET_STATE_INFO		0x01000001
-#define SAL_GET_STATE_INFO_SIZE		0x01000002
-#define SAL_CLEAR_STATE_INFO		0x01000003
-#define SAL_MC_RENDEZ			0x01000004
-#define SAL_MC_SET_PARAMS		0x01000005
-#define SAL_REGISTER_PHYSICAL_ADDR	0x01000006
-
-#define SAL_CACHE_FLUSH			0x01000008
-#define SAL_CACHE_INIT			0x01000009
-#define SAL_PCI_CONFIG_READ		0x01000010
-#define SAL_PCI_CONFIG_WRITE		0x01000011
-#define SAL_FREQ_BASE			0x01000012
-#define SAL_PHYSICAL_ID_INFO		0x01000013
-
-#define SAL_UPDATE_PAL			0x01000020
-
-struct ia64_sal_retval {
-	/*
-	 * A zero status value indicates call completed without error.
-	 * A negative status value indicates reason of call failure.
-	 * A positive status value indicates success but an
-	 * informational value should be printed (e.g., "reboot for
-	 * change to take effect").
-	 */
-	long status;
-	unsigned long v0;
-	unsigned long v1;
-	unsigned long v2;
-};
-
-typedef struct ia64_sal_retval (*ia64_sal_handler) (u64, ...);
-
-enum {
-	SAL_FREQ_BASE_PLATFORM = 0,
-	SAL_FREQ_BASE_INTERVAL_TIMER = 1,
-	SAL_FREQ_BASE_REALTIME_CLOCK = 2
-};
-
-/*
- * The SAL system table is followed by a variable number of variable
- * length descriptors.  The structure of these descriptors follows
- * below.
- * The defininition follows SAL specs from July 2000
- */
-struct ia64_sal_systab {
-	u8 signature[4];	/* should be "SST_" */
-	u32 size;		/* size of this table in bytes */
-	u8 sal_rev_minor;
-	u8 sal_rev_major;
-	u16 entry_count;	/* # of entries in variable portion */
-	u8 checksum;
-	u8 reserved1[7];
-	u8 sal_a_rev_minor;
-	u8 sal_a_rev_major;
-	u8 sal_b_rev_minor;
-	u8 sal_b_rev_major;
-	/* oem_id & product_id: terminating NUL is missing if string is exactly 32 bytes long. */
-	u8 oem_id[32];
-	u8 product_id[32];	/* ASCII product id  */
-	u8 reserved2[8];
-};
-
-enum sal_systab_entry_type {
-	SAL_DESC_ENTRY_POINT = 0,
-	SAL_DESC_MEMORY = 1,
-	SAL_DESC_PLATFORM_FEATURE = 2,
-	SAL_DESC_TR = 3,
-	SAL_DESC_PTC = 4,
-	SAL_DESC_AP_WAKEUP = 5
-};
-
-/*
- * Entry type:	Size:
- *	0	48
- *	1	32
- *	2	16
- *	3	32
- *	4	16
- *	5	16
- */
-#define SAL_DESC_SIZE(type)	"\060\040\020\040\020\020"[(unsigned) type]
-
-typedef struct ia64_sal_desc_entry_point {
-	u8 type;
-	u8 reserved1[7];
-	u64 pal_proc;
-	u64 sal_proc;
-	u64 gp;
-	u8 reserved2[16];
-}ia64_sal_desc_entry_point_t;
-
-typedef struct ia64_sal_desc_memory {
-	u8 type;
-	u8 used_by_sal;	/* needs to be mapped for SAL? */
-	u8 mem_attr;		/* current memory attribute setting */
-	u8 access_rights;	/* access rights set up by SAL */
-	u8 mem_attr_mask;	/* mask of supported memory attributes */
-	u8 reserved1;
-	u8 mem_type;		/* memory type */
-	u8 mem_usage;		/* memory usage */
-	u64 addr;		/* physical address of memory */
-	u32 length;	/* length (multiple of 4KB pages) */
-	u32 reserved2;
-	u8 oem_reserved[8];
-} ia64_sal_desc_memory_t;
-
-typedef struct ia64_sal_desc_platform_feature {
-	u8 type;
-	u8 feature_mask;
-	u8 reserved1[14];
-} ia64_sal_desc_platform_feature_t;
-
-typedef struct ia64_sal_desc_tr {
-	u8 type;
-	u8 tr_type;		/* 0 == instruction, 1 == data */
-	u8 regnum;		/* translation register number */
-	u8 reserved1[5];
-	u64 addr;		/* virtual address of area covered */
-	u64 page_size;		/* encoded page size */
-	u8 reserved2[8];
-} ia64_sal_desc_tr_t;
-
-typedef struct ia64_sal_desc_ptc {
-	u8 type;
-	u8 reserved1[3];
-	u32 num_domains;	/* # of coherence domains */
-	u64 domain_info;	/* physical address of domain info table */
-} ia64_sal_desc_ptc_t;
-
-typedef struct ia64_sal_ptc_domain_info {
-	u64 proc_count;		/* number of processors in domain */
-	u64 proc_list;		/* physical address of LID array */
-} ia64_sal_ptc_domain_info_t;
-
-typedef struct ia64_sal_ptc_domain_proc_entry {
-	u64 id  : 8;		/* id of processor */
-	u64 eid : 8;		/* eid of processor */
-} ia64_sal_ptc_domain_proc_entry_t;
-
-
-#define IA64_SAL_AP_EXTERNAL_INT 0
-
-typedef struct ia64_sal_desc_ap_wakeup {
-	u8 type;
-	u8 mechanism;		/* 0 == external interrupt */
-	u8 reserved1[6];
-	u64 vector;		/* interrupt vector in range 0x10-0xff */
-} ia64_sal_desc_ap_wakeup_t ;
-
-extern ia64_sal_handler ia64_sal;
-extern struct ia64_sal_desc_ptc *ia64_ptc_domain_info;
-
-extern unsigned short sal_revision;	/* supported SAL spec revision */
-extern unsigned short sal_version;	/* SAL version; OEM dependent */
-#define SAL_VERSION_CODE(major, minor) ((bin2bcd(major) << 8) | bin2bcd(minor))
-
-extern const char *ia64_sal_strerror (long status);
-extern void ia64_sal_init (struct ia64_sal_systab *sal_systab);
-
-/* SAL information type encodings */
-enum {
-	SAL_INFO_TYPE_MCA  = 0,		/* Machine check abort information */
-        SAL_INFO_TYPE_INIT = 1,		/* Init information */
-        SAL_INFO_TYPE_CMC  = 2,		/* Corrected machine check information */
-        SAL_INFO_TYPE_CPE  = 3		/* Corrected platform error information */
-};
-
-/* Encodings for machine check parameter types */
-enum {
-	SAL_MC_PARAM_RENDEZ_INT    = 1,	/* Rendezvous interrupt */
-	SAL_MC_PARAM_RENDEZ_WAKEUP = 2,	/* Wakeup */
-	SAL_MC_PARAM_CPE_INT	   = 3	/* Corrected Platform Error Int */
-};
-
-/* Encodings for rendezvous mechanisms */
-enum {
-	SAL_MC_PARAM_MECHANISM_INT = 1,	/* Use interrupt */
-	SAL_MC_PARAM_MECHANISM_MEM = 2	/* Use memory synchronization variable*/
-};
-
-/* Encodings for vectors which can be registered by the OS with SAL */
-enum {
-	SAL_VECTOR_OS_MCA	  = 0,
-	SAL_VECTOR_OS_INIT	  = 1,
-	SAL_VECTOR_OS_BOOT_RENDEZ = 2
-};
-
-/* Encodings for mca_opt parameter sent to SAL_MC_SET_PARAMS */
-#define	SAL_MC_PARAM_RZ_ALWAYS		0x1
-#define	SAL_MC_PARAM_BINIT_ESCALATE	0x10
-
-/*
- * Definition of the SAL Error Log from the SAL spec
- */
-
-/* SAL Error Record Section GUID Definitions */
-#define SAL_PROC_DEV_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf1, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_MEM_DEV_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf2, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_SEL_DEV_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf3, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_PCI_BUS_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf4, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_SMBIOS_DEV_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf5, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_PCI_COMP_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf6, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_SPECIFIC_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf7, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_HOST_CTLR_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf8, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define SAL_PLAT_BUS_ERR_SECT_GUID  \
-    EFI_GUID(0xe429faf9, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81)
-#define PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID \
-    EFI_GUID(0x6cb0a200, 0x893a, 0x11da, 0x96, 0xd2, 0x0, 0x10, 0x83, 0xff, \
-		0xca, 0x4d)
-
-#define MAX_CACHE_ERRORS	6
-#define MAX_TLB_ERRORS		6
-#define MAX_BUS_ERRORS		1
-
-/* Definition of version  according to SAL spec for logging purposes */
-typedef struct sal_log_revision {
-	u8 minor;		/* BCD (0..99) */
-	u8 major;		/* BCD (0..99) */
-} sal_log_revision_t;
-
-/* Definition of timestamp according to SAL spec for logging purposes */
-typedef struct sal_log_timestamp {
-	u8 slh_second;		/* Second (0..59) */
-	u8 slh_minute;		/* Minute (0..59) */
-	u8 slh_hour;		/* Hour (0..23) */
-	u8 slh_reserved;
-	u8 slh_day;		/* Day (1..31) */
-	u8 slh_month;		/* Month (1..12) */
-	u8 slh_year;		/* Year (00..99) */
-	u8 slh_century;		/* Century (19, 20, 21, ...) */
-} sal_log_timestamp_t;
-
-/* Definition of log record  header structures */
-typedef struct sal_log_record_header {
-	u64 id;				/* Unique monotonically increasing ID */
-	sal_log_revision_t revision;	/* Major and Minor revision of header */
-	u8 severity;			/* Error Severity */
-	u8 validation_bits;		/* 0: platform_guid, 1: !timestamp */
-	u32 len;			/* Length of this error log in bytes */
-	sal_log_timestamp_t timestamp;	/* Timestamp */
-	efi_guid_t platform_guid;	/* Unique OEM Platform ID */
-} sal_log_record_header_t;
-
-#define sal_log_severity_recoverable	0
-#define sal_log_severity_fatal		1
-#define sal_log_severity_corrected	2
-
-/*
- * Error Recovery Info (ERI) bit decode.  From SAL Spec section B.2.2 Table B-3
- * Error Section Error_Recovery_Info Field Definition.
- */
-#define ERI_NOT_VALID		0x0	/* Error Recovery Field is not valid */
-#define ERI_NOT_ACCESSIBLE	0x30	/* Resource not accessible */
-#define ERI_CONTAINMENT_WARN	0x22	/* Corrupt data propagated */
-#define ERI_UNCORRECTED_ERROR	0x20	/* Uncorrected error */
-#define ERI_COMPONENT_RESET	0x24	/* Component must be reset */
-#define ERI_CORR_ERROR_LOG	0x21	/* Corrected error, needs logging */
-#define ERI_CORR_ERROR_THRESH	0x29	/* Corrected error threshold exceeded */
-
-/* Definition of log section header structures */
-typedef struct sal_log_sec_header {
-    efi_guid_t guid;			/* Unique Section ID */
-    sal_log_revision_t revision;	/* Major and Minor revision of Section */
-    u8 error_recovery_info;		/* Platform error recovery status */
-    u8 reserved;
-    u32 len;				/* Section length */
-} sal_log_section_hdr_t;
-
-typedef struct sal_log_mod_error_info {
-	struct {
-		u64 check_info              : 1,
-		    requestor_identifier    : 1,
-		    responder_identifier    : 1,
-		    target_identifier       : 1,
-		    precise_ip              : 1,
-		    reserved                : 59;
-	} valid;
-	u64 check_info;
-	u64 requestor_identifier;
-	u64 responder_identifier;
-	u64 target_identifier;
-	u64 precise_ip;
-} sal_log_mod_error_info_t;
-
-typedef struct sal_processor_static_info {
-	struct {
-		u64 minstate        : 1,
-		    br              : 1,
-		    cr              : 1,
-		    ar              : 1,
-		    rr              : 1,
-		    fr              : 1,
-		    reserved        : 58;
-	} valid;
-	struct pal_min_state_area min_state_area;
-	u64 br[8];
-	u64 cr[128];
-	u64 ar[128];
-	u64 rr[8];
-	struct ia64_fpreg __attribute__ ((packed)) fr[128];
-} sal_processor_static_info_t;
-
-struct sal_cpuid_info {
-	u64 regs[5];
-	u64 reserved;
-};
-
-typedef struct sal_log_processor_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 proc_error_map      : 1,
-		    proc_state_param    : 1,
-		    proc_cr_lid         : 1,
-		    psi_static_struct   : 1,
-		    num_cache_check     : 4,
-		    num_tlb_check       : 4,
-		    num_bus_check       : 4,
-		    num_reg_file_check  : 4,
-		    num_ms_check        : 4,
-		    cpuid_info          : 1,
-		    reserved1           : 39;
-	} valid;
-	u64 proc_error_map;
-	u64 proc_state_parameter;
-	u64 proc_cr_lid;
-	/*
-	 * The rest of this structure consists of variable-length arrays, which can't be
-	 * expressed in C.
-	 */
-	sal_log_mod_error_info_t info[];
-	/*
-	 * This is what the rest looked like if C supported variable-length arrays:
-	 *
-	 * sal_log_mod_error_info_t cache_check_info[.valid.num_cache_check];
-	 * sal_log_mod_error_info_t tlb_check_info[.valid.num_tlb_check];
-	 * sal_log_mod_error_info_t bus_check_info[.valid.num_bus_check];
-	 * sal_log_mod_error_info_t reg_file_check_info[.valid.num_reg_file_check];
-	 * sal_log_mod_error_info_t ms_check_info[.valid.num_ms_check];
-	 * struct sal_cpuid_info cpuid_info;
-	 * sal_processor_static_info_t processor_static_info;
-	 */
-} sal_log_processor_info_t;
-
-/* Given a sal_log_processor_info_t pointer, return a pointer to the processor_static_info: */
-#define SAL_LPI_PSI_INFO(l)									\
-({	sal_log_processor_info_t *_l = (l);							\
-	((sal_processor_static_info_t *)							\
-	 ((char *) _l->info + ((_l->valid.num_cache_check + _l->valid.num_tlb_check		\
-				+ _l->valid.num_bus_check + _l->valid.num_reg_file_check	\
-				+ _l->valid.num_ms_check) * sizeof(sal_log_mod_error_info_t)	\
-			       + sizeof(struct sal_cpuid_info))));				\
-})
-
-/* platform error log structures */
-
-typedef struct sal_log_mem_dev_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 error_status    : 1,
-		    physical_addr   : 1,
-		    addr_mask       : 1,
-		    node            : 1,
-		    card            : 1,
-		    module          : 1,
-		    bank            : 1,
-		    device          : 1,
-		    row             : 1,
-		    column          : 1,
-		    bit_position    : 1,
-		    requestor_id    : 1,
-		    responder_id    : 1,
-		    target_id       : 1,
-		    bus_spec_data   : 1,
-		    oem_id          : 1,
-		    oem_data        : 1,
-		    reserved        : 47;
-	} valid;
-	u64 error_status;
-	u64 physical_addr;
-	u64 addr_mask;
-	u16 node;
-	u16 card;
-	u16 module;
-	u16 bank;
-	u16 device;
-	u16 row;
-	u16 column;
-	u16 bit_position;
-	u64 requestor_id;
-	u64 responder_id;
-	u64 target_id;
-	u64 bus_spec_data;
-	u8 oem_id[16];
-	u8 oem_data[1];			/* Variable length data */
-} sal_log_mem_dev_err_info_t;
-
-typedef struct sal_log_sel_dev_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 record_id       : 1,
-		    record_type     : 1,
-		    generator_id    : 1,
-		    evm_rev         : 1,
-		    sensor_type     : 1,
-		    sensor_num      : 1,
-		    event_dir       : 1,
-		    event_data1     : 1,
-		    event_data2     : 1,
-		    event_data3     : 1,
-		    reserved        : 54;
-	} valid;
-	u16 record_id;
-	u8 record_type;
-	u8 timestamp[4];
-	u16 generator_id;
-	u8 evm_rev;
-	u8 sensor_type;
-	u8 sensor_num;
-	u8 event_dir;
-	u8 event_data1;
-	u8 event_data2;
-	u8 event_data3;
-} sal_log_sel_dev_err_info_t;
-
-typedef struct sal_log_pci_bus_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 err_status      : 1,
-		    err_type        : 1,
-		    bus_id          : 1,
-		    bus_address     : 1,
-		    bus_data        : 1,
-		    bus_cmd         : 1,
-		    requestor_id    : 1,
-		    responder_id    : 1,
-		    target_id       : 1,
-		    oem_data        : 1,
-		    reserved        : 54;
-	} valid;
-	u64 err_status;
-	u16 err_type;
-	u16 bus_id;
-	u32 reserved;
-	u64 bus_address;
-	u64 bus_data;
-	u64 bus_cmd;
-	u64 requestor_id;
-	u64 responder_id;
-	u64 target_id;
-	u8 oem_data[1];			/* Variable length data */
-} sal_log_pci_bus_err_info_t;
-
-typedef struct sal_log_smbios_dev_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 event_type      : 1,
-		    length          : 1,
-		    time_stamp      : 1,
-		    data            : 1,
-		    reserved1       : 60;
-	} valid;
-	u8 event_type;
-	u8 length;
-	u8 time_stamp[6];
-	u8 data[1];			/* data of variable length, length == slsmb_length */
-} sal_log_smbios_dev_err_info_t;
-
-typedef struct sal_log_pci_comp_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 err_status      : 1,
-		    comp_info       : 1,
-		    num_mem_regs    : 1,
-		    num_io_regs     : 1,
-		    reg_data_pairs  : 1,
-		    oem_data        : 1,
-		    reserved        : 58;
-	} valid;
-	u64 err_status;
-	struct {
-		u16 vendor_id;
-		u16 device_id;
-		u8 class_code[3];
-		u8 func_num;
-		u8 dev_num;
-		u8 bus_num;
-		u8 seg_num;
-		u8 reserved[5];
-	} comp_info;
-	u32 num_mem_regs;
-	u32 num_io_regs;
-	u64 reg_data_pairs[1];
-	/*
-	 * array of address/data register pairs is num_mem_regs + num_io_regs elements
-	 * long.  Each array element consists of a u64 address followed by a u64 data
-	 * value.  The oem_data array immediately follows the reg_data_pairs array
-	 */
-	u8 oem_data[1];			/* Variable length data */
-} sal_log_pci_comp_err_info_t;
-
-typedef struct sal_log_plat_specific_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 err_status      : 1,
-		    guid            : 1,
-		    oem_data        : 1,
-		    reserved        : 61;
-	} valid;
-	u64 err_status;
-	efi_guid_t guid;
-	u8 oem_data[1];			/* platform specific variable length data */
-} sal_log_plat_specific_err_info_t;
-
-typedef struct sal_log_host_ctlr_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 err_status      : 1,
-		    requestor_id    : 1,
-		    responder_id    : 1,
-		    target_id       : 1,
-		    bus_spec_data   : 1,
-		    oem_data        : 1,
-		    reserved        : 58;
-	} valid;
-	u64 err_status;
-	u64 requestor_id;
-	u64 responder_id;
-	u64 target_id;
-	u64 bus_spec_data;
-	u8 oem_data[1];			/* Variable length OEM data */
-} sal_log_host_ctlr_err_info_t;
-
-typedef struct sal_log_plat_bus_err_info {
-	sal_log_section_hdr_t header;
-	struct {
-		u64 err_status      : 1,
-		    requestor_id    : 1,
-		    responder_id    : 1,
-		    target_id       : 1,
-		    bus_spec_data   : 1,
-		    oem_data        : 1,
-		    reserved        : 58;
-	} valid;
-	u64 err_status;
-	u64 requestor_id;
-	u64 responder_id;
-	u64 target_id;
-	u64 bus_spec_data;
-	u8 oem_data[1];			/* Variable length OEM data */
-} sal_log_plat_bus_err_info_t;
-
-/* Overall platform error section structure */
-typedef union sal_log_platform_err_info {
-	sal_log_mem_dev_err_info_t mem_dev_err;
-	sal_log_sel_dev_err_info_t sel_dev_err;
-	sal_log_pci_bus_err_info_t pci_bus_err;
-	sal_log_smbios_dev_err_info_t smbios_dev_err;
-	sal_log_pci_comp_err_info_t pci_comp_err;
-	sal_log_plat_specific_err_info_t plat_specific_err;
-	sal_log_host_ctlr_err_info_t host_ctlr_err;
-	sal_log_plat_bus_err_info_t plat_bus_err;
-} sal_log_platform_err_info_t;
-
-/* SAL log over-all, multi-section error record structure (processor+platform) */
-typedef struct err_rec {
-	sal_log_record_header_t sal_elog_header;
-	sal_log_processor_info_t proc_err;
-	sal_log_platform_err_info_t plat_err;
-	u8 oem_data_pad[1024];
-} ia64_err_rec_t;
-
-/*
- * Now define a couple of inline functions for improved type checking
- * and convenience.
- */
-
-extern s64 ia64_sal_cache_flush (u64 cache_type);
-extern void __init check_sal_cache_flush (void);
-
-/* Initialize all the processor and platform level instruction and data caches */
-static inline s64
-ia64_sal_cache_init (void)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL(isrv, SAL_CACHE_INIT, 0, 0, 0, 0, 0, 0, 0);
-	return isrv.status;
-}
-
-/*
- * Clear the processor and platform information logged by SAL with respect to the machine
- * state at the time of MCA's, INITs, CMCs, or CPEs.
- */
-static inline s64
-ia64_sal_clear_state_info (u64 sal_info_type)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL_REENTRANT(isrv, SAL_CLEAR_STATE_INFO, sal_info_type, 0,
-	              0, 0, 0, 0, 0);
-	return isrv.status;
-}
-
-
-/* Get the processor and platform information logged by SAL with respect to the machine
- * state at the time of the MCAs, INITs, CMCs, or CPEs.
- */
-static inline u64
-ia64_sal_get_state_info (u64 sal_info_type, u64 *sal_info)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL_REENTRANT(isrv, SAL_GET_STATE_INFO, sal_info_type, 0,
-	              sal_info, 0, 0, 0, 0);
-	if (isrv.status)
-		return 0;
-
-	return isrv.v0;
-}
-
-/*
- * Get the maximum size of the information logged by SAL with respect to the machine state
- * at the time of MCAs, INITs, CMCs, or CPEs.
- */
-static inline u64
-ia64_sal_get_state_info_size (u64 sal_info_type)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL_REENTRANT(isrv, SAL_GET_STATE_INFO_SIZE, sal_info_type, 0,
-	              0, 0, 0, 0, 0);
-	if (isrv.status)
-		return 0;
-	return isrv.v0;
-}
-
-/*
- * Causes the processor to go into a spin loop within SAL where SAL awaits a wakeup from
- * the monarch processor.  Must not lock, because it will not return on any cpu until the
- * monarch processor sends a wake up.
- */
-static inline s64
-ia64_sal_mc_rendez (void)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL_NOLOCK(isrv, SAL_MC_RENDEZ, 0, 0, 0, 0, 0, 0, 0);
-	return isrv.status;
-}
-
-/*
- * Allow the OS to specify the interrupt number to be used by SAL to interrupt OS during
- * the machine check rendezvous sequence as well as the mechanism to wake up the
- * non-monarch processor at the end of machine check processing.
- * Returns the complete ia64_sal_retval because some calls return more than just a status
- * value.
- */
-static inline struct ia64_sal_retval
-ia64_sal_mc_set_params (u64 param_type, u64 i_or_m, u64 i_or_m_val, u64 timeout, u64 rz_always)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL(isrv, SAL_MC_SET_PARAMS, param_type, i_or_m, i_or_m_val,
-		 timeout, rz_always, 0, 0);
-	return isrv;
-}
-
-/* Read from PCI configuration space */
-static inline s64
-ia64_sal_pci_config_read (u64 pci_config_addr, int type, u64 size, u64 *value)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL(isrv, SAL_PCI_CONFIG_READ, pci_config_addr, size, type, 0, 0, 0, 0);
-	if (value)
-		*value = isrv.v0;
-	return isrv.status;
-}
-
-/* Write to PCI configuration space */
-static inline s64
-ia64_sal_pci_config_write (u64 pci_config_addr, int type, u64 size, u64 value)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL(isrv, SAL_PCI_CONFIG_WRITE, pci_config_addr, size, value,
-	         type, 0, 0, 0);
-	return isrv.status;
-}
-
-/*
- * Register physical addresses of locations needed by SAL when SAL procedures are invoked
- * in virtual mode.
- */
-static inline s64
-ia64_sal_register_physical_addr (u64 phys_entry, u64 phys_addr)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL(isrv, SAL_REGISTER_PHYSICAL_ADDR, phys_entry, phys_addr,
-	         0, 0, 0, 0, 0);
-	return isrv.status;
-}
-
-/*
- * Register software dependent code locations within SAL. These locations are handlers or
- * entry points where SAL will pass control for the specified event. These event handlers
- * are for the bott rendezvous, MCAs and INIT scenarios.
- */
-static inline s64
-ia64_sal_set_vectors (u64 vector_type,
-		      u64 handler_addr1, u64 gp1, u64 handler_len1,
-		      u64 handler_addr2, u64 gp2, u64 handler_len2)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL(isrv, SAL_SET_VECTORS, vector_type,
-			handler_addr1, gp1, handler_len1,
-			handler_addr2, gp2, handler_len2);
-
-	return isrv.status;
-}
-
-/* Update the contents of PAL block in the non-volatile storage device */
-static inline s64
-ia64_sal_update_pal (u64 param_buf, u64 scratch_buf, u64 scratch_buf_size,
-		     u64 *error_code, u64 *scratch_buf_size_needed)
-{
-	struct ia64_sal_retval isrv;
-	SAL_CALL(isrv, SAL_UPDATE_PAL, param_buf, scratch_buf, scratch_buf_size,
-	         0, 0, 0, 0);
-	if (error_code)
-		*error_code = isrv.v0;
-	if (scratch_buf_size_needed)
-		*scratch_buf_size_needed = isrv.v1;
-	return isrv.status;
-}
-
-/* Get physical processor die mapping in the platform. */
-static inline s64
-ia64_sal_physical_id_info(u16 *splid)
-{
-	struct ia64_sal_retval isrv;
-
-	if (sal_revision < SAL_VERSION_CODE(3,2))
-		return -1;
-
-	SAL_CALL(isrv, SAL_PHYSICAL_ID_INFO, 0, 0, 0, 0, 0, 0, 0);
-	if (splid)
-		*splid = isrv.v0;
-	return isrv.status;
-}
-
-extern unsigned long sal_platform_features;
-
-extern int (*salinfo_platform_oemdata)(const u8 *, u8 **, u64 *);
-
-struct sal_ret_values {
-	long r8; long r9; long r10; long r11;
-};
-
-#define IA64_SAL_OEMFUNC_MIN		0x02000000
-#define IA64_SAL_OEMFUNC_MAX		0x03ffffff
-
-extern int ia64_sal_oemcall(struct ia64_sal_retval *, u64, u64, u64, u64, u64,
-			    u64, u64, u64);
-extern int ia64_sal_oemcall_nolock(struct ia64_sal_retval *, u64, u64, u64,
-				   u64, u64, u64, u64, u64);
-extern int ia64_sal_oemcall_reentrant(struct ia64_sal_retval *, u64, u64, u64,
-				      u64, u64, u64, u64, u64);
-extern long
-ia64_sal_freq_base (unsigned long which, unsigned long *ticks_per_second,
-		    unsigned long *drift_info);
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * System Abstraction Layer Specification
- * Section 3.2.5.1: OS_BOOT_RENDEZ to SAL return State.
- * Note: region regs are stored first in head.S _start. Hence they must
- * stay up front.
- */
-struct sal_to_os_boot {
-	u64 rr[8];		/* Region Registers */
-	u64 br[6];		/* br0:
-				 * return addr into SAL boot rendez routine */
-	u64 gr1;		/* SAL:GP */
-	u64 gr12;		/* SAL:SP */
-	u64 gr13;		/* SAL: Task Pointer */
-	u64 fpsr;
-	u64 pfs;
-	u64 rnat;
-	u64 unat;
-	u64 bspstore;
-	u64 dcr;		/* Default Control Register */
-	u64 iva;
-	u64 pta;
-	u64 itv;
-	u64 pmv;
-	u64 cmcv;
-	u64 lrr[2];
-	u64 gr[4];
-	u64 pr;			/* Predicate registers */
-	u64 lc;			/* Loop Count */
-	struct ia64_fpreg fp[20];
-};
-
-/*
- * Global array allocated for NR_CPUS at boot time
- */
-extern struct sal_to_os_boot sal_boot_rendez_state[NR_CPUS];
-
-extern void ia64_jump_to_sal(struct sal_to_os_boot *);
-#endif
-
-extern void ia64_sal_handler_init(void *entry_point, void *gpval);
-
-#define PALO_MAX_TLB_PURGES	0xFFFF
-#define PALO_SIG	"PALO"
-
-struct palo_table {
-	u8  signature[4];	/* Should be "PALO" */
-	u32 length;
-	u8  minor_revision;
-	u8  major_revision;
-	u8  checksum;
-	u8  reserved1[5];
-	u16 max_tlb_purges;
-	u8  reserved2[6];
-};
-
-#define NPTCG_FROM_PAL			0
-#define NPTCG_FROM_PALO			1
-#define NPTCG_FROM_KERNEL_PARAMETER	2
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_IA64_SAL_H */
diff --git a/arch/ia64/include/asm/sections.h b/arch/ia64/include/asm/sections.h
deleted file mode 100644
index 8e0875cf6071..000000000000
--- a/arch/ia64/include/asm/sections.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_SECTIONS_H
-#define _ASM_IA64_SECTIONS_H
-
-/*
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/elf.h>
-#include <linux/uaccess.h>
-
-typedef struct fdesc func_desc_t;
-
-#include <asm-generic/sections.h>
-
-extern char __phys_per_cpu_start[];
-#ifdef	CONFIG_SMP
-extern char __cpu0_per_cpu[];
-#endif
-extern char __start___vtop_patchlist[], __end___vtop_patchlist[];
-extern char __start___rse_patchlist[], __end___rse_patchlist[];
-extern char __start___mckinley_e9_bundles[], __end___mckinley_e9_bundles[];
-extern char __start___phys_stack_reg_patchlist[], __end___phys_stack_reg_patchlist[];
-extern char __start_gate_section[];
-extern char __start_gate_mckinley_e9_patchlist[], __end_gate_mckinley_e9_patchlist[];
-extern char __start_gate_vtop_patchlist[], __end_gate_vtop_patchlist[];
-extern char __start_gate_fsyscall_patchlist[], __end_gate_fsyscall_patchlist[];
-extern char __start_gate_brl_fsys_bubble_down_patchlist[], __end_gate_brl_fsys_bubble_down_patchlist[];
-extern char __start_unwind[], __end_unwind[];
-extern char __start_ivt_text[], __end_ivt_text[];
-
-#endif /* _ASM_IA64_SECTIONS_H */
diff --git a/arch/ia64/include/asm/serial.h b/arch/ia64/include/asm/serial.h
deleted file mode 100644
index 068be11583df..000000000000
--- a/arch/ia64/include/asm/serial.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Derived from the i386 version.
- */
-
-/*
- * This assumes you have a 1.8432 MHz clock for your UART.
- *
- * It'd be nice if someone built a serial card with a 24.576 MHz
- * clock, since the 16550A is capable of handling a top speed of 1.5
- * megabits/second; but this requires the faster clock.
- */
-#define BASE_BAUD ( 1843200 / 16 )
-
-/*
- * All legacy serial ports should be enumerated via ACPI namespace, so
- * we need not list them here.
- */
diff --git a/arch/ia64/include/asm/shmparam.h b/arch/ia64/include/asm/shmparam.h
deleted file mode 100644
index 43bd8324ab71..000000000000
--- a/arch/ia64/include/asm/shmparam.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_SHMPARAM_H
-#define _ASM_IA64_SHMPARAM_H
-
-/*
- * SHMLBA controls minimum alignment at which shared memory segments
- * get attached.  The IA-64 architecture says that there may be a
- * performance degradation when there are virtual aliases within 1MB.
- * To reduce the chance of this, we set SHMLBA to 1MB. --davidm 00/12/20
- */
-#define	SHMLBA	(1024*1024)
-
-#endif /* _ASM_IA64_SHMPARAM_H */
diff --git a/arch/ia64/include/asm/signal.h b/arch/ia64/include/asm/signal.h
deleted file mode 100644
index 80f067f9b3ce..000000000000
--- a/arch/ia64/include/asm/signal.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Modified 1998-2001, 2003
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- *
- * Unfortunately, this file is being included by bits/signal.h in
- * glibc-2.x.  Hence the #ifdef __KERNEL__ ugliness.
- */
-#ifndef _ASM_IA64_SIGNAL_H
-#define _ASM_IA64_SIGNAL_H
-
-#include <uapi/asm/signal.h>
-
-
-#define _NSIG		64
-#define _NSIG_BPW	64
-#define _NSIG_WORDS	(_NSIG / _NSIG_BPW)
-
-# ifndef __ASSEMBLY__
-
-/* Most things should be clean enough to redefine this at will, if care
-   is taken to make libc match.  */
-
-typedef unsigned long old_sigset_t;
-
-typedef struct {
-	unsigned long sig[_NSIG_WORDS];
-} sigset_t;
-
-#  include <asm/sigcontext.h>
-
-# endif /* !__ASSEMBLY__ */
-#endif /* _ASM_IA64_SIGNAL_H */
diff --git a/arch/ia64/include/asm/smp.h b/arch/ia64/include/asm/smp.h
deleted file mode 100644
index aa92234c0142..000000000000
--- a/arch/ia64/include/asm/smp.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * SMP Support
- *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * (c) Copyright 2001-2003, 2005 Hewlett-Packard Development Company, L.P.
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- */
-#ifndef _ASM_IA64_SMP_H
-#define _ASM_IA64_SMP_H
-
-#include <linux/init.h>
-#include <linux/threads.h>
-#include <linux/kernel.h>
-#include <linux/cpumask.h>
-#include <linux/bitops.h>
-#include <linux/irqreturn.h>
-
-#include <asm/param.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-
-static inline unsigned int
-ia64_get_lid (void)
-{
-	union {
-		struct {
-			unsigned long reserved : 16;
-			unsigned long eid : 8;
-			unsigned long id : 8;
-			unsigned long ignored : 32;
-		} f;
-		unsigned long bits;
-	} lid;
-
-	lid.bits = ia64_getreg(_IA64_REG_CR_LID);
-	return lid.f.id << 8 | lid.f.eid;
-}
-
-#define hard_smp_processor_id()		ia64_get_lid()
-
-#ifdef CONFIG_SMP
-
-#define raw_smp_processor_id() (current_thread_info()->cpu)
-
-extern struct smp_boot_data {
-	int cpu_count;
-	int cpu_phys_id[NR_CPUS];
-} smp_boot_data __initdata;
-
-extern char no_int_routing;
-
-extern cpumask_t cpu_core_map[NR_CPUS];
-DECLARE_PER_CPU_SHARED_ALIGNED(cpumask_t, cpu_sibling_map);
-extern int smp_num_siblings;
-extern void __iomem *ipi_base_addr;
-
-extern volatile int ia64_cpu_to_sapicid[];
-#define cpu_physical_id(i)	ia64_cpu_to_sapicid[i]
-
-extern unsigned long ap_wakeup_vector;
-
-/*
- * Function to map hard smp processor id to logical id.  Slow, so don't use this in
- * performance-critical code.
- */
-static inline int
-cpu_logical_id (int cpuid)
-{
-	int i;
-
-	for (i = 0; i < NR_CPUS; ++i)
-		if (cpu_physical_id(i) == cpuid)
-			break;
-	return i;
-}
-
-/* Upping and downing of CPUs */
-extern int __cpu_disable (void);
-extern void __cpu_die (unsigned int cpu);
-extern void cpu_die (void) __attribute__ ((noreturn));
-extern void __init smp_build_cpu_map(void);
-
-extern void __init init_smp_config (void);
-extern void smp_do_timer (struct pt_regs *regs);
-
-extern irqreturn_t handle_IPI(int irq, void *dev_id);
-extern void smp_send_reschedule (int cpu);
-extern void identify_siblings (struct cpuinfo_ia64 *);
-extern int is_multithreading_enabled(void);
-
-extern void arch_send_call_function_single_ipi(int cpu);
-extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
-
-#else /* CONFIG_SMP */
-
-#define cpu_logical_id(i)		0
-#define cpu_physical_id(i)		ia64_get_lid()
-
-#endif /* CONFIG_SMP */
-#endif /* _ASM_IA64_SMP_H */
diff --git a/arch/ia64/include/asm/sn/intr.h b/arch/ia64/include/asm/sn/intr.h
deleted file mode 100644
index 3885a77b21df..000000000000
--- a/arch/ia64/include/asm/sn/intr.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 1992 - 1997, 2000-2006 Silicon Graphics, Inc. All rights reserved.
- */
-
-#ifndef _ASM_IA64_SN_INTR_H
-#define _ASM_IA64_SN_INTR_H
-
-#define SGI_XPC_ACTIVATE	0x30
-#define SGI_XPC_NOTIFY		0xe7
-
-#endif /* _ASM_IA64_SN_INTR_H */
diff --git a/arch/ia64/include/asm/sn/sn_sal.h b/arch/ia64/include/asm/sn/sn_sal.h
deleted file mode 100644
index d437aa43343b..000000000000
--- a/arch/ia64/include/asm/sn/sn_sal.h
+++ /dev/null
@@ -1,124 +0,0 @@
-#ifndef _ASM_IA64_SN_SN_SAL_H
-#define _ASM_IA64_SN_SN_SAL_H
-
-/*
- * System Abstraction Layer definitions for IA64
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.  All rights reserved.
- */
-
-#include <linux/types.h>
-#include <asm/sal.h>
-
-// SGI Specific Calls
-#define  SN_SAL_GET_PARTITION_ADDR		   0x02000009
-#define  SN_SAL_MEMPROTECT                         0x0200003e
-
-#define  SN_SAL_WATCHLIST_ALLOC			   0x02000070
-#define  SN_SAL_WATCHLIST_FREE			   0x02000071
-
-/*
- * SAL Error Codes
- */
-#define SALRET_MORE_PASSES	1
-#define SALRET_OK		0
-#define SALRET_NOT_IMPLEMENTED	(-1)
-#define SALRET_INVALID_ARG	(-2)
-#define SALRET_ERROR		(-3)
-
-/*
- * Returns the physical address of the partition's reserved page through
- * an iterative number of calls.
- *
- * On first call, 'cookie' and 'len' should be set to 0, and 'addr'
- * set to the nasid of the partition whose reserved page's address is
- * being sought.
- * On subsequent calls, pass the values, that were passed back on the
- * previous call.
- *
- * While the return status equals SALRET_MORE_PASSES, keep calling
- * this function after first copying 'len' bytes starting at 'addr'
- * into 'buf'. Once the return status equals SALRET_OK, 'addr' will
- * be the physical address of the partition's reserved page. If the
- * return status equals neither of these, an error as occurred.
- */
-static inline s64
-sn_partition_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
-{
-	struct ia64_sal_retval rv;
-	ia64_sal_oemcall_reentrant(&rv, SN_SAL_GET_PARTITION_ADDR, *cookie,
-				   *addr, buf, *len, 0, 0, 0);
-	*cookie = rv.v0;
-	*addr = rv.v1;
-	*len = rv.v2;
-	return rv.status;
-}
-
-/*
- * Change memory access protections for a physical address range.
- * nasid_array is not used on Altix, but may be in future architectures.
- * Available memory protection access classes are defined after the function.
- */
-static inline int
-sn_change_memprotect(u64 paddr, u64 len, u64 perms, u64 *nasid_array)
-{
-	struct ia64_sal_retval ret_stuff;
-
-	ia64_sal_oemcall_nolock(&ret_stuff, SN_SAL_MEMPROTECT, paddr, len,
-				(u64)nasid_array, perms, 0, 0, 0);
-	return ret_stuff.status;
-}
-#define SN_MEMPROT_ACCESS_CLASS_0		0x14a080
-#define SN_MEMPROT_ACCESS_CLASS_1		0x2520c2
-#define SN_MEMPROT_ACCESS_CLASS_2		0x14a1ca
-#define SN_MEMPROT_ACCESS_CLASS_3		0x14a290
-#define SN_MEMPROT_ACCESS_CLASS_6		0x084080
-#define SN_MEMPROT_ACCESS_CLASS_7		0x021080
-
-union sn_watchlist_u {
-	u64     val;
-	struct {
-		u64	blade	: 16,
-			size	: 32,
-			filler	: 16;
-	};
-};
-
-static inline int
-sn_mq_watchlist_alloc(int blade, void *mq, unsigned int mq_size,
-				unsigned long *intr_mmr_offset)
-{
-	struct ia64_sal_retval rv;
-	unsigned long addr;
-	union sn_watchlist_u size_blade;
-	int watchlist;
-
-	addr = (unsigned long)mq;
-	size_blade.size = mq_size;
-	size_blade.blade = blade;
-
-	/*
-	 * bios returns watchlist number or negative error number.
-	 */
-	ia64_sal_oemcall_nolock(&rv, SN_SAL_WATCHLIST_ALLOC, addr,
-			size_blade.val, (u64)intr_mmr_offset,
-			(u64)&watchlist, 0, 0, 0);
-	if (rv.status < 0)
-		return rv.status;
-
-	return watchlist;
-}
-
-static inline int
-sn_mq_watchlist_free(int blade, int watchlist_num)
-{
-	struct ia64_sal_retval rv;
-	ia64_sal_oemcall_nolock(&rv, SN_SAL_WATCHLIST_FREE, blade,
-			watchlist_num, 0, 0, 0, 0, 0);
-	return rv.status;
-}
-#endif /* _ASM_IA64_SN_SN_SAL_H */
diff --git a/arch/ia64/include/asm/sparsemem.h b/arch/ia64/include/asm/sparsemem.h
deleted file mode 100644
index a58f8b466d96..000000000000
--- a/arch/ia64/include/asm/sparsemem.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_SPARSEMEM_H
-#define _ASM_IA64_SPARSEMEM_H
-
-#ifdef CONFIG_SPARSEMEM
-#include <asm/page.h>
-/*
- * SECTION_SIZE_BITS            2^N: how big each section will be
- * MAX_PHYSMEM_BITS             2^N: how much memory we can have in that space
- */
-
-#define SECTION_SIZE_BITS	(30)
-#define MAX_PHYSMEM_BITS	(50)
-#ifdef CONFIG_ARCH_FORCE_MAX_ORDER
-#if (CONFIG_ARCH_FORCE_MAX_ORDER + PAGE_SHIFT > SECTION_SIZE_BITS)
-#undef SECTION_SIZE_BITS
-#define SECTION_SIZE_BITS (CONFIG_ARCH_FORCE_MAX_ORDER + PAGE_SHIFT)
-#endif
-#endif
-
-#endif /* CONFIG_SPARSEMEM */
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int memory_add_physaddr_to_nid(u64 addr);
-#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
-#endif
-
-#endif /* _ASM_IA64_SPARSEMEM_H */
diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
deleted file mode 100644
index 0e5c1ad3239c..000000000000
--- a/arch/ia64/include/asm/spinlock.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_SPINLOCK_H
-#define _ASM_IA64_SPINLOCK_H
-
-/*
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- *
- * This file is used for SMP configurations only.
- */
-
-#include <linux/compiler.h>
-#include <linux/kernel.h>
-#include <linux/bitops.h>
-
-#include <linux/atomic.h>
-#include <asm/intrinsics.h>
-#include <asm/barrier.h>
-#include <asm/processor.h>
-
-#define arch_spin_lock_init(x)			((x)->lock = 0)
-
-/*
- * Ticket locks are conceptually two parts, one indicating the current head of
- * the queue, and the other indicating the current tail. The lock is acquired
- * by atomically noting the tail and incrementing it by one (thus adding
- * ourself to the queue and noting our position), then waiting until the head
- * becomes equal to the initial value of the tail.
- * The pad bits in the middle are used to prevent the next_ticket number
- * overflowing into the now_serving number.
- *
- *   31             17  16    15  14                    0
- *  +----------------------------------------------------+
- *  |  now_serving     | padding |   next_ticket         |
- *  +----------------------------------------------------+
- */
-
-#define TICKET_SHIFT	17
-#define TICKET_BITS	15
-#define	TICKET_MASK	((1 << TICKET_BITS) - 1)
-
-static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
-{
-	int	*p = (int *)&lock->lock, ticket, serve;
-
-	ticket = ia64_fetchadd(1, p, acq);
-
-	if (!(((ticket >> TICKET_SHIFT) ^ ticket) & TICKET_MASK))
-		return;
-
-	ia64_invala();
-
-	for (;;) {
-		asm volatile ("ld4.c.nc %0=[%1]" : "=r"(serve) : "r"(p) : "memory");
-
-		if (!(((serve >> TICKET_SHIFT) ^ ticket) & TICKET_MASK))
-			return;
-		cpu_relax();
-	}
-}
-
-static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
-{
-	int tmp = READ_ONCE(lock->lock);
-
-	if (!(((tmp >> TICKET_SHIFT) ^ tmp) & TICKET_MASK))
-		return ia64_cmpxchg(acq, &lock->lock, tmp, tmp + 1, sizeof (tmp)) == tmp;
-	return 0;
-}
-
-static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
-{
-	unsigned short	*p = (unsigned short *)&lock->lock + 1, tmp;
-
-	/* This could be optimised with ARCH_HAS_MMIOWB */
-	mmiowb();
-	asm volatile ("ld2.bias %0=[%1]" : "=r"(tmp) : "r"(p));
-	WRITE_ONCE(*p, (tmp + 2) & ~1);
-}
-
-static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
-{
-	long tmp = READ_ONCE(lock->lock);
-
-	return !!(((tmp >> TICKET_SHIFT) ^ tmp) & TICKET_MASK);
-}
-
-static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
-{
-	long tmp = READ_ONCE(lock->lock);
-
-	return ((tmp - (tmp >> TICKET_SHIFT)) & TICKET_MASK) > 1;
-}
-
-static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
-{
-	return !(((lock.lock >> TICKET_SHIFT) ^ lock.lock) & TICKET_MASK);
-}
-
-static inline int arch_spin_is_locked(arch_spinlock_t *lock)
-{
-	return __ticket_spin_is_locked(lock);
-}
-
-static inline int arch_spin_is_contended(arch_spinlock_t *lock)
-{
-	return __ticket_spin_is_contended(lock);
-}
-#define arch_spin_is_contended	arch_spin_is_contended
-
-static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-	__ticket_spin_lock(lock);
-}
-
-static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
-	return __ticket_spin_trylock(lock);
-}
-
-static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-	__ticket_spin_unlock(lock);
-}
-
-#ifdef ASM_SUPPORTED
-
-static __always_inline void
-arch_read_lock(arch_rwlock_t *lock)
-{
-	unsigned long flags = 0;
-
-	__asm__ __volatile__ (
-		"tbit.nz p6, p0 = %1,%2\n"
-		"br.few 3f\n"
-		"1:\n"
-		"fetchadd4.rel r2 = [%0], -1;;\n"
-		"(p6) ssm psr.i\n"
-		"2:\n"
-		"hint @pause\n"
-		"ld4 r2 = [%0];;\n"
-		"cmp4.lt p7,p0 = r2, r0\n"
-		"(p7) br.cond.spnt.few 2b\n"
-		"(p6) rsm psr.i\n"
-		";;\n"
-		"3:\n"
-		"fetchadd4.acq r2 = [%0], 1;;\n"
-		"cmp4.lt p7,p0 = r2, r0\n"
-		"(p7) br.cond.spnt.few 1b\n"
-		: : "r"(lock), "r"(flags), "i"(IA64_PSR_I_BIT)
-		: "p6", "p7", "r2", "memory");
-}
-
-#else /* !ASM_SUPPORTED */
-
-#define arch_read_lock(rw)								\
-do {											\
-	arch_rwlock_t *__read_lock_ptr = (rw);						\
-											\
-	while (unlikely(ia64_fetchadd(1, (int *) __read_lock_ptr, acq) < 0)) {		\
-		ia64_fetchadd(-1, (int *) __read_lock_ptr, rel);			\
-		while (*(volatile int *)__read_lock_ptr < 0)				\
-			cpu_relax();							\
-	}										\
-} while (0)
-
-#endif /* !ASM_SUPPORTED */
-
-#define arch_read_unlock(rw)					\
-do {								\
-	arch_rwlock_t *__read_lock_ptr = (rw);			\
-	ia64_fetchadd(-1, (int *) __read_lock_ptr, rel);	\
-} while (0)
-
-#ifdef ASM_SUPPORTED
-
-static __always_inline void
-arch_write_lock(arch_rwlock_t *lock)
-{
-	unsigned long flags = 0;
-
-	__asm__ __volatile__ (
-		"tbit.nz p6, p0 = %1, %2\n"
-		"mov ar.ccv = r0\n"
-		"dep r29 = -1, r0, 31, 1\n"
-		"br.few 3f;;\n"
-		"1:\n"
-		"(p6) ssm psr.i\n"
-		"2:\n"
-		"hint @pause\n"
-		"ld4 r2 = [%0];;\n"
-		"cmp4.eq p0,p7 = r0, r2\n"
-		"(p7) br.cond.spnt.few 2b\n"
-		"(p6) rsm psr.i\n"
-		";;\n"
-		"3:\n"
-		"cmpxchg4.acq r2 = [%0], r29, ar.ccv;;\n"
-		"cmp4.eq p0,p7 = r0, r2\n"
-		"(p7) br.cond.spnt.few 1b;;\n"
-		: : "r"(lock), "r"(flags), "i"(IA64_PSR_I_BIT)
-		: "ar.ccv", "p6", "p7", "r2", "r29", "memory");
-}
-
-#define arch_write_trylock(rw)							\
-({										\
-	register long result;							\
-										\
-	__asm__ __volatile__ (							\
-		"mov ar.ccv = r0\n"						\
-		"dep r29 = -1, r0, 31, 1;;\n"					\
-		"cmpxchg4.acq %0 = [%1], r29, ar.ccv\n"				\
-		: "=r"(result) : "r"(rw) : "ar.ccv", "r29", "memory");		\
-	(result == 0);								\
-})
-
-static inline void arch_write_unlock(arch_rwlock_t *x)
-{
-	u8 *y = (u8 *)x;
-	barrier();
-	asm volatile ("st1.rel.nta [%0] = r0\n\t" :: "r"(y+3) : "memory" );
-}
-
-#else /* !ASM_SUPPORTED */
-
-#define arch_write_lock(l)								\
-({											\
-	__u64 ia64_val, ia64_set_val = ia64_dep_mi(-1, 0, 31, 1);			\
-	__u32 *ia64_write_lock_ptr = (__u32 *) (l);					\
-	do {										\
-		while (*ia64_write_lock_ptr)						\
-			ia64_barrier();							\
-		ia64_val = ia64_cmpxchg4_acq(ia64_write_lock_ptr, ia64_set_val, 0);	\
-	} while (ia64_val);								\
-})
-
-#define arch_write_trylock(rw)						\
-({									\
-	__u64 ia64_val;							\
-	__u64 ia64_set_val = ia64_dep_mi(-1, 0, 31,1);			\
-	ia64_val = ia64_cmpxchg4_acq((__u32 *)(rw), ia64_set_val, 0);	\
-	(ia64_val == 0);						\
-})
-
-static inline void arch_write_unlock(arch_rwlock_t *x)
-{
-	barrier();
-	x->write_lock = 0;
-}
-
-#endif /* !ASM_SUPPORTED */
-
-static inline int arch_read_trylock(arch_rwlock_t *x)
-{
-	union {
-		arch_rwlock_t lock;
-		__u32 word;
-	} old, new;
-	old.lock = new.lock = *x;
-	old.lock.write_lock = new.lock.write_lock = 0;
-	++new.lock.read_counter;
-	return (u32)ia64_cmpxchg4_acq((__u32 *)(x), new.word, old.word) == old.word;
-}
-
-#endif /*  _ASM_IA64_SPINLOCK_H */
diff --git a/arch/ia64/include/asm/spinlock_types.h b/arch/ia64/include/asm/spinlock_types.h
deleted file mode 100644
index 14b8a161c165..000000000000
--- a/arch/ia64/include/asm/spinlock_types.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_SPINLOCK_TYPES_H
-#define _ASM_IA64_SPINLOCK_TYPES_H
-
-#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
-# error "please don't include this file directly"
-#endif
-
-typedef struct {
-	volatile unsigned int lock;
-} arch_spinlock_t;
-
-#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
-
-typedef struct {
-	volatile unsigned int read_counter	: 31;
-	volatile unsigned int write_lock	:  1;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED		{ 0, 0 }
-
-#endif
diff --git a/arch/ia64/include/asm/string.h b/arch/ia64/include/asm/string.h
deleted file mode 100644
index 8b84df0dbfad..000000000000
--- a/arch/ia64/include/asm/string.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_STRING_H
-#define _ASM_IA64_STRING_H
-
-/*
- * Here is where we want to put optimized versions of the string
- * routines.
- *
- * Copyright (C) 1998-2000, 2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-
-#define __HAVE_ARCH_STRLEN	1 /* see arch/ia64/lib/strlen.S */
-#define __HAVE_ARCH_MEMSET	1 /* see arch/ia64/lib/memset.S */
-#define __HAVE_ARCH_MEMCPY	1 /* see arch/ia64/lib/memcpy.S */
-
-extern __kernel_size_t strlen (const char *);
-extern void *memcpy (void *, const void *, __kernel_size_t);
-extern void *memset (void *, int, __kernel_size_t);
-
-#endif /* _ASM_IA64_STRING_H */
diff --git a/arch/ia64/include/asm/switch_to.h b/arch/ia64/include/asm/switch_to.h
deleted file mode 100644
index a5a4e09468fa..000000000000
--- a/arch/ia64/include/asm/switch_to.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Low-level task switching. This is based on information published in
- * the Processor Abstraction Layer and the System Abstraction Layer
- * manual.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- */
-#ifndef _ASM_IA64_SWITCH_TO_H
-#define _ASM_IA64_SWITCH_TO_H
-
-#include <linux/percpu.h>
-
-struct task_struct;
-
-/*
- * Context switch from one thread to another.  If the two threads have
- * different address spaces, schedule() has already taken care of
- * switching to the new address space by calling switch_mm().
- *
- * Disabling access to the fph partition and the debug-register
- * context switch MUST be done before calling ia64_switch_to() since a
- * newly created thread returns directly to
- * ia64_ret_from_syscall_clear_r8.
- */
-extern struct task_struct *ia64_switch_to (void *next_task);
-
-extern void ia64_save_extra (struct task_struct *task);
-extern void ia64_load_extra (struct task_struct *task);
-
-#define IA64_HAS_EXTRA_STATE(t)							\
-	((t)->thread.flags & (IA64_THREAD_DBG_VALID|IA64_THREAD_PM_VALID))
-
-#define __switch_to(prev,next,last) do {							 \
-	if (IA64_HAS_EXTRA_STATE(prev))								 \
-		ia64_save_extra(prev);								 \
-	if (IA64_HAS_EXTRA_STATE(next))								 \
-		ia64_load_extra(next);								 \
-	ia64_psr(task_pt_regs(next))->dfh = !ia64_is_local_fpu_owner(next);			 \
-	(last) = ia64_switch_to((next));							 \
-} while (0)
-
-#ifdef CONFIG_SMP
-/*
- * In the SMP case, we save the fph state when context-switching away from a thread that
- * modified fph.  This way, when the thread gets scheduled on another CPU, the CPU can
- * pick up the state from task->thread.fph, avoiding the complication of having to fetch
- * the latest fph state from another CPU.  In other words: eager save, lazy restore.
- */
-# define switch_to(prev,next,last) do {						\
-	if (ia64_psr(task_pt_regs(prev))->mfh && ia64_is_local_fpu_owner(prev)) {				\
-		ia64_psr(task_pt_regs(prev))->mfh = 0;			\
-		(prev)->thread.flags |= IA64_THREAD_FPH_VALID;			\
-		__ia64_save_fpu((prev)->thread.fph);				\
-	}									\
-	__switch_to(prev, next, last);						\
-	/* "next" in old context is "current" in new context */			\
-	if (unlikely((current->thread.flags & IA64_THREAD_MIGRATION) &&	       \
-		     (task_cpu(current) !=				       \
-		      		      task_thread_info(current)->last_cpu))) { \
-		task_thread_info(current)->last_cpu = task_cpu(current);       \
-	}								       \
-} while (0)
-#else
-# define switch_to(prev,next,last)	__switch_to(prev, next, last)
-#endif
-
-#endif /* _ASM_IA64_SWITCH_TO_H */
diff --git a/arch/ia64/include/asm/syscall.h b/arch/ia64/include/asm/syscall.h
deleted file mode 100644
index 2b02a3fb862a..000000000000
--- a/arch/ia64/include/asm/syscall.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Access to user system call parameters and results
- *
- * Copyright (C) 2008 Intel Corp.  Shaohua Li <shaohua.li@intel.com>
- *
- * See asm-generic/syscall.h for descriptions of what we must do here.
- */
-
-#ifndef _ASM_SYSCALL_H
-#define _ASM_SYSCALL_H	1
-
-#include <uapi/linux/audit.h>
-#include <linux/sched.h>
-#include <linux/err.h>
-
-static inline long syscall_get_nr(struct task_struct *task,
-				  struct pt_regs *regs)
-{
-	if ((long)regs->cr_ifs < 0) /* Not a syscall */
-		return -1;
-
-	return regs->r15;
-}
-
-static inline void syscall_rollback(struct task_struct *task,
-				    struct pt_regs *regs)
-{
-	/* do nothing */
-}
-
-static inline long syscall_get_error(struct task_struct *task,
-				     struct pt_regs *regs)
-{
-	return regs->r10 == -1 ? -regs->r8:0;
-}
-
-static inline long syscall_get_return_value(struct task_struct *task,
-					    struct pt_regs *regs)
-{
-	return regs->r8;
-}
-
-static inline void syscall_set_return_value(struct task_struct *task,
-					    struct pt_regs *regs,
-					    int error, long val)
-{
-	if (error) {
-		/* error < 0, but ia64 uses > 0 return value */
-		regs->r8 = -error;
-		regs->r10 = -1;
-	} else {
-		regs->r8 = val;
-		regs->r10 = 0;
-	}
-}
-
-extern void syscall_get_arguments(struct task_struct *task,
-	struct pt_regs *regs, unsigned long *args);
-
-static inline int syscall_get_arch(struct task_struct *task)
-{
-	return AUDIT_ARCH_IA64;
-}
-#endif	/* _ASM_SYSCALL_H */
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
deleted file mode 100644
index 21b257117e0a..000000000000
--- a/arch/ia64/include/asm/thread_info.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#ifndef _ASM_IA64_THREAD_INFO_H
-#define _ASM_IA64_THREAD_INFO_H
-
-#ifndef ASM_OFFSETS_C
-#include <asm/asm-offsets.h>
-#endif
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-
-#define THREAD_SIZE			KERNEL_STACK_SIZE
-
-#ifndef __ASSEMBLY__
-
-/*
- * On IA-64, we want to keep the task structure and kernel stack together, so they can be
- * mapped by a single TLB entry and so they can be addressed by the "current" pointer
- * without having to do pointer masking.
- */
-struct thread_info {
-	struct task_struct *task;	/* XXX not really needed, except for dup_task_struct() */
-	__u32 flags;			/* thread_info flags (see TIF_*) */
-	__u32 cpu;			/* current CPU */
-	__u32 last_cpu;			/* Last CPU thread ran on */
-	__u32 status;			/* Thread synchronous flags */
-	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	__u64 utime;
-	__u64 stime;
-	__u64 gtime;
-	__u64 hardirq_time;
-	__u64 softirq_time;
-	__u64 idle_time;
-	__u64 ac_stamp;
-	__u64 ac_leave;
-	__u64 ac_stime;
-	__u64 ac_utime;
-#endif
-};
-
-#define INIT_THREAD_INFO(tsk)			\
-{						\
-	.task		= &tsk,			\
-	.flags		= 0,			\
-	.cpu		= 0,			\
-	.preempt_count	= INIT_PREEMPT_COUNT,	\
-}
-
-#ifndef ASM_OFFSETS_C
-/* how to get the thread information struct from C */
-#define current_thread_info()	((struct thread_info *) ((char *) current + IA64_TASK_SIZE))
-#define arch_alloc_thread_stack_node(tsk, node)	\
-		((unsigned long *) ((char *) (tsk) + IA64_TASK_SIZE))
-#define task_thread_info(tsk)	((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE))
-#else
-#define current_thread_info()	((struct thread_info *) 0)
-#define arch_alloc_thread_stack_node(tsk, node)	((unsigned long *) 0)
-#define task_thread_info(tsk)	((struct thread_info *) 0)
-#endif
-#define arch_free_thread_stack(tsk)	/* nothing */
-#define task_stack_page(tsk)	((void *)(tsk))
-
-#define __HAVE_THREAD_FUNCTIONS
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-#define setup_thread_stack(p, org)			\
-	*task_thread_info(p) = *task_thread_info(org);	\
-	task_thread_info(p)->ac_stime = 0;		\
-	task_thread_info(p)->ac_utime = 0;		\
-	task_thread_info(p)->task = (p);
-#else
-#define setup_thread_stack(p, org) \
-	*task_thread_info(p) = *task_thread_info(org); \
-	task_thread_info(p)->task = (p);
-#endif
-#define end_of_stack(p) (unsigned long *)((void *)(p) + IA64_RBS_OFFSET)
-
-#define alloc_task_struct_node(node)						\
-({										\
-	struct page *page = alloc_pages_node(node, GFP_KERNEL | __GFP_COMP,	\
-					     KERNEL_STACK_SIZE_ORDER);		\
-	struct task_struct *ret = page ? page_address(page) : NULL;		\
-										\
-	ret;									\
-})
-#define free_task_struct(tsk)	free_pages((unsigned long) (tsk), KERNEL_STACK_SIZE_ORDER)
-
-#endif /* !__ASSEMBLY */
-
-/*
- * thread information flags
- * - these are process state flags that various assembly files may need to access
- * - pending work-to-be-done flags are in least-significant 16 bits, other flags
- *   in top 16 bits
- */
-#define TIF_SIGPENDING		0	/* signal pending */
-#define TIF_NEED_RESCHED	1	/* rescheduling necessary */
-#define TIF_SYSCALL_TRACE	2	/* syscall trace active */
-#define TIF_SYSCALL_AUDIT	3	/* syscall auditing active */
-#define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
-#define TIF_NOTIFY_SIGNAL	5	/* signal notification exist */
-#define TIF_NOTIFY_RESUME	6	/* resumption notification requested */
-#define TIF_MEMDIE		17	/* is terminating due to OOM killer */
-#define TIF_MCA_INIT		18	/* this task is processing MCA or INIT */
-#define TIF_DB_DISABLED		19	/* debug trap disabled for fsyscall */
-#define TIF_RESTORE_RSE		21	/* user RBS is newer than kernel RBS */
-#define TIF_POLLING_NRFLAG	22	/* idle is polling for TIF_NEED_RESCHED */
-
-#define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
-#define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
-#define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
-#define _TIF_SYSCALL_TRACEAUDIT	(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP)
-#define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
-#define _TIF_NOTIFY_SIGNAL	(1 << TIF_NOTIFY_SIGNAL)
-#define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
-#define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
-#define _TIF_MCA_INIT		(1 << TIF_MCA_INIT)
-#define _TIF_DB_DISABLED	(1 << TIF_DB_DISABLED)
-#define _TIF_RESTORE_RSE	(1 << TIF_RESTORE_RSE)
-#define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
-
-/* "work to do on user-return" bits */
-#define TIF_ALLWORK_MASK	(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\
-				 _TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_NOTIFY_SIGNAL)
-/* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */
-#define TIF_WORK_MASK		(TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
-
-#endif /* _ASM_IA64_THREAD_INFO_H */
diff --git a/arch/ia64/include/asm/timex.h b/arch/ia64/include/asm/timex.h
deleted file mode 100644
index 7ccc077a60be..000000000000
--- a/arch/ia64/include/asm/timex.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_TIMEX_H
-#define _ASM_IA64_TIMEX_H
-
-/*
- * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-/*
- * 2001/01/18 davidm	Removed CLOCK_TICK_RATE.  It makes no sense on IA-64.
- *			Also removed cacheflush_time as it's entirely unused.
- */
-
-#include <asm/intrinsics.h>
-#include <asm/processor.h>
-
-typedef unsigned long cycles_t;
-
-extern void (*ia64_udelay)(unsigned long usecs);
-
-/*
- * For performance reasons, we don't want to define CLOCK_TICK_TRATE as
- * local_cpu_data->itc_rate.  Fortunately, we don't have to, either: according to George
- * Anzinger, 1/CLOCK_TICK_RATE is taken as the resolution of the timer clock.  The time
- * calculation assumes that you will use enough of these so that your tick size <= 1/HZ.
- * If the calculation shows that your CLOCK_TICK_RATE can not supply exactly 1/HZ ticks,
- * the actual value is calculated and used to update the wall clock each jiffie.  Setting
- * the CLOCK_TICK_RATE to x*HZ insures that the calculation will find no errors.  Hence we
- * pick a multiple of HZ which gives us a (totally virtual) CLOCK_TICK_RATE of about
- * 100MHz.
- */
-#define CLOCK_TICK_RATE		(HZ * 100000UL)
-
-static inline cycles_t
-get_cycles (void)
-{
-	cycles_t ret;
-
-	ret = ia64_getreg(_IA64_REG_AR_ITC);
-	return ret;
-}
-#define get_cycles get_cycles
-
-extern void ia64_cpu_local_tick (void);
-extern unsigned long long ia64_native_sched_clock (void);
-
-#endif /* _ASM_IA64_TIMEX_H */
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
deleted file mode 100644
index a15fe0809aae..000000000000
--- a/arch/ia64/include/asm/tlb.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_TLB_H
-#define _ASM_IA64_TLB_H
-/*
- * Based on <asm-generic/tlb.h>.
- *
- * Copyright (C) 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-/*
- * Removing a translation from a page table (including TLB-shootdown) is a four-step
- * procedure:
- *
- *	(1) Flush (virtual) caches --- ensures virtual memory is coherent with kernel memory
- *	    (this is a no-op on ia64).
- *	(2) Clear the relevant portions of the page-table
- *	(3) Flush the TLBs --- ensures that stale content is gone from CPU TLBs
- *	(4) Release the pages that were freed up in step (2).
- *
- * Note that the ordering of these steps is crucial to avoid races on MP machines.
- *
- * The Linux kernel defines several platform-specific hooks for TLB-shootdown.  When
- * unmapping a portion of the virtual address space, these hooks are called according to
- * the following template:
- *
- *	tlb <- tlb_gather_mmu(mm);			// start unmap for address space MM
- *	{
- *	  for each vma that needs a shootdown do {
- *	    tlb_start_vma(tlb, vma);
- *	      for each page-table-entry PTE that needs to be removed do {
- *		tlb_remove_tlb_entry(tlb, pte, address);
- *		if (pte refers to a normal page) {
- *		  tlb_remove_page(tlb, page);
- *		}
- *	      }
- *	    tlb_end_vma(tlb, vma);
- *	  }
- *	}
- *	tlb_finish_mmu(tlb);				// finish unmap for address space MM
- */
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-
-#include <asm-generic/tlb.h>
-
-#endif /* _ASM_IA64_TLB_H */
diff --git a/arch/ia64/include/asm/tlbflush.h b/arch/ia64/include/asm/tlbflush.h
deleted file mode 100644
index ceac10c4d6e2..000000000000
--- a/arch/ia64/include/asm/tlbflush.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_TLBFLUSH_H
-#define _ASM_IA64_TLBFLUSH_H
-
-/*
- * Copyright (C) 2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-
-#include <linux/mm.h>
-
-#include <asm/intrinsics.h>
-#include <asm/mmu_context.h>
-#include <asm/page.h>
-
-struct ia64_tr_entry {
-	u64 ifa;
-	u64 itir;
-	u64 pte;
-	u64 rr;
-}; /*Record for tr entry!*/
-
-extern int ia64_itr_entry(u64 target_mask, u64 va, u64 pte, u64 log_size);
-extern void ia64_ptr_entry(u64 target_mask, int slot);
-extern struct ia64_tr_entry *ia64_idtrs[NR_CPUS];
-
-/*
- region register macros
-*/
-#define RR_TO_VE(val)   (((val) >> 0) & 0x0000000000000001)
-#define RR_VE(val)     (((val) & 0x0000000000000001) << 0)
-#define RR_VE_MASK     0x0000000000000001L
-#define RR_VE_SHIFT    0
-#define RR_TO_PS(val)  (((val) >> 2) & 0x000000000000003f)
-#define RR_PS(val)     (((val) & 0x000000000000003f) << 2)
-#define RR_PS_MASK     0x00000000000000fcL
-#define RR_PS_SHIFT    2
-#define RR_RID_MASK    0x00000000ffffff00L
-#define RR_TO_RID(val)         ((val >> 8) & 0xffffff)
-
-/*
- * Now for some TLB flushing routines.  This is the kind of stuff that
- * can be very expensive, so try to avoid them whenever possible.
- */
-extern void setup_ptcg_sem(int max_purges, int from_palo);
-
-/*
- * Flush everything (kernel mapping may also have changed due to
- * vmalloc/vfree).
- */
-extern void local_flush_tlb_all (void);
-
-#ifdef CONFIG_SMP
-  extern void smp_flush_tlb_all (void);
-  extern void smp_flush_tlb_mm (struct mm_struct *mm);
-  extern void smp_flush_tlb_cpumask (cpumask_t xcpumask);
-# define flush_tlb_all()	smp_flush_tlb_all()
-#else
-# define flush_tlb_all()	local_flush_tlb_all()
-# define smp_flush_tlb_cpumask(m) local_flush_tlb_all()
-#endif
-
-static inline void
-local_finish_flush_tlb_mm (struct mm_struct *mm)
-{
-	if (mm == current->active_mm)
-		activate_context(mm);
-}
-
-/*
- * Flush a specified user mapping.  This is called, e.g., as a result of fork() and
- * exit().  fork() ends up here because the copy-on-write mechanism needs to write-protect
- * the PTEs of the parent task.
- */
-static inline void
-flush_tlb_mm (struct mm_struct *mm)
-{
-	if (!mm)
-		return;
-
-	set_bit(mm->context, ia64_ctx.flushmap);
-	mm->context = 0;
-
-	if (atomic_read(&mm->mm_users) == 0)
-		return;		/* happens as a result of exit_mmap() */
-
-#ifdef CONFIG_SMP
-	smp_flush_tlb_mm(mm);
-#else
-	local_finish_flush_tlb_mm(mm);
-#endif
-}
-
-extern void flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long end);
-
-/*
- * Page-granular tlb flush.
- */
-static inline void
-flush_tlb_page (struct vm_area_struct *vma, unsigned long addr)
-{
-#ifdef CONFIG_SMP
-	flush_tlb_range(vma, (addr & PAGE_MASK), (addr & PAGE_MASK) + PAGE_SIZE);
-#else
-	if (vma->vm_mm == current->active_mm)
-		ia64_ptcl(addr, (PAGE_SHIFT << 2));
-	else
-		vma->vm_mm->context = 0;
-#endif
-}
-
-/*
- * Flush the local TLB. Invoked from another cpu using an IPI.
- */
-#ifdef CONFIG_SMP
-void smp_local_flush_tlb(void);
-#else
-#define smp_local_flush_tlb()
-#endif
-
-static inline void flush_tlb_kernel_range(unsigned long start,
-					  unsigned long end)
-{
-	flush_tlb_all();	/* XXX fix me */
-}
-
-#endif /* _ASM_IA64_TLBFLUSH_H */
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
deleted file mode 100644
index 43567240b0d6..000000000000
--- a/arch/ia64/include/asm/topology.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (C) 2002, Erich Focht, NEC
- *
- * All rights reserved.
- */
-#ifndef _ASM_IA64_TOPOLOGY_H
-#define _ASM_IA64_TOPOLOGY_H
-
-#include <asm/acpi.h>
-#include <asm/numa.h>
-#include <asm/smp.h>
-
-#ifdef CONFIG_NUMA
-
-/* Nodes w/o CPUs are preferred for memory allocations, see build_zonelists */
-#define PENALTY_FOR_NODE_WITH_CPUS 255
-
-/*
- * Nodes within this distance are eligible for reclaim by zone_reclaim() when
- * zone_reclaim_mode is enabled.
- */
-#define RECLAIM_DISTANCE 15
-
-/*
- * Returns a bitmask of CPUs on Node 'node'.
- */
-#define cpumask_of_node(node) ((node) == -1 ?				\
-			       cpu_all_mask :				\
-			       &node_to_cpu_mask[node])
-
-/*
- * Determines the node for a given pci bus
- */
-#define pcibus_to_node(bus) PCI_CONTROLLER(bus)->node
-
-void build_cpu_to_node_map(void);
-
-#endif /* CONFIG_NUMA */
-
-#ifdef CONFIG_SMP
-#define topology_physical_package_id(cpu)	(cpu_data(cpu)->socket_id)
-#define topology_core_id(cpu)			(cpu_data(cpu)->core_id)
-#define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
-#define topology_sibling_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
-#endif
-
-extern void arch_fix_phys_package_id(int num, u32 slot);
-
-#define cpumask_of_pcibus(bus)	(pcibus_to_node(bus) == -1 ?		\
-				 cpu_all_mask :				\
-				 cpumask_of_node(pcibus_to_node(bus)))
-
-#include <asm-generic/topology.h>
-
-#endif /* _ASM_IA64_TOPOLOGY_H */
diff --git a/arch/ia64/include/asm/types.h b/arch/ia64/include/asm/types.h
deleted file mode 100644
index 5ddc7703de99..000000000000
--- a/arch/ia64/include/asm/types.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This file is never included by application software unless explicitly
- * requested (e.g., via linux/types.h) in which case the application is
- * Linux specific so (user-) name space pollution is not a major issue.
- * However, for interoperability, libraries still need to be careful to
- * avoid naming clashes.
- *
- * Based on <asm-alpha/types.h>.
- *
- * Modified 1998-2000, 2002
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#ifndef _ASM_IA64_TYPES_H
-#define _ASM_IA64_TYPES_H
-
-#include <asm-generic/int-ll64.h>
-#include <uapi/asm/types.h>
-
-#ifdef __ASSEMBLY__
-#else
-/*
- * These aren't exported outside the kernel to avoid name space clashes
- */
-
-struct fnptr {
-	unsigned long ip;
-	unsigned long gp;
-};
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_IA64_TYPES_H */
diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h
deleted file mode 100644
index 60adadeb3e9e..000000000000
--- a/arch/ia64/include/asm/uaccess.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_UACCESS_H
-#define _ASM_IA64_UACCESS_H
-
-/*
- * This file defines various macros to transfer memory areas across
- * the user/kernel boundary.  This needs to be done carefully because
- * this code is executed in kernel mode and uses user-specified
- * addresses.  Thus, we need to be careful not to let the user to
- * trick us into accessing kernel memory that would normally be
- * inaccessible.  This code is also fairly performance sensitive,
- * so we want to spend as little time doing safety checks as
- * possible.
- *
- * To make matters a bit more interesting, these macros sometimes also
- * called from within the kernel itself, in which case the address
- * validity check must be skipped.  The get_fs() macro tells us what
- * to do: if get_fs()==USER_DS, checking is performed, if
- * get_fs()==KERNEL_DS, checking is bypassed.
- *
- * Note that even if the memory area specified by the user is in a
- * valid address range, it is still possible that we'll get a page
- * fault while accessing it.  This is handled by filling out an
- * exception handler fixup entry for each instruction that has the
- * potential to fault.  When such a fault occurs, the page fault
- * handler checks to see whether the faulting instruction has a fixup
- * associated and, if so, sets r8 to -EFAULT and clears r9 to 0 and
- * then resumes execution at the continuation point.
- *
- * Based on <asm-alpha/uaccess.h>.
- *
- * Copyright (C) 1998, 1999, 2001-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/compiler.h>
-#include <linux/page-flags.h>
-
-#include <asm/intrinsics.h>
-#include <linux/pgtable.h>
-#include <asm/io.h>
-#include <asm/extable.h>
-
-/*
- * When accessing user memory, we need to make sure the entire area really is
- * in user-level space.  We also need to make sure that the address doesn't
- * point inside the virtually mapped linear page table.
- */
-static inline int __access_ok(const void __user *p, unsigned long size)
-{
-	unsigned long limit = TASK_SIZE;
-	unsigned long addr = (unsigned long)p;
-
-	return likely((size <= limit) && (addr <= (limit - size)) &&
-		 likely(REGION_OFFSET(addr) < RGN_MAP_LIMIT));
-}
-#define __access_ok __access_ok
-#include <asm-generic/access_ok.h>
-
-/*
- * These are the main single-value transfer routines.  They automatically
- * use the right size if we just have the right pointer type.
- *
- * Careful to not
- * (a) re-use the arguments for side effects (sizeof/typeof is ok)
- * (b) require any knowledge of processes at this stage
- */
-#define put_user(x, ptr)	__put_user_check((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr)))
-#define get_user(x, ptr)	__get_user_check((x), (ptr), sizeof(*(ptr)))
-
-/*
- * The "__xxx" versions do not do address space checking, useful when
- * doing multiple accesses to the same area (the programmer has to do the
- * checks by hand with "access_ok()")
- */
-#define __put_user(x, ptr)	__put_user_nocheck((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr)))
-#define __get_user(x, ptr)	__get_user_nocheck((x), (ptr), sizeof(*(ptr)))
-
-#ifdef ASM_SUPPORTED
-  struct __large_struct { unsigned long buf[100]; };
-# define __m(x) (*(struct __large_struct __user *)(x))
-
-/* We need to declare the __ex_table section before we can use it in .xdata.  */
-asm (".section \"__ex_table\", \"a\"\n\t.previous");
-
-# define __get_user_size(val, addr, n, err)							\
-do {												\
-	register long __gu_r8 asm ("r8") = 0;							\
-	register long __gu_r9 asm ("r9");							\
-	asm ("\n[1:]\tld"#n" %0=%2%P2\t// %0 and %1 get overwritten by exception handler\n"	\
-	     "\t.xdata4 \"__ex_table\", 1b-., 1f-.+4\n"						\
-	     "[1:]"										\
-	     : "=r"(__gu_r9), "=r"(__gu_r8) : "m"(__m(addr)), "1"(__gu_r8));			\
-	(err) = __gu_r8;									\
-	(val) = __gu_r9;									\
-} while (0)
-
-/*
- * The "__put_user_size()" macro tells gcc it reads from memory instead of writing it.  This
- * is because they do not write to any memory gcc knows about, so there are no aliasing
- * issues.
- */
-# define __put_user_size(val, addr, n, err)							\
-do {												\
-	register long __pu_r8 asm ("r8") = 0;							\
-	asm volatile ("\n[1:]\tst"#n" %1=%r2%P1\t// %0 gets overwritten by exception handler\n"	\
-		      "\t.xdata4 \"__ex_table\", 1b-., 1f-.\n"					\
-		      "[1:]"									\
-		      : "=r"(__pu_r8) : "m"(__m(addr)), "rO"(val), "0"(__pu_r8));		\
-	(err) = __pu_r8;									\
-} while (0)
-
-#else /* !ASM_SUPPORTED */
-# define RELOC_TYPE	2	/* ip-rel */
-# define __get_user_size(val, addr, n, err)				\
-do {									\
-	__ld_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE);	\
-	(err) = ia64_getreg(_IA64_REG_R8);				\
-	(val) = ia64_getreg(_IA64_REG_R9);				\
-} while (0)
-# define __put_user_size(val, addr, n, err)				\
-do {									\
-	__st_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE,	\
-		  (__force unsigned long) (val));			\
-	(err) = ia64_getreg(_IA64_REG_R8);				\
-} while (0)
-#endif /* !ASM_SUPPORTED */
-
-extern void __get_user_unknown (void);
-
-/*
- * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve subroutine-calls, which
- * could clobber r8 and r9 (among others).  Thus, be careful not to evaluate it while
- * using r8/r9.
- */
-#define __do_get_user(check, x, ptr, size)						\
-({											\
-	const __typeof__(*(ptr)) __user *__gu_ptr = (ptr);				\
-	__typeof__ (size) __gu_size = (size);						\
-	long __gu_err = -EFAULT;							\
-	unsigned long __gu_val = 0;							\
-	if (!check || __access_ok(__gu_ptr, size))					\
-		switch (__gu_size) {							\
-		      case 1: __get_user_size(__gu_val, __gu_ptr, 1, __gu_err); break;	\
-		      case 2: __get_user_size(__gu_val, __gu_ptr, 2, __gu_err); break;	\
-		      case 4: __get_user_size(__gu_val, __gu_ptr, 4, __gu_err); break;	\
-		      case 8: __get_user_size(__gu_val, __gu_ptr, 8, __gu_err); break;	\
-		      default: __get_user_unknown(); break;				\
-		}									\
-	(x) = (__force __typeof__(*(__gu_ptr))) __gu_val;				\
-	__gu_err;									\
-})
-
-#define __get_user_nocheck(x, ptr, size)	__do_get_user(0, x, ptr, size)
-#define __get_user_check(x, ptr, size)	__do_get_user(1, x, ptr, size)
-
-extern void __put_user_unknown (void);
-
-/*
- * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve subroutine-calls, which
- * could clobber r8 (among others).  Thus, be careful not to evaluate them while using r8.
- */
-#define __do_put_user(check, x, ptr, size)						\
-({											\
-	__typeof__ (x) __pu_x = (x);							\
-	__typeof__ (*(ptr)) __user *__pu_ptr = (ptr);					\
-	__typeof__ (size) __pu_size = (size);						\
-	long __pu_err = -EFAULT;							\
-											\
-	if (!check || __access_ok(__pu_ptr, __pu_size))					\
-		switch (__pu_size) {							\
-		      case 1: __put_user_size(__pu_x, __pu_ptr, 1, __pu_err); break;	\
-		      case 2: __put_user_size(__pu_x, __pu_ptr, 2, __pu_err); break;	\
-		      case 4: __put_user_size(__pu_x, __pu_ptr, 4, __pu_err); break;	\
-		      case 8: __put_user_size(__pu_x, __pu_ptr, 8, __pu_err); break;	\
-		      default: __put_user_unknown(); break;				\
-		}									\
-	__pu_err;									\
-})
-
-#define __put_user_nocheck(x, ptr, size)	__do_put_user(0, x, ptr, size)
-#define __put_user_check(x, ptr, size)	__do_put_user(1, x, ptr, size)
-
-/*
- * Complex access routines
- */
-extern unsigned long __must_check __copy_user (void __user *to, const void __user *from,
-					       unsigned long count);
-
-static inline unsigned long
-raw_copy_to_user(void __user *to, const void *from, unsigned long count)
-{
-	return __copy_user(to, (__force void __user *) from, count);
-}
-
-static inline unsigned long
-raw_copy_from_user(void *to, const void __user *from, unsigned long count)
-{
-	return __copy_user((__force void __user *) to, from, count);
-}
-
-#define INLINE_COPY_FROM_USER
-#define INLINE_COPY_TO_USER
-
-extern unsigned long __do_clear_user (void __user *, unsigned long);
-
-#define __clear_user(to, n)		__do_clear_user(to, n)
-
-#define clear_user(to, n)					\
-({								\
-	unsigned long __cu_len = (n);				\
-	if (__access_ok(to, __cu_len))				\
-		__cu_len = __do_clear_user(to, __cu_len);	\
-	__cu_len;						\
-})
-
-
-/*
- * Returns: -EFAULT if exception before terminator, N if the entire buffer filled, else
- * strlen.
- */
-extern long __must_check __strncpy_from_user (char *to, const char __user *from, long to_len);
-
-#define strncpy_from_user(to, from, n)					\
-({									\
-	const char __user * __sfu_from = (from);			\
-	long __sfu_ret = -EFAULT;					\
-	if (__access_ok(__sfu_from, 0))					\
-		__sfu_ret = __strncpy_from_user((to), __sfu_from, (n));	\
-	__sfu_ret;							\
-})
-
-/*
- * Returns: 0 if exception before NUL or reaching the supplied limit
- * (N), a value greater than N if the limit would be exceeded, else
- * strlen.
- */
-extern unsigned long __strnlen_user (const char __user *, long);
-
-#define strnlen_user(str, len)					\
-({								\
-	const char __user *__su_str = (str);			\
-	unsigned long __su_ret = 0;				\
-	if (__access_ok(__su_str, 0))				\
-		__su_ret = __strnlen_user(__su_str, len);	\
-	__su_ret;						\
-})
-
-#define ARCH_HAS_TRANSLATE_MEM_PTR	1
-static __inline__ void *
-xlate_dev_mem_ptr(phys_addr_t p)
-{
-	struct page *page;
-	void *ptr;
-
-	page = pfn_to_page(p >> PAGE_SHIFT);
-	if (PageUncached(page))
-		ptr = (void *)p + __IA64_UNCACHED_OFFSET;
-	else
-		ptr = __va(p);
-
-	return ptr;
-}
-
-#endif /* _ASM_IA64_UACCESS_H */
diff --git a/arch/ia64/include/asm/uncached.h b/arch/ia64/include/asm/uncached.h
deleted file mode 100644
index 98f447fc77b7..000000000000
--- a/arch/ia64/include/asm/uncached.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2001-2008 Silicon Graphics, Inc.  All rights reserved.
- *
- * Prototypes for the uncached page allocator
- */
-
-extern unsigned long uncached_alloc_page(int starting_nid, int n_pages);
-extern void uncached_free_page(unsigned long uc_addr, int n_pages);
diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h
deleted file mode 100644
index 9ba6110b10b9..000000000000
--- a/arch/ia64/include/asm/unistd.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * IA-64 Linux syscall numbers and inline-functions.
- *
- * Copyright (C) 1998-2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#ifndef _ASM_IA64_UNISTD_H
-#define _ASM_IA64_UNISTD_H
-
-#include <uapi/asm/unistd.h>
-
-#define NR_syscalls		__NR_syscalls /* length of syscall table */
-
-#define __ARCH_WANT_NEW_STAT
-#define __ARCH_WANT_SYS_UTIME
-
-#if !defined(__ASSEMBLY__) && !defined(ASSEMBLER)
-
-#include <linux/types.h>
-#include <linux/linkage.h>
-#include <linux/compiler.h>
-
-extern long __ia64_syscall (long a0, long a1, long a2, long a3, long a4, long nr);
-
-asmlinkage unsigned long sys_mmap(
-				unsigned long addr, unsigned long len,
-				int prot, int flags,
-				int fd, long off);
-asmlinkage unsigned long sys_mmap2(
-				unsigned long addr, unsigned long len,
-				int prot, int flags,
-				int fd, long pgoff);
-struct pt_regs;
-asmlinkage long sys_ia64_pipe(void);
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_IA64_UNISTD_H */
diff --git a/arch/ia64/include/asm/unwind.h b/arch/ia64/include/asm/unwind.h
deleted file mode 100644
index c5bd4b3e3a36..000000000000
--- a/arch/ia64/include/asm/unwind.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_UNWIND_H
-#define _ASM_IA64_UNWIND_H
-
-/*
- * Copyright (C) 1999-2000, 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * A simple API for unwinding kernel stacks.  This is used for
- * debugging and error reporting purposes.  The kernel doesn't need
- * full-blown stack unwinding with all the bells and whitles, so there
- * is not much point in implementing the full IA-64 unwind API (though
- * it would of course be possible to implement the kernel API on top
- * of it).
- */
-
-struct task_struct;	/* forward declaration */
-struct switch_stack;	/* forward declaration */
-
-enum unw_application_register {
-	UNW_AR_BSP,
-	UNW_AR_BSPSTORE,
-	UNW_AR_PFS,
-	UNW_AR_RNAT,
-	UNW_AR_UNAT,
-	UNW_AR_LC,
-	UNW_AR_EC,
-	UNW_AR_FPSR,
-	UNW_AR_RSC,
-	UNW_AR_CCV,
-	UNW_AR_CSD,
-	UNW_AR_SSD
-};
-
-/*
- * The following declarations are private to the unwind
- * implementation:
- */
-
-struct unw_stack {
-	unsigned long limit;
-	unsigned long top;
-};
-
-#define UNW_FLAG_INTERRUPT_FRAME	(1UL << 0)
-
-/*
- * No user of this module should every access this structure directly
- * as it is subject to change.  It is declared here solely so we can
- * use automatic variables.
- */
-struct unw_frame_info {
-	struct unw_stack regstk;
-	struct unw_stack memstk;
-	unsigned int flags;
-	short hint;
-	short prev_script;
-
-	/* current frame info: */
-	unsigned long bsp;		/* backing store pointer value */
-	unsigned long sp;		/* stack pointer value */
-	unsigned long psp;		/* previous sp value */
-	unsigned long ip;		/* instruction pointer value */
-	unsigned long pr;		/* current predicate values */
-	unsigned long *cfm_loc;		/* cfm save location (or NULL) */
-	unsigned long pt;		/* struct pt_regs location */
-
-	struct task_struct *task;
-	struct switch_stack *sw;
-
-	/* preserved state: */
-	unsigned long *bsp_loc;		/* previous bsp save location */
-	unsigned long *bspstore_loc;
-	unsigned long *pfs_loc;
-	unsigned long *rnat_loc;
-	unsigned long *rp_loc;
-	unsigned long *pri_unat_loc;
-	unsigned long *unat_loc;
-	unsigned long *pr_loc;
-	unsigned long *lc_loc;
-	unsigned long *fpsr_loc;
-	struct unw_ireg {
-		unsigned long *loc;
-		struct unw_ireg_nat {
-			unsigned long type : 3;		/* enum unw_nat_type */
-			signed long off : 61;		/* NaT word is at loc+nat.off */
-		} nat;
-	} r4, r5, r6, r7;
-	unsigned long *b1_loc, *b2_loc, *b3_loc, *b4_loc, *b5_loc;
-	struct ia64_fpreg *f2_loc, *f3_loc, *f4_loc, *f5_loc, *fr_loc[16];
-};
-
-/*
- * The official API follows below:
- */
-
-struct unw_table_entry {
-	u64 start_offset;
-	u64 end_offset;
-	u64 info_offset;
-};
-
-/*
- * Initialize unwind support.
- */
-extern void unw_init (void);
-
-extern void *unw_add_unwind_table (const char *name, unsigned long segment_base, unsigned long gp,
-				   const void *table_start, const void *table_end);
-
-extern void unw_remove_unwind_table (void *handle);
-
-/*
- * Prepare to unwind blocked task t.
- */
-extern void unw_init_from_blocked_task (struct unw_frame_info *info, struct task_struct *t);
-
-extern void unw_init_frame_info (struct unw_frame_info *info, struct task_struct *t,
-				 struct switch_stack *sw);
-
-/*
- * Prepare to unwind the currently running thread.
- */
-extern void unw_init_running (void (*callback)(struct unw_frame_info *info, void *arg), void *arg);
-
-/*
- * Unwind to previous to frame.  Returns 0 if successful, negative
- * number in case of an error.
- */
-extern int unw_unwind (struct unw_frame_info *info);
-
-/*
- * Unwind until the return pointer is in user-land (or until an error
- * occurs).  Returns 0 if successful, negative number in case of
- * error.
- */
-extern int unw_unwind_to_user (struct unw_frame_info *info);
-
-#define unw_is_intr_frame(info)	(((info)->flags & UNW_FLAG_INTERRUPT_FRAME) != 0)
-
-static inline int
-unw_get_ip (struct unw_frame_info *info, unsigned long *valp)
-{
-	*valp = (info)->ip;
-	return 0;
-}
-
-static inline int
-unw_get_sp (struct unw_frame_info *info, unsigned long *valp)
-{
-	*valp = (info)->sp;
-	return 0;
-}
-
-static inline int
-unw_get_psp (struct unw_frame_info *info, unsigned long *valp)
-{
-	*valp = (info)->psp;
-	return 0;
-}
-
-static inline int
-unw_get_bsp (struct unw_frame_info *info, unsigned long *valp)
-{
-	*valp = (info)->bsp;
-	return 0;
-}
-
-static inline int
-unw_get_cfm (struct unw_frame_info *info, unsigned long *valp)
-{
-	*valp = *(info)->cfm_loc;
-	return 0;
-}
-
-static inline int
-unw_set_cfm (struct unw_frame_info *info, unsigned long val)
-{
-	*(info)->cfm_loc = val;
-	return 0;
-}
-
-static inline int
-unw_get_rp (struct unw_frame_info *info, unsigned long *val)
-{
-	if (!info->rp_loc)
-		return -1;
-	*val = *info->rp_loc;
-	return 0;
-}
-
-extern int unw_access_gr (struct unw_frame_info *, int, unsigned long *, char *, int);
-extern int unw_access_br (struct unw_frame_info *, int, unsigned long *, int);
-extern int unw_access_fr (struct unw_frame_info *, int, struct ia64_fpreg *, int);
-extern int unw_access_ar (struct unw_frame_info *, int, unsigned long *, int);
-extern int unw_access_pr (struct unw_frame_info *, unsigned long *, int);
-
-static inline int
-unw_set_gr (struct unw_frame_info *i, int n, unsigned long v, char nat)
-{
-	return unw_access_gr(i, n, &v, &nat, 1);
-}
-
-static inline int
-unw_set_br (struct unw_frame_info *i, int n, unsigned long v)
-{
-	return unw_access_br(i, n, &v, 1);
-}
-
-static inline int
-unw_set_fr (struct unw_frame_info *i, int n, struct ia64_fpreg v)
-{
-	return unw_access_fr(i, n, &v, 1);
-}
-
-static inline int
-unw_set_ar (struct unw_frame_info *i, int n, unsigned long v)
-{
-	return unw_access_ar(i, n, &v, 1);
-}
-
-static inline int
-unw_set_pr (struct unw_frame_info *i, unsigned long v)
-{
-	return unw_access_pr(i, &v, 1);
-}
-
-#define unw_get_gr(i,n,v,nat)	unw_access_gr(i,n,v,nat,0)
-#define unw_get_br(i,n,v)	unw_access_br(i,n,v,0)
-#define unw_get_fr(i,n,v)	unw_access_fr(i,n,v,0)
-#define unw_get_ar(i,n,v)	unw_access_ar(i,n,v,0)
-#define unw_get_pr(i,v)		unw_access_pr(i,v,0)
-
-#endif /* _ASM_UNWIND_H */
diff --git a/arch/ia64/include/asm/user.h b/arch/ia64/include/asm/user.h
deleted file mode 100644
index ec03d3ab8715..000000000000
--- a/arch/ia64/include/asm/user.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_USER_H
-#define _ASM_IA64_USER_H
-
-/*
- * Core file format: The core file is written in such a way that gdb
- * can understand it and provide useful information to the user (under
- * linux we use the `trad-core' bfd).  The file contents are as
- * follows:
- *
- *  upage: 1 page consisting of a user struct that tells gdb
- *	what is present in the file.  Directly after this is a
- *	copy of the task_struct, which is currently not used by gdb,
- *	but it may come in handy at some point.  All of the registers
- *	are stored as part of the upage.  The upage should always be
- *	only one page long.
- *  data: The data segment follows next.  We use current->end_text to
- *	current->brk to pick up all of the user variables, plus any memory
- *	that may have been sbrk'ed.  No attempt is made to determine if a
- *	page is demand-zero or if a page is totally unused, we just cover
- *	the entire range.  All of the addresses are rounded in such a way
- *	that an integral number of pages is written.
- *  stack: We need the stack information in order to get a meaningful
- *	backtrace.  We need to write the data from usp to
- *	current->start_stack, so we round each of these in order to be able
- *	to write an integer number of pages.
- *
- * Modified 1998, 1999, 2001
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-
-#include <linux/ptrace.h>
-#include <linux/types.h>
-
-#include <asm/page.h>
-
-#define EF_SIZE		3072	/* XXX fix me */
-
-struct user {
-	unsigned long	regs[EF_SIZE/8+32];	/* integer and fp regs */
-	size_t		u_tsize;		/* text size (pages) */
-	size_t		u_dsize;		/* data size (pages) */
-	size_t		u_ssize;		/* stack size (pages) */
-	unsigned long	start_code;		/* text starting address */
-	unsigned long	start_data;		/* data starting address */
-	unsigned long	start_stack;		/* stack starting address */
-	long int	signal;			/* signal causing core dump */
-	unsigned long	u_ar0;			/* help gdb find registers */
-	unsigned long	magic;			/* identifies a core file */
-	char		u_comm[32];		/* user command name */
-};
-
-#endif /* _ASM_IA64_USER_H */
diff --git a/arch/ia64/include/asm/ustack.h b/arch/ia64/include/asm/ustack.h
deleted file mode 100644
index 112d40a0fec2..000000000000
--- a/arch/ia64/include/asm/ustack.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_USTACK_H
-#define _ASM_IA64_USTACK_H
-
-#include <asm/page.h>
-#include <uapi/asm/ustack.h>
-
-/* The absolute hard limit for stack size is 1/2 of the mappable space in the region */
-#define MAX_USER_STACK_SIZE	(RGN_MAP_LIMIT/2)
-#define STACK_TOP		(0x6000000000000000UL + RGN_MAP_LIMIT)
-#define STACK_TOP_MAX		STACK_TOP
-#endif /* _ASM_IA64_USTACK_H */
diff --git a/arch/ia64/include/asm/uv/uv.h b/arch/ia64/include/asm/uv/uv.h
deleted file mode 100644
index 48d4526bf4cd..000000000000
--- a/arch/ia64/include/asm/uv/uv.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_UV_UV_H
-#define _ASM_IA64_UV_UV_H
-
-#ifdef CONFIG_IA64_SGI_UV
-extern bool ia64_is_uv;
-
-static inline int is_uv_system(void)
-{
-	return ia64_is_uv;
-}
-
-void __init uv_probe_system_type(void);
-void __init uv_setup(char **cmdline_p);
-#else /* CONFIG_IA64_SGI_UV */
-static inline int is_uv_system(void)
-{
-	return false;
-}
-
-static inline void __init uv_probe_system_type(void)
-{
-}
-
-static inline void __init uv_setup(char **cmdline_p)
-{
-}
-#endif /* CONFIG_IA64_SGI_UV */
-
-#endif	/* _ASM_IA64_UV_UV_H */
diff --git a/arch/ia64/include/asm/uv/uv_hub.h b/arch/ia64/include/asm/uv/uv_hub.h
deleted file mode 100644
index 809ddb6896db..000000000000
--- a/arch/ia64/include/asm/uv/uv_hub.h
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * SGI UV architectural definitions
- *
- * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
- */
-
-#ifndef __ASM_IA64_UV_HUB_H__
-#define __ASM_IA64_UV_HUB_H__
-
-#include <linux/numa.h>
-#include <linux/percpu.h>
-#include <asm/types.h>
-#include <asm/percpu.h>
-
-
-/*
- * Addressing Terminology
- *
- *	M       - The low M bits of a physical address represent the offset
- *		  into the blade local memory. RAM memory on a blade is physically
- *		  contiguous (although various IO spaces may punch holes in
- *		  it)..
- *
- * 	N	- Number of bits in the node portion of a socket physical
- * 		  address.
- *
- * 	NASID   - network ID of a router, Mbrick or Cbrick. Nasid values of
- * 	 	  routers always have low bit of 1, C/MBricks have low bit
- * 		  equal to 0. Most addressing macros that target UV hub chips
- * 		  right shift the NASID by 1 to exclude the always-zero bit.
- * 		  NASIDs contain up to 15 bits.
- *
- *	GNODE   - NASID right shifted by 1 bit. Most mmrs contain gnodes instead
- *		  of nasids.
- *
- * 	PNODE   - the low N bits of the GNODE. The PNODE is the most useful variant
- * 		  of the nasid for socket usage.
- *
- *
- *  NumaLink Global Physical Address Format:
- *  +--------------------------------+---------------------+
- *  |00..000|      GNODE             |      NodeOffset     |
- *  +--------------------------------+---------------------+
- *          |<-------53 - M bits --->|<--------M bits ----->
- *
- *	M - number of node offset bits (35 .. 40)
- *
- *
- *  Memory/UV-HUB Processor Socket Address Format:
- *  +----------------+---------------+---------------------+
- *  |00..000000000000|   PNODE       |      NodeOffset     |
- *  +----------------+---------------+---------------------+
- *                   <--- N bits --->|<--------M bits ----->
- *
- *	M - number of node offset bits (35 .. 40)
- *	N - number of PNODE bits (0 .. 10)
- *
- *		Note: M + N cannot currently exceed 44 (x86_64) or 46 (IA64).
- *		The actual values are configuration dependent and are set at
- *		boot time. M & N values are set by the hardware/BIOS at boot.
- */
-
-
-/*
- * Maximum number of bricks in all partitions and in all coherency domains.
- * This is the total number of bricks accessible in the numalink fabric. It
- * includes all C & M bricks. Routers are NOT included.
- *
- * This value is also the value of the maximum number of non-router NASIDs
- * in the numalink fabric.
- *
- * NOTE: a brick may contain 1 or 2 OS nodes. Don't get these confused.
- */
-#define UV_MAX_NUMALINK_BLADES	16384
-
-/*
- * Maximum number of C/Mbricks within a software SSI (hardware may support
- * more).
- */
-#define UV_MAX_SSI_BLADES	1
-
-/*
- * The largest possible NASID of a C or M brick (+ 2)
- */
-#define UV_MAX_NASID_VALUE	(UV_MAX_NUMALINK_NODES * 2)
-
-/*
- * The following defines attributes of the HUB chip. These attributes are
- * frequently referenced and are kept in the per-cpu data areas of each cpu.
- * They are kept together in a struct to minimize cache misses.
- */
-struct uv_hub_info_s {
-	unsigned long	global_mmr_base;
-	unsigned long	gpa_mask;
-	unsigned long	gnode_upper;
-	unsigned long	lowmem_remap_top;
-	unsigned long	lowmem_remap_base;
-	unsigned short	pnode;
-	unsigned short	pnode_mask;
-	unsigned short	coherency_domain_number;
-	unsigned short	numa_blade_id;
-	unsigned char	blade_processor_id;
-	unsigned char	m_val;
-	unsigned char	n_val;
-};
-DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
-#define uv_hub_info 		this_cpu_ptr(&__uv_hub_info)
-#define uv_cpu_hub_info(cpu)	(&per_cpu(__uv_hub_info, cpu))
-
-/*
- * Local & Global MMR space macros.
- * 	Note: macros are intended to be used ONLY by inline functions
- * 	in this file - not by other kernel code.
- * 		n -  NASID (full 15-bit global nasid)
- * 		g -  GNODE (full 15-bit global nasid, right shifted 1)
- * 		p -  PNODE (local part of nsids, right shifted 1)
- */
-#define UV_NASID_TO_PNODE(n)		(((n) >> 1) & uv_hub_info->pnode_mask)
-#define UV_PNODE_TO_NASID(p)		(((p) << 1) | uv_hub_info->gnode_upper)
-
-#define UV_LOCAL_MMR_BASE		0xf4000000UL
-#define UV_GLOBAL_MMR32_BASE		0xf8000000UL
-#define UV_GLOBAL_MMR64_BASE		(uv_hub_info->global_mmr_base)
-
-#define UV_GLOBAL_MMR32_PNODE_SHIFT	15
-#define UV_GLOBAL_MMR64_PNODE_SHIFT	26
-
-#define UV_GLOBAL_MMR32_PNODE_BITS(p)	((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
-
-#define UV_GLOBAL_MMR64_PNODE_BITS(p)					\
-	((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT)
-
-/*
- * Macros for converting between kernel virtual addresses, socket local physical
- * addresses, and UV global physical addresses.
- * 	Note: use the standard __pa() & __va() macros for converting
- * 	      between socket virtual and socket physical addresses.
- */
-
-/* socket phys RAM --> UV global physical address */
-static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
-{
-	if (paddr < uv_hub_info->lowmem_remap_top)
-		paddr += uv_hub_info->lowmem_remap_base;
-	return paddr | uv_hub_info->gnode_upper;
-}
-
-
-/* socket virtual --> UV global physical address */
-static inline unsigned long uv_gpa(void *v)
-{
-	return __pa(v) | uv_hub_info->gnode_upper;
-}
-
-/* socket virtual --> UV global physical address */
-static inline void *uv_vgpa(void *v)
-{
-	return (void *)uv_gpa(v);
-}
-
-/* UV global physical address --> socket virtual */
-static inline void *uv_va(unsigned long gpa)
-{
-	return __va(gpa & uv_hub_info->gpa_mask);
-}
-
-/* pnode, offset --> socket virtual */
-static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
-{
-	return __va(((unsigned long)pnode << uv_hub_info->m_val) | offset);
-}
-
-
-/*
- * Access global MMRs using the low memory MMR32 space. This region supports
- * faster MMR access but not all MMRs are accessible in this space.
- */
-static inline unsigned long *uv_global_mmr32_address(int pnode,
-				unsigned long offset)
-{
-	return __va(UV_GLOBAL_MMR32_BASE |
-		       UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset);
-}
-
-static inline void uv_write_global_mmr32(int pnode, unsigned long offset,
-				 unsigned long val)
-{
-	*uv_global_mmr32_address(pnode, offset) = val;
-}
-
-static inline unsigned long uv_read_global_mmr32(int pnode,
-						 unsigned long offset)
-{
-	return *uv_global_mmr32_address(pnode, offset);
-}
-
-/*
- * Access Global MMR space using the MMR space located at the top of physical
- * memory.
- */
-static inline unsigned long *uv_global_mmr64_address(int pnode,
-				unsigned long offset)
-{
-	return __va(UV_GLOBAL_MMR64_BASE |
-		    UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
-}
-
-static inline void uv_write_global_mmr64(int pnode, unsigned long offset,
-				unsigned long val)
-{
-	*uv_global_mmr64_address(pnode, offset) = val;
-}
-
-static inline unsigned long uv_read_global_mmr64(int pnode,
-						 unsigned long offset)
-{
-	return *uv_global_mmr64_address(pnode, offset);
-}
-
-/*
- * Access hub local MMRs. Faster than using global space but only local MMRs
- * are accessible.
- */
-static inline unsigned long *uv_local_mmr_address(unsigned long offset)
-{
-	return __va(UV_LOCAL_MMR_BASE | offset);
-}
-
-static inline unsigned long uv_read_local_mmr(unsigned long offset)
-{
-	return *uv_local_mmr_address(offset);
-}
-
-static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
-{
-	*uv_local_mmr_address(offset) = val;
-}
-
-/*
- * Structures and definitions for converting between cpu, node, pnode, and blade
- * numbers.
- */
-
-/* Blade-local cpu number of current cpu. Numbered 0 .. <# cpus on the blade> */
-static inline int uv_blade_processor_id(void)
-{
-	return smp_processor_id();
-}
-
-/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */
-static inline int uv_numa_blade_id(void)
-{
-	return 0;
-}
-
-/* Convert a cpu number to the UV blade number */
-static inline int uv_cpu_to_blade_id(int cpu)
-{
-	return 0;
-}
-
-/* Convert linux node number to the UV blade number */
-static inline int uv_node_to_blade_id(int nid)
-{
-	return 0;
-}
-
-/* Convert a blade id to the PNODE of the blade */
-static inline int uv_blade_to_pnode(int bid)
-{
-	return 0;
-}
-
-/* Determine the number of possible cpus on a blade */
-static inline int uv_blade_nr_possible_cpus(int bid)
-{
-	return num_possible_cpus();
-}
-
-/* Determine the number of online cpus on a blade */
-static inline int uv_blade_nr_online_cpus(int bid)
-{
-	return num_online_cpus();
-}
-
-/* Convert a cpu id to the PNODE of the blade containing the cpu */
-static inline int uv_cpu_to_pnode(int cpu)
-{
-	return 0;
-}
-
-/* Convert a linux node number to the PNODE of the blade */
-static inline int uv_node_to_pnode(int nid)
-{
-	return 0;
-}
-
-/* Maximum possible number of blades */
-static inline int uv_num_possible_blades(void)
-{
-	return 1;
-}
-
-static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
-{
-	/* not currently needed on ia64 */
-}
-
-
-#endif /* __ASM_IA64_UV_HUB__ */
-
diff --git a/arch/ia64/include/asm/uv/uv_mmrs.h b/arch/ia64/include/asm/uv/uv_mmrs.h
deleted file mode 100644
index fe0b8f05e1a8..000000000000
--- a/arch/ia64/include/asm/uv/uv_mmrs.h
+++ /dev/null
@@ -1,825 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * SGI UV MMR definitions
- *
- * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
- */
-
-#ifndef _ASM_IA64_UV_UV_MMRS_H
-#define _ASM_IA64_UV_UV_MMRS_H
-
-#define UV_MMR_ENABLE		(1UL << 63)
-
-/* ========================================================================= */
-/*                           UVH_BAU_DATA_CONFIG                             */
-/* ========================================================================= */
-#define UVH_BAU_DATA_CONFIG 0x61680UL
-#define UVH_BAU_DATA_CONFIG_32 0x0438
-
-#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
-#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
-#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
-#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
-#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_BAU_DATA_CONFIG_P_SHFT 13
-#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_BAU_DATA_CONFIG_T_SHFT 15
-#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_BAU_DATA_CONFIG_M_SHFT 16
-#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
-#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_bau_data_config_u {
-    unsigned long	v;
-    struct uvh_bau_data_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                           UVH_EVENT_OCCURRED0                             */
-/* ========================================================================= */
-#define UVH_EVENT_OCCURRED0 0x70000UL
-#define UVH_EVENT_OCCURRED0_32 0x005e8
-
-#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0
-#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
-#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
-#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
-#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
-#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
-#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3
-#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
-#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4
-#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
-#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5
-#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
-#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6
-#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
-#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
-#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
-#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
-#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
-#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
-#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
-#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
-#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
-#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
-#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
-#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
-#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
-#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
-#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
-#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
-#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
-#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
-#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
-#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
-#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
-#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
-#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
-#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
-#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
-#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
-#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
-#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
-#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
-#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
-#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
-#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
-#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
-#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
-#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
-#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
-#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
-#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
-#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
-#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
-#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
-#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
-#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43
-#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
-#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
-#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
-#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45
-#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
-#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
-#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
-#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
-#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
-#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
-#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
-#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51
-#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52
-#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53
-#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
-#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54
-#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
-#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55
-#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
-#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
-#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
-union uvh_event_occurred0_u {
-    unsigned long	v;
-    struct uvh_event_occurred0_s {
-	unsigned long	lb_hcerr             :  1;  /* RW, W1C */
-	unsigned long	gr0_hcerr            :  1;  /* RW, W1C */
-	unsigned long	gr1_hcerr            :  1;  /* RW, W1C */
-	unsigned long	lh_hcerr             :  1;  /* RW, W1C */
-	unsigned long	rh_hcerr             :  1;  /* RW, W1C */
-	unsigned long	xn_hcerr             :  1;  /* RW, W1C */
-	unsigned long	si_hcerr             :  1;  /* RW, W1C */
-	unsigned long	lb_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	gr0_aoerr0           :  1;  /* RW, W1C */
-	unsigned long	gr1_aoerr0           :  1;  /* RW, W1C */
-	unsigned long	lh_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	rh_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	xn_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	si_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	lb_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	gr0_aoerr1           :  1;  /* RW, W1C */
-	unsigned long	gr1_aoerr1           :  1;  /* RW, W1C */
-	unsigned long	lh_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	rh_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	xn_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	si_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	rh_vpi_int           :  1;  /* RW, W1C */
-	unsigned long	system_shutdown_int  :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_0         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_1         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_2         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_3         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_4         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_5         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_6         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_7         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_8         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_9         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_10        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_11        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_12        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_13        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_14        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_15        :  1;  /* RW, W1C */
-	unsigned long	l1_nmi_int           :  1;  /* RW, W1C */
-	unsigned long	stop_clock           :  1;  /* RW, W1C */
-	unsigned long	asic_to_l1           :  1;  /* RW, W1C */
-	unsigned long	l1_to_asic           :  1;  /* RW, W1C */
-	unsigned long	ltc_int              :  1;  /* RW, W1C */
-	unsigned long	la_seq_trigger       :  1;  /* RW, W1C */
-	unsigned long	ipi_int              :  1;  /* RW, W1C */
-	unsigned long	extio_int0           :  1;  /* RW, W1C */
-	unsigned long	extio_int1           :  1;  /* RW, W1C */
-	unsigned long	extio_int2           :  1;  /* RW, W1C */
-	unsigned long	extio_int3           :  1;  /* RW, W1C */
-	unsigned long	profile_int          :  1;  /* RW, W1C */
-	unsigned long	rtc0                 :  1;  /* RW, W1C */
-	unsigned long	rtc1                 :  1;  /* RW, W1C */
-	unsigned long	rtc2                 :  1;  /* RW, W1C */
-	unsigned long	rtc3                 :  1;  /* RW, W1C */
-	unsigned long	bau_data             :  1;  /* RW, W1C */
-	unsigned long	power_management_req :  1;  /* RW, W1C */
-	unsigned long	rsvd_57_63           :  7;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                        UVH_EVENT_OCCURRED0_ALIAS                          */
-/* ========================================================================= */
-#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
-#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0
-
-/* ========================================================================= */
-/*                         UVH_GR0_TLB_INT0_CONFIG                           */
-/* ========================================================================= */
-#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL
-
-#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8
-#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13
-#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15
-#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16
-#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_gr0_tlb_int0_config_u {
-    unsigned long	v;
-    struct uvh_gr0_tlb_int0_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                         UVH_GR0_TLB_INT1_CONFIG                           */
-/* ========================================================================= */
-#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL
-
-#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0
-#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8
-#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12
-#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13
-#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15
-#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16
-#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_gr0_tlb_int1_config_u {
-    unsigned long	v;
-    struct uvh_gr0_tlb_int1_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                         UVH_GR1_TLB_INT0_CONFIG                           */
-/* ========================================================================= */
-#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL
-
-#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8
-#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13
-#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15
-#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16
-#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_gr1_tlb_int0_config_u {
-    unsigned long	v;
-    struct uvh_gr1_tlb_int0_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                         UVH_GR1_TLB_INT1_CONFIG                           */
-/* ========================================================================= */
-#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL
-
-#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0
-#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8
-#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12
-#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13
-#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15
-#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16
-#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_gr1_tlb_int1_config_u {
-    unsigned long	v;
-    struct uvh_gr1_tlb_int1_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                               UVH_INT_CMPB                                */
-/* ========================================================================= */
-#define UVH_INT_CMPB 0x22080UL
-
-#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
-#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
-
-union uvh_int_cmpb_u {
-    unsigned long	v;
-    struct uvh_int_cmpb_s {
-	unsigned long	real_time_cmpb : 56;  /* RW */
-	unsigned long	rsvd_56_63     :  8;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                               UVH_INT_CMPC                                */
-/* ========================================================================= */
-#define UVH_INT_CMPC 0x22100UL
-
-#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0
-#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL
-
-union uvh_int_cmpc_u {
-    unsigned long	v;
-    struct uvh_int_cmpc_s {
-	unsigned long	real_time_cmpc : 56;  /* RW */
-	unsigned long	rsvd_56_63     :  8;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                               UVH_INT_CMPD                                */
-/* ========================================================================= */
-#define UVH_INT_CMPD 0x22180UL
-
-#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0
-#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL
-
-union uvh_int_cmpd_u {
-    unsigned long	v;
-    struct uvh_int_cmpd_s {
-	unsigned long	real_time_cmpd : 56;  /* RW */
-	unsigned long	rsvd_56_63     :  8;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                               UVH_NODE_ID                                 */
-/* ========================================================================= */
-#define UVH_NODE_ID 0x0UL
-
-#define UVH_NODE_ID_FORCE1_SHFT 0
-#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
-#define UVH_NODE_ID_MANUFACTURER_SHFT 1
-#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
-#define UVH_NODE_ID_PART_NUMBER_SHFT 12
-#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
-#define UVH_NODE_ID_REVISION_SHFT 28
-#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
-#define UVH_NODE_ID_NODE_ID_SHFT 32
-#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
-#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48
-#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
-#define UVH_NODE_ID_NI_PORT_SHFT 56
-#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
-
-union uvh_node_id_u {
-    unsigned long	v;
-    struct uvh_node_id_s {
-	unsigned long	force1        :  1;  /* RO */
-	unsigned long	manufacturer  : 11;  /* RO */
-	unsigned long	part_number   : 16;  /* RO */
-	unsigned long	revision      :  4;  /* RO */
-	unsigned long	node_id       : 15;  /* RW */
-	unsigned long	rsvd_47       :  1;  /*    */
-	unsigned long	nodes_per_bit :  7;  /* RW */
-	unsigned long	rsvd_55       :  1;  /*    */
-	unsigned long	ni_port       :  4;  /* RO */
-	unsigned long	rsvd_60_63    :  4;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR                  */
-/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
-
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
-
-union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
-	unsigned long	rsvd_0_23 : 24;  /*    */
-	unsigned long	dest_base : 22;  /* RW */
-	unsigned long	rsvd_46_63: 18;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR                  */
-/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
-
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
-
-union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
-	unsigned long	rsvd_0_23 : 24;  /*    */
-	unsigned long	dest_base : 22;  /* RW */
-	unsigned long	rsvd_46_63: 18;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR                  */
-/* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
-
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
-
-union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
-	unsigned long	rsvd_0_23 : 24;  /*    */
-	unsigned long	dest_base : 22;  /* RW */
-	unsigned long	rsvd_46_63: 18;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                    UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR                      */
-/* ========================================================================= */
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
-
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_rh_gam_gru_overlay_config_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_gru_overlay_config_mmr_s {
-	unsigned long	rsvd_0_27: 28;  /*    */
-	unsigned long	base   : 18;  /* RW */
-	unsigned long	rsvd_46_47:  2;  /*    */
-	unsigned long	gr4    :  1;  /* RW */
-	unsigned long	rsvd_49_51:  3;  /*    */
-	unsigned long	n_gru  :  4;  /* RW */
-	unsigned long	rsvd_56_62:  7;  /*    */
-	unsigned long	enable :  1;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                    UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR                      */
-/* ========================================================================= */
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
-
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_rh_gam_mmr_overlay_config_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_mmr_overlay_config_mmr_s {
-	unsigned long	rsvd_0_25: 26;  /*    */
-	unsigned long	base     : 20;  /* RW */
-	unsigned long	dual_hub :  1;  /* RW */
-	unsigned long	rsvd_47_62: 16;  /*    */
-	unsigned long	enable   :  1;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                                 UVH_RTC                                   */
-/* ========================================================================= */
-#define UVH_RTC 0x340000UL
-
-#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
-#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
-
-union uvh_rtc_u {
-    unsigned long	v;
-    struct uvh_rtc_s {
-	unsigned long	real_time_clock : 56;  /* RW */
-	unsigned long	rsvd_56_63      :  8;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                           UVH_RTC1_INT_CONFIG                             */
-/* ========================================================================= */
-#define UVH_RTC1_INT_CONFIG 0x615c0UL
-
-#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC1_INT_CONFIG_P_SHFT 13
-#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC1_INT_CONFIG_T_SHFT 15
-#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC1_INT_CONFIG_M_SHFT 16
-#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_rtc1_int_config_u {
-    unsigned long	v;
-    struct uvh_rtc1_int_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                           UVH_RTC2_INT_CONFIG                             */
-/* ========================================================================= */
-#define UVH_RTC2_INT_CONFIG 0x61600UL
-
-#define UVH_RTC2_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC2_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC2_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC2_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC2_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC2_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC2_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC2_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC2_INT_CONFIG_P_SHFT 13
-#define UVH_RTC2_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC2_INT_CONFIG_T_SHFT 15
-#define UVH_RTC2_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC2_INT_CONFIG_M_SHFT 16
-#define UVH_RTC2_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC2_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC2_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_rtc2_int_config_u {
-    unsigned long	v;
-    struct uvh_rtc2_int_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                           UVH_RTC3_INT_CONFIG                             */
-/* ========================================================================= */
-#define UVH_RTC3_INT_CONFIG 0x61640UL
-
-#define UVH_RTC3_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC3_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC3_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC3_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC3_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC3_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC3_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC3_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC3_INT_CONFIG_P_SHFT 13
-#define UVH_RTC3_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC3_INT_CONFIG_T_SHFT 15
-#define UVH_RTC3_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC3_INT_CONFIG_M_SHFT 16
-#define UVH_RTC3_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC3_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC3_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_rtc3_int_config_u {
-    unsigned long	v;
-    struct uvh_rtc3_int_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                            UVH_RTC_INC_RATIO                              */
-/* ========================================================================= */
-#define UVH_RTC_INC_RATIO 0x350000UL
-
-#define UVH_RTC_INC_RATIO_FRACTION_SHFT 0
-#define UVH_RTC_INC_RATIO_FRACTION_MASK 0x00000000000fffffUL
-#define UVH_RTC_INC_RATIO_RATIO_SHFT 20
-#define UVH_RTC_INC_RATIO_RATIO_MASK 0x0000000000700000UL
-
-union uvh_rtc_inc_ratio_u {
-    unsigned long	v;
-    struct uvh_rtc_inc_ratio_s {
-	unsigned long	fraction : 20;  /* RW */
-	unsigned long	ratio    :  3;  /* RW */
-	unsigned long	rsvd_23_63: 41;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                          UVH_SI_ADDR_MAP_CONFIG                           */
-/* ========================================================================= */
-#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL
-
-#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0
-#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL
-#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8
-#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL
-
-union uvh_si_addr_map_config_u {
-    unsigned long	v;
-    struct uvh_si_addr_map_config_s {
-	unsigned long	m_skt :  6;  /* RW */
-	unsigned long	rsvd_6_7:  2;  /*    */
-	unsigned long	n_skt :  4;  /* RW */
-	unsigned long	rsvd_12_63: 52;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                       UVH_SI_ALIAS0_OVERLAY_CONFIG                        */
-/* ========================================================================= */
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL
-
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias0_overlay_config_u {
-    unsigned long	v;
-    struct uvh_si_alias0_overlay_config_s {
-	unsigned long	rsvd_0_23: 24;  /*    */
-	unsigned long	base    :  8;  /* RW */
-	unsigned long	rsvd_32_47: 16;  /*    */
-	unsigned long	m_alias :  5;  /* RW */
-	unsigned long	rsvd_53_62: 10;  /*    */
-	unsigned long	enable  :  1;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                       UVH_SI_ALIAS1_OVERLAY_CONFIG                        */
-/* ========================================================================= */
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL
-
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias1_overlay_config_u {
-    unsigned long	v;
-    struct uvh_si_alias1_overlay_config_s {
-	unsigned long	rsvd_0_23: 24;  /*    */
-	unsigned long	base    :  8;  /* RW */
-	unsigned long	rsvd_32_47: 16;  /*    */
-	unsigned long	m_alias :  5;  /* RW */
-	unsigned long	rsvd_53_62: 10;  /*    */
-	unsigned long	enable  :  1;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                       UVH_SI_ALIAS2_OVERLAY_CONFIG                        */
-/* ========================================================================= */
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL
-
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias2_overlay_config_u {
-    unsigned long	v;
-    struct uvh_si_alias2_overlay_config_s {
-	unsigned long	rsvd_0_23: 24;  /*    */
-	unsigned long	base    :  8;  /* RW */
-	unsigned long	rsvd_32_47: 16;  /*    */
-	unsigned long	m_alias :  5;  /* RW */
-	unsigned long	rsvd_53_62: 10;  /*    */
-	unsigned long	enable  :  1;  /* RW */
-    } s;
-};
-
-
-#endif /* _ASM_IA64_UV_UV_MMRS_H */
diff --git a/arch/ia64/include/asm/vermagic.h b/arch/ia64/include/asm/vermagic.h
deleted file mode 100644
index 29c7424f4c25..000000000000
--- a/arch/ia64/include/asm/vermagic.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#ifndef _ASM_VERMAGIC_H
-#define _ASM_VERMAGIC_H
-
-#include <linux/stringify.h>
-
-#define MODULE_ARCH_VERMAGIC	"ia64" \
-	"gcc-" __stringify(__GNUC__) "." __stringify(__GNUC_MINOR__)
-
-#endif /* _ASM_VERMAGIC_H */
diff --git a/arch/ia64/include/asm/vga.h b/arch/ia64/include/asm/vga.h
deleted file mode 100644
index 64ce0b971a0a..000000000000
--- a/arch/ia64/include/asm/vga.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *	Access to VGA videoram
- *
- *	(c) 1998 Martin Mares <mj@ucw.cz>
- *	(c) 1999 Asit Mallick <asit.k.mallick@intel.com>
- *	(c) 1999 Don Dugger <don.dugger@intel.com>
- */
-
-#ifndef __ASM_IA64_VGA_H_
-#define __ASM_IA64_VGA_H_
-
-/*
- * On the PC, we can just recalculate addresses and then access the
- * videoram directly without any black magic.
- */
-
-extern unsigned long vga_console_iobase;
-extern unsigned long vga_console_membase;
-
-#define VGA_MAP_MEM(x,s)	((unsigned long) ioremap(vga_console_membase + (x), s))
-
-#define vga_readb(x)	(*(x))
-#define vga_writeb(x,y)	(*(y) = (x))
-
-#endif /* __ASM_IA64_VGA_H_ */
diff --git a/arch/ia64/include/asm/vmalloc.h b/arch/ia64/include/asm/vmalloc.h
deleted file mode 100644
index a2b51141ad28..000000000000
--- a/arch/ia64/include/asm/vmalloc.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef _ASM_IA64_VMALLOC_H
-#define _ASM_IA64_VMALLOC_H
-
-#endif /* _ASM_IA64_VMALLOC_H */
diff --git a/arch/ia64/include/asm/xor.h b/arch/ia64/include/asm/xor.h
deleted file mode 100644
index 6785f70d3208..000000000000
--- a/arch/ia64/include/asm/xor.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Optimized RAID-5 checksumming functions for IA-64.
- */
-
-
-extern void xor_ia64_2(unsigned long bytes, unsigned long * __restrict p1,
-		       const unsigned long * __restrict p2);
-extern void xor_ia64_3(unsigned long bytes, unsigned long * __restrict p1,
-		       const unsigned long * __restrict p2,
-		       const unsigned long * __restrict p3);
-extern void xor_ia64_4(unsigned long bytes, unsigned long * __restrict p1,
-		       const unsigned long * __restrict p2,
-		       const unsigned long * __restrict p3,
-		       const unsigned long * __restrict p4);
-extern void xor_ia64_5(unsigned long bytes, unsigned long * __restrict p1,
-		       const unsigned long * __restrict p2,
-		       const unsigned long * __restrict p3,
-		       const unsigned long * __restrict p4,
-		       const unsigned long * __restrict p5);
-
-static struct xor_block_template xor_block_ia64 = {
-	.name =	"ia64",
-	.do_2 =	xor_ia64_2,
-	.do_3 =	xor_ia64_3,
-	.do_4 =	xor_ia64_4,
-	.do_5 =	xor_ia64_5,
-};
-
-#define XOR_TRY_TEMPLATES	xor_speed(&xor_block_ia64)
diff --git a/arch/ia64/include/asm/xtp.h b/arch/ia64/include/asm/xtp.h
deleted file mode 100644
index 5bf1d70ad860..000000000000
--- a/arch/ia64/include/asm/xtp.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_XTP_H
-#define _ASM_IA64_XTP_H
-
-#include <asm/io.h>
-
-#ifdef CONFIG_SMP
-
-#define XTP_OFFSET		0x1e0008
-
-#define SMP_IRQ_REDIRECTION	(1 << 0)
-#define SMP_IPI_REDIRECTION	(1 << 1)
-
-extern unsigned char smp_int_redirect;
-
-/*
- * XTP control functions:
- *	min_xtp   : route all interrupts to this CPU
- *	normal_xtp: nominal XTP value
- *	max_xtp   : never deliver interrupts to this CPU.
- */
-
-static inline void
-min_xtp (void)
-{
-	if (smp_int_redirect & SMP_IRQ_REDIRECTION)
-		writeb(0x00, ipi_base_addr + XTP_OFFSET); /* XTP to min */
-}
-
-static inline void
-normal_xtp (void)
-{
-	if (smp_int_redirect & SMP_IRQ_REDIRECTION)
-		writeb(0x08, ipi_base_addr + XTP_OFFSET); /* XTP normal */
-}
-
-static inline void
-max_xtp (void)
-{
-	if (smp_int_redirect & SMP_IRQ_REDIRECTION)
-		writeb(0x0f, ipi_base_addr + XTP_OFFSET); /* Set XTP to max */
-}
-
-#endif /* CONFIG_SMP */
-
-#endif /* _ASM_IA64_XTP_Hy */
diff --git a/arch/ia64/include/uapi/asm/Kbuild b/arch/ia64/include/uapi/asm/Kbuild
deleted file mode 100644
index 3a1341e3535a..000000000000
--- a/arch/ia64/include/uapi/asm/Kbuild
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-generated-y += unistd_64.h
diff --git a/arch/ia64/include/uapi/asm/auxvec.h b/arch/ia64/include/uapi/asm/auxvec.h
deleted file mode 100644
index 09969a5d2e0a..000000000000
--- a/arch/ia64/include/uapi/asm/auxvec.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_AUXVEC_H
-#define _ASM_IA64_AUXVEC_H
-
-/*
- * Architecture-neutral AT_ values are in the range 0-17.  Leave some room for more of
- * them, start the architecture-specific ones at 32.
- */
-#define AT_SYSINFO	32
-#define AT_SYSINFO_EHDR	33
-
-#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */
-
-#endif /* _ASM_IA64_AUXVEC_H */
diff --git a/arch/ia64/include/uapi/asm/bitsperlong.h b/arch/ia64/include/uapi/asm/bitsperlong.h
deleted file mode 100644
index 1146d55563db..000000000000
--- a/arch/ia64/include/uapi/asm/bitsperlong.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __ASM_IA64_BITSPERLONG_H
-#define __ASM_IA64_BITSPERLONG_H
-
-#define __BITS_PER_LONG 64
-
-#include <asm-generic/bitsperlong.h>
-
-#endif /* __ASM_IA64_BITSPERLONG_H */
diff --git a/arch/ia64/include/uapi/asm/break.h b/arch/ia64/include/uapi/asm/break.h
deleted file mode 100644
index 4ca110f0a94b..000000000000
--- a/arch/ia64/include/uapi/asm/break.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_BREAK_H
-#define _ASM_IA64_BREAK_H
-
-/*
- * IA-64 Linux break numbers.
- *
- * Copyright (C) 1999 Hewlett-Packard Co
- * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-/*
- * OS-specific debug break numbers:
- */
-#define __IA64_BREAK_KDB		0x80100
-#define __IA64_BREAK_KPROBE		0x81000 /* .. 0x81fff */
-
-/*
- * OS-specific break numbers:
- */
-#define __IA64_BREAK_SYSCALL		0x100000
-
-#endif /* _ASM_IA64_BREAK_H */
diff --git a/arch/ia64/include/uapi/asm/byteorder.h b/arch/ia64/include/uapi/asm/byteorder.h
deleted file mode 100644
index f85d0faaaf34..000000000000
--- a/arch/ia64/include/uapi/asm/byteorder.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_BYTEORDER_H
-#define _ASM_IA64_BYTEORDER_H
-
-#include <linux/byteorder/little_endian.h>
-
-#endif /* _ASM_IA64_BYTEORDER_H */
diff --git a/arch/ia64/include/uapi/asm/cmpxchg.h b/arch/ia64/include/uapi/asm/cmpxchg.h
deleted file mode 100644
index a59b5de6eec6..000000000000
--- a/arch/ia64/include/uapi/asm/cmpxchg.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_ASM_IA64_CMPXCHG_H
-#define _UAPI_ASM_IA64_CMPXCHG_H
-
-/*
- * Compare/Exchange, forked from asm/intrinsics.h
- * which was:
- *
- *	Copyright (C) 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-/* include compiler specific intrinsics */
-#include <asm/ia64regs.h>
-#include <asm/gcc_intrin.h>
-
-/*
- * This function doesn't exist, so you'll get a linker error if
- * something tries to do an invalid xchg().
- */
-extern void ia64_xchg_called_with_bad_pointer(void);
-
-#define __arch_xchg(x, ptr, size)					\
-({									\
-	unsigned long __xchg_result;					\
-									\
-	switch (size) {							\
-	case 1:								\
-		__xchg_result = ia64_xchg1((__u8 __force *)ptr, x);	\
-		break;							\
-									\
-	case 2:								\
-		__xchg_result = ia64_xchg2((__u16 __force *)ptr, x);	\
-		break;							\
-									\
-	case 4:								\
-		__xchg_result = ia64_xchg4((__u32 __force *)ptr, x);	\
-		break;							\
-									\
-	case 8:								\
-		__xchg_result = ia64_xchg8((__u64 __force *)ptr, x);	\
-		break;							\
-	default:							\
-		ia64_xchg_called_with_bad_pointer();			\
-	}								\
-	(__typeof__ (*(ptr)) __force) __xchg_result;			\
-})
-
-#ifndef __KERNEL__
-#define xchg(ptr, x)							\
-({(__typeof__(*(ptr))) __arch_xchg((unsigned long) (x), (ptr), sizeof(*(ptr)));})
-#endif
-
-/*
- * Atomic compare and exchange.  Compare OLD with MEM, if identical,
- * store NEW in MEM.  Return the initial value in MEM.  Success is
- * indicated by comparing RETURN with OLD.
- */
-
-/*
- * This function doesn't exist, so you'll get a linker error
- * if something tries to do an invalid cmpxchg().
- */
-extern long ia64_cmpxchg_called_with_bad_pointer(void);
-
-#define ia64_cmpxchg(sem, ptr, old, new, size)				\
-({									\
-	__u64 _o_, _r_;							\
-									\
-	switch (size) {							\
-	case 1:								\
-		_o_ = (__u8) (long __force) (old);			\
-		break;							\
-	case 2:								\
-		_o_ = (__u16) (long __force) (old);			\
-		break;							\
-	case 4:								\
-		_o_ = (__u32) (long __force) (old);			\
-		break;							\
-	case 8:								\
-		_o_ = (__u64) (long __force) (old);			\
-		break;							\
-	default:							\
-		break;							\
-	}								\
-	switch (size) {							\
-	case 1:								\
-		_r_ = ia64_cmpxchg1_##sem((__u8 __force *) ptr, new, _o_);	\
-		break;							\
-									\
-	case 2:								\
-		_r_ = ia64_cmpxchg2_##sem((__u16 __force *) ptr, new, _o_);	\
-		break;							\
-									\
-	case 4:								\
-		_r_ = ia64_cmpxchg4_##sem((__u32 __force *) ptr, new, _o_);	\
-		break;							\
-									\
-	case 8:								\
-		_r_ = ia64_cmpxchg8_##sem((__u64 __force *) ptr, new, _o_);	\
-		break;							\
-									\
-	default:							\
-		_r_ = ia64_cmpxchg_called_with_bad_pointer();		\
-		break;							\
-	}								\
-	(__typeof__(old) __force) _r_;					\
-})
-
-#define cmpxchg_acq(ptr, o, n)	\
-	ia64_cmpxchg(acq, (ptr), (o), (n), sizeof(*(ptr)))
-#define cmpxchg_rel(ptr, o, n)	\
-	ia64_cmpxchg(rel, (ptr), (o), (n), sizeof(*(ptr)))
-
-/*
- * Worse still - early processor implementations actually just ignored
- * the acquire/release and did a full fence all the time.  Unfortunately
- * this meant a lot of badly written code that used .acq when they really
- * wanted .rel became legacy out in the wild - so when we made a cpu
- * that strictly did the .acq or .rel ... all that code started breaking - so
- * we had to back-pedal and keep the "legacy" behavior of a full fence :-(
- */
-
-#ifndef __KERNEL__
-/* for compatibility with other platforms: */
-#define cmpxchg(ptr, o, n)	cmpxchg_acq((ptr), (o), (n))
-#define cmpxchg64(ptr, o, n)	cmpxchg_acq((ptr), (o), (n))
-
-#define cmpxchg_local		cmpxchg
-#define cmpxchg64_local		cmpxchg64
-#endif
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* _UAPI_ASM_IA64_CMPXCHG_H */
diff --git a/arch/ia64/include/uapi/asm/fcntl.h b/arch/ia64/include/uapi/asm/fcntl.h
deleted file mode 100644
index 7b95523efe5a..000000000000
--- a/arch/ia64/include/uapi/asm/fcntl.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_FCNTL_H
-#define _ASM_IA64_FCNTL_H
-/*
- * Modified 1998-2000
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co.
- */
-
-#define force_o_largefile()	\
-		(personality(current->personality) != PER_LINUX32)
-
-#include <linux/personality.h>
-#include <asm-generic/fcntl.h>
-
-#endif /* _ASM_IA64_FCNTL_H */
diff --git a/arch/ia64/include/uapi/asm/fpu.h b/arch/ia64/include/uapi/asm/fpu.h
deleted file mode 100644
index 0df392982ce8..000000000000
--- a/arch/ia64/include/uapi/asm/fpu.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_FPU_H
-#define _ASM_IA64_FPU_H
-
-/*
- * Copyright (C) 1998, 1999, 2002, 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/types.h>
-
-/* floating point status register: */
-#define FPSR_TRAP_VD	(1 << 0)	/* invalid op trap disabled */
-#define FPSR_TRAP_DD	(1 << 1)	/* denormal trap disabled */
-#define FPSR_TRAP_ZD	(1 << 2)	/* zero-divide trap disabled */
-#define FPSR_TRAP_OD	(1 << 3)	/* overflow trap disabled */
-#define FPSR_TRAP_UD	(1 << 4)	/* underflow trap disabled */
-#define FPSR_TRAP_ID	(1 << 5)	/* inexact trap disabled */
-#define FPSR_S0(x)	((x) <<  6)
-#define FPSR_S1(x)	((x) << 19)
-#define FPSR_S2(x)	(__IA64_UL(x) << 32)
-#define FPSR_S3(x)	(__IA64_UL(x) << 45)
-
-/* floating-point status field controls: */
-#define FPSF_FTZ	(1 << 0)		/* flush-to-zero */
-#define FPSF_WRE	(1 << 1)		/* widest-range exponent */
-#define FPSF_PC(x)	(((x) & 0x3) << 2)	/* precision control */
-#define FPSF_RC(x)	(((x) & 0x3) << 4)	/* rounding control */
-#define FPSF_TD		(1 << 6)		/* trap disabled */
-
-/* floating-point status field flags: */
-#define FPSF_V		(1 <<  7)		/* invalid operation flag */
-#define FPSF_D		(1 <<  8)		/* denormal/unnormal operand flag */
-#define FPSF_Z		(1 <<  9)		/* zero divide (IEEE) flag */
-#define FPSF_O		(1 << 10)		/* overflow (IEEE) flag */
-#define FPSF_U		(1 << 11)		/* underflow (IEEE) flag */
-#define FPSF_I		(1 << 12)		/* inexact (IEEE) flag) */
-
-/* floating-point rounding control: */
-#define FPRC_NEAREST	0x0
-#define FPRC_NEGINF	0x1
-#define FPRC_POSINF	0x2
-#define FPRC_TRUNC	0x3
-
-#define FPSF_DEFAULT	(FPSF_PC (0x3) | FPSF_RC (FPRC_NEAREST))
-
-/* This default value is the same as HP-UX uses.  Don't change it
-   without a very good reason.  */
-#define FPSR_DEFAULT	(FPSR_TRAP_VD | FPSR_TRAP_DD | FPSR_TRAP_ZD	\
-			 | FPSR_TRAP_OD | FPSR_TRAP_UD | FPSR_TRAP_ID	\
-			 | FPSR_S0 (FPSF_DEFAULT)			\
-			 | FPSR_S1 (FPSF_DEFAULT | FPSF_TD | FPSF_WRE)	\
-			 | FPSR_S2 (FPSF_DEFAULT | FPSF_TD)		\
-			 | FPSR_S3 (FPSF_DEFAULT | FPSF_TD))
-
-# ifndef __ASSEMBLY__
-
-struct ia64_fpreg {
-	union {
-		unsigned long bits[2];
-		long double __dummy;	/* force 16-byte alignment */
-	} u;
-};
-
-# endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_IA64_FPU_H */
diff --git a/arch/ia64/include/uapi/asm/gcc_intrin.h b/arch/ia64/include/uapi/asm/gcc_intrin.h
deleted file mode 100644
index ecfa3eadb217..000000000000
--- a/arch/ia64/include/uapi/asm/gcc_intrin.h
+++ /dev/null
@@ -1,619 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- *
- * Copyright (C) 2002,2003 Jun Nakajima <jun.nakajima@intel.com>
- * Copyright (C) 2002,2003 Suresh Siddha <suresh.b.siddha@intel.com>
- */
-#ifndef _UAPI_ASM_IA64_GCC_INTRIN_H
-#define _UAPI_ASM_IA64_GCC_INTRIN_H
-
-#include <linux/types.h>
-#include <linux/compiler.h>
-
-/* define this macro to get some asm stmts included in 'c' files */
-#define ASM_SUPPORTED
-
-/* Optimization barrier */
-/* The "volatile" is due to gcc bugs */
-#define ia64_barrier()	asm volatile ("":::"memory")
-
-#define ia64_stop()	asm volatile (";;"::)
-
-#define ia64_invala_gr(regnum)	asm volatile ("invala.e r%0" :: "i"(regnum))
-
-#define ia64_invala_fr(regnum)	asm volatile ("invala.e f%0" :: "i"(regnum))
-
-#define ia64_flushrs() asm volatile ("flushrs;;":::"memory")
-
-#define ia64_loadrs() asm volatile ("loadrs;;":::"memory")
-
-extern void ia64_bad_param_for_setreg (void);
-extern void ia64_bad_param_for_getreg (void);
-
-
-#define ia64_setreg(regnum, val)						\
-({										\
-	switch (regnum) {							\
-	    case _IA64_REG_PSR_L:						\
-		    asm volatile ("mov psr.l=%0" :: "r"(val) : "memory");	\
-		    break;							\
-	    case _IA64_REG_AR_KR0 ... _IA64_REG_AR_EC:				\
-		    asm volatile ("mov ar%0=%1" ::				\
-		    			  "i" (regnum - _IA64_REG_AR_KR0),	\
-					  "r"(val): "memory");			\
-		    break;							\
-	    case _IA64_REG_CR_DCR ... _IA64_REG_CR_LRR1:			\
-		    asm volatile ("mov cr%0=%1" ::				\
-				          "i" (regnum - _IA64_REG_CR_DCR),	\
-					  "r"(val): "memory" );			\
-		    break;							\
-	    case _IA64_REG_SP:							\
-		    asm volatile ("mov r12=%0" ::				\
-			    		  "r"(val): "memory");			\
-		    break;							\
-	    case _IA64_REG_GP:							\
-		    asm volatile ("mov gp=%0" :: "r"(val) : "memory");		\
-		break;								\
-	    default:								\
-		    ia64_bad_param_for_setreg();				\
-		    break;							\
-	}									\
-})
-
-#define ia64_getreg(regnum)							\
-({										\
-	__u64 ia64_intri_res;							\
-										\
-	switch (regnum) {							\
-	case _IA64_REG_GP:							\
-		asm volatile ("mov %0=gp" : "=r"(ia64_intri_res));		\
-		break;								\
-	case _IA64_REG_IP:							\
-		asm volatile ("mov %0=ip" : "=r"(ia64_intri_res));		\
-		break;								\
-	case _IA64_REG_PSR:							\
-		asm volatile ("mov %0=psr" : "=r"(ia64_intri_res));		\
-		break;								\
-	case _IA64_REG_TP:	/* for current() */				\
-		ia64_intri_res = ia64_r13;					\
-		break;								\
-	case _IA64_REG_AR_KR0 ... _IA64_REG_AR_EC:				\
-		asm volatile ("mov %0=ar%1" : "=r" (ia64_intri_res)		\
-				      : "i"(regnum - _IA64_REG_AR_KR0));	\
-		break;								\
-	case _IA64_REG_CR_DCR ... _IA64_REG_CR_LRR1:				\
-		asm volatile ("mov %0=cr%1" : "=r" (ia64_intri_res)		\
-				      : "i" (regnum - _IA64_REG_CR_DCR));	\
-		break;								\
-	case _IA64_REG_SP:							\
-		asm volatile ("mov %0=sp" : "=r" (ia64_intri_res));		\
-		break;								\
-	default:								\
-		ia64_bad_param_for_getreg();					\
-		break;								\
-	}									\
-	ia64_intri_res;								\
-})
-
-#define ia64_hint_pause 0
-
-#define ia64_hint(mode)						\
-({								\
-	switch (mode) {						\
-	case ia64_hint_pause:					\
-		asm volatile ("hint @pause" ::: "memory");	\
-		break;						\
-	}							\
-})
-
-
-/* Integer values for mux1 instruction */
-#define ia64_mux1_brcst 0
-#define ia64_mux1_mix   8
-#define ia64_mux1_shuf  9
-#define ia64_mux1_alt  10
-#define ia64_mux1_rev  11
-
-#define ia64_mux1(x, mode)							\
-({										\
-	__u64 ia64_intri_res;							\
-										\
-	switch (mode) {								\
-	case ia64_mux1_brcst:							\
-		asm ("mux1 %0=%1,@brcst" : "=r" (ia64_intri_res) : "r" (x));	\
-		break;								\
-	case ia64_mux1_mix:							\
-		asm ("mux1 %0=%1,@mix" : "=r" (ia64_intri_res) : "r" (x));	\
-		break;								\
-	case ia64_mux1_shuf:							\
-		asm ("mux1 %0=%1,@shuf" : "=r" (ia64_intri_res) : "r" (x));	\
-		break;								\
-	case ia64_mux1_alt:							\
-		asm ("mux1 %0=%1,@alt" : "=r" (ia64_intri_res) : "r" (x));	\
-		break;								\
-	case ia64_mux1_rev:							\
-		asm ("mux1 %0=%1,@rev" : "=r" (ia64_intri_res) : "r" (x));	\
-		break;								\
-	}									\
-	ia64_intri_res;								\
-})
-
-#if __GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
-# define ia64_popcnt(x)		__builtin_popcountl(x)
-#else
-# define ia64_popcnt(x)						\
-  ({								\
-	__u64 ia64_intri_res;					\
-	asm ("popcnt %0=%1" : "=r" (ia64_intri_res) : "r" (x));	\
-								\
-	ia64_intri_res;						\
-  })
-#endif
-
-#define ia64_getf_exp(x)					\
-({								\
-	long ia64_intri_res;					\
-								\
-	asm ("getf.exp %0=%1" : "=r"(ia64_intri_res) : "f"(x));	\
-								\
-	ia64_intri_res;						\
-})
-
-#define ia64_shrp(a, b, count)								\
-({											\
-	__u64 ia64_intri_res;								\
-	asm ("shrp %0=%1,%2,%3" : "=r"(ia64_intri_res) : "r"(a), "r"(b), "i"(count));	\
-	ia64_intri_res;									\
-})
-
-#define ia64_ldfs(regnum, x)					\
-({								\
-	register double __f__ asm ("f"#regnum);			\
-	asm volatile ("ldfs %0=[%1]" :"=f"(__f__): "r"(x));	\
-})
-
-#define ia64_ldfd(regnum, x)					\
-({								\
-	register double __f__ asm ("f"#regnum);			\
-	asm volatile ("ldfd %0=[%1]" :"=f"(__f__): "r"(x));	\
-})
-
-#define ia64_ldfe(regnum, x)					\
-({								\
-	register double __f__ asm ("f"#regnum);			\
-	asm volatile ("ldfe %0=[%1]" :"=f"(__f__): "r"(x));	\
-})
-
-#define ia64_ldf8(regnum, x)					\
-({								\
-	register double __f__ asm ("f"#regnum);			\
-	asm volatile ("ldf8 %0=[%1]" :"=f"(__f__): "r"(x));	\
-})
-
-#define ia64_ldf_fill(regnum, x)				\
-({								\
-	register double __f__ asm ("f"#regnum);			\
-	asm volatile ("ldf.fill %0=[%1]" :"=f"(__f__): "r"(x));	\
-})
-
-#define ia64_st4_rel_nta(m, val)					\
-({									\
-	asm volatile ("st4.rel.nta [%0] = %1\n\t" :: "r"(m), "r"(val));	\
-})
-
-#define ia64_stfs(x, regnum)						\
-({									\
-	register double __f__ asm ("f"#regnum);				\
-	asm volatile ("stfs [%0]=%1" :: "r"(x), "f"(__f__) : "memory");	\
-})
-
-#define ia64_stfd(x, regnum)						\
-({									\
-	register double __f__ asm ("f"#regnum);				\
-	asm volatile ("stfd [%0]=%1" :: "r"(x), "f"(__f__) : "memory");	\
-})
-
-#define ia64_stfe(x, regnum)						\
-({									\
-	register double __f__ asm ("f"#regnum);				\
-	asm volatile ("stfe [%0]=%1" :: "r"(x), "f"(__f__) : "memory");	\
-})
-
-#define ia64_stf8(x, regnum)						\
-({									\
-	register double __f__ asm ("f"#regnum);				\
-	asm volatile ("stf8 [%0]=%1" :: "r"(x), "f"(__f__) : "memory");	\
-})
-
-#define ia64_stf_spill(x, regnum)						\
-({										\
-	register double __f__ asm ("f"#regnum);					\
-	asm volatile ("stf.spill [%0]=%1" :: "r"(x), "f"(__f__) : "memory");	\
-})
-
-#define ia64_fetchadd4_acq(p, inc)						\
-({										\
-										\
-	__u64 ia64_intri_res;							\
-	asm volatile ("fetchadd4.acq %0=[%1],%2"				\
-				: "=r"(ia64_intri_res) : "r"(p), "i" (inc)	\
-				: "memory");					\
-										\
-	ia64_intri_res;								\
-})
-
-#define ia64_fetchadd4_rel(p, inc)						\
-({										\
-	__u64 ia64_intri_res;							\
-	asm volatile ("fetchadd4.rel %0=[%1],%2"				\
-				: "=r"(ia64_intri_res) : "r"(p), "i" (inc)	\
-				: "memory");					\
-										\
-	ia64_intri_res;								\
-})
-
-#define ia64_fetchadd8_acq(p, inc)						\
-({										\
-										\
-	__u64 ia64_intri_res;							\
-	asm volatile ("fetchadd8.acq %0=[%1],%2"				\
-				: "=r"(ia64_intri_res) : "r"(p), "i" (inc)	\
-				: "memory");					\
-										\
-	ia64_intri_res;								\
-})
-
-#define ia64_fetchadd8_rel(p, inc)						\
-({										\
-	__u64 ia64_intri_res;							\
-	asm volatile ("fetchadd8.rel %0=[%1],%2"				\
-				: "=r"(ia64_intri_res) : "r"(p), "i" (inc)	\
-				: "memory");					\
-										\
-	ia64_intri_res;								\
-})
-
-#define ia64_xchg1(ptr,x)							\
-({										\
-	__u64 ia64_intri_res;							\
-	asm volatile ("xchg1 %0=[%1],%2"					\
-		      : "=r" (ia64_intri_res) : "r" (ptr), "r" (x) : "memory");	\
-	ia64_intri_res;								\
-})
-
-#define ia64_xchg2(ptr,x)						\
-({									\
-	__u64 ia64_intri_res;						\
-	asm volatile ("xchg2 %0=[%1],%2" : "=r" (ia64_intri_res)	\
-		      : "r" (ptr), "r" (x) : "memory");			\
-	ia64_intri_res;							\
-})
-
-#define ia64_xchg4(ptr,x)						\
-({									\
-	__u64 ia64_intri_res;						\
-	asm volatile ("xchg4 %0=[%1],%2" : "=r" (ia64_intri_res)	\
-		      : "r" (ptr), "r" (x) : "memory");			\
-	ia64_intri_res;							\
-})
-
-#define ia64_xchg8(ptr,x)						\
-({									\
-	__u64 ia64_intri_res;						\
-	asm volatile ("xchg8 %0=[%1],%2" : "=r" (ia64_intri_res)	\
-		      : "r" (ptr), "r" (x) : "memory");			\
-	ia64_intri_res;							\
-})
-
-#define ia64_cmpxchg1_acq(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-	asm volatile ("cmpxchg1.acq %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_cmpxchg1_rel(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-	asm volatile ("cmpxchg1.rel %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_cmpxchg2_acq(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-	asm volatile ("cmpxchg2.acq %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_cmpxchg2_rel(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-											\
-	asm volatile ("cmpxchg2.rel %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_cmpxchg4_acq(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-	asm volatile ("cmpxchg4.acq %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_cmpxchg4_rel(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-	asm volatile ("cmpxchg4.rel %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_cmpxchg8_acq(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-	asm volatile ("cmpxchg8.acq %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_cmpxchg8_rel(ptr, new, old)						\
-({											\
-	__u64 ia64_intri_res;								\
-	asm volatile ("mov ar.ccv=%0;;" :: "rO"(old));					\
-											\
-	asm volatile ("cmpxchg8.rel %0=[%1],%2,ar.ccv":					\
-			      "=r"(ia64_intri_res) : "r"(ptr), "r"(new) : "memory");	\
-	ia64_intri_res;									\
-})
-
-#define ia64_mf()	asm volatile ("mf" ::: "memory")
-#define ia64_mfa()	asm volatile ("mf.a" ::: "memory")
-
-#define ia64_invala() asm volatile ("invala" ::: "memory")
-
-#define ia64_thash(addr)							\
-({										\
-	unsigned long ia64_intri_res;						\
-	asm volatile ("thash %0=%1" : "=r"(ia64_intri_res) : "r" (addr));	\
-	ia64_intri_res;								\
-})
-
-#define ia64_srlz_i()	asm volatile (";; srlz.i ;;" ::: "memory")
-#define ia64_srlz_d()	asm volatile (";; srlz.d" ::: "memory");
-
-#ifdef HAVE_SERIALIZE_DIRECTIVE
-# define ia64_dv_serialize_data()		asm volatile (".serialize.data");
-# define ia64_dv_serialize_instruction()	asm volatile (".serialize.instruction");
-#else
-# define ia64_dv_serialize_data()
-# define ia64_dv_serialize_instruction()
-#endif
-
-#define ia64_nop(x)	asm volatile ("nop %0"::"i"(x));
-
-#define ia64_itci(addr)	asm volatile ("itc.i %0;;" :: "r"(addr) : "memory")
-
-#define ia64_itcd(addr)	asm volatile ("itc.d %0;;" :: "r"(addr) : "memory")
-
-
-#define ia64_itri(trnum, addr) asm volatile ("itr.i itr[%0]=%1"				\
-					     :: "r"(trnum), "r"(addr) : "memory")
-
-#define ia64_itrd(trnum, addr) asm volatile ("itr.d dtr[%0]=%1"				\
-					     :: "r"(trnum), "r"(addr) : "memory")
-
-#define ia64_tpa(addr)								\
-({										\
-	unsigned long ia64_pa;							\
-	asm volatile ("tpa %0 = %1" : "=r"(ia64_pa) : "r"(addr) : "memory");	\
-	ia64_pa;								\
-})
-
-#define __ia64_set_dbr(index, val)						\
-	asm volatile ("mov dbr[%0]=%1" :: "r"(index), "r"(val) : "memory")
-
-#define ia64_set_ibr(index, val)						\
-	asm volatile ("mov ibr[%0]=%1" :: "r"(index), "r"(val) : "memory")
-
-#define ia64_set_pkr(index, val)						\
-	asm volatile ("mov pkr[%0]=%1" :: "r"(index), "r"(val) : "memory")
-
-#define ia64_set_pmc(index, val)						\
-	asm volatile ("mov pmc[%0]=%1" :: "r"(index), "r"(val) : "memory")
-
-#define ia64_set_pmd(index, val)						\
-	asm volatile ("mov pmd[%0]=%1" :: "r"(index), "r"(val) : "memory")
-
-#define ia64_set_rr(index, val)							\
-	asm volatile ("mov rr[%0]=%1" :: "r"(index), "r"(val) : "memory");
-
-#define ia64_get_cpuid(index)								\
-({											\
-	unsigned long ia64_intri_res;							\
-	asm volatile ("mov %0=cpuid[%r1]" : "=r"(ia64_intri_res) : "rO"(index));	\
-	ia64_intri_res;									\
-})
-
-#define __ia64_get_dbr(index)							\
-({										\
-	unsigned long ia64_intri_res;						\
-	asm volatile ("mov %0=dbr[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
-	ia64_intri_res;								\
-})
-
-#define ia64_get_ibr(index)							\
-({										\
-	unsigned long ia64_intri_res;						\
-	asm volatile ("mov %0=ibr[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
-	ia64_intri_res;								\
-})
-
-#define ia64_get_pkr(index)							\
-({										\
-	unsigned long ia64_intri_res;						\
-	asm volatile ("mov %0=pkr[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
-	ia64_intri_res;								\
-})
-
-#define ia64_get_pmc(index)							\
-({										\
-	unsigned long ia64_intri_res;						\
-	asm volatile ("mov %0=pmc[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
-	ia64_intri_res;								\
-})
-
-
-#define ia64_get_pmd(index)							\
-({										\
-	unsigned long ia64_intri_res;						\
-	asm volatile ("mov %0=pmd[%1]" : "=r"(ia64_intri_res) : "r"(index));	\
-	ia64_intri_res;								\
-})
-
-#define ia64_get_rr(index)							\
-({										\
-	unsigned long ia64_intri_res;						\
-	asm volatile ("mov %0=rr[%1]" : "=r"(ia64_intri_res) : "r" (index));	\
-	ia64_intri_res;								\
-})
-
-#define ia64_fc(addr)	asm volatile ("fc %0" :: "r"(addr) : "memory")
-
-
-#define ia64_sync_i()	asm volatile (";; sync.i" ::: "memory")
-
-#define ia64_ssm(mask)	asm volatile ("ssm %0":: "i"((mask)) : "memory")
-#define ia64_rsm(mask)	asm volatile ("rsm %0":: "i"((mask)) : "memory")
-#define ia64_sum(mask)	asm volatile ("sum %0":: "i"((mask)) : "memory")
-#define ia64_rum(mask)	asm volatile ("rum %0":: "i"((mask)) : "memory")
-
-#define ia64_ptce(addr)	asm volatile ("ptc.e %0" :: "r"(addr))
-
-#define ia64_ptcga(addr, size)							\
-do {										\
-	asm volatile ("ptc.ga %0,%1" :: "r"(addr), "r"(size) : "memory");	\
-	ia64_dv_serialize_data();						\
-} while (0)
-
-#define ia64_ptcl(addr, size)							\
-do {										\
-	asm volatile ("ptc.l %0,%1" :: "r"(addr), "r"(size) : "memory");	\
-	ia64_dv_serialize_data();						\
-} while (0)
-
-#define ia64_ptri(addr, size)						\
-	asm volatile ("ptr.i %0,%1" :: "r"(addr), "r"(size) : "memory")
-
-#define ia64_ptrd(addr, size)						\
-	asm volatile ("ptr.d %0,%1" :: "r"(addr), "r"(size) : "memory")
-
-#define ia64_ttag(addr)							\
-({									  \
-	__u64 ia64_intri_res;						   \
-	asm volatile ("ttag %0=%1" : "=r"(ia64_intri_res) : "r" (addr));   \
-	ia64_intri_res;							 \
-})
-
-
-/* Values for lfhint in ia64_lfetch and ia64_lfetch_fault */
-
-#define ia64_lfhint_none   0
-#define ia64_lfhint_nt1    1
-#define ia64_lfhint_nt2    2
-#define ia64_lfhint_nta    3
-
-#define ia64_lfetch(lfhint, y)					\
-({								\
-        switch (lfhint) {					\
-        case ia64_lfhint_none:					\
-                asm volatile ("lfetch [%0]" : : "r"(y));	\
-                break;						\
-        case ia64_lfhint_nt1:					\
-                asm volatile ("lfetch.nt1 [%0]" : : "r"(y));	\
-                break;						\
-        case ia64_lfhint_nt2:					\
-                asm volatile ("lfetch.nt2 [%0]" : : "r"(y));	\
-                break;						\
-        case ia64_lfhint_nta:					\
-                asm volatile ("lfetch.nta [%0]" : : "r"(y));	\
-                break;						\
-        }							\
-})
-
-#define ia64_lfetch_excl(lfhint, y)					\
-({									\
-        switch (lfhint) {						\
-        case ia64_lfhint_none:						\
-                asm volatile ("lfetch.excl [%0]" :: "r"(y));		\
-                break;							\
-        case ia64_lfhint_nt1:						\
-                asm volatile ("lfetch.excl.nt1 [%0]" :: "r"(y));	\
-                break;							\
-        case ia64_lfhint_nt2:						\
-                asm volatile ("lfetch.excl.nt2 [%0]" :: "r"(y));	\
-                break;							\
-        case ia64_lfhint_nta:						\
-                asm volatile ("lfetch.excl.nta [%0]" :: "r"(y));	\
-                break;							\
-        }								\
-})
-
-#define ia64_lfetch_fault(lfhint, y)					\
-({									\
-        switch (lfhint) {						\
-        case ia64_lfhint_none:						\
-                asm volatile ("lfetch.fault [%0]" : : "r"(y));		\
-                break;							\
-        case ia64_lfhint_nt1:						\
-                asm volatile ("lfetch.fault.nt1 [%0]" : : "r"(y));	\
-                break;							\
-        case ia64_lfhint_nt2:						\
-                asm volatile ("lfetch.fault.nt2 [%0]" : : "r"(y));	\
-                break;							\
-        case ia64_lfhint_nta:						\
-                asm volatile ("lfetch.fault.nta [%0]" : : "r"(y));	\
-                break;							\
-        }								\
-})
-
-#define ia64_lfetch_fault_excl(lfhint, y)				\
-({									\
-        switch (lfhint) {						\
-        case ia64_lfhint_none:						\
-                asm volatile ("lfetch.fault.excl [%0]" :: "r"(y));	\
-                break;							\
-        case ia64_lfhint_nt1:						\
-                asm volatile ("lfetch.fault.excl.nt1 [%0]" :: "r"(y));	\
-                break;							\
-        case ia64_lfhint_nt2:						\
-                asm volatile ("lfetch.fault.excl.nt2 [%0]" :: "r"(y));	\
-                break;							\
-        case ia64_lfhint_nta:						\
-                asm volatile ("lfetch.fault.excl.nta [%0]" :: "r"(y));	\
-                break;							\
-        }								\
-})
-
-#define ia64_intrin_local_irq_restore(x)			\
-do {								\
-	asm volatile (";;   cmp.ne p6,p7=%0,r0;;"		\
-		      "(p6) ssm psr.i;"				\
-		      "(p7) rsm psr.i;;"			\
-		      "(p6) srlz.d"				\
-		      :: "r"((x)) : "p6", "p7", "memory");	\
-} while (0)
-
-#endif /* _UAPI_ASM_IA64_GCC_INTRIN_H */
diff --git a/arch/ia64/include/uapi/asm/ia64regs.h b/arch/ia64/include/uapi/asm/ia64regs.h
deleted file mode 100644
index d7d10cec8b9f..000000000000
--- a/arch/ia64/include/uapi/asm/ia64regs.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Copyright (C) 2002,2003 Intel Corp.
- *      Jun Nakajima <jun.nakajima@intel.com>
- *      Suresh Siddha <suresh.b.siddha@intel.com>
- */
-
-#ifndef _ASM_IA64_IA64REGS_H
-#define _ASM_IA64_IA64REGS_H
-
-/*
- * Register Names for getreg() and setreg().
- *
- * The "magic" numbers happen to match the values used by the Intel compiler's
- * getreg()/setreg() intrinsics.
- */
-
-/* Special Registers */
-
-#define _IA64_REG_IP		1016	/* getreg only */
-#define _IA64_REG_PSR		1019
-#define _IA64_REG_PSR_L		1019
-
-/* General Integer Registers */
-
-#define _IA64_REG_GP		1025	/* R1 */
-#define _IA64_REG_R8		1032	/* R8 */
-#define _IA64_REG_R9		1033	/* R9 */
-#define _IA64_REG_SP		1036	/* R12 */
-#define _IA64_REG_TP		1037	/* R13 */
-
-/* Application Registers */
-
-#define _IA64_REG_AR_KR0	3072
-#define _IA64_REG_AR_KR1	3073
-#define _IA64_REG_AR_KR2	3074
-#define _IA64_REG_AR_KR3	3075
-#define _IA64_REG_AR_KR4	3076
-#define _IA64_REG_AR_KR5	3077
-#define _IA64_REG_AR_KR6	3078
-#define _IA64_REG_AR_KR7	3079
-#define _IA64_REG_AR_RSC	3088
-#define _IA64_REG_AR_BSP	3089
-#define _IA64_REG_AR_BSPSTORE	3090
-#define _IA64_REG_AR_RNAT	3091
-#define _IA64_REG_AR_FCR	3093
-#define _IA64_REG_AR_EFLAG	3096
-#define _IA64_REG_AR_CSD	3097
-#define _IA64_REG_AR_SSD	3098
-#define _IA64_REG_AR_CFLAG	3099
-#define _IA64_REG_AR_FSR	3100
-#define _IA64_REG_AR_FIR	3101
-#define _IA64_REG_AR_FDR	3102
-#define _IA64_REG_AR_CCV	3104
-#define _IA64_REG_AR_UNAT	3108
-#define _IA64_REG_AR_FPSR	3112
-#define _IA64_REG_AR_ITC	3116
-#define _IA64_REG_AR_PFS	3136
-#define _IA64_REG_AR_LC		3137
-#define _IA64_REG_AR_EC		3138
-
-/* Control Registers */
-
-#define _IA64_REG_CR_DCR	4096
-#define _IA64_REG_CR_ITM	4097
-#define _IA64_REG_CR_IVA	4098
-#define _IA64_REG_CR_PTA	4104
-#define _IA64_REG_CR_IPSR	4112
-#define _IA64_REG_CR_ISR	4113
-#define _IA64_REG_CR_IIP	4115
-#define _IA64_REG_CR_IFA	4116
-#define _IA64_REG_CR_ITIR	4117
-#define _IA64_REG_CR_IIPA	4118
-#define _IA64_REG_CR_IFS	4119
-#define _IA64_REG_CR_IIM	4120
-#define _IA64_REG_CR_IHA	4121
-#define _IA64_REG_CR_LID	4160
-#define _IA64_REG_CR_IVR	4161	/* getreg only */
-#define _IA64_REG_CR_TPR	4162
-#define _IA64_REG_CR_EOI	4163
-#define _IA64_REG_CR_IRR0	4164	/* getreg only */
-#define _IA64_REG_CR_IRR1	4165	/* getreg only */
-#define _IA64_REG_CR_IRR2	4166	/* getreg only */
-#define _IA64_REG_CR_IRR3	4167	/* getreg only */
-#define _IA64_REG_CR_ITV	4168
-#define _IA64_REG_CR_PMV	4169
-#define _IA64_REG_CR_CMCV	4170
-#define _IA64_REG_CR_LRR0	4176
-#define _IA64_REG_CR_LRR1	4177
-
-/* Indirect Registers for getindreg() and setindreg() */
-
-#define _IA64_REG_INDR_CPUID	9000	/* getindreg only */
-#define _IA64_REG_INDR_DBR	9001
-#define _IA64_REG_INDR_IBR	9002
-#define _IA64_REG_INDR_PKR	9003
-#define _IA64_REG_INDR_PMC	9004
-#define _IA64_REG_INDR_PMD	9005
-#define _IA64_REG_INDR_RR	9006
-
-#endif /* _ASM_IA64_IA64REGS_H */
diff --git a/arch/ia64/include/uapi/asm/intrinsics.h b/arch/ia64/include/uapi/asm/intrinsics.h
deleted file mode 100644
index 63f27c4ec739..000000000000
--- a/arch/ia64/include/uapi/asm/intrinsics.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Compiler-dependent intrinsics.
- *
- * Copyright (C) 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#ifndef _UAPI_ASM_IA64_INTRINSICS_H
-#define _UAPI_ASM_IA64_INTRINSICS_H
-
-
-#ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-/* include compiler specific intrinsics */
-#include <asm/ia64regs.h>
-#include <asm/gcc_intrin.h>
-#include <asm/cmpxchg.h>
-
-#define ia64_set_rr0_to_rr4(val0, val1, val2, val3, val4)		\
-do {									\
-	ia64_set_rr(0x0000000000000000UL, (val0));			\
-	ia64_set_rr(0x2000000000000000UL, (val1));			\
-	ia64_set_rr(0x4000000000000000UL, (val2));			\
-	ia64_set_rr(0x6000000000000000UL, (val3));			\
-	ia64_set_rr(0x8000000000000000UL, (val4));			\
-} while (0)
-
-/*
- * Force an unresolved reference if someone tries to use
- * ia64_fetch_and_add() with a bad value.
- */
-extern unsigned long __bad_size_for_ia64_fetch_and_add (void);
-extern unsigned long __bad_increment_for_ia64_fetch_and_add (void);
-
-#define IA64_FETCHADD(tmp,v,n,sz,sem)						\
-({										\
-	switch (sz) {								\
-	      case 4:								\
-	        tmp = ia64_fetchadd4_##sem((unsigned int *) v, n);		\
-		break;								\
-										\
-	      case 8:								\
-	        tmp = ia64_fetchadd8_##sem((unsigned long *) v, n);		\
-		break;								\
-										\
-	      default:								\
-		__bad_size_for_ia64_fetch_and_add();				\
-	}									\
-})
-
-#define ia64_fetchadd(i,v,sem)								\
-({											\
-	__u64 _tmp;									\
-	volatile __typeof__(*(v)) *_v = (v);						\
-	/* Can't use a switch () here: gcc isn't always smart enough for that... */	\
-	if ((i) == -16)									\
-		IA64_FETCHADD(_tmp, _v, -16, sizeof(*(v)), sem);			\
-	else if ((i) == -8)								\
-		IA64_FETCHADD(_tmp, _v, -8, sizeof(*(v)), sem);				\
-	else if ((i) == -4)								\
-		IA64_FETCHADD(_tmp, _v, -4, sizeof(*(v)), sem);				\
-	else if ((i) == -1)								\
-		IA64_FETCHADD(_tmp, _v, -1, sizeof(*(v)), sem);				\
-	else if ((i) == 1)								\
-		IA64_FETCHADD(_tmp, _v, 1, sizeof(*(v)), sem);				\
-	else if ((i) == 4)								\
-		IA64_FETCHADD(_tmp, _v, 4, sizeof(*(v)), sem);				\
-	else if ((i) == 8)								\
-		IA64_FETCHADD(_tmp, _v, 8, sizeof(*(v)), sem);				\
-	else if ((i) == 16)								\
-		IA64_FETCHADD(_tmp, _v, 16, sizeof(*(v)), sem);				\
-	else										\
-		_tmp = __bad_increment_for_ia64_fetch_and_add();			\
-	(__typeof__(*(v))) (_tmp);	/* return old value */				\
-})
-
-#define ia64_fetch_and_add(i,v)	(ia64_fetchadd(i, v, rel) + (i)) /* return new value */
-
-#endif
-
-#endif /* _UAPI_ASM_IA64_INTRINSICS_H */
diff --git a/arch/ia64/include/uapi/asm/mman.h b/arch/ia64/include/uapi/asm/mman.h
deleted file mode 100644
index ce0cc3d7509e..000000000000
--- a/arch/ia64/include/uapi/asm/mman.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Based on <asm-i386/mman.h>.
- *
- * Modified 1998-2000, 2002
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#ifndef _UAPI_ASM_IA64_MMAN_H
-#define _UAPI_ASM_IA64_MMAN_H
-
-
-#include <asm-generic/mman.h>
-
-#define MAP_GROWSUP	0x0200		/* register stack-like segment */
-
-
-#endif /* _UAPI_ASM_IA64_MMAN_H */
diff --git a/arch/ia64/include/uapi/asm/param.h b/arch/ia64/include/uapi/asm/param.h
deleted file mode 100644
index 123ab45940b4..000000000000
--- a/arch/ia64/include/uapi/asm/param.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Fundamental kernel parameters.
- *
- * Based on <asm-i386/param.h>.
- *
- * Modified 1998, 1999, 2002-2003
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#ifndef _UAPI_ASM_IA64_PARAM_H
-#define _UAPI_ASM_IA64_PARAM_H
-
-
-#define EXEC_PAGESIZE	65536
-
-#ifndef NOGROUP
-# define NOGROUP	(-1)
-#endif
-
-#define MAXHOSTNAMELEN	64	/* max length of hostname */
-
-#ifndef __KERNEL__
-   /*
-    * Technically, this is wrong, but some old apps still refer to it.  The proper way to
-    * get the HZ value is via sysconf(_SC_CLK_TCK).
-    */
-# define HZ 1024
-#endif
-
-#endif /* _UAPI_ASM_IA64_PARAM_H */
diff --git a/arch/ia64/include/uapi/asm/posix_types.h b/arch/ia64/include/uapi/asm/posix_types.h
deleted file mode 100644
index bded40f7defe..000000000000
--- a/arch/ia64/include/uapi/asm/posix_types.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_POSIX_TYPES_H
-#define _ASM_IA64_POSIX_TYPES_H
-
-typedef unsigned long	__kernel_sigset_t;	/* at least 32 bits */
-
-#include <asm-generic/posix_types.h>
-
-#endif /* _ASM_IA64_POSIX_TYPES_H */
diff --git a/arch/ia64/include/uapi/asm/ptrace.h b/arch/ia64/include/uapi/asm/ptrace.h
deleted file mode 100644
index f52655b44414..000000000000
--- a/arch/ia64/include/uapi/asm/ptrace.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Copyright (C) 1998-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2003 Intel Co
- *	Suresh Siddha <suresh.b.siddha@intel.com>
- *	Fenghua Yu <fenghua.yu@intel.com>
- *	Arun Sharma <arun.sharma@intel.com>
- *
- * 12/07/98	S. Eranian	added pt_regs & switch_stack
- * 12/21/98	D. Mosberger	updated to match latest code
- *  6/17/99	D. Mosberger	added second unat member to "struct switch_stack"
- *
- */
-#ifndef _UAPI_ASM_IA64_PTRACE_H
-#define _UAPI_ASM_IA64_PTRACE_H
-
-/*
- * When a user process is blocked, its state looks as follows:
- *
- *            +----------------------+	-------	IA64_STK_OFFSET
- *     	      |			     |	 ^
- *            | struct pt_regs       |	 |
- *	      |			     |	 |
- *            +----------------------+	 |
- *	      |			     |	 |
- *     	      |	   memory stack	     |	 |
- *	      |	(growing downwards)  |	 |
- *	      //.....................//	 |
- *					 |
- *	      //.....................//	 |
- *	      |			     |	 |
- *            +----------------------+	 |
- *            | struct switch_stack  |	 |
- *	      |			     |	 |
- *	      +----------------------+	 |
- *	      |			     |	 |
- *	      //.....................//	 |
- *					 |
- *	      //.....................//	 |
- *	      |			     |	 |
- *	      |	 register stack	     |	 |
- *	      |	(growing upwards)    |	 |
- *            |			     |	 |
- *	      +----------------------+	 |  ---	IA64_RBS_OFFSET
- *            |  struct thread_info  |	 |  ^
- *	      +----------------------+	 |  |
- *	      |			     |	 |  |
- *            |  struct task_struct  |	 |  |
- * current -> |			     |   |  |
- *	      +----------------------+ -------
- *
- * Note that ar.ec is not saved explicitly in pt_reg or switch_stack.
- * This is because ar.ec is saved as part of ar.pfs.
- */
-
-
-#include <asm/fpu.h>
-
-
-#ifndef __ASSEMBLY__
-
-/*
- * This struct defines the way the registers are saved on system
- * calls.
- *
- * We don't save all floating point register because the kernel
- * is compiled to use only a very small subset, so the other are
- * untouched.
- *
- * THIS STRUCTURE MUST BE A MULTIPLE 16-BYTE IN SIZE
- * (because the memory stack pointer MUST ALWAYS be aligned this way)
- *
- */
-struct pt_regs {
-	/* The following registers are saved by SAVE_MIN: */
-	unsigned long b6;		/* scratch */
-	unsigned long b7;		/* scratch */
-
-	unsigned long ar_csd;           /* used by cmp8xchg16 (scratch) */
-	unsigned long ar_ssd;           /* reserved for future use (scratch) */
-
-	unsigned long r8;		/* scratch (return value register 0) */
-	unsigned long r9;		/* scratch (return value register 1) */
-	unsigned long r10;		/* scratch (return value register 2) */
-	unsigned long r11;		/* scratch (return value register 3) */
-
-	unsigned long cr_ipsr;		/* interrupted task's psr */
-	unsigned long cr_iip;		/* interrupted task's instruction pointer */
-	/*
-	 * interrupted task's function state; if bit 63 is cleared, it
-	 * contains syscall's ar.pfs.pfm:
-	 */
-	unsigned long cr_ifs;
-
-	unsigned long ar_unat;		/* interrupted task's NaT register (preserved) */
-	unsigned long ar_pfs;		/* prev function state  */
-	unsigned long ar_rsc;		/* RSE configuration */
-	/* The following two are valid only if cr_ipsr.cpl > 0 || ti->flags & _TIF_MCA_INIT */
-	unsigned long ar_rnat;		/* RSE NaT */
-	unsigned long ar_bspstore;	/* RSE bspstore */
-
-	unsigned long pr;		/* 64 predicate registers (1 bit each) */
-	unsigned long b0;		/* return pointer (bp) */
-	unsigned long loadrs;		/* size of dirty partition << 16 */
-
-	unsigned long r1;		/* the gp pointer */
-	unsigned long r12;		/* interrupted task's memory stack pointer */
-	unsigned long r13;		/* thread pointer */
-
-	unsigned long ar_fpsr;		/* floating point status (preserved) */
-	unsigned long r15;		/* scratch */
-
-	/* The remaining registers are NOT saved for system calls.  */
-
-	unsigned long r14;		/* scratch */
-	unsigned long r2;		/* scratch */
-	unsigned long r3;		/* scratch */
-
-	/* The following registers are saved by SAVE_REST: */
-	unsigned long r16;		/* scratch */
-	unsigned long r17;		/* scratch */
-	unsigned long r18;		/* scratch */
-	unsigned long r19;		/* scratch */
-	unsigned long r20;		/* scratch */
-	unsigned long r21;		/* scratch */
-	unsigned long r22;		/* scratch */
-	unsigned long r23;		/* scratch */
-	unsigned long r24;		/* scratch */
-	unsigned long r25;		/* scratch */
-	unsigned long r26;		/* scratch */
-	unsigned long r27;		/* scratch */
-	unsigned long r28;		/* scratch */
-	unsigned long r29;		/* scratch */
-	unsigned long r30;		/* scratch */
-	unsigned long r31;		/* scratch */
-
-	unsigned long ar_ccv;		/* compare/exchange value (scratch) */
-
-	/*
-	 * Floating point registers that the kernel considers scratch:
-	 */
-	struct ia64_fpreg f6;		/* scratch */
-	struct ia64_fpreg f7;		/* scratch */
-	struct ia64_fpreg f8;		/* scratch */
-	struct ia64_fpreg f9;		/* scratch */
-	struct ia64_fpreg f10;		/* scratch */
-	struct ia64_fpreg f11;		/* scratch */
-};
-
-/*
- * This structure contains the addition registers that need to
- * preserved across a context switch.  This generally consists of
- * "preserved" registers.
- */
-struct switch_stack {
-	unsigned long caller_unat;	/* user NaT collection register (preserved) */
-	unsigned long ar_fpsr;		/* floating-point status register */
-
-	struct ia64_fpreg f2;		/* preserved */
-	struct ia64_fpreg f3;		/* preserved */
-	struct ia64_fpreg f4;		/* preserved */
-	struct ia64_fpreg f5;		/* preserved */
-
-	struct ia64_fpreg f12;		/* scratch, but untouched by kernel */
-	struct ia64_fpreg f13;		/* scratch, but untouched by kernel */
-	struct ia64_fpreg f14;		/* scratch, but untouched by kernel */
-	struct ia64_fpreg f15;		/* scratch, but untouched by kernel */
-	struct ia64_fpreg f16;		/* preserved */
-	struct ia64_fpreg f17;		/* preserved */
-	struct ia64_fpreg f18;		/* preserved */
-	struct ia64_fpreg f19;		/* preserved */
-	struct ia64_fpreg f20;		/* preserved */
-	struct ia64_fpreg f21;		/* preserved */
-	struct ia64_fpreg f22;		/* preserved */
-	struct ia64_fpreg f23;		/* preserved */
-	struct ia64_fpreg f24;		/* preserved */
-	struct ia64_fpreg f25;		/* preserved */
-	struct ia64_fpreg f26;		/* preserved */
-	struct ia64_fpreg f27;		/* preserved */
-	struct ia64_fpreg f28;		/* preserved */
-	struct ia64_fpreg f29;		/* preserved */
-	struct ia64_fpreg f30;		/* preserved */
-	struct ia64_fpreg f31;		/* preserved */
-
-	unsigned long r4;		/* preserved */
-	unsigned long r5;		/* preserved */
-	unsigned long r6;		/* preserved */
-	unsigned long r7;		/* preserved */
-
-	unsigned long b0;		/* so we can force a direct return in copy_thread */
-	unsigned long b1;
-	unsigned long b2;
-	unsigned long b3;
-	unsigned long b4;
-	unsigned long b5;
-
-	unsigned long ar_pfs;		/* previous function state */
-	unsigned long ar_lc;		/* loop counter (preserved) */
-	unsigned long ar_unat;		/* NaT bits for r4-r7 */
-	unsigned long ar_rnat;		/* RSE NaT collection register */
-	unsigned long ar_bspstore;	/* RSE dirty base (preserved) */
-	unsigned long pr;		/* 64 predicate registers (1 bit each) */
-};
-
-
-/* pt_all_user_regs is used for PTRACE_GETREGS PTRACE_SETREGS */
-struct pt_all_user_regs {
-	unsigned long nat;
-	unsigned long cr_iip;
-	unsigned long cfm;
-	unsigned long cr_ipsr;
-	unsigned long pr;
-
-	unsigned long gr[32];
-	unsigned long br[8];
-	unsigned long ar[128];
-	struct ia64_fpreg fr[128];
-};
-
-#endif /* !__ASSEMBLY__ */
-
-/* indices to application-registers array in pt_all_user_regs */
-#define PT_AUR_RSC	16
-#define PT_AUR_BSP	17
-#define PT_AUR_BSPSTORE	18
-#define PT_AUR_RNAT	19
-#define PT_AUR_CCV	32
-#define PT_AUR_UNAT	36
-#define PT_AUR_FPSR	40
-#define PT_AUR_PFS	64
-#define PT_AUR_LC	65
-#define PT_AUR_EC	66
-
-/*
- * The numbers chosen here are somewhat arbitrary but absolutely MUST
- * not overlap with any of the number assigned in <linux/ptrace.h>.
- */
-#define PTRACE_SINGLEBLOCK	12	/* resume execution until next branch */
-#define PTRACE_OLD_GETSIGINFO	13	/* (replaced by PTRACE_GETSIGINFO in <linux/ptrace.h>)  */
-#define PTRACE_OLD_SETSIGINFO	14	/* (replaced by PTRACE_SETSIGINFO in <linux/ptrace.h>)  */
-#define PTRACE_GETREGS		18	/* get all registers (pt_all_user_regs) in one shot */
-#define PTRACE_SETREGS		19	/* set all registers (pt_all_user_regs) in one shot */
-
-#define PTRACE_OLDSETOPTIONS	21
-
-#endif /* _UAPI_ASM_IA64_PTRACE_H */
diff --git a/arch/ia64/include/uapi/asm/ptrace_offsets.h b/arch/ia64/include/uapi/asm/ptrace_offsets.h
deleted file mode 100644
index 2847c18139ef..000000000000
--- a/arch/ia64/include/uapi/asm/ptrace_offsets.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_PTRACE_OFFSETS_H
-#define _ASM_IA64_PTRACE_OFFSETS_H
-
-/*
- * Copyright (C) 1999, 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-/*
- * The "uarea" that can be accessed via PEEKUSER and POKEUSER is a
- * virtual structure that would have the following definition:
- *
- *	struct uarea {
- *		struct ia64_fpreg fph[96];		// f32-f127
- *		unsigned long nat_bits;
- *		unsigned long empty1;
- *		struct ia64_fpreg f2;			// f2-f5
- *			:
- *		struct ia64_fpreg f5;
- *		struct ia64_fpreg f10;			// f10-f31
- *			:
- *		struct ia64_fpreg f31;
- *		unsigned long r4;			// r4-r7
- *			:
- *		unsigned long r7;
- *		unsigned long b1;			// b1-b5
- *			:
- *		unsigned long b5;
- *		unsigned long ar_ec;
- *		unsigned long ar_lc;
- *		unsigned long empty2[5];
- *		unsigned long cr_ipsr;
- *		unsigned long cr_iip;
- *		unsigned long cfm;
- *		unsigned long ar_unat;
- *		unsigned long ar_pfs;
- *		unsigned long ar_rsc;
- *		unsigned long ar_rnat;
- *		unsigned long ar_bspstore;
- *		unsigned long pr;
- *		unsigned long b6;
- *		unsigned long ar_bsp;
- *		unsigned long r1;
- *		unsigned long r2;
- *		unsigned long r3;
- *		unsigned long r12;
- *		unsigned long r13;
- *		unsigned long r14;
- *		unsigned long r15;
- *		unsigned long r8;
- *		unsigned long r9;
- *		unsigned long r10;
- *		unsigned long r11;
- *		unsigned long r16;
- *			:
- *		unsigned long r31;
- *		unsigned long ar_ccv;
- *		unsigned long ar_fpsr;
- *		unsigned long b0;
- *		unsigned long b7;
- *		unsigned long f6;
- *		unsigned long f7;
- *		unsigned long f8;
- *		unsigned long f9;
- *		unsigned long ar_csd;
- *		unsigned long ar_ssd;
- *		unsigned long rsvd1[710];
- *		unsigned long dbr[8];
- *		unsigned long rsvd2[504];
- *		unsigned long ibr[8];
- *		unsigned long rsvd3[504];
- *		unsigned long pmd[4];
- *	}
- */
-
-/* fph: */
-#define PT_F32			0x0000
-#define PT_F33			0x0010
-#define PT_F34			0x0020
-#define PT_F35			0x0030
-#define PT_F36			0x0040
-#define PT_F37			0x0050
-#define PT_F38			0x0060
-#define PT_F39			0x0070
-#define PT_F40			0x0080
-#define PT_F41			0x0090
-#define PT_F42			0x00a0
-#define PT_F43			0x00b0
-#define PT_F44			0x00c0
-#define PT_F45			0x00d0
-#define PT_F46			0x00e0
-#define PT_F47			0x00f0
-#define PT_F48			0x0100
-#define PT_F49			0x0110
-#define PT_F50			0x0120
-#define PT_F51			0x0130
-#define PT_F52			0x0140
-#define PT_F53			0x0150
-#define PT_F54			0x0160
-#define PT_F55			0x0170
-#define PT_F56			0x0180
-#define PT_F57			0x0190
-#define PT_F58			0x01a0
-#define PT_F59			0x01b0
-#define PT_F60			0x01c0
-#define PT_F61			0x01d0
-#define PT_F62			0x01e0
-#define PT_F63			0x01f0
-#define PT_F64			0x0200
-#define PT_F65			0x0210
-#define PT_F66			0x0220
-#define PT_F67			0x0230
-#define PT_F68			0x0240
-#define PT_F69			0x0250
-#define PT_F70			0x0260
-#define PT_F71			0x0270
-#define PT_F72			0x0280
-#define PT_F73			0x0290
-#define PT_F74			0x02a0
-#define PT_F75			0x02b0
-#define PT_F76			0x02c0
-#define PT_F77			0x02d0
-#define PT_F78			0x02e0
-#define PT_F79			0x02f0
-#define PT_F80			0x0300
-#define PT_F81			0x0310
-#define PT_F82			0x0320
-#define PT_F83			0x0330
-#define PT_F84			0x0340
-#define PT_F85			0x0350
-#define PT_F86			0x0360
-#define PT_F87			0x0370
-#define PT_F88			0x0380
-#define PT_F89			0x0390
-#define PT_F90			0x03a0
-#define PT_F91			0x03b0
-#define PT_F92			0x03c0
-#define PT_F93			0x03d0
-#define PT_F94			0x03e0
-#define PT_F95			0x03f0
-#define PT_F96			0x0400
-#define PT_F97			0x0410
-#define PT_F98			0x0420
-#define PT_F99			0x0430
-#define PT_F100			0x0440
-#define PT_F101			0x0450
-#define PT_F102			0x0460
-#define PT_F103			0x0470
-#define PT_F104			0x0480
-#define PT_F105			0x0490
-#define PT_F106			0x04a0
-#define PT_F107			0x04b0
-#define PT_F108			0x04c0
-#define PT_F109			0x04d0
-#define PT_F110			0x04e0
-#define PT_F111			0x04f0
-#define PT_F112			0x0500
-#define PT_F113			0x0510
-#define PT_F114			0x0520
-#define PT_F115			0x0530
-#define PT_F116			0x0540
-#define PT_F117			0x0550
-#define PT_F118			0x0560
-#define PT_F119			0x0570
-#define PT_F120			0x0580
-#define PT_F121			0x0590
-#define PT_F122			0x05a0
-#define PT_F123			0x05b0
-#define PT_F124			0x05c0
-#define PT_F125			0x05d0
-#define PT_F126			0x05e0
-#define PT_F127			0x05f0
-
-#define PT_NAT_BITS		0x0600
-
-#define PT_F2			0x0610
-#define PT_F3			0x0620
-#define PT_F4			0x0630
-#define PT_F5			0x0640
-#define PT_F10			0x0650
-#define PT_F11			0x0660
-#define PT_F12			0x0670
-#define PT_F13			0x0680
-#define PT_F14			0x0690
-#define PT_F15			0x06a0
-#define PT_F16			0x06b0
-#define PT_F17			0x06c0
-#define PT_F18			0x06d0
-#define PT_F19			0x06e0
-#define PT_F20			0x06f0
-#define PT_F21			0x0700
-#define PT_F22			0x0710
-#define PT_F23			0x0720
-#define PT_F24			0x0730
-#define PT_F25			0x0740
-#define PT_F26			0x0750
-#define PT_F27			0x0760
-#define PT_F28			0x0770
-#define PT_F29			0x0780
-#define PT_F30			0x0790
-#define PT_F31			0x07a0
-#define PT_R4			0x07b0
-#define PT_R5			0x07b8
-#define PT_R6			0x07c0
-#define PT_R7			0x07c8
-
-#define PT_B1			0x07d8
-#define PT_B2			0x07e0
-#define PT_B3			0x07e8
-#define PT_B4			0x07f0
-#define PT_B5			0x07f8
-
-#define PT_AR_EC		0x0800
-#define PT_AR_LC		0x0808
-
-#define PT_CR_IPSR		0x0830
-#define PT_CR_IIP		0x0838
-#define PT_CFM			0x0840
-#define PT_AR_UNAT		0x0848
-#define PT_AR_PFS		0x0850
-#define PT_AR_RSC		0x0858
-#define PT_AR_RNAT		0x0860
-#define PT_AR_BSPSTORE		0x0868
-#define PT_PR			0x0870
-#define PT_B6			0x0878
-#define PT_AR_BSP		0x0880	/* note: this points to the *end* of the backing store! */
-#define PT_R1			0x0888
-#define PT_R2			0x0890
-#define PT_R3			0x0898
-#define PT_R12			0x08a0
-#define PT_R13			0x08a8
-#define PT_R14			0x08b0
-#define PT_R15			0x08b8
-#define PT_R8 			0x08c0
-#define PT_R9			0x08c8
-#define PT_R10			0x08d0
-#define PT_R11			0x08d8
-#define PT_R16			0x08e0
-#define PT_R17			0x08e8
-#define PT_R18			0x08f0
-#define PT_R19			0x08f8
-#define PT_R20			0x0900
-#define PT_R21			0x0908
-#define PT_R22			0x0910
-#define PT_R23			0x0918
-#define PT_R24			0x0920
-#define PT_R25			0x0928
-#define PT_R26			0x0930
-#define PT_R27			0x0938
-#define PT_R28			0x0940
-#define PT_R29			0x0948
-#define PT_R30			0x0950
-#define PT_R31			0x0958
-#define PT_AR_CCV		0x0960
-#define PT_AR_FPSR		0x0968
-#define PT_B0			0x0970
-#define PT_B7			0x0978
-#define PT_F6			0x0980
-#define PT_F7			0x0990
-#define PT_F8			0x09a0
-#define PT_F9			0x09b0
-#define PT_AR_CSD		0x09c0
-#define PT_AR_SSD		0x09c8
-
-#define PT_DBR			0x2000	/* data breakpoint registers */
-#define PT_IBR			0x3000	/* instruction breakpoint registers */
-#define PT_PMD			0x4000	/* performance monitoring counters */
-
-#endif /* _ASM_IA64_PTRACE_OFFSETS_H */
diff --git a/arch/ia64/include/uapi/asm/resource.h b/arch/ia64/include/uapi/asm/resource.h
deleted file mode 100644
index d488d2b22ac4..000000000000
--- a/arch/ia64/include/uapi/asm/resource.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_RESOURCE_H
-#define _ASM_IA64_RESOURCE_H
-
-#include <asm/ustack.h>
-#include <asm-generic/resource.h>
-
-#endif /* _ASM_IA64_RESOURCE_H */
diff --git a/arch/ia64/include/uapi/asm/rse.h b/arch/ia64/include/uapi/asm/rse.h
deleted file mode 100644
index 6d260af571c5..000000000000
--- a/arch/ia64/include/uapi/asm/rse.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_RSE_H
-#define _ASM_IA64_RSE_H
-
-/*
- * Copyright (C) 1998, 1999 Hewlett-Packard Co
- * Copyright (C) 1998, 1999 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * Register stack engine related helper functions.  This file may be
- * used in applications, so be careful about the name-space and give
- * some consideration to non-GNU C compilers (though __inline__ is
- * fine).
- */
-
-static __inline__ unsigned long
-ia64_rse_slot_num (unsigned long *addr)
-{
-	return (((unsigned long) addr) >> 3) & 0x3f;
-}
-
-/*
- * Return TRUE if ADDR is the address of an RNAT slot.
- */
-static __inline__ unsigned long
-ia64_rse_is_rnat_slot (unsigned long *addr)
-{
-	return ia64_rse_slot_num(addr) == 0x3f;
-}
-
-/*
- * Returns the address of the RNAT slot that covers the slot at
- * address SLOT_ADDR.
- */
-static __inline__ unsigned long *
-ia64_rse_rnat_addr (unsigned long *slot_addr)
-{
-	return (unsigned long *) ((unsigned long) slot_addr | (0x3f << 3));
-}
-
-/*
- * Calculate the number of registers in the dirty partition starting at BSPSTORE and
- * ending at BSP.  This isn't simply (BSP-BSPSTORE)/8 because every 64th slot stores
- * ar.rnat.
- */
-static __inline__ unsigned long
-ia64_rse_num_regs (unsigned long *bspstore, unsigned long *bsp)
-{
-	unsigned long slots = (bsp - bspstore);
-
-	return slots - (ia64_rse_slot_num(bspstore) + slots)/0x40;
-}
-
-/*
- * The inverse of the above: given bspstore and the number of
- * registers, calculate ar.bsp.
- */
-static __inline__ unsigned long *
-ia64_rse_skip_regs (unsigned long *addr, long num_regs)
-{
-	long delta = ia64_rse_slot_num(addr) + num_regs;
-
-	if (num_regs < 0)
-		delta -= 0x3e;
-	return addr + num_regs + delta/0x3f;
-}
-
-#endif /* _ASM_IA64_RSE_H */
diff --git a/arch/ia64/include/uapi/asm/setup.h b/arch/ia64/include/uapi/asm/setup.h
deleted file mode 100644
index 8d13ce8fb03a..000000000000
--- a/arch/ia64/include/uapi/asm/setup.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __IA64_SETUP_H
-#define __IA64_SETUP_H
-
-#define COMMAND_LINE_SIZE	2048
-
-extern struct ia64_boot_param {
-	__u64 command_line;		/* physical address of command line arguments */
-	__u64 efi_systab;		/* physical address of EFI system table */
-	__u64 efi_memmap;		/* physical address of EFI memory map */
-	__u64 efi_memmap_size;		/* size of EFI memory map */
-	__u64 efi_memdesc_size;		/* size of an EFI memory map descriptor */
-	__u32 efi_memdesc_version;	/* memory descriptor version */
-	struct {
-		__u16 num_cols;	/* number of columns on console output device */
-		__u16 num_rows;	/* number of rows on console output device */
-		__u16 orig_x;	/* cursor's x position */
-		__u16 orig_y;	/* cursor's y position */
-	} console_info;
-	__u64 fpswa;		/* physical address of the fpswa interface */
-	__u64 initrd_start;
-	__u64 initrd_size;
-} *ia64_boot_param;
-
-#endif
diff --git a/arch/ia64/include/uapi/asm/sigcontext.h b/arch/ia64/include/uapi/asm/sigcontext.h
deleted file mode 100644
index 1bb6f0f2bd73..000000000000
--- a/arch/ia64/include/uapi/asm/sigcontext.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_SIGCONTEXT_H
-#define _ASM_IA64_SIGCONTEXT_H
-
-/*
- * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
- * Copyright (C) 1998, 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <asm/fpu.h>
-
-#define IA64_SC_FLAG_ONSTACK_BIT		0	/* is handler running on signal stack? */
-#define IA64_SC_FLAG_IN_SYSCALL_BIT		1	/* did signal interrupt a syscall? */
-#define IA64_SC_FLAG_FPH_VALID_BIT		2	/* is state in f[32]-f[127] valid? */
-
-#define IA64_SC_FLAG_ONSTACK		(1 << IA64_SC_FLAG_ONSTACK_BIT)
-#define IA64_SC_FLAG_IN_SYSCALL		(1 << IA64_SC_FLAG_IN_SYSCALL_BIT)
-#define IA64_SC_FLAG_FPH_VALID		(1 << IA64_SC_FLAG_FPH_VALID_BIT)
-
-# ifndef __ASSEMBLY__
-
-/*
- * Note on handling of register backing store: sc_ar_bsp contains the address that would
- * be found in ar.bsp after executing a "cover" instruction the context in which the
- * signal was raised.  If signal delivery required switching to an alternate signal stack
- * (sc_rbs_base is not NULL), the "dirty" partition (as it would exist after executing the
- * imaginary "cover" instruction) is backed by the *alternate* signal stack, not the
- * original one.  In this case, sc_rbs_base contains the base address of the new register
- * backing store.  The number of registers in the dirty partition can be calculated as:
- *
- *   ndirty = ia64_rse_num_regs(sc_rbs_base, sc_rbs_base + (sc_loadrs >> 16))
- *
- */
-
-struct sigcontext {
-	unsigned long		sc_flags;	/* see manifest constants above */
-	unsigned long		sc_nat;		/* bit i == 1 iff scratch reg gr[i] is a NaT */
-	stack_t			sc_stack;	/* previously active stack */
-
-	unsigned long		sc_ip;		/* instruction pointer */
-	unsigned long		sc_cfm;		/* current frame marker */
-	unsigned long		sc_um;		/* user mask bits */
-	unsigned long		sc_ar_rsc;	/* register stack configuration register */
-	unsigned long		sc_ar_bsp;	/* backing store pointer */
-	unsigned long		sc_ar_rnat;	/* RSE NaT collection register */
-	unsigned long		sc_ar_ccv;	/* compare and exchange compare value register */
-	unsigned long		sc_ar_unat;	/* ar.unat of interrupted context */
-	unsigned long		sc_ar_fpsr;	/* floating-point status register */
-	unsigned long		sc_ar_pfs;	/* previous function state */
-	unsigned long		sc_ar_lc;	/* loop count register */
-	unsigned long		sc_pr;		/* predicate registers */
-	unsigned long		sc_br[8];	/* branch registers */
-	/* Note: sc_gr[0] is used as the "uc_link" member of ucontext_t */
-	unsigned long		sc_gr[32];	/* general registers (static partition) */
-	struct ia64_fpreg	sc_fr[128];	/* floating-point registers */
-
-	unsigned long		sc_rbs_base;	/* NULL or new base of sighandler's rbs */
-	unsigned long		sc_loadrs;	/* see description above */
-
-	unsigned long		sc_ar25;	/* cmp8xchg16 uses this */
-	unsigned long		sc_ar26;	/* rsvd for scratch use */
-	unsigned long		sc_rsvd[12];	/* reserved for future use */
-	/*
-	 * The mask must come last so we can increase _NSIG_WORDS
-	 * without breaking binary compatibility.
-	 */
-	sigset_t		sc_mask;	/* signal mask to restore after handler returns */
-};
-
-# endif /* __ASSEMBLY__ */
-#endif /* _ASM_IA64_SIGCONTEXT_H */
diff --git a/arch/ia64/include/uapi/asm/siginfo.h b/arch/ia64/include/uapi/asm/siginfo.h
deleted file mode 100644
index 796af1ccaa7e..000000000000
--- a/arch/ia64/include/uapi/asm/siginfo.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Based on <asm-i386/siginfo.h>.
- *
- * Modified 1998-2002
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#ifndef _UAPI_ASM_IA64_SIGINFO_H
-#define _UAPI_ASM_IA64_SIGINFO_H
-
-
-#include <asm-generic/siginfo.h>
-
-#define si_imm		_sifields._sigfault._imm	/* as per UNIX SysV ABI spec */
-#define si_flags	_sifields._sigfault._flags
-/*
- * si_isr is valid for SIGILL, SIGFPE, SIGSEGV, SIGBUS, and SIGTRAP provided that
- * si_code is non-zero and __ISR_VALID is set in si_flags.
- */
-#define si_isr		_sifields._sigfault._isr
-
-/*
- * Flag values for si_flags:
- */
-#define __ISR_VALID_BIT	0
-#define __ISR_VALID	(1 << __ISR_VALID_BIT)
-
-#endif /* _UAPI_ASM_IA64_SIGINFO_H */
diff --git a/arch/ia64/include/uapi/asm/signal.h b/arch/ia64/include/uapi/asm/signal.h
deleted file mode 100644
index 63d574e802a2..000000000000
--- a/arch/ia64/include/uapi/asm/signal.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Modified 1998-2001, 2003
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- *
- * Unfortunately, this file is being included by bits/signal.h in
- * glibc-2.x.  Hence the #ifdef __KERNEL__ ugliness.
- */
-#ifndef _UAPI_ASM_IA64_SIGNAL_H
-#define _UAPI_ASM_IA64_SIGNAL_H
-
-
-#define SIGHUP		 1
-#define SIGINT		 2
-#define SIGQUIT		 3
-#define SIGILL		 4
-#define SIGTRAP		 5
-#define SIGABRT		 6
-#define SIGIOT		 6
-#define SIGBUS		 7
-#define SIGFPE		 8
-#define SIGKILL		 9
-#define SIGUSR1		10
-#define SIGSEGV		11
-#define SIGUSR2		12
-#define SIGPIPE		13
-#define SIGALRM		14
-#define SIGTERM		15
-#define SIGSTKFLT	16
-#define SIGCHLD		17
-#define SIGCONT		18
-#define SIGSTOP		19
-#define SIGTSTP		20
-#define SIGTTIN		21
-#define SIGTTOU		22
-#define SIGURG		23
-#define SIGXCPU		24
-#define SIGXFSZ		25
-#define SIGVTALRM	26
-#define SIGPROF		27
-#define SIGWINCH	28
-#define SIGIO		29
-#define SIGPOLL		SIGIO
-/*
-#define SIGLOST		29
-*/
-#define SIGPWR		30
-#define SIGSYS		31
-/* signal 31 is no longer "unused", but the SIGUNUSED macro remains for backwards compatibility */
-#define	SIGUNUSED	31
-
-/* These should not be considered constants from userland.  */
-#define SIGRTMIN	32
-#define SIGRTMAX	_NSIG
-
-#define SA_RESTORER	0x04000000
-
-/*
- * The minimum stack size needs to be fairly large because we want to
- * be sure that an app compiled for today's CPUs will continue to run
- * on all future CPU models.  The CPU model matters because the signal
- * frame needs to have space for the complete machine state, including
- * all physical stacked registers.  The number of physical stacked
- * registers is CPU model dependent, but given that the width of
- * ar.rsc.loadrs is 14 bits, we can assume that they'll never take up
- * more than 16KB of space.
- */
-#if 1
-  /*
-   * This is a stupid typo: the value was _meant_ to be 131072 (0x20000), but I typed it
-   * in wrong. ;-(  To preserve backwards compatibility, we leave the kernel at the
-   * incorrect value and fix libc only.
-   */
-# define MINSIGSTKSZ	131027	/* min. stack size for sigaltstack() */
-#else
-# define MINSIGSTKSZ	131072	/* min. stack size for sigaltstack() */
-#endif
-#define SIGSTKSZ	262144	/* default stack size for sigaltstack() */
-
-
-#include <asm-generic/signal-defs.h>
-
-# ifndef __ASSEMBLY__
-
-#  include <linux/types.h>
-
-/* Avoid too many header ordering problems.  */
-struct siginfo;
-
-typedef struct sigaltstack {
-	void __user *ss_sp;
-	int ss_flags;
-	__kernel_size_t ss_size;
-} stack_t;
-
-
-# endif /* !__ASSEMBLY__ */
-#endif /* _UAPI_ASM_IA64_SIGNAL_H */
diff --git a/arch/ia64/include/uapi/asm/stat.h b/arch/ia64/include/uapi/asm/stat.h
deleted file mode 100644
index 3265ed5aac0f..000000000000
--- a/arch/ia64/include/uapi/asm/stat.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_STAT_H
-#define _ASM_IA64_STAT_H
-
-/*
- * Modified 1998, 1999
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-
-struct stat {
-	unsigned long	st_dev;
-	unsigned long	st_ino;
-	unsigned long	st_nlink;
-	unsigned int	st_mode;
-	unsigned int	st_uid;
-	unsigned int	st_gid;
-	unsigned int	__pad0;
-	unsigned long	st_rdev;
-	unsigned long	st_size;
-	unsigned long	st_atime;
-	unsigned long	st_atime_nsec;
-	unsigned long	st_mtime;
-	unsigned long	st_mtime_nsec;
-	unsigned long	st_ctime;
-	unsigned long	st_ctime_nsec;
-	unsigned long	st_blksize;
-	long		st_blocks;
-	unsigned long	__unused[3];
-};
-
-#define STAT_HAVE_NSEC 1
-
-struct ia64_oldstat {
-	unsigned int	st_dev;
-	unsigned int	st_ino;
-	unsigned int	st_mode;
-	unsigned int	st_nlink;
-	unsigned int	st_uid;
-	unsigned int	st_gid;
-	unsigned int	st_rdev;
-	unsigned int	__pad1;
-	unsigned long	st_size;
-	unsigned long	st_atime;
-	unsigned long	st_mtime;
-	unsigned long	st_ctime;
-	unsigned int	st_blksize;
-	int		st_blocks;
-	unsigned int	__unused1;
-	unsigned int	__unused2;
-};
-
-#endif /* _ASM_IA64_STAT_H */
diff --git a/arch/ia64/include/uapi/asm/statfs.h b/arch/ia64/include/uapi/asm/statfs.h
deleted file mode 100644
index de3bae4f137d..000000000000
--- a/arch/ia64/include/uapi/asm/statfs.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_STATFS_H
-#define _ASM_IA64_STATFS_H
-
-/*
- * Based on <asm-i386/statfs.h>.
- *
- * Modified 1998, 1999, 2003
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-
-/*
- * We need compat_statfs64 to be packed, because the i386 ABI won't
- * add padding at the end to bring it to a multiple of 8 bytes, but
- * the IA64 ABI will.
- */
-#define ARCH_PACK_COMPAT_STATFS64 __attribute__((packed,aligned(4)))
-
-#include <asm-generic/statfs.h>
-
-#endif /* _ASM_IA64_STATFS_H */
diff --git a/arch/ia64/include/uapi/asm/swab.h b/arch/ia64/include/uapi/asm/swab.h
deleted file mode 100644
index 79f3fef1a05e..000000000000
--- a/arch/ia64/include/uapi/asm/swab.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_SWAB_H
-#define _ASM_IA64_SWAB_H
-
-/*
- * Modified 1998, 1999
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co.
- */
-
-#include <linux/types.h>
-#include <asm/intrinsics.h>
-#include <linux/compiler.h>
-
-static __inline__ __attribute_const__ __u64 __arch_swab64(__u64 x)
-{
-	__u64 result;
-
-	result = ia64_mux1(x, ia64_mux1_rev);
-	return result;
-}
-#define __arch_swab64 __arch_swab64
-
-static __inline__ __attribute_const__ __u32 __arch_swab32(__u32 x)
-{
-	return __arch_swab64(x) >> 32;
-}
-#define __arch_swab32 __arch_swab32
-
-static __inline__ __attribute_const__ __u16 __arch_swab16(__u16 x)
-{
-	return __arch_swab64(x) >> 48;
-}
-#define __arch_swab16 __arch_swab16
-
-#endif /* _ASM_IA64_SWAB_H */
diff --git a/arch/ia64/include/uapi/asm/types.h b/arch/ia64/include/uapi/asm/types.h
deleted file mode 100644
index 2000de474be6..000000000000
--- a/arch/ia64/include/uapi/asm/types.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * This file is never included by application software unless explicitly
- * requested (e.g., via linux/types.h) in which case the application is
- * Linux specific so (user-) name space pollution is not a major issue.
- * However, for interoperability, libraries still need to be careful to
- * avoid naming clashes.
- *
- * Based on <asm-alpha/types.h>.
- *
- * Modified 1998-2000, 2002
- *	David Mosberger-Tang <davidm@hpl.hp.com>, Hewlett-Packard Co
- */
-#ifndef _UAPI_ASM_IA64_TYPES_H
-#define _UAPI_ASM_IA64_TYPES_H
-
-
-#ifndef __KERNEL__
-#include <asm-generic/int-l64.h>
-#endif
-
-#ifdef __ASSEMBLY__
-# define __IA64_UL(x)		(x)
-# define __IA64_UL_CONST(x)	x
-
-#else
-# define __IA64_UL(x)		((unsigned long)(x))
-# define __IA64_UL_CONST(x)	x##UL
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* _UAPI_ASM_IA64_TYPES_H */
diff --git a/arch/ia64/include/uapi/asm/ucontext.h b/arch/ia64/include/uapi/asm/ucontext.h
deleted file mode 100644
index 46f51e535e04..000000000000
--- a/arch/ia64/include/uapi/asm/ucontext.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_IA64_UCONTEXT_H
-#define _ASM_IA64_UCONTEXT_H
-
-struct ucontext {
-	struct sigcontext uc_mcontext;
-};
-
-#define uc_link		uc_mcontext.sc_gr[0]	/* wrong type; nobody cares */
-#define uc_sigmask	uc_mcontext.sc_sigmask
-#define uc_stack	uc_mcontext.sc_stack
-
-#endif /* _ASM_IA64_UCONTEXT_H */
diff --git a/arch/ia64/include/uapi/asm/unistd.h b/arch/ia64/include/uapi/asm/unistd.h
deleted file mode 100644
index 013e0bcacc39..000000000000
--- a/arch/ia64/include/uapi/asm/unistd.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * IA-64 Linux syscall numbers and inline-functions.
- *
- * Copyright (C) 1998-2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#ifndef _UAPI_ASM_IA64_UNISTD_H
-#define _UAPI_ASM_IA64_UNISTD_H
-
-
-#include <asm/break.h>
-
-#define __BREAK_SYSCALL	__IA64_BREAK_SYSCALL
-
-#define __NR_Linux      1024
-
-#define __NR_umount __NR_umount2
-
-#include <asm/unistd_64.h>
-
-#endif /* _UAPI_ASM_IA64_UNISTD_H */
diff --git a/arch/ia64/include/uapi/asm/ustack.h b/arch/ia64/include/uapi/asm/ustack.h
deleted file mode 100644
index 703cc5f546ff..000000000000
--- a/arch/ia64/include/uapi/asm/ustack.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_ASM_IA64_USTACK_H
-#define _UAPI_ASM_IA64_USTACK_H
-
-/*
- * Constants for the user stack size
- */
-
-
-/* Make a default stack size of 2GiB */
-#define DEFAULT_USER_STACK_SIZE	(1UL << 31)
-
-#endif /* _UAPI_ASM_IA64_USTACK_H */
diff --git a/arch/ia64/install.sh b/arch/ia64/install.sh
deleted file mode 100755
index 2d4b66a9f362..000000000000
--- a/arch/ia64/install.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/sh
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 1995 by Linus Torvalds
-#
-# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin
-#
-# "make install" script for ia64 architecture
-#
-# Arguments:
-#   $1 - kernel version
-#   $2 - kernel image file
-#   $3 - kernel map file
-#   $4 - default install path (blank if root directory)
-
-if [ -f $4/vmlinuz ]; then
-	mv $4/vmlinuz $4/vmlinuz.old
-fi
-
-if [ -f $4/System.map ]; then
-	mv $4/System.map $4/System.old
-fi
-
-cat $2 > $4/vmlinuz
-cp $3 $4/System.map
-
-test -x /usr/sbin/elilo && /usr/sbin/elilo
diff --git a/arch/ia64/kernel/.gitignore b/arch/ia64/kernel/.gitignore
deleted file mode 100644
index 0374827206e7..000000000000
--- a/arch/ia64/kernel/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-gate.lds
-vmlinux.lds
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
deleted file mode 100644
index d7e1cabee2ec..000000000000
--- a/arch/ia64/kernel/Makefile
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the linux kernel.
-#
-
-ifdef CONFIG_DYNAMIC_FTRACE
-CFLAGS_REMOVE_ftrace.o = -pg
-endif
-
-extra-y	:= vmlinux.lds
-
-obj-y := head.o entry.o efi.o efi_stub.o gate-data.o fsys.o irq.o irq_ia64.o	\
-	 irq_lsapic.o ivt.o pal.o patch.o process.o ptrace.o sal.o		\
-	 salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
-	 unwind.o mca.o mca_asm.o topology.o dma-mapping.o iosapic.o acpi.o \
-	 acpi-ext.o
-
-obj-$(CONFIG_IA64_BRL_EMU)	+= brl_emu.o
-
-obj-$(CONFIG_IA64_PALINFO)	+= palinfo.o
-obj-$(CONFIG_MODULES)		+= module.o
-obj-$(CONFIG_SMP)		+= smp.o smpboot.o
-obj-$(CONFIG_NUMA)		+= numa.o
-obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
-obj-$(CONFIG_IA64_MCA_RECOVERY)	+= mca_recovery.o
-obj-$(CONFIG_KPROBES)		+= kprobes.o
-obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
-obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
-obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR)	+= uncached.o
-obj-$(CONFIG_AUDIT)		+= audit.o
-obj-y				+= msi_ia64.o
-mca_recovery-y			+= mca_drv.o mca_drv_asm.o
-obj-$(CONFIG_IA64_MC_ERR_INJECT)+= err_inject.o
-obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
-
-obj-$(CONFIG_IA64_ESI)		+= esi.o esi_stub.o # must be in kernel proper
-obj-$(CONFIG_INTEL_IOMMU)	+= pci-dma.o
-
-obj-$(CONFIG_ELF_CORE)		+= elfcore.o
-
-# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
-CFLAGS_traps.o  += -mfixed-range=f2-f5,f16-f31
-
-# The gate DSO image is built using a special linker script.
-include $(srctree)/$(src)/Makefile.gate
diff --git a/arch/ia64/kernel/Makefile.gate b/arch/ia64/kernel/Makefile.gate
deleted file mode 100644
index 846867bff6d6..000000000000
--- a/arch/ia64/kernel/Makefile.gate
+++ /dev/null
@@ -1,29 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# The gate DSO image is built using a special linker script.
-
-targets += gate.so gate.lds gate.o gate-dummy.o
-
-obj-y += gate-syms.o
-
-CPPFLAGS_gate.lds := -P -C -U$(ARCH)
-
-quiet_cmd_gate = GATE    $@
-      cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
-
-GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
-		     -Wl,--hash-style=sysv
-$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
-	$(call if_changed,gate)
-
-GATECFLAGS_gate-dummy.o = -r
-$(obj)/gate-dummy.o: $(obj)/gate.lds $(obj)/gate.o FORCE
-	$(call if_changed,gate)
-
-LDFLAGS_gate-syms.o := -r -R
-$(obj)/gate-syms.o: $(obj)/gate-dummy.o FORCE
-	$(call if_changed,ld)
-
-# gate-data.o contains the gate DSO image as data in section .data..gate.
-# We must build gate.so before we can assemble it.
-# Note: kbuild does not track this dependency due to usage of .incbin
-$(obj)/gate-data.o: $(obj)/gate.so
diff --git a/arch/ia64/kernel/acpi-ext.c b/arch/ia64/kernel/acpi-ext.c
deleted file mode 100644
index 42cd21480833..000000000000
--- a/arch/ia64/kernel/acpi-ext.c
+++ /dev/null
@@ -1,101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * (c) Copyright 2003, 2006 Hewlett-Packard Development Company, L.P.
- *	Alex Williamson <alex.williamson@hp.com>
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/acpi.h>
-
-#include <asm/acpi-ext.h>
-
-/*
- * Device CSRs that do not appear in PCI config space should be described
- * via ACPI.  This would normally be done with Address Space Descriptors
- * marked as "consumer-only," but old versions of Windows and Linux ignore
- * the producer/consumer flag, so HP invented a vendor-defined resource to
- * describe the location and size of CSR space.
- */
-
-struct acpi_vendor_uuid hp_ccsr_uuid = {
-	.subtype = 2,
-	.data = { 0xf9, 0xad, 0xe9, 0x69, 0x4f, 0x92, 0x5f, 0xab, 0xf6, 0x4a,
-	    0x24, 0xd2, 0x01, 0x37, 0x0e, 0xad },
-};
-
-static acpi_status hp_ccsr_locate(acpi_handle obj, u64 *base, u64 *length)
-{
-	acpi_status status;
-	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-	struct acpi_resource *resource;
-	struct acpi_resource_vendor_typed *vendor;
-
-	status = acpi_get_vendor_resource(obj, METHOD_NAME__CRS, &hp_ccsr_uuid,
-		&buffer);
-
-	resource = buffer.pointer;
-	vendor = &resource->data.vendor_typed;
-
-	if (ACPI_FAILURE(status) || vendor->byte_length < 16) {
-		status = AE_NOT_FOUND;
-		goto exit;
-	}
-
-	memcpy(base, vendor->byte_data, sizeof(*base));
-	memcpy(length, vendor->byte_data + 8, sizeof(*length));
-
-  exit:
-	kfree(buffer.pointer);
-	return status;
-}
-
-struct csr_space {
-	u64	base;
-	u64	length;
-};
-
-static acpi_status find_csr_space(struct acpi_resource *resource, void *data)
-{
-	struct csr_space *space = data;
-	struct acpi_resource_address64 addr;
-	acpi_status status;
-
-	status = acpi_resource_to_address64(resource, &addr);
-	if (ACPI_SUCCESS(status) &&
-	    addr.resource_type == ACPI_MEMORY_RANGE &&
-	    addr.address.address_length &&
-	    addr.producer_consumer == ACPI_CONSUMER) {
-		space->base = addr.address.minimum;
-		space->length = addr.address.address_length;
-		return AE_CTRL_TERMINATE;
-	}
-	return AE_OK;		/* keep looking */
-}
-
-static acpi_status hp_crs_locate(acpi_handle obj, u64 *base, u64 *length)
-{
-	struct csr_space space = { 0, 0 };
-
-	acpi_walk_resources(obj, METHOD_NAME__CRS, find_csr_space, &space);
-	if (!space.length)
-		return AE_NOT_FOUND;
-
-	*base = space.base;
-	*length = space.length;
-	return AE_OK;
-}
-
-acpi_status hp_acpi_csr_space(acpi_handle obj, u64 *csr_base, u64 *csr_length)
-{
-	acpi_status status;
-
-	status = hp_ccsr_locate(obj, csr_base, csr_length);
-	if (ACPI_SUCCESS(status))
-		return status;
-
-	return hp_crs_locate(obj, csr_base, csr_length);
-}
-EXPORT_SYMBOL(hp_acpi_csr_space);
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
deleted file mode 100644
index 41e8fe55cd98..000000000000
--- a/arch/ia64/kernel/acpi.c
+++ /dev/null
@@ -1,913 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- *  acpi.c - Architecture-Specific Low-Level ACPI Support
- *
- *  Copyright (C) 1999 VA Linux Systems
- *  Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com>
- *  Copyright (C) 2000, 2002-2003 Hewlett-Packard Co.
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *  Copyright (C) 2000 Intel Corp.
- *  Copyright (C) 2000,2001 J.I. Lee <jung-ik.lee@intel.com>
- *  Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- *  Copyright (C) 2001 Jenna Hall <jenna.s.hall@intel.com>
- *  Copyright (C) 2001 Takayoshi Kochi <t-kochi@bq.jp.nec.com>
- *  Copyright (C) 2002 Erich Focht <efocht@ess.nec.de>
- *  Copyright (C) 2004 Ashok Raj <ashok.raj@intel.com>
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/irq.h>
-#include <linux/acpi.h>
-#include <linux/efi.h>
-#include <linux/mmzone.h>
-#include <linux/nodemask.h>
-#include <linux/slab.h>
-#include <acpi/processor.h>
-#include <asm/io.h>
-#include <asm/iosapic.h>
-#include <asm/page.h>
-#include <asm/numa.h>
-#include <asm/sal.h>
-#include <asm/cyclone.h>
-
-#define PREFIX			"ACPI: "
-
-int acpi_lapic;
-unsigned int acpi_cpei_override;
-unsigned int acpi_cpei_phys_cpuid;
-
-#define ACPI_MAX_PLATFORM_INTERRUPTS	256
-
-/* Array to record platform interrupt vectors for generic interrupt routing. */
-int platform_intr_list[ACPI_MAX_PLATFORM_INTERRUPTS] = {
-	[0 ... ACPI_MAX_PLATFORM_INTERRUPTS - 1] = -1
-};
-
-enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_IOSAPIC;
-
-/*
- * Interrupt routing API for device drivers.  Provides interrupt vector for
- * a generic platform event.  Currently only CPEI is implemented.
- */
-int acpi_request_vector(u32 int_type)
-{
-	int vector = -1;
-
-	if (int_type < ACPI_MAX_PLATFORM_INTERRUPTS) {
-		/* corrected platform error interrupt */
-		vector = platform_intr_list[int_type];
-	} else
-		printk(KERN_ERR
-		       "acpi_request_vector(): invalid interrupt type\n");
-	return vector;
-}
-
-void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size)
-{
-	return __va(phys);
-}
-
-void __init __acpi_unmap_table(void __iomem *map, unsigned long size)
-{
-}
-
-/* --------------------------------------------------------------------------
-                            Boot-time Table Parsing
-   -------------------------------------------------------------------------- */
-
-static int available_cpus __initdata;
-struct acpi_table_madt *acpi_madt __initdata;
-static u8 has_8259;
-
-static int __init
-acpi_parse_lapic_addr_ovr(union acpi_subtable_headers * header,
-			  const unsigned long end)
-{
-	struct acpi_madt_local_apic_override *lapic;
-
-	lapic = (struct acpi_madt_local_apic_override *)header;
-
-	if (BAD_MADT_ENTRY(lapic, end))
-		return -EINVAL;
-
-	if (lapic->address) {
-		iounmap(ipi_base_addr);
-		ipi_base_addr = ioremap(lapic->address, 0);
-	}
-	return 0;
-}
-
-static int __init
-acpi_parse_lsapic(union acpi_subtable_headers *header, const unsigned long end)
-{
-	struct acpi_madt_local_sapic *lsapic;
-
-	lsapic = (struct acpi_madt_local_sapic *)header;
-
-	/*Skip BAD_MADT_ENTRY check, as lsapic size could vary */
-
-	if (lsapic->lapic_flags & ACPI_MADT_ENABLED) {
-#ifdef CONFIG_SMP
-		smp_boot_data.cpu_phys_id[available_cpus] =
-		    (lsapic->id << 8) | lsapic->eid;
-#endif
-		++available_cpus;
-	}
-
-	total_cpus++;
-	return 0;
-}
-
-static int __init
-acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long end)
-{
-	struct acpi_madt_local_apic_nmi *lacpi_nmi;
-
-	lacpi_nmi = (struct acpi_madt_local_apic_nmi *)header;
-
-	if (BAD_MADT_ENTRY(lacpi_nmi, end))
-		return -EINVAL;
-
-	/* TBD: Support lapic_nmi entries */
-	return 0;
-}
-
-static int __init
-acpi_parse_iosapic(union acpi_subtable_headers * header, const unsigned long end)
-{
-	struct acpi_madt_io_sapic *iosapic;
-
-	iosapic = (struct acpi_madt_io_sapic *)header;
-
-	if (BAD_MADT_ENTRY(iosapic, end))
-		return -EINVAL;
-
-	return iosapic_init(iosapic->address, iosapic->global_irq_base);
-}
-
-static unsigned int __initdata acpi_madt_rev;
-
-static int __init
-acpi_parse_plat_int_src(union acpi_subtable_headers * header,
-			const unsigned long end)
-{
-	struct acpi_madt_interrupt_source *plintsrc;
-	int vector;
-
-	plintsrc = (struct acpi_madt_interrupt_source *)header;
-
-	if (BAD_MADT_ENTRY(plintsrc, end))
-		return -EINVAL;
-
-	/*
-	 * Get vector assignment for this interrupt, set attributes,
-	 * and program the IOSAPIC routing table.
-	 */
-	vector = iosapic_register_platform_intr(plintsrc->type,
-						plintsrc->global_irq,
-						plintsrc->io_sapic_vector,
-						plintsrc->eid,
-						plintsrc->id,
-						((plintsrc->inti_flags & ACPI_MADT_POLARITY_MASK) ==
-						 ACPI_MADT_POLARITY_ACTIVE_HIGH) ?
-						IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW,
-						((plintsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) ==
-						 ACPI_MADT_TRIGGER_EDGE) ?
-						IOSAPIC_EDGE : IOSAPIC_LEVEL);
-
-	platform_intr_list[plintsrc->type] = vector;
-	if (acpi_madt_rev > 1) {
-		acpi_cpei_override = plintsrc->flags & ACPI_MADT_CPEI_OVERRIDE;
-	}
-
-	/*
-	 * Save the physical id, so we can check when its being removed
-	 */
-	acpi_cpei_phys_cpuid = ((plintsrc->id << 8) | (plintsrc->eid)) & 0xffff;
-
-	return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-unsigned int can_cpei_retarget(void)
-{
-	extern int cpe_vector;
-	extern unsigned int force_cpei_retarget;
-
-	/*
-	 * Only if CPEI is supported and the override flag
-	 * is present, otherwise return that its re-targettable
-	 * if we are in polling mode.
-	 */
-	if (cpe_vector > 0) {
-		if (acpi_cpei_override || force_cpei_retarget)
-			return 1;
-		else
-			return 0;
-	}
-	return 1;
-}
-
-unsigned int is_cpu_cpei_target(unsigned int cpu)
-{
-	unsigned int logical_id;
-
-	logical_id = cpu_logical_id(acpi_cpei_phys_cpuid);
-
-	if (logical_id == cpu)
-		return 1;
-	else
-		return 0;
-}
-
-void set_cpei_target_cpu(unsigned int cpu)
-{
-	acpi_cpei_phys_cpuid = cpu_physical_id(cpu);
-}
-#endif
-
-unsigned int get_cpei_target_cpu(void)
-{
-	return acpi_cpei_phys_cpuid;
-}
-
-static int __init
-acpi_parse_int_src_ovr(union acpi_subtable_headers * header,
-		       const unsigned long end)
-{
-	struct acpi_madt_interrupt_override *p;
-
-	p = (struct acpi_madt_interrupt_override *)header;
-
-	if (BAD_MADT_ENTRY(p, end))
-		return -EINVAL;
-
-	iosapic_override_isa_irq(p->source_irq, p->global_irq,
-				 ((p->inti_flags & ACPI_MADT_POLARITY_MASK) ==
-				  ACPI_MADT_POLARITY_ACTIVE_LOW) ?
-				 IOSAPIC_POL_LOW : IOSAPIC_POL_HIGH,
-				 ((p->inti_flags & ACPI_MADT_TRIGGER_MASK) ==
-				 ACPI_MADT_TRIGGER_LEVEL) ?
-				 IOSAPIC_LEVEL : IOSAPIC_EDGE);
-	return 0;
-}
-
-static int __init
-acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end)
-{
-	struct acpi_madt_nmi_source *nmi_src;
-
-	nmi_src = (struct acpi_madt_nmi_source *)header;
-
-	if (BAD_MADT_ENTRY(nmi_src, end))
-		return -EINVAL;
-
-	/* TBD: Support nimsrc entries */
-	return 0;
-}
-
-static void __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
-	if (!strncmp(oem_id, "IBM", 3) && (!strncmp(oem_table_id, "SERMOW", 6))) {
-
-		/*
-		 * Unfortunately ITC_DRIFT is not yet part of the
-		 * official SAL spec, so the ITC_DRIFT bit is not
-		 * set by the BIOS on this hardware.
-		 */
-		sal_platform_features |= IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT;
-
-		cyclone_setup();
-	}
-}
-
-static int __init acpi_parse_madt(struct acpi_table_header *table)
-{
-	acpi_madt = (struct acpi_table_madt *)table;
-
-	acpi_madt_rev = acpi_madt->header.revision;
-
-	/* remember the value for reference after free_initmem() */
-#ifdef CONFIG_ITANIUM
-	has_8259 = 1;		/* Firmware on old Itanium systems is broken */
-#else
-	has_8259 = acpi_madt->flags & ACPI_MADT_PCAT_COMPAT;
-#endif
-	iosapic_system_init(has_8259);
-
-	/* Get base address of IPI Message Block */
-
-	if (acpi_madt->address)
-		ipi_base_addr = ioremap(acpi_madt->address, 0);
-
-	printk(KERN_INFO PREFIX "Local APIC address %p\n", ipi_base_addr);
-
-	acpi_madt_oem_check(acpi_madt->header.oem_id,
-			    acpi_madt->header.oem_table_id);
-
-	return 0;
-}
-
-#ifdef CONFIG_ACPI_NUMA
-
-#undef SLIT_DEBUG
-
-#define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
-
-static int __initdata srat_num_cpus;	/* number of cpus */
-static u32 pxm_flag[PXM_FLAG_LEN];
-#define pxm_bit_set(bit)	(set_bit(bit,(void *)pxm_flag))
-#define pxm_bit_test(bit)	(test_bit(bit,(void *)pxm_flag))
-static struct acpi_table_slit __initdata *slit_table;
-cpumask_t early_cpu_possible_map = CPU_MASK_NONE;
-
-static int __init
-get_processor_proximity_domain(struct acpi_srat_cpu_affinity *pa)
-{
-	int pxm;
-
-	pxm = pa->proximity_domain_lo;
-	if (acpi_srat_revision >= 2)
-		pxm += pa->proximity_domain_hi[0] << 8;
-	return pxm;
-}
-
-static int __init
-get_memory_proximity_domain(struct acpi_srat_mem_affinity *ma)
-{
-	int pxm;
-
-	pxm = ma->proximity_domain;
-	if (acpi_srat_revision <= 1)
-		pxm &= 0xff;
-
-	return pxm;
-}
-
-/*
- * ACPI 2.0 SLIT (System Locality Information Table)
- * http://devresource.hp.com/devresource/Docs/TechPapers/IA64/slit.pdf
- */
-void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
-{
-	u32 len;
-
-	len = sizeof(struct acpi_table_header) + 8
-	    + slit->locality_count * slit->locality_count;
-	if (slit->header.length != len) {
-		printk(KERN_ERR
-		       "ACPI 2.0 SLIT: size mismatch: %d expected, %d actual\n",
-		       len, slit->header.length);
-		return;
-	}
-	slit_table = slit;
-}
-
-void __init
-acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
-{
-	int pxm;
-
-	if (!(pa->flags & ACPI_SRAT_CPU_ENABLED))
-		return;
-
-	if (srat_num_cpus >= ARRAY_SIZE(node_cpuid)) {
-		printk_once(KERN_WARNING
-			    "node_cpuid[%ld] is too small, may not be able to use all cpus\n",
-			    ARRAY_SIZE(node_cpuid));
-		return;
-	}
-	pxm = get_processor_proximity_domain(pa);
-
-	/* record this node in proximity bitmap */
-	pxm_bit_set(pxm);
-
-	node_cpuid[srat_num_cpus].phys_id =
-	    (pa->apic_id << 8) | (pa->local_sapic_eid);
-	/* nid should be overridden as logical node id later */
-	node_cpuid[srat_num_cpus].nid = pxm;
-	cpumask_set_cpu(srat_num_cpus, &early_cpu_possible_map);
-	srat_num_cpus++;
-}
-
-int __init
-acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
-{
-	unsigned long paddr, size;
-	int pxm;
-	struct node_memblk_s *p, *q, *pend;
-
-	pxm = get_memory_proximity_domain(ma);
-
-	/* fill node memory chunk structure */
-	paddr = ma->base_address;
-	size = ma->length;
-
-	/* Ignore disabled entries */
-	if (!(ma->flags & ACPI_SRAT_MEM_ENABLED))
-		return -1;
-
-	if (num_node_memblks >= NR_NODE_MEMBLKS) {
-		pr_err("NUMA: too many memblk ranges\n");
-		return -EINVAL;
-	}
-
-	/* record this node in proximity bitmap */
-	pxm_bit_set(pxm);
-
-	/* Insertion sort based on base address */
-	pend = &node_memblk[num_node_memblks];
-	for (p = &node_memblk[0]; p < pend; p++) {
-		if (paddr < p->start_paddr)
-			break;
-	}
-	if (p < pend) {
-		for (q = pend - 1; q >= p; q--)
-			*(q + 1) = *q;
-	}
-	p->start_paddr = paddr;
-	p->size = size;
-	p->nid = pxm;
-	num_node_memblks++;
-	return 0;
-}
-
-void __init acpi_numa_fixup(void)
-{
-	int i, j, node_from, node_to;
-
-	/* If there's no SRAT, fix the phys_id and mark node 0 online */
-	if (srat_num_cpus == 0) {
-		node_set_online(0);
-		node_cpuid[0].phys_id = hard_smp_processor_id();
-		slit_distance(0, 0) = LOCAL_DISTANCE;
-		goto out;
-	}
-
-	/*
-	 * MCD - This can probably be dropped now.  No need for pxm ID to node ID
-	 * mapping with sparse node numbering iff MAX_PXM_DOMAINS <= MAX_NUMNODES.
-	 */
-	nodes_clear(node_online_map);
-	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
-		if (pxm_bit_test(i)) {
-			int nid = acpi_map_pxm_to_node(i);
-			node_set_online(nid);
-		}
-	}
-
-	/* set logical node id in memory chunk structure */
-	for (i = 0; i < num_node_memblks; i++)
-		node_memblk[i].nid = pxm_to_node(node_memblk[i].nid);
-
-	/* assign memory bank numbers for each chunk on each node */
-	for_each_online_node(i) {
-		int bank;
-
-		bank = 0;
-		for (j = 0; j < num_node_memblks; j++)
-			if (node_memblk[j].nid == i)
-				node_memblk[j].bank = bank++;
-	}
-
-	/* set logical node id in cpu structure */
-	for_each_possible_early_cpu(i)
-		node_cpuid[i].nid = pxm_to_node(node_cpuid[i].nid);
-
-	printk(KERN_INFO "Number of logical nodes in system = %d\n",
-	       num_online_nodes());
-	printk(KERN_INFO "Number of memory chunks in system = %d\n",
-	       num_node_memblks);
-
-	if (!slit_table) {
-		for (i = 0; i < MAX_NUMNODES; i++)
-			for (j = 0; j < MAX_NUMNODES; j++)
-				slit_distance(i, j) = i == j ?
-					LOCAL_DISTANCE : REMOTE_DISTANCE;
-		goto out;
-	}
-
-	memset(numa_slit, -1, sizeof(numa_slit));
-	for (i = 0; i < slit_table->locality_count; i++) {
-		if (!pxm_bit_test(i))
-			continue;
-		node_from = pxm_to_node(i);
-		for (j = 0; j < slit_table->locality_count; j++) {
-			if (!pxm_bit_test(j))
-				continue;
-			node_to = pxm_to_node(j);
-			slit_distance(node_from, node_to) =
-			    slit_table->entry[i * slit_table->locality_count + j];
-		}
-	}
-
-#ifdef SLIT_DEBUG
-	printk("ACPI 2.0 SLIT locality table:\n");
-	for_each_online_node(i) {
-		for_each_online_node(j)
-		    printk("%03d ", node_distance(i, j));
-		printk("\n");
-	}
-#endif
-out:
-	node_possible_map = node_online_map;
-}
-#endif				/* CONFIG_ACPI_NUMA */
-
-/*
- * success: return IRQ number (>=0)
- * failure: return < 0
- */
-int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
-{
-	if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
-		return gsi;
-
-	if (has_8259 && gsi < 16)
-		return isa_irq_to_vector(gsi);
-
-	return iosapic_register_intr(gsi,
-				     (polarity ==
-				      ACPI_ACTIVE_HIGH) ? IOSAPIC_POL_HIGH :
-				     IOSAPIC_POL_LOW,
-				     (triggering ==
-				      ACPI_EDGE_SENSITIVE) ? IOSAPIC_EDGE :
-				     IOSAPIC_LEVEL);
-}
-EXPORT_SYMBOL_GPL(acpi_register_gsi);
-
-void acpi_unregister_gsi(u32 gsi)
-{
-	if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
-		return;
-
-	if (has_8259 && gsi < 16)
-		return;
-
-	iosapic_unregister_intr(gsi);
-}
-EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
-
-static int __init acpi_parse_fadt(struct acpi_table_header *table)
-{
-	struct acpi_table_header *fadt_header;
-	struct acpi_table_fadt *fadt;
-
-	fadt_header = (struct acpi_table_header *)table;
-	if (fadt_header->revision != 3)
-		return -ENODEV;	/* Only deal with ACPI 2.0 FADT */
-
-	fadt = (struct acpi_table_fadt *)fadt_header;
-
-	acpi_register_gsi(NULL, fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE,
-				 ACPI_ACTIVE_LOW);
-	return 0;
-}
-
-int __init early_acpi_boot_init(void)
-{
-	int ret;
-
-	/*
-	 * do a partial walk of MADT to determine how many CPUs
-	 * we have including offline CPUs
-	 */
-	if (acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
-		printk(KERN_ERR PREFIX "Can't find MADT\n");
-		return 0;
-	}
-
-	ret = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
-		acpi_parse_lsapic, NR_CPUS);
-	if (ret < 1)
-		printk(KERN_ERR PREFIX
-		       "Error parsing MADT - no LAPIC entries\n");
-	else
-		acpi_lapic = 1;
-
-#ifdef CONFIG_SMP
-	if (available_cpus == 0) {
-		printk(KERN_INFO "ACPI: Found 0 CPUS; assuming 1\n");
-		printk(KERN_INFO "CPU 0 (0x%04x)", hard_smp_processor_id());
-		smp_boot_data.cpu_phys_id[available_cpus] =
-		    hard_smp_processor_id();
-		available_cpus = 1;	/* We've got at least one of these, no? */
-	}
-	smp_boot_data.cpu_count = available_cpus;
-#endif
-	/* Make boot-up look pretty */
-	printk(KERN_INFO "%d CPUs available, %d CPUs total\n", available_cpus,
-	       total_cpus);
-
-	return 0;
-}
-
-int __init acpi_boot_init(void)
-{
-
-	/*
-	 * MADT
-	 * ----
-	 * Parse the Multiple APIC Description Table (MADT), if exists.
-	 * Note that this table provides platform SMP configuration
-	 * information -- the successor to MPS tables.
-	 */
-
-	if (acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
-		printk(KERN_ERR PREFIX "Can't find MADT\n");
-		goto skip_madt;
-	}
-
-	/* Local APIC */
-
-	if (acpi_table_parse_madt
-	    (ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, acpi_parse_lapic_addr_ovr, 0) < 0)
-		printk(KERN_ERR PREFIX
-		       "Error parsing LAPIC address override entry\n");
-
-	if (acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0)
-	    < 0)
-		printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
-
-	/* I/O APIC */
-
-	if (acpi_table_parse_madt
-	    (ACPI_MADT_TYPE_IO_SAPIC, acpi_parse_iosapic, NR_IOSAPICS) < 1) {
-		printk(KERN_ERR PREFIX
-		       "Error parsing MADT - no IOSAPIC entries\n");
-	}
-
-	/* System-Level Interrupt Routing */
-
-	if (acpi_table_parse_madt
-	    (ACPI_MADT_TYPE_INTERRUPT_SOURCE, acpi_parse_plat_int_src,
-	     ACPI_MAX_PLATFORM_INTERRUPTS) < 0)
-		printk(KERN_ERR PREFIX
-		       "Error parsing platform interrupt source entry\n");
-
-	if (acpi_table_parse_madt
-	    (ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, 0) < 0)
-		printk(KERN_ERR PREFIX
-		       "Error parsing interrupt source overrides entry\n");
-
-	if (acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, 0) < 0)
-		printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
-      skip_madt:
-
-	/*
-	 * FADT says whether a legacy keyboard controller is present.
-	 * The FADT also contains an SCI_INT line, by which the system
-	 * gets interrupts such as power and sleep buttons.  If it's not
-	 * on a Legacy interrupt, it needs to be setup.
-	 */
-	if (acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt))
-		printk(KERN_ERR PREFIX "Can't find FADT\n");
-
-#ifdef CONFIG_ACPI_NUMA
-#ifdef CONFIG_SMP
-	if (srat_num_cpus == 0) {
-		int cpu, i = 1;
-		for (cpu = 0; cpu < smp_boot_data.cpu_count; cpu++)
-			if (smp_boot_data.cpu_phys_id[cpu] !=
-			    hard_smp_processor_id())
-				node_cpuid[i++].phys_id =
-				    smp_boot_data.cpu_phys_id[cpu];
-	}
-#endif
-	build_cpu_to_node_map();
-#endif
-	return 0;
-}
-
-int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
-{
-	int tmp;
-
-	if (has_8259 && gsi < 16)
-		*irq = isa_irq_to_vector(gsi);
-	else {
-		tmp = gsi_to_irq(gsi);
-		if (tmp == -1)
-			return -1;
-		*irq = tmp;
-	}
-	return 0;
-}
-
-int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
-{
-	if (isa_irq >= 16)
-		return -1;
-	*gsi = isa_irq;
-	return 0;
-}
-
-/*
- *  ACPI based hotplug CPU support
- */
-#ifdef CONFIG_ACPI_HOTPLUG_CPU
-int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
-{
-#ifdef CONFIG_ACPI_NUMA
-	/*
-	 * We don't have cpu-only-node hotadd. But if the system equips
-	 * SRAT table, pxm is already found and node is ready.
-  	 * So, just pxm_to_nid(pxm) is OK.
-	 * This code here is for the system which doesn't have full SRAT
-  	 * table for possible cpus.
-	 */
-	node_cpuid[cpu].phys_id = physid;
-	node_cpuid[cpu].nid = acpi_get_node(handle);
-#endif
-	return 0;
-}
-
-int additional_cpus __initdata = -1;
-
-static __init int setup_additional_cpus(char *s)
-{
-	if (s)
-		additional_cpus = simple_strtol(s, NULL, 0);
-
-	return 0;
-}
-
-early_param("additional_cpus", setup_additional_cpus);
-
-/*
- * cpu_possible_mask should be static, it cannot change as CPUs
- * are onlined, or offlined. The reason is per-cpu data-structures
- * are allocated by some modules at init time, and dont expect to
- * do this dynamically on cpu arrival/departure.
- * cpu_present_mask on the other hand can change dynamically.
- * In case when cpu_hotplug is not compiled, then we resort to current
- * behaviour, which is cpu_possible == cpu_present.
- * - Ashok Raj
- *
- * Three ways to find out the number of additional hotplug CPUs:
- * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
- * - The user can overwrite it with additional_cpus=NUM
- * - Otherwise don't reserve additional CPUs.
- */
-__init void prefill_possible_map(void)
-{
-	int i;
-	int possible, disabled_cpus;
-
-	disabled_cpus = total_cpus - available_cpus;
-
- 	if (additional_cpus == -1) {
- 		if (disabled_cpus > 0)
-			additional_cpus = disabled_cpus;
- 		else
-			additional_cpus = 0;
- 	}
-
-	possible = available_cpus + additional_cpus;
-
-	if (possible > nr_cpu_ids)
-		possible = nr_cpu_ids;
-
-	printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
-		possible, max((possible - available_cpus), 0));
-
-	for (i = 0; i < possible; i++)
-		set_cpu_possible(i, true);
-}
-
-static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
-{
-	int cpu;
-
-	cpu = cpumask_first_zero(cpu_present_mask);
-	if (cpu >= nr_cpu_ids)
-		return -EINVAL;
-
-	acpi_map_cpu2node(handle, cpu, physid);
-
-	set_cpu_present(cpu, true);
-	ia64_cpu_to_sapicid[cpu] = physid;
-
-	acpi_processor_set_pdc(handle);
-
-	*pcpu = cpu;
-	return (0);
-}
-
-/* wrapper to silence section mismatch warning */
-int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
-		       int *pcpu)
-{
-	return _acpi_map_lsapic(handle, physid, pcpu);
-}
-EXPORT_SYMBOL(acpi_map_cpu);
-
-int acpi_unmap_cpu(int cpu)
-{
-	ia64_cpu_to_sapicid[cpu] = -1;
-	set_cpu_present(cpu, false);
-
-#ifdef CONFIG_ACPI_NUMA
-	/* NUMA specific cleanup's */
-#endif
-
-	return (0);
-}
-EXPORT_SYMBOL(acpi_unmap_cpu);
-#endif				/* CONFIG_ACPI_HOTPLUG_CPU */
-
-#ifdef CONFIG_ACPI_NUMA
-static acpi_status acpi_map_iosapic(acpi_handle handle, u32 depth,
-				    void *context, void **ret)
-{
-	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-	union acpi_object *obj;
-	struct acpi_madt_io_sapic *iosapic;
-	unsigned int gsi_base;
-	int node;
-
-	/* Only care about objects w/ a method that returns the MADT */
-	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
-		return AE_OK;
-
-	if (!buffer.length || !buffer.pointer)
-		return AE_OK;
-
-	obj = buffer.pointer;
-	if (obj->type != ACPI_TYPE_BUFFER ||
-	    obj->buffer.length < sizeof(*iosapic)) {
-		kfree(buffer.pointer);
-		return AE_OK;
-	}
-
-	iosapic = (struct acpi_madt_io_sapic *)obj->buffer.pointer;
-
-	if (iosapic->header.type != ACPI_MADT_TYPE_IO_SAPIC) {
-		kfree(buffer.pointer);
-		return AE_OK;
-	}
-
-	gsi_base = iosapic->global_irq_base;
-
-	kfree(buffer.pointer);
-
-	/* OK, it's an IOSAPIC MADT entry; associate it with a node */
-	node = acpi_get_node(handle);
-	if (node == NUMA_NO_NODE || !node_online(node) ||
-	    cpumask_empty(cpumask_of_node(node)))
-		return AE_OK;
-
-	/* We know a gsi to node mapping! */
-	map_iosapic_to_node(gsi_base, node);
-	return AE_OK;
-}
-
-static int __init
-acpi_map_iosapics (void)
-{
-	acpi_get_devices(NULL, acpi_map_iosapic, NULL, NULL);
-	return 0;
-}
-
-fs_initcall(acpi_map_iosapics);
-#endif				/* CONFIG_ACPI_NUMA */
-
-int __ref acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
-{
-	int err;
-
-	if ((err = iosapic_init(phys_addr, gsi_base)))
-		return err;
-
-#ifdef CONFIG_ACPI_NUMA
-	acpi_map_iosapic(handle, 0, NULL, NULL);
-#endif				/* CONFIG_ACPI_NUMA */
-
-	return 0;
-}
-
-EXPORT_SYMBOL(acpi_register_ioapic);
-
-int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
-{
-	return iosapic_remove(gsi_base);
-}
-
-EXPORT_SYMBOL(acpi_unregister_ioapic);
-
-/*
- * acpi_suspend_lowlevel() - save kernel state and suspend.
- *
- * TBD when IA64 starts to support suspend...
- */
-int acpi_suspend_lowlevel(void) { return 0; }
-
-void acpi_proc_quirk_mwait_check(void)
-{
-}
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
deleted file mode 100644
index be3b90fef2e9..000000000000
--- a/arch/ia64/kernel/asm-offsets.c
+++ /dev/null
@@ -1,289 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed
- * to extract and format the required data.
- */
-
-#define ASM_OFFSETS_C 1
-
-#include <linux/sched/signal.h>
-#include <linux/pid.h>
-#include <linux/clocksource.h>
-#include <linux/kbuild.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/siginfo.h>
-#include <asm/sigcontext.h>
-#include <asm/mca.h>
-
-#include "../kernel/sigframe.h"
-#include "../kernel/fsyscall_gtod_data.h"
-
-void foo(void)
-{
-	DEFINE(IA64_TASK_SIZE, sizeof (struct task_struct));
-	DEFINE(IA64_THREAD_INFO_SIZE, sizeof (struct thread_info));
-	DEFINE(IA64_PT_REGS_SIZE, sizeof (struct pt_regs));
-	DEFINE(IA64_SWITCH_STACK_SIZE, sizeof (struct switch_stack));
-	DEFINE(IA64_SIGINFO_SIZE, sizeof (struct siginfo));
-	DEFINE(IA64_CPU_SIZE, sizeof (struct cpuinfo_ia64));
-	DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe));
-	DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info));
-
-	BUILD_BUG_ON(sizeof(struct upid) != 16);
-	DEFINE(IA64_UPID_SHIFT, 4);
-
-	BLANK();
-
-	DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
-	DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
-	DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count));
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	DEFINE(TI_AC_STAMP, offsetof(struct thread_info, ac_stamp));
-	DEFINE(TI_AC_LEAVE, offsetof(struct thread_info, ac_leave));
-	DEFINE(TI_AC_STIME, offsetof(struct thread_info, ac_stime));
-	DEFINE(TI_AC_UTIME, offsetof(struct thread_info, ac_utime));
-#endif
-
-	BLANK();
-
-	DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked));
-	DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid));
-	DEFINE(IA64_TASK_THREAD_PID_OFFSET,offsetof (struct task_struct, thread_pid));
-	DEFINE(IA64_PID_LEVEL_OFFSET, offsetof (struct pid, level));
-	DEFINE(IA64_PID_UPID_OFFSET, offsetof (struct pid, numbers[0]));
-	DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending));
-	DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid));
-	DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent));
-	DEFINE(IA64_TASK_SIGNAL_OFFSET,offsetof (struct task_struct, signal));
-	DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid));
-	DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, thread.ksp));
-	DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, thread.on_ustack));
-
-	BLANK();
-
-
-	DEFINE(IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,offsetof (struct signal_struct,
-							     group_stop_count));
-	DEFINE(IA64_SIGNAL_SHARED_PENDING_OFFSET,offsetof (struct signal_struct, shared_pending));
-	DEFINE(IA64_SIGNAL_PIDS_TGID_OFFSET, offsetof (struct signal_struct, pids[PIDTYPE_TGID]));
-
-	BLANK();
-
-	DEFINE(IA64_PT_REGS_B6_OFFSET, offsetof (struct pt_regs, b6));
-	DEFINE(IA64_PT_REGS_B7_OFFSET, offsetof (struct pt_regs, b7));
-	DEFINE(IA64_PT_REGS_AR_CSD_OFFSET, offsetof (struct pt_regs, ar_csd));
-	DEFINE(IA64_PT_REGS_AR_SSD_OFFSET, offsetof (struct pt_regs, ar_ssd));
-	DEFINE(IA64_PT_REGS_R8_OFFSET, offsetof (struct pt_regs, r8));
-	DEFINE(IA64_PT_REGS_R9_OFFSET, offsetof (struct pt_regs, r9));
-	DEFINE(IA64_PT_REGS_R10_OFFSET, offsetof (struct pt_regs, r10));
-	DEFINE(IA64_PT_REGS_R11_OFFSET, offsetof (struct pt_regs, r11));
-	DEFINE(IA64_PT_REGS_CR_IPSR_OFFSET, offsetof (struct pt_regs, cr_ipsr));
-	DEFINE(IA64_PT_REGS_CR_IIP_OFFSET, offsetof (struct pt_regs, cr_iip));
-	DEFINE(IA64_PT_REGS_CR_IFS_OFFSET, offsetof (struct pt_regs, cr_ifs));
-	DEFINE(IA64_PT_REGS_AR_UNAT_OFFSET, offsetof (struct pt_regs, ar_unat));
-	DEFINE(IA64_PT_REGS_AR_PFS_OFFSET, offsetof (struct pt_regs, ar_pfs));
-	DEFINE(IA64_PT_REGS_AR_RSC_OFFSET, offsetof (struct pt_regs, ar_rsc));
-	DEFINE(IA64_PT_REGS_AR_RNAT_OFFSET, offsetof (struct pt_regs, ar_rnat));
-
-	DEFINE(IA64_PT_REGS_AR_BSPSTORE_OFFSET, offsetof (struct pt_regs, ar_bspstore));
-	DEFINE(IA64_PT_REGS_PR_OFFSET, offsetof (struct pt_regs, pr));
-	DEFINE(IA64_PT_REGS_B0_OFFSET, offsetof (struct pt_regs, b0));
-	DEFINE(IA64_PT_REGS_LOADRS_OFFSET, offsetof (struct pt_regs, loadrs));
-	DEFINE(IA64_PT_REGS_R1_OFFSET, offsetof (struct pt_regs, r1));
-	DEFINE(IA64_PT_REGS_R12_OFFSET, offsetof (struct pt_regs, r12));
-	DEFINE(IA64_PT_REGS_R13_OFFSET, offsetof (struct pt_regs, r13));
-	DEFINE(IA64_PT_REGS_AR_FPSR_OFFSET, offsetof (struct pt_regs, ar_fpsr));
-	DEFINE(IA64_PT_REGS_R15_OFFSET, offsetof (struct pt_regs, r15));
-	DEFINE(IA64_PT_REGS_R14_OFFSET, offsetof (struct pt_regs, r14));
-	DEFINE(IA64_PT_REGS_R2_OFFSET, offsetof (struct pt_regs, r2));
-	DEFINE(IA64_PT_REGS_R3_OFFSET, offsetof (struct pt_regs, r3));
-	DEFINE(IA64_PT_REGS_R16_OFFSET, offsetof (struct pt_regs, r16));
-	DEFINE(IA64_PT_REGS_R17_OFFSET, offsetof (struct pt_regs, r17));
-	DEFINE(IA64_PT_REGS_R18_OFFSET, offsetof (struct pt_regs, r18));
-	DEFINE(IA64_PT_REGS_R19_OFFSET, offsetof (struct pt_regs, r19));
-	DEFINE(IA64_PT_REGS_R20_OFFSET, offsetof (struct pt_regs, r20));
-	DEFINE(IA64_PT_REGS_R21_OFFSET, offsetof (struct pt_regs, r21));
-	DEFINE(IA64_PT_REGS_R22_OFFSET, offsetof (struct pt_regs, r22));
-	DEFINE(IA64_PT_REGS_R23_OFFSET, offsetof (struct pt_regs, r23));
-	DEFINE(IA64_PT_REGS_R24_OFFSET, offsetof (struct pt_regs, r24));
-	DEFINE(IA64_PT_REGS_R25_OFFSET, offsetof (struct pt_regs, r25));
-	DEFINE(IA64_PT_REGS_R26_OFFSET, offsetof (struct pt_regs, r26));
-	DEFINE(IA64_PT_REGS_R27_OFFSET, offsetof (struct pt_regs, r27));
-	DEFINE(IA64_PT_REGS_R28_OFFSET, offsetof (struct pt_regs, r28));
-	DEFINE(IA64_PT_REGS_R29_OFFSET, offsetof (struct pt_regs, r29));
-	DEFINE(IA64_PT_REGS_R30_OFFSET, offsetof (struct pt_regs, r30));
-	DEFINE(IA64_PT_REGS_R31_OFFSET, offsetof (struct pt_regs, r31));
-	DEFINE(IA64_PT_REGS_AR_CCV_OFFSET, offsetof (struct pt_regs, ar_ccv));
-	DEFINE(IA64_PT_REGS_F6_OFFSET, offsetof (struct pt_regs, f6));
-	DEFINE(IA64_PT_REGS_F7_OFFSET, offsetof (struct pt_regs, f7));
-	DEFINE(IA64_PT_REGS_F8_OFFSET, offsetof (struct pt_regs, f8));
-	DEFINE(IA64_PT_REGS_F9_OFFSET, offsetof (struct pt_regs, f9));
-	DEFINE(IA64_PT_REGS_F10_OFFSET, offsetof (struct pt_regs, f10));
-	DEFINE(IA64_PT_REGS_F11_OFFSET, offsetof (struct pt_regs, f11));
-
-	BLANK();
-
-	DEFINE(IA64_SWITCH_STACK_CALLER_UNAT_OFFSET, offsetof (struct switch_stack, caller_unat));
-	DEFINE(IA64_SWITCH_STACK_AR_FPSR_OFFSET, offsetof (struct switch_stack, ar_fpsr));
-	DEFINE(IA64_SWITCH_STACK_F2_OFFSET, offsetof (struct switch_stack, f2));
-	DEFINE(IA64_SWITCH_STACK_F3_OFFSET, offsetof (struct switch_stack, f3));
-	DEFINE(IA64_SWITCH_STACK_F4_OFFSET, offsetof (struct switch_stack, f4));
-	DEFINE(IA64_SWITCH_STACK_F5_OFFSET, offsetof (struct switch_stack, f5));
-	DEFINE(IA64_SWITCH_STACK_F12_OFFSET, offsetof (struct switch_stack, f12));
-	DEFINE(IA64_SWITCH_STACK_F13_OFFSET, offsetof (struct switch_stack, f13));
-	DEFINE(IA64_SWITCH_STACK_F14_OFFSET, offsetof (struct switch_stack, f14));
-	DEFINE(IA64_SWITCH_STACK_F15_OFFSET, offsetof (struct switch_stack, f15));
-	DEFINE(IA64_SWITCH_STACK_F16_OFFSET, offsetof (struct switch_stack, f16));
-	DEFINE(IA64_SWITCH_STACK_F17_OFFSET, offsetof (struct switch_stack, f17));
-	DEFINE(IA64_SWITCH_STACK_F18_OFFSET, offsetof (struct switch_stack, f18));
-	DEFINE(IA64_SWITCH_STACK_F19_OFFSET, offsetof (struct switch_stack, f19));
-	DEFINE(IA64_SWITCH_STACK_F20_OFFSET, offsetof (struct switch_stack, f20));
-	DEFINE(IA64_SWITCH_STACK_F21_OFFSET, offsetof (struct switch_stack, f21));
-	DEFINE(IA64_SWITCH_STACK_F22_OFFSET, offsetof (struct switch_stack, f22));
-	DEFINE(IA64_SWITCH_STACK_F23_OFFSET, offsetof (struct switch_stack, f23));
-	DEFINE(IA64_SWITCH_STACK_F24_OFFSET, offsetof (struct switch_stack, f24));
-	DEFINE(IA64_SWITCH_STACK_F25_OFFSET, offsetof (struct switch_stack, f25));
-	DEFINE(IA64_SWITCH_STACK_F26_OFFSET, offsetof (struct switch_stack, f26));
-	DEFINE(IA64_SWITCH_STACK_F27_OFFSET, offsetof (struct switch_stack, f27));
-	DEFINE(IA64_SWITCH_STACK_F28_OFFSET, offsetof (struct switch_stack, f28));
-	DEFINE(IA64_SWITCH_STACK_F29_OFFSET, offsetof (struct switch_stack, f29));
-	DEFINE(IA64_SWITCH_STACK_F30_OFFSET, offsetof (struct switch_stack, f30));
-	DEFINE(IA64_SWITCH_STACK_F31_OFFSET, offsetof (struct switch_stack, f31));
-	DEFINE(IA64_SWITCH_STACK_R4_OFFSET, offsetof (struct switch_stack, r4));
-	DEFINE(IA64_SWITCH_STACK_R5_OFFSET, offsetof (struct switch_stack, r5));
-	DEFINE(IA64_SWITCH_STACK_R6_OFFSET, offsetof (struct switch_stack, r6));
-	DEFINE(IA64_SWITCH_STACK_R7_OFFSET, offsetof (struct switch_stack, r7));
-	DEFINE(IA64_SWITCH_STACK_B0_OFFSET, offsetof (struct switch_stack, b0));
-	DEFINE(IA64_SWITCH_STACK_B1_OFFSET, offsetof (struct switch_stack, b1));
-	DEFINE(IA64_SWITCH_STACK_B2_OFFSET, offsetof (struct switch_stack, b2));
-	DEFINE(IA64_SWITCH_STACK_B3_OFFSET, offsetof (struct switch_stack, b3));
-	DEFINE(IA64_SWITCH_STACK_B4_OFFSET, offsetof (struct switch_stack, b4));
-	DEFINE(IA64_SWITCH_STACK_B5_OFFSET, offsetof (struct switch_stack, b5));
-	DEFINE(IA64_SWITCH_STACK_AR_PFS_OFFSET, offsetof (struct switch_stack, ar_pfs));
-	DEFINE(IA64_SWITCH_STACK_AR_LC_OFFSET, offsetof (struct switch_stack, ar_lc));
-	DEFINE(IA64_SWITCH_STACK_AR_UNAT_OFFSET, offsetof (struct switch_stack, ar_unat));
-	DEFINE(IA64_SWITCH_STACK_AR_RNAT_OFFSET, offsetof (struct switch_stack, ar_rnat));
-	DEFINE(IA64_SWITCH_STACK_AR_BSPSTORE_OFFSET, offsetof (struct switch_stack, ar_bspstore));
-	DEFINE(IA64_SWITCH_STACK_PR_OFFSET, offsetof (struct switch_stack, pr));
-
-	BLANK();
-
-	DEFINE(IA64_SIGCONTEXT_IP_OFFSET, offsetof (struct sigcontext, sc_ip));
-	DEFINE(IA64_SIGCONTEXT_AR_BSP_OFFSET, offsetof (struct sigcontext, sc_ar_bsp));
-	DEFINE(IA64_SIGCONTEXT_AR_FPSR_OFFSET, offsetof (struct sigcontext, sc_ar_fpsr));
-	DEFINE(IA64_SIGCONTEXT_AR_RNAT_OFFSET, offsetof (struct sigcontext, sc_ar_rnat));
-	DEFINE(IA64_SIGCONTEXT_AR_UNAT_OFFSET, offsetof (struct sigcontext, sc_ar_unat));
-	DEFINE(IA64_SIGCONTEXT_B0_OFFSET, offsetof (struct sigcontext, sc_br[0]));
-	DEFINE(IA64_SIGCONTEXT_CFM_OFFSET, offsetof (struct sigcontext, sc_cfm));
-	DEFINE(IA64_SIGCONTEXT_FLAGS_OFFSET, offsetof (struct sigcontext, sc_flags));
-	DEFINE(IA64_SIGCONTEXT_FR6_OFFSET, offsetof (struct sigcontext, sc_fr[6]));
-	DEFINE(IA64_SIGCONTEXT_PR_OFFSET, offsetof (struct sigcontext, sc_pr));
-	DEFINE(IA64_SIGCONTEXT_R12_OFFSET, offsetof (struct sigcontext, sc_gr[12]));
-	DEFINE(IA64_SIGCONTEXT_RBS_BASE_OFFSET,offsetof (struct sigcontext, sc_rbs_base));
-	DEFINE(IA64_SIGCONTEXT_LOADRS_OFFSET, offsetof (struct sigcontext, sc_loadrs));
-
-	BLANK();
-
-	DEFINE(IA64_SIGPENDING_SIGNAL_OFFSET, offsetof (struct sigpending, signal));
-
-	BLANK();
-
-	DEFINE(IA64_SIGFRAME_ARG0_OFFSET, offsetof (struct sigframe, arg0));
-	DEFINE(IA64_SIGFRAME_ARG1_OFFSET, offsetof (struct sigframe, arg1));
-	DEFINE(IA64_SIGFRAME_ARG2_OFFSET, offsetof (struct sigframe, arg2));
-	DEFINE(IA64_SIGFRAME_HANDLER_OFFSET, offsetof (struct sigframe, handler));
-	DEFINE(IA64_SIGFRAME_SIGCONTEXT_OFFSET, offsetof (struct sigframe, sc));
-	BLANK();
-    /* for assembly files which can't include sched.h: */
-	DEFINE(IA64_CLONE_VFORK, CLONE_VFORK);
-	DEFINE(IA64_CLONE_VM, CLONE_VM);
-
-	BLANK();
-	DEFINE(IA64_CPUINFO_NSEC_PER_CYC_OFFSET,
-	       offsetof (struct cpuinfo_ia64, nsec_per_cyc));
-	DEFINE(IA64_CPUINFO_PTCE_BASE_OFFSET,
-	       offsetof (struct cpuinfo_ia64, ptce_base));
-	DEFINE(IA64_CPUINFO_PTCE_COUNT_OFFSET,
-	       offsetof (struct cpuinfo_ia64, ptce_count));
-	DEFINE(IA64_CPUINFO_PTCE_STRIDE_OFFSET,
-	       offsetof (struct cpuinfo_ia64, ptce_stride));
-	BLANK();
-	DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET,
-	       offsetof (struct __kernel_old_timespec, tv_nsec));
-	DEFINE(IA64_TIME_SN_SPEC_SNSEC_OFFSET,
-	       offsetof (struct time_sn_spec, snsec));
-
-	DEFINE(CLONE_SETTLS_BIT, 19);
-#if CLONE_SETTLS != (1<<19)
-# error "CLONE_SETTLS_BIT incorrect, please fix"
-#endif
-
-	BLANK();
-	DEFINE(IA64_MCA_CPU_MCA_STACK_OFFSET,
-	       offsetof (struct ia64_mca_cpu, mca_stack));
-	DEFINE(IA64_MCA_CPU_INIT_STACK_OFFSET,
-	       offsetof (struct ia64_mca_cpu, init_stack));
-	BLANK();
-	DEFINE(IA64_SAL_OS_STATE_OS_GP_OFFSET,
-	       offsetof (struct ia64_sal_os_state, os_gp));
-	DEFINE(IA64_SAL_OS_STATE_PROC_STATE_PARAM_OFFSET,
-	       offsetof (struct ia64_sal_os_state, proc_state_param));
-	DEFINE(IA64_SAL_OS_STATE_SAL_RA_OFFSET,
-	       offsetof (struct ia64_sal_os_state, sal_ra));
-	DEFINE(IA64_SAL_OS_STATE_SAL_GP_OFFSET,
-	       offsetof (struct ia64_sal_os_state, sal_gp));
-	DEFINE(IA64_SAL_OS_STATE_PAL_MIN_STATE_OFFSET,
-	       offsetof (struct ia64_sal_os_state, pal_min_state));
-	DEFINE(IA64_SAL_OS_STATE_OS_STATUS_OFFSET,
-	       offsetof (struct ia64_sal_os_state, os_status));
-	DEFINE(IA64_SAL_OS_STATE_CONTEXT_OFFSET,
-	       offsetof (struct ia64_sal_os_state, context));
-	DEFINE(IA64_SAL_OS_STATE_SIZE,
-	       sizeof (struct ia64_sal_os_state));
-	BLANK();
-
-	DEFINE(IA64_PMSA_GR_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_gr));
-	DEFINE(IA64_PMSA_BANK1_GR_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_bank1_gr));
-	DEFINE(IA64_PMSA_PR_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_pr));
-	DEFINE(IA64_PMSA_BR0_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_br0));
-	DEFINE(IA64_PMSA_RSC_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_rsc));
-	DEFINE(IA64_PMSA_IIP_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_iip));
-	DEFINE(IA64_PMSA_IPSR_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_ipsr));
-	DEFINE(IA64_PMSA_IFS_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_ifs));
-	DEFINE(IA64_PMSA_XIP_OFFSET,
-	       offsetof(struct pal_min_state_area, pmsa_xip));
-	BLANK();
-
-	/* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */
-	DEFINE(IA64_GTOD_SEQ_OFFSET,
-	       offsetof (struct fsyscall_gtod_data_t, seq));
-	DEFINE(IA64_GTOD_WALL_TIME_OFFSET,
-		offsetof (struct fsyscall_gtod_data_t, wall_time));
-	DEFINE(IA64_GTOD_MONO_TIME_OFFSET,
-		offsetof (struct fsyscall_gtod_data_t, monotonic_time));
-	DEFINE(IA64_CLKSRC_MASK_OFFSET,
-		offsetof (struct fsyscall_gtod_data_t, clk_mask));
-	DEFINE(IA64_CLKSRC_MULT_OFFSET,
-		offsetof (struct fsyscall_gtod_data_t, clk_mult));
-	DEFINE(IA64_CLKSRC_SHIFT_OFFSET,
-		offsetof (struct fsyscall_gtod_data_t, clk_shift));
-	DEFINE(IA64_CLKSRC_MMIO_OFFSET,
-		offsetof (struct fsyscall_gtod_data_t, clk_fsys_mmio));
-	DEFINE(IA64_CLKSRC_CYCLE_LAST_OFFSET,
-		offsetof (struct fsyscall_gtod_data_t, clk_cycle_last));
-	DEFINE(IA64_ITC_JITTER_OFFSET,
-		offsetof (struct itc_jitter_data_t, itc_jitter));
-	DEFINE(IA64_ITC_LASTCYCLE_OFFSET,
-		offsetof (struct itc_jitter_data_t, itc_lastcycle));
-
-}
diff --git a/arch/ia64/kernel/audit.c b/arch/ia64/kernel/audit.c
deleted file mode 100644
index ec61f20ca61f..000000000000
--- a/arch/ia64/kernel/audit.c
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/audit.h>
-#include <asm/unistd.h>
-
-static unsigned dir_class[] = {
-#include <asm-generic/audit_dir_write.h>
-~0U
-};
-
-static unsigned read_class[] = {
-#include <asm-generic/audit_read.h>
-~0U
-};
-
-static unsigned write_class[] = {
-#include <asm-generic/audit_write.h>
-~0U
-};
-
-static unsigned chattr_class[] = {
-#include <asm-generic/audit_change_attr.h>
-~0U
-};
-
-static unsigned signal_class[] = {
-#include <asm-generic/audit_signal.h>
-~0U
-};
-
-int audit_classify_arch(int arch)
-{
-	return 0;
-}
-
-int audit_classify_syscall(int abi, unsigned syscall)
-{
-	switch(syscall) {
-	case __NR_open:
-		return AUDITSC_OPEN;
-	case __NR_openat:
-		return AUDITSC_OPENAT;
-	case __NR_execve:
-		return AUDITSC_EXECVE;
-	case __NR_openat2:
-		return AUDITSC_OPENAT2;
-	default:
-		return AUDITSC_NATIVE;
-	}
-}
-
-static int __init audit_classes_init(void)
-{
-	audit_register_class(AUDIT_CLASS_WRITE, write_class);
-	audit_register_class(AUDIT_CLASS_READ, read_class);
-	audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
-	audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
-	audit_register_class(AUDIT_CLASS_SIGNAL, signal_class);
-	return 0;
-}
-
-__initcall(audit_classes_init);
diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c
deleted file mode 100644
index 782c481d7052..000000000000
--- a/arch/ia64/kernel/brl_emu.c
+++ /dev/null
@@ -1,217 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *  Emulation of the "brl" instruction for IA64 processors that
- *  don't support it in hardware.
- *  Author: Stephan Zeisset, Intel Corp. <Stephan.Zeisset@intel.com>
- *
- *    02/22/02	D. Mosberger	Clear si_flgs, si_isr, and si_imm to avoid
- *				leaking kernel bits.
- */
-
-#include <linux/kernel.h>
-#include <linux/sched/signal.h>
-#include <linux/uaccess.h>
-#include <asm/processor.h>
-
-extern char ia64_set_b1, ia64_set_b2, ia64_set_b3, ia64_set_b4, ia64_set_b5;
-
-struct illegal_op_return {
-	unsigned long fkt, arg1, arg2, arg3;
-};
-
-/*
- *  The unimplemented bits of a virtual address must be set
- *  to the value of the most significant implemented bit.
- *  unimpl_va_mask includes all unimplemented bits and
- *  the most significant implemented bit, so the result
- *  of an and operation with the mask must be all 0's
- *  or all 1's for the address to be valid.
- */
-#define unimplemented_virtual_address(va) (						\
-	((va) & local_cpu_data->unimpl_va_mask) != 0 &&					\
-	((va) & local_cpu_data->unimpl_va_mask) != local_cpu_data->unimpl_va_mask	\
-)
-
-/*
- *  The unimplemented bits of a physical address must be 0.
- *  unimpl_pa_mask includes all unimplemented bits, so the result
- *  of an and operation with the mask must be all 0's for the
- *  address to be valid.
- */
-#define unimplemented_physical_address(pa) (		\
-	((pa) & local_cpu_data->unimpl_pa_mask) != 0	\
-)
-
-/*
- *  Handle an illegal operation fault that was caused by an
- *  unimplemented "brl" instruction.
- *  If we are not successful (e.g because the illegal operation
- *  wasn't caused by a "brl" after all), we return -1.
- *  If we are successful, we return either 0 or the address
- *  of a "fixup" function for manipulating preserved register
- *  state.
- */
-
-struct illegal_op_return
-ia64_emulate_brl (struct pt_regs *regs, unsigned long ar_ec)
-{
-	unsigned long bundle[2];
-	unsigned long opcode, btype, qp, offset, cpl;
-	unsigned long next_ip;
-	struct illegal_op_return rv;
-	long tmp_taken, unimplemented_address;
-
-	rv.fkt = (unsigned long) -1;
-
-	/*
-	 *  Decode the instruction bundle.
-	 */
-
-	if (copy_from_user(bundle, (void *) (regs->cr_iip), sizeof(bundle)))
-		return rv;
-
-	next_ip = (unsigned long) regs->cr_iip + 16;
-
-	/* "brl" must be in slot 2. */
-	if (ia64_psr(regs)->ri != 1) return rv;
-
-	/* Must be "mlx" template */
-	if ((bundle[0] & 0x1e) != 0x4) return rv;
-
-	opcode = (bundle[1] >> 60);
-	btype = ((bundle[1] >> 29) & 0x7);
-	qp = ((bundle[1] >> 23) & 0x3f);
-	offset = ((bundle[1] & 0x0800000000000000L) << 4)
-		| ((bundle[1] & 0x00fffff000000000L) >> 32)
-		| ((bundle[1] & 0x00000000007fffffL) << 40)
-		| ((bundle[0] & 0xffff000000000000L) >> 24);
-
-	tmp_taken = regs->pr & (1L << qp);
-
-	switch(opcode) {
-
-		case 0xC:
-			/*
-			 *  Long Branch.
-			 */
-			if (btype != 0) return rv;
-			rv.fkt = 0;
-			if (!(tmp_taken)) {
-				/*
-				 *  Qualifying predicate is 0.
-				 *  Skip instruction.
-				 */
-				regs->cr_iip = next_ip;
-				ia64_psr(regs)->ri = 0;
-				return rv;
-			}
-			break;
-
-		case 0xD:
-			/*
-			 *  Long Call.
-			 */
-			rv.fkt = 0;
-			if (!(tmp_taken)) {
-				/*
-				 *  Qualifying predicate is 0.
-				 *  Skip instruction.
-				 */
-				regs->cr_iip = next_ip;
-				ia64_psr(regs)->ri = 0;
-				return rv;
-			}
-
-			/*
-			 *  BR[btype] = IP+16
-			 */
-			switch(btype) {
-				case 0:
-					regs->b0 = next_ip;
-					break;
-				case 1:
-					rv.fkt = (unsigned long) &ia64_set_b1;
-					break;
-				case 2:
-					rv.fkt = (unsigned long) &ia64_set_b2;
-					break;
-				case 3:
-					rv.fkt = (unsigned long) &ia64_set_b3;
-					break;
-				case 4:
-					rv.fkt = (unsigned long) &ia64_set_b4;
-					break;
-				case 5:
-					rv.fkt = (unsigned long) &ia64_set_b5;
-					break;
-				case 6:
-					regs->b6 = next_ip;
-					break;
-				case 7:
-					regs->b7 = next_ip;
-					break;
-			}
-			rv.arg1 = next_ip;
-
-			/*
-			 *  AR[PFS].pfm = CFM
-			 *  AR[PFS].pec = AR[EC]
-			 *  AR[PFS].ppl = PSR.cpl
-			 */
-			cpl = ia64_psr(regs)->cpl;
-			regs->ar_pfs = ((regs->cr_ifs & 0x3fffffffff)
-					| (ar_ec << 52) | (cpl << 62));
-
-			/*
-			 *  CFM.sof -= CFM.sol
-			 *  CFM.sol = 0
-			 *  CFM.sor = 0
-			 *  CFM.rrb.gr = 0
-			 *  CFM.rrb.fr = 0
-			 *  CFM.rrb.pr = 0
-			 */
-			regs->cr_ifs = ((regs->cr_ifs & 0xffffffc00000007f)
-					- ((regs->cr_ifs >> 7) & 0x7f));
-
-			break;
-
-		default:
-			/*
-			 *  Unknown opcode.
-			 */
-			return rv;
-
-	}
-
-	regs->cr_iip += offset;
-	ia64_psr(regs)->ri = 0;
-
-	if (ia64_psr(regs)->it == 0)
-		unimplemented_address = unimplemented_physical_address(regs->cr_iip);
-	else
-		unimplemented_address = unimplemented_virtual_address(regs->cr_iip);
-
-	if (unimplemented_address) {
-		/*
-		 *  The target address contains unimplemented bits.
-		 */
-		printk(KERN_DEBUG "Woah! Unimplemented Instruction Address Trap!\n");
-		force_sig_fault(SIGILL, ILL_BADIADDR, (void __user *)NULL,
-				0, 0, 0);
-	} else if (ia64_psr(regs)->tb) {
-		/*
-		 *  Branch Tracing is enabled.
-		 *  Force a taken branch signal.
-		 */
-		force_sig_fault(SIGTRAP, TRAP_BRANCH, (void __user *)NULL,
-				0, 0, 0);
-	} else if (ia64_psr(regs)->ss) {
-		/*
-		 *  Single Step is enabled.
-		 *  Force a trace signal.
-		 */
-		force_sig_fault(SIGTRAP, TRAP_TRACE, (void __user *)NULL,
-				0, 0, 0);
-	}
-	return rv;
-}
diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c
deleted file mode 100644
index 88b3ce3e66cd..000000000000
--- a/arch/ia64/kernel/crash.c
+++ /dev/null
@@ -1,257 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * arch/ia64/kernel/crash.c
- *
- * Architecture specific (ia64) functions for kexec based crash dumps.
- *
- * Created by: Khalid Aziz <khalid.aziz@hp.com>
- * Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
- * Copyright (C) 2005 Intel Corp	Zou Nan hai <nanhai.zou@intel.com>
- *
- */
-#include <linux/smp.h>
-#include <linux/delay.h>
-#include <linux/crash_dump.h>
-#include <linux/memblock.h>
-#include <linux/kexec.h>
-#include <linux/elfcore.h>
-#include <linux/reboot.h>
-#include <linux/sysctl.h>
-#include <linux/init.h>
-#include <linux/kdebug.h>
-
-#include <asm/mca.h>
-
-int kdump_status[NR_CPUS];
-static atomic_t kdump_cpu_frozen;
-atomic_t kdump_in_progress;
-static int kdump_freeze_monarch;
-static int kdump_on_init = 1;
-static int kdump_on_fatal_mca = 1;
-
-extern void ia64_dump_cpu_regs(void *);
-
-static DEFINE_PER_CPU(struct elf_prstatus, elf_prstatus);
-
-void
-crash_save_this_cpu(void)
-{
-	void *buf;
-	unsigned long cfm, sof, sol;
-
-	int cpu = smp_processor_id();
-	struct elf_prstatus *prstatus = &per_cpu(elf_prstatus, cpu);
-
-	elf_greg_t *dst = (elf_greg_t *)&(prstatus->pr_reg);
-	memset(prstatus, 0, sizeof(*prstatus));
-	prstatus->common.pr_pid = current->pid;
-
-	ia64_dump_cpu_regs(dst);
-	cfm = dst[43];
-	sol = (cfm >> 7) & 0x7f;
-	sof = cfm & 0x7f;
-	dst[46] = (unsigned long)ia64_rse_skip_regs((unsigned long *)dst[46],
-			sof - sol);
-
-	buf = (u64 *) per_cpu_ptr(crash_notes, cpu);
-	if (!buf)
-		return;
-	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, prstatus,
-			sizeof(*prstatus));
-	final_note(buf);
-}
-
-#ifdef CONFIG_SMP
-static int
-kdump_wait_cpu_freeze(void)
-{
-	int cpu_num = num_online_cpus() - 1;
-	int timeout = 1000;
-	while(timeout-- > 0) {
-		if (atomic_read(&kdump_cpu_frozen) == cpu_num)
-			return 0;
-		udelay(1000);
-	}
-	return 1;
-}
-#endif
-
-void
-machine_crash_shutdown(struct pt_regs *pt)
-{
-	/* This function is only called after the system
-	 * has paniced or is otherwise in a critical state.
-	 * The minimum amount of code to allow a kexec'd kernel
-	 * to run successfully needs to happen here.
-	 *
-	 * In practice this means shooting down the other cpus in
-	 * an SMP system.
-	 */
-	kexec_disable_iosapic();
-#ifdef CONFIG_SMP
-	/*
-	 * If kdump_on_init is set and an INIT is asserted here, kdump will
-	 * be started again via INIT monarch.
-	 */
-	local_irq_disable();
-	ia64_set_psr_mc();	/* mask MCA/INIT */
-	if (atomic_inc_return(&kdump_in_progress) != 1)
-		unw_init_running(kdump_cpu_freeze, NULL);
-
-	/*
-	 * Now this cpu is ready for kdump.
-	 * Stop all others by IPI or INIT.  They could receive INIT from
-	 * outside and might be INIT monarch, but only thing they have to
-	 * do is falling into kdump_cpu_freeze().
-	 *
-	 * If an INIT is asserted here:
-	 * - All receivers might be slaves, since some of cpus could already
-	 *   be frozen and INIT might be masked on monarch.  In this case,
-	 *   all slaves will be frozen soon since kdump_in_progress will let
-	 *   them into DIE_INIT_SLAVE_LEAVE.
-	 * - One might be a monarch, but INIT rendezvous will fail since
-	 *   at least this cpu already have INIT masked so it never join
-	 *   to the rendezvous.  In this case, all slaves and monarch will
-	 *   be frozen soon with no wait since the INIT rendezvous is skipped
-	 *   by kdump_in_progress.
-	 */
-	kdump_smp_send_stop();
-	/* not all cpu response to IPI, send INIT to freeze them */
-	if (kdump_wait_cpu_freeze()) {
-		kdump_smp_send_init();
-		/* wait again, don't go ahead if possible */
-		kdump_wait_cpu_freeze();
-	}
-#endif
-}
-
-static void
-machine_kdump_on_init(void)
-{
-	crash_save_vmcoreinfo();
-	local_irq_disable();
-	kexec_disable_iosapic();
-	machine_kexec(ia64_kimage);
-}
-
-void
-kdump_cpu_freeze(struct unw_frame_info *info, void *arg)
-{
-	int cpuid;
-
-	local_irq_disable();
-	cpuid = smp_processor_id();
-	crash_save_this_cpu();
-	current->thread.ksp = (__u64)info->sw - 16;
-
-	ia64_set_psr_mc();	/* mask MCA/INIT and stop reentrance */
-
-	atomic_inc(&kdump_cpu_frozen);
-	kdump_status[cpuid] = 1;
-	mb();
-	for (;;)
-		cpu_relax();
-}
-
-static int
-kdump_init_notifier(struct notifier_block *self, unsigned long val, void *data)
-{
-	struct ia64_mca_notify_die *nd;
-	struct die_args *args = data;
-
-	if (atomic_read(&kdump_in_progress)) {
-		switch (val) {
-		case DIE_INIT_MONARCH_LEAVE:
-			if (!kdump_freeze_monarch)
-				break;
-			fallthrough;
-		case DIE_INIT_SLAVE_LEAVE:
-		case DIE_INIT_MONARCH_ENTER:
-		case DIE_MCA_RENDZVOUS_LEAVE:
-			unw_init_running(kdump_cpu_freeze, NULL);
-			break;
-		}
-	}
-
-	if (!kdump_on_init && !kdump_on_fatal_mca)
-		return NOTIFY_DONE;
-
-	if (!ia64_kimage) {
-		if (val == DIE_INIT_MONARCH_LEAVE)
-			ia64_mca_printk(KERN_NOTICE
-					"%s: kdump not configured\n",
-					__func__);
-		return NOTIFY_DONE;
-	}
-
-	if (val != DIE_INIT_MONARCH_LEAVE &&
-	    val != DIE_INIT_MONARCH_PROCESS &&
-	    val != DIE_MCA_MONARCH_LEAVE)
-		return NOTIFY_DONE;
-
-	nd = (struct ia64_mca_notify_die *)args->err;
-
-	switch (val) {
-	case DIE_INIT_MONARCH_PROCESS:
-		/* Reason code 1 means machine check rendezvous*/
-		if (kdump_on_init && (nd->sos->rv_rc != 1)) {
-			if (atomic_inc_return(&kdump_in_progress) != 1)
-				kdump_freeze_monarch = 1;
-		}
-		break;
-	case DIE_INIT_MONARCH_LEAVE:
-		/* Reason code 1 means machine check rendezvous*/
-		if (kdump_on_init && (nd->sos->rv_rc != 1))
-			machine_kdump_on_init();
-		break;
-	case DIE_MCA_MONARCH_LEAVE:
-		/* *(nd->data) indicate if MCA is recoverable */
-		if (kdump_on_fatal_mca && !(*(nd->data))) {
-			if (atomic_inc_return(&kdump_in_progress) == 1)
-				machine_kdump_on_init();
-			/* We got fatal MCA while kdump!? No way!! */
-		}
-		break;
-	}
-	return NOTIFY_DONE;
-}
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table kdump_ctl_table[] = {
-	{
-		.procname = "kdump_on_init",
-		.data = &kdump_on_init,
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = proc_dointvec,
-	},
-	{
-		.procname = "kdump_on_fatal_mca",
-		.data = &kdump_on_fatal_mca,
-		.maxlen = sizeof(int),
-		.mode = 0644,
-		.proc_handler = proc_dointvec,
-	},
-	{ }
-};
-#endif
-
-static int
-machine_crash_setup(void)
-{
-	/* be notified before default_monarch_init_process */
-	static struct notifier_block kdump_init_notifier_nb = {
-		.notifier_call = kdump_init_notifier,
-		.priority = 1,
-	};
-	int ret;
-	if((ret = register_die_notifier(&kdump_init_notifier_nb)) != 0)
-		return ret;
-#ifdef CONFIG_SYSCTL
-	register_sysctl("kernel", kdump_ctl_table);
-#endif
-	return 0;
-}
-
-__initcall(machine_crash_setup);
-
diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c
deleted file mode 100644
index 4ef68e2aa757..000000000000
--- a/arch/ia64/kernel/crash_dump.c
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *	kernel/crash_dump.c - Memory preserving reboot related code.
- *
- *	Created by: Simon Horman <horms@verge.net.au>
- *	Original code moved from kernel/crash.c
- *	Original code comment copied from the i386 version of this file
- */
-
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/crash_dump.h>
-#include <linux/uio.h>
-#include <asm/page.h>
-
-ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn,
-		size_t csize, unsigned long offset)
-{
-	void  *vaddr;
-
-	if (!csize)
-		return 0;
-	vaddr = __va(pfn<<PAGE_SHIFT);
-	csize = copy_to_iter(vaddr + offset, csize, iter);
-	return csize;
-}
-
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
deleted file mode 100644
index 258d7b70c0f3..000000000000
--- a/arch/ia64/kernel/cyclone.c
+++ /dev/null
@@ -1,125 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/module.h>
-#include <linux/smp.h>
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/timex.h>
-#include <linux/clocksource.h>
-#include <linux/io.h>
-
-/* IBM Summit (EXA) Cyclone counter code*/
-#define CYCLONE_CBAR_ADDR 0xFEB00CD0
-#define CYCLONE_PMCC_OFFSET 0x51A0
-#define CYCLONE_MPMC_OFFSET 0x51D0
-#define CYCLONE_MPCS_OFFSET 0x51A8
-#define CYCLONE_TIMER_FREQ 100000000
-
-int use_cyclone;
-void __init cyclone_setup(void)
-{
-	use_cyclone = 1;
-}
-
-static void __iomem *cyclone_mc;
-
-static u64 read_cyclone(struct clocksource *cs)
-{
-	return (u64)readq((void __iomem *)cyclone_mc);
-}
-
-static struct clocksource clocksource_cyclone = {
-        .name           = "cyclone",
-        .rating         = 300,
-        .read           = read_cyclone,
-        .mask           = (1LL << 40) - 1,
-        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-int __init init_cyclone_clock(void)
-{
-	u64 __iomem *reg;
-	u64 base;	/* saved cyclone base address */
-	u64 offset;	/* offset from pageaddr to cyclone_timer register */
-	int i;
-	u32 __iomem *cyclone_timer;	/* Cyclone MPMC0 register */
-
-	if (!use_cyclone)
-		return 0;
-
-	printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
-
-	/* find base address */
-	offset = (CYCLONE_CBAR_ADDR);
-	reg = ioremap(offset, sizeof(u64));
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid CBAR"
-				" register.\n");
-		use_cyclone = 0;
-		return -ENODEV;
-	}
-	base = readq(reg);
-	iounmap(reg);
-	if(!base){
-		printk(KERN_ERR "Summit chipset: Could not find valid CBAR"
-				" value.\n");
-		use_cyclone = 0;
-		return -ENODEV;
-	}
-
-	/* setup PMCC */
-	offset = (base + CYCLONE_PMCC_OFFSET);
-	reg = ioremap(offset, sizeof(u64));
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid PMCC"
-				" register.\n");
-		use_cyclone = 0;
-		return -ENODEV;
-	}
-	writel(0x00000001,reg);
-	iounmap(reg);
-
-	/* setup MPCS */
-	offset = (base + CYCLONE_MPCS_OFFSET);
-	reg = ioremap(offset, sizeof(u64));
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid MPCS"
-				" register.\n");
-		use_cyclone = 0;
-		return -ENODEV;
-	}
-	writel(0x00000001,reg);
-	iounmap(reg);
-
-	/* map in cyclone_timer */
-	offset = (base + CYCLONE_MPMC_OFFSET);
-	cyclone_timer = ioremap(offset, sizeof(u32));
-	if(!cyclone_timer){
-		printk(KERN_ERR "Summit chipset: Could not find valid MPMC"
-				" register.\n");
-		use_cyclone = 0;
-		return -ENODEV;
-	}
-
-	/*quick test to make sure its ticking*/
-	for(i=0; i<3; i++){
-		u32 old = readl(cyclone_timer);
-		int stall = 100;
-		while(stall--) barrier();
-		if(readl(cyclone_timer) == old){
-			printk(KERN_ERR "Summit chipset: Counter not counting!"
-					" DISABLED\n");
-			iounmap(cyclone_timer);
-			cyclone_timer = NULL;
-			use_cyclone = 0;
-			return -ENODEV;
-		}
-	}
-	/* initialize last tick */
-	cyclone_mc = cyclone_timer;
-	clocksource_cyclone.archdata.fsys_mmio = cyclone_timer;
-	clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ);
-
-	return 0;
-}
-
-__initcall(init_cyclone_clock);
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
deleted file mode 100644
index cd0c166bfbc2..000000000000
--- a/arch/ia64/kernel/dma-mapping.c
+++ /dev/null
@@ -1,9 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/dma-map-ops.h>
-#include <linux/export.h>
-
-/* Set this to 1 if there is a HW IOMMU in the system */
-int iommu_detected __read_mostly;
-
-const struct dma_map_ops *dma_ops;
-EXPORT_SYMBOL(dma_ops);
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
deleted file mode 100644
index 033f5aead88a..000000000000
--- a/arch/ia64/kernel/efi.c
+++ /dev/null
@@ -1,1360 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Extensible Firmware Interface
- *
- * Based on Extensible Firmware Interface Specification version 0.9
- * April 30, 1999
- *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999-2003 Hewlett-Packard Co.
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * (c) Copyright 2006 Hewlett-Packard Development Company, L.P.
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- *
- * All EFI Runtime Services are not implemented yet as EFI only
- * supports physical mode addressing on SoftSDV. This is to be fixed
- * in a future version.  --drummond 1999-07-20
- *
- * Implemented EFI runtime services and virtual mode calls.  --davidm
- *
- * Goutham Rao: <goutham.rao@intel.com>
- *	Skip non-WB memory and ignore empty memory ranges.
- */
-#include <linux/module.h>
-#include <linux/memblock.h>
-#include <linux/crash_dump.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/efi.h>
-#include <linux/kexec.h>
-#include <linux/mm.h>
-
-#include <asm/efi.h>
-#include <asm/io.h>
-#include <asm/kregs.h>
-#include <asm/meminit.h>
-#include <asm/processor.h>
-#include <asm/mca.h>
-#include <asm/sal.h>
-#include <asm/setup.h>
-#include <asm/tlbflush.h>
-
-#define EFI_DEBUG	0
-
-#define ESI_TABLE_GUID					\
-    EFI_GUID(0x43EA58DC, 0xCF28, 0x4b06, 0xB3,		\
-	     0x91, 0xB7, 0x50, 0x59, 0x34, 0x2B, 0xD4)
-
-static unsigned long mps_phys = EFI_INVALID_TABLE_ADDR;
-static __initdata unsigned long palo_phys;
-
-unsigned long __initdata esi_phys = EFI_INVALID_TABLE_ADDR;
-unsigned long hcdp_phys = EFI_INVALID_TABLE_ADDR;
-unsigned long sal_systab_phys = EFI_INVALID_TABLE_ADDR;
-
-static const efi_config_table_type_t arch_tables[] __initconst = {
-	{ESI_TABLE_GUID,				&esi_phys,		"ESI"		},
-	{HCDP_TABLE_GUID,				&hcdp_phys,		"HCDP"		},
-	{MPS_TABLE_GUID,				&mps_phys,		"MPS"		},
-	{PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID,	&palo_phys,		"PALO"		},
-	{SAL_SYSTEM_TABLE_GUID,				&sal_systab_phys,	"SALsystab"	},
-	{},
-};
-
-extern efi_status_t efi_call_phys (void *, ...);
-
-static efi_runtime_services_t *runtime;
-static u64 mem_limit = ~0UL, max_addr = ~0UL, min_addr = 0UL;
-
-#define efi_call_virt(f, args...)	(*(f))(args)
-
-#define STUB_GET_TIME(prefix, adjust_arg)				       \
-static efi_status_t							       \
-prefix##_get_time (efi_time_t *tm, efi_time_cap_t *tc)			       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_time_cap_t *atc = NULL;					       \
-	efi_status_t ret;						       \
-									       \
-	if (tc)								       \
-		atc = adjust_arg(tc);					       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix((efi_get_time_t *) __va(runtime->get_time),    \
-				adjust_arg(tm), atc);			       \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_SET_TIME(prefix, adjust_arg)				       \
-static efi_status_t							       \
-prefix##_set_time (efi_time_t *tm)					       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_status_t ret;						       \
-									       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix((efi_set_time_t *) __va(runtime->set_time),    \
-				adjust_arg(tm));			       \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_GET_WAKEUP_TIME(prefix, adjust_arg)			       \
-static efi_status_t							       \
-prefix##_get_wakeup_time (efi_bool_t *enabled, efi_bool_t *pending,	       \
-			  efi_time_t *tm)				       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_status_t ret;						       \
-									       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix(					       \
-		(efi_get_wakeup_time_t *) __va(runtime->get_wakeup_time),      \
-		adjust_arg(enabled), adjust_arg(pending), adjust_arg(tm));     \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_SET_WAKEUP_TIME(prefix, adjust_arg)			       \
-static efi_status_t							       \
-prefix##_set_wakeup_time (efi_bool_t enabled, efi_time_t *tm)		       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_time_t *atm = NULL;						       \
-	efi_status_t ret;						       \
-									       \
-	if (tm)								       \
-		atm = adjust_arg(tm);					       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix(					       \
-		(efi_set_wakeup_time_t *) __va(runtime->set_wakeup_time),      \
-		enabled, atm);						       \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_GET_VARIABLE(prefix, adjust_arg)				       \
-static efi_status_t							       \
-prefix##_get_variable (efi_char16_t *name, efi_guid_t *vendor, u32 *attr,      \
-		       unsigned long *data_size, void *data)		       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	u32 *aattr = NULL;						       \
-	efi_status_t ret;						       \
-									       \
-	if (attr)							       \
-		aattr = adjust_arg(attr);				       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix(					       \
-		(efi_get_variable_t *) __va(runtime->get_variable),	       \
-		adjust_arg(name), adjust_arg(vendor), aattr,		       \
-		adjust_arg(data_size), adjust_arg(data));		       \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_GET_NEXT_VARIABLE(prefix, adjust_arg)			       \
-static efi_status_t							       \
-prefix##_get_next_variable (unsigned long *name_size, efi_char16_t *name,      \
-			    efi_guid_t *vendor)				       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_status_t ret;						       \
-									       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix(					       \
-		(efi_get_next_variable_t *) __va(runtime->get_next_variable),  \
-		adjust_arg(name_size), adjust_arg(name), adjust_arg(vendor));  \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_SET_VARIABLE(prefix, adjust_arg)				       \
-static efi_status_t							       \
-prefix##_set_variable (efi_char16_t *name, efi_guid_t *vendor,		       \
-		       u32 attr, unsigned long data_size,		       \
-		       void *data)					       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_status_t ret;						       \
-									       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix(					       \
-		(efi_set_variable_t *) __va(runtime->set_variable),	       \
-		adjust_arg(name), adjust_arg(vendor), attr, data_size,	       \
-		adjust_arg(data));					       \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_GET_NEXT_HIGH_MONO_COUNT(prefix, adjust_arg)		       \
-static efi_status_t							       \
-prefix##_get_next_high_mono_count (u32 *count)				       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_status_t ret;						       \
-									       \
-	ia64_save_scratch_fpregs(fr);					       \
-	ret = efi_call_##prefix((efi_get_next_high_mono_count_t *)	       \
-				__va(runtime->get_next_high_mono_count),       \
-				adjust_arg(count));			       \
-	ia64_load_scratch_fpregs(fr);					       \
-	return ret;							       \
-}
-
-#define STUB_RESET_SYSTEM(prefix, adjust_arg)				       \
-static void								       \
-prefix##_reset_system (int reset_type, efi_status_t status,		       \
-		       unsigned long data_size, efi_char16_t *data)	       \
-{									       \
-	struct ia64_fpreg fr[6];					       \
-	efi_char16_t *adata = NULL;					       \
-									       \
-	if (data)							       \
-		adata = adjust_arg(data);				       \
-									       \
-	ia64_save_scratch_fpregs(fr);					       \
-	efi_call_##prefix(						       \
-		(efi_reset_system_t *) __va(runtime->reset_system),	       \
-		reset_type, status, data_size, adata);			       \
-	/* should not return, but just in case... */			       \
-	ia64_load_scratch_fpregs(fr);					       \
-}
-
-#define phys_ptr(arg)	((__typeof__(arg)) ia64_tpa(arg))
-
-STUB_GET_TIME(phys, phys_ptr)
-STUB_SET_TIME(phys, phys_ptr)
-STUB_GET_WAKEUP_TIME(phys, phys_ptr)
-STUB_SET_WAKEUP_TIME(phys, phys_ptr)
-STUB_GET_VARIABLE(phys, phys_ptr)
-STUB_GET_NEXT_VARIABLE(phys, phys_ptr)
-STUB_SET_VARIABLE(phys, phys_ptr)
-STUB_GET_NEXT_HIGH_MONO_COUNT(phys, phys_ptr)
-STUB_RESET_SYSTEM(phys, phys_ptr)
-
-#define id(arg)	arg
-
-STUB_GET_TIME(virt, id)
-STUB_SET_TIME(virt, id)
-STUB_GET_WAKEUP_TIME(virt, id)
-STUB_SET_WAKEUP_TIME(virt, id)
-STUB_GET_VARIABLE(virt, id)
-STUB_GET_NEXT_VARIABLE(virt, id)
-STUB_SET_VARIABLE(virt, id)
-STUB_GET_NEXT_HIGH_MONO_COUNT(virt, id)
-STUB_RESET_SYSTEM(virt, id)
-
-void
-efi_gettimeofday (struct timespec64 *ts)
-{
-	efi_time_t tm;
-
-	if ((*efi.get_time)(&tm, NULL) != EFI_SUCCESS) {
-		memset(ts, 0, sizeof(*ts));
-		return;
-	}
-
-	ts->tv_sec = mktime64(tm.year, tm.month, tm.day,
-			    tm.hour, tm.minute, tm.second);
-	ts->tv_nsec = tm.nanosecond;
-}
-
-static int
-is_memory_available (efi_memory_desc_t *md)
-{
-	if (!(md->attribute & EFI_MEMORY_WB))
-		return 0;
-
-	switch (md->type) {
-	      case EFI_LOADER_CODE:
-	      case EFI_LOADER_DATA:
-	      case EFI_BOOT_SERVICES_CODE:
-	      case EFI_BOOT_SERVICES_DATA:
-	      case EFI_CONVENTIONAL_MEMORY:
-		return 1;
-	}
-	return 0;
-}
-
-typedef struct kern_memdesc {
-	u64 attribute;
-	u64 start;
-	u64 num_pages;
-} kern_memdesc_t;
-
-static kern_memdesc_t *kern_memmap;
-
-#define efi_md_size(md)	(md->num_pages << EFI_PAGE_SHIFT)
-
-static inline u64
-kmd_end(kern_memdesc_t *kmd)
-{
-	return (kmd->start + (kmd->num_pages << EFI_PAGE_SHIFT));
-}
-
-static inline u64
-efi_md_end(efi_memory_desc_t *md)
-{
-	return (md->phys_addr + efi_md_size(md));
-}
-
-static inline int
-efi_wb(efi_memory_desc_t *md)
-{
-	return (md->attribute & EFI_MEMORY_WB);
-}
-
-static inline int
-efi_uc(efi_memory_desc_t *md)
-{
-	return (md->attribute & EFI_MEMORY_UC);
-}
-
-static void
-walk (efi_freemem_callback_t callback, void *arg, u64 attr)
-{
-	kern_memdesc_t *k;
-	u64 start, end, voff;
-
-	voff = (attr == EFI_MEMORY_WB) ? PAGE_OFFSET : __IA64_UNCACHED_OFFSET;
-	for (k = kern_memmap; k->start != ~0UL; k++) {
-		if (k->attribute != attr)
-			continue;
-		start = PAGE_ALIGN(k->start);
-		end = (k->start + (k->num_pages << EFI_PAGE_SHIFT)) & PAGE_MASK;
-		if (start < end)
-			if ((*callback)(start + voff, end + voff, arg) < 0)
-				return;
-	}
-}
-
-/*
- * Walk the EFI memory map and call CALLBACK once for each EFI memory
- * descriptor that has memory that is available for OS use.
- */
-void
-efi_memmap_walk (efi_freemem_callback_t callback, void *arg)
-{
-	walk(callback, arg, EFI_MEMORY_WB);
-}
-
-/*
- * Walk the EFI memory map and call CALLBACK once for each EFI memory
- * descriptor that has memory that is available for uncached allocator.
- */
-void
-efi_memmap_walk_uc (efi_freemem_callback_t callback, void *arg)
-{
-	walk(callback, arg, EFI_MEMORY_UC);
-}
-
-/*
- * Look for the PAL_CODE region reported by EFI and map it using an
- * ITR to enable safe PAL calls in virtual mode.  See IA-64 Processor
- * Abstraction Layer chapter 11 in ADAG
- */
-void *
-efi_get_pal_addr (void)
-{
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-	int pal_code_count = 0;
-	u64 vaddr, mask;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-		if (md->type != EFI_PAL_CODE)
-			continue;
-
-		if (++pal_code_count > 1) {
-			printk(KERN_ERR "Too many EFI Pal Code memory ranges, "
-			       "dropped @ %llx\n", md->phys_addr);
-			continue;
-		}
-		/*
-		 * The only ITLB entry in region 7 that is used is the one
-		 * installed by __start().  That entry covers a 64MB range.
-		 */
-		mask  = ~((1 << KERNEL_TR_PAGE_SHIFT) - 1);
-		vaddr = PAGE_OFFSET + md->phys_addr;
-
-		/*
-		 * We must check that the PAL mapping won't overlap with the
-		 * kernel mapping.
-		 *
-		 * PAL code is guaranteed to be aligned on a power of 2 between
-		 * 4k and 256KB and that only one ITR is needed to map it. This
-		 * implies that the PAL code is always aligned on its size,
-		 * i.e., the closest matching page size supported by the TLB.
-		 * Therefore PAL code is guaranteed never to cross a 64MB unless
-		 * it is bigger than 64MB (very unlikely!).  So for now the
-		 * following test is enough to determine whether or not we need
-		 * a dedicated ITR for the PAL code.
-		 */
-		if ((vaddr & mask) == (KERNEL_START & mask)) {
-			printk(KERN_INFO "%s: no need to install ITR for PAL code\n",
-			       __func__);
-			continue;
-		}
-
-		if (efi_md_size(md) > IA64_GRANULE_SIZE)
-			panic("Whoa!  PAL code size bigger than a granule!");
-
-#if EFI_DEBUG
-		mask  = ~((1 << IA64_GRANULE_SHIFT) - 1);
-
-		printk(KERN_INFO "CPU %d: mapping PAL code "
-			"[0x%llx-0x%llx) into [0x%llx-0x%llx)\n",
-			smp_processor_id(), md->phys_addr,
-			md->phys_addr + efi_md_size(md),
-			vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE);
-#endif
-		return __va(md->phys_addr);
-	}
-	printk(KERN_WARNING "%s: no PAL-code memory-descriptor found\n",
-	       __func__);
-	return NULL;
-}
-
-
-static u8 __init palo_checksum(u8 *buffer, u32 length)
-{
-	u8 sum = 0;
-	u8 *end = buffer + length;
-
-	while (buffer < end)
-		sum = (u8) (sum + *(buffer++));
-
-	return sum;
-}
-
-/*
- * Parse and handle PALO table which is published at:
- * http://www.dig64.org/home/DIG64_PALO_R1_0.pdf
- */
-static void __init handle_palo(unsigned long phys_addr)
-{
-	struct palo_table *palo = __va(phys_addr);
-	u8  checksum;
-
-	if (strncmp(palo->signature, PALO_SIG, sizeof(PALO_SIG) - 1)) {
-		printk(KERN_INFO "PALO signature incorrect.\n");
-		return;
-	}
-
-	checksum = palo_checksum((u8 *)palo, palo->length);
-	if (checksum) {
-		printk(KERN_INFO "PALO checksum incorrect.\n");
-		return;
-	}
-
-	setup_ptcg_sem(palo->max_tlb_purges, NPTCG_FROM_PALO);
-}
-
-void
-efi_map_pal_code (void)
-{
-	void *pal_vaddr = efi_get_pal_addr ();
-	u64 psr;
-
-	if (!pal_vaddr)
-		return;
-
-	/*
-	 * Cannot write to CRx with PSR.ic=1
-	 */
-	psr = ia64_clear_ic();
-	ia64_itr(0x1, IA64_TR_PALCODE,
-		 GRANULEROUNDDOWN((unsigned long) pal_vaddr),
-		 pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)),
-		 IA64_GRANULE_SHIFT);
-	ia64_set_psr(psr);		/* restore psr */
-}
-
-void __init
-efi_init (void)
-{
-	const efi_system_table_t *efi_systab;
-	void *efi_map_start, *efi_map_end;
-	u64 efi_desc_size;
-	char *cp;
-
-	set_bit(EFI_BOOT, &efi.flags);
-	set_bit(EFI_64BIT, &efi.flags);
-
-	/*
-	 * It's too early to be able to use the standard kernel command line
-	 * support...
-	 */
-	for (cp = boot_command_line; *cp; ) {
-		if (memcmp(cp, "mem=", 4) == 0) {
-			mem_limit = memparse(cp + 4, &cp);
-		} else if (memcmp(cp, "max_addr=", 9) == 0) {
-			max_addr = GRANULEROUNDDOWN(memparse(cp + 9, &cp));
-		} else if (memcmp(cp, "min_addr=", 9) == 0) {
-			min_addr = GRANULEROUNDDOWN(memparse(cp + 9, &cp));
-		} else {
-			while (*cp != ' ' && *cp)
-				++cp;
-			while (*cp == ' ')
-				++cp;
-		}
-	}
-	if (min_addr != 0UL)
-		printk(KERN_INFO "Ignoring memory below %lluMB\n",
-		       min_addr >> 20);
-	if (max_addr != ~0UL)
-		printk(KERN_INFO "Ignoring memory above %lluMB\n",
-		       max_addr >> 20);
-
-	efi_systab = __va(ia64_boot_param->efi_systab);
-
-	/*
-	 * Verify the EFI Table
-	 */
-	if (efi_systab == NULL)
-		panic("Whoa! Can't find EFI system table.\n");
-	if (efi_systab_check_header(&efi_systab->hdr))
-		panic("Whoa! EFI system table signature incorrect\n");
-
-	efi_systab_report_header(&efi_systab->hdr, efi_systab->fw_vendor);
-
-	palo_phys      = EFI_INVALID_TABLE_ADDR;
-
-	if (efi_config_parse_tables(__va(efi_systab->tables),
-				    efi_systab->nr_tables,
-				    arch_tables) != 0)
-		return;
-
-	if (palo_phys != EFI_INVALID_TABLE_ADDR)
-		handle_palo(palo_phys);
-
-	runtime = __va(efi_systab->runtime);
-	efi.get_time = phys_get_time;
-	efi.set_time = phys_set_time;
-	efi.get_wakeup_time = phys_get_wakeup_time;
-	efi.set_wakeup_time = phys_set_wakeup_time;
-	efi.get_variable = phys_get_variable;
-	efi.get_next_variable = phys_get_next_variable;
-	efi.set_variable = phys_set_variable;
-	efi.get_next_high_mono_count = phys_get_next_high_mono_count;
-	efi.reset_system = phys_reset_system;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-#if EFI_DEBUG
-	/* print EFI memory map: */
-	{
-		efi_memory_desc_t *md;
-		void *p;
-		unsigned int i;
-
-		for (i = 0, p = efi_map_start; p < efi_map_end;
-		     ++i, p += efi_desc_size)
-		{
-			const char *unit;
-			unsigned long size;
-			char buf[64];
-
-			md = p;
-			size = md->num_pages << EFI_PAGE_SHIFT;
-
-			if ((size >> 40) > 0) {
-				size >>= 40;
-				unit = "TB";
-			} else if ((size >> 30) > 0) {
-				size >>= 30;
-				unit = "GB";
-			} else if ((size >> 20) > 0) {
-				size >>= 20;
-				unit = "MB";
-			} else {
-				size >>= 10;
-				unit = "KB";
-			}
-
-			printk("mem%02d: %s "
-			       "range=[0x%016llx-0x%016llx) (%4lu%s)\n",
-			       i, efi_md_typeattr_format(buf, sizeof(buf), md),
-			       md->phys_addr,
-			       md->phys_addr + efi_md_size(md), size, unit);
-		}
-	}
-#endif
-
-	efi_map_pal_code();
-	efi_enter_virtual_mode();
-}
-
-void
-efi_enter_virtual_mode (void)
-{
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	efi_status_t status;
-	u64 efi_desc_size;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-		if (md->attribute & EFI_MEMORY_RUNTIME) {
-			/*
-			 * Some descriptors have multiple bits set, so the
-			 * order of the tests is relevant.
-			 */
-			if (md->attribute & EFI_MEMORY_WB) {
-				md->virt_addr = (u64) __va(md->phys_addr);
-			} else if (md->attribute & EFI_MEMORY_UC) {
-				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
-			} else if (md->attribute & EFI_MEMORY_WC) {
-#if 0
-				md->virt_addr = ia64_remap(md->phys_addr,
-							   (_PAGE_A |
-							    _PAGE_P |
-							    _PAGE_D |
-							    _PAGE_MA_WC |
-							    _PAGE_PL_0 |
-							    _PAGE_AR_RW));
-#else
-				printk(KERN_INFO "EFI_MEMORY_WC mapping\n");
-				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
-#endif
-			} else if (md->attribute & EFI_MEMORY_WT) {
-#if 0
-				md->virt_addr = ia64_remap(md->phys_addr,
-							   (_PAGE_A |
-							    _PAGE_P |
-							    _PAGE_D |
-							    _PAGE_MA_WT |
-							    _PAGE_PL_0 |
-							    _PAGE_AR_RW));
-#else
-				printk(KERN_INFO "EFI_MEMORY_WT mapping\n");
-				md->virt_addr = (u64) ioremap(md->phys_addr, 0);
-#endif
-			}
-		}
-	}
-
-	status = efi_call_phys(__va(runtime->set_virtual_address_map),
-			       ia64_boot_param->efi_memmap_size,
-			       efi_desc_size,
-			       ia64_boot_param->efi_memdesc_version,
-			       ia64_boot_param->efi_memmap);
-	if (status != EFI_SUCCESS) {
-		printk(KERN_WARNING "warning: unable to switch EFI into "
-		       "virtual mode (status=%lu)\n", status);
-		return;
-	}
-
-	set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
-
-	/*
-	 * Now that EFI is in virtual mode, we call the EFI functions more
-	 * efficiently:
-	 */
-	efi.get_time = virt_get_time;
-	efi.set_time = virt_set_time;
-	efi.get_wakeup_time = virt_get_wakeup_time;
-	efi.set_wakeup_time = virt_set_wakeup_time;
-	efi.get_variable = virt_get_variable;
-	efi.get_next_variable = virt_get_next_variable;
-	efi.set_variable = virt_set_variable;
-	efi.get_next_high_mono_count = virt_get_next_high_mono_count;
-	efi.reset_system = virt_reset_system;
-}
-
-/*
- * Walk the EFI memory map looking for the I/O port range.  There can only be
- * one entry of this type, other I/O port ranges should be described via ACPI.
- */
-u64
-efi_get_iobase (void)
-{
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-		if (md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) {
-			if (md->attribute & EFI_MEMORY_UC)
-				return md->phys_addr;
-		}
-	}
-	return 0;
-}
-
-static struct kern_memdesc *
-kern_memory_descriptor (unsigned long phys_addr)
-{
-	struct kern_memdesc *md;
-
-	for (md = kern_memmap; md->start != ~0UL; md++) {
-		if (phys_addr - md->start < (md->num_pages << EFI_PAGE_SHIFT))
-			 return md;
-	}
-	return NULL;
-}
-
-static efi_memory_desc_t *
-efi_memory_descriptor (unsigned long phys_addr)
-{
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-
-		if (phys_addr - md->phys_addr < efi_md_size(md))
-			 return md;
-	}
-	return NULL;
-}
-
-static int
-efi_memmap_intersects (unsigned long phys_addr, unsigned long size)
-{
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-	unsigned long end;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	end = phys_addr + size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-		if (md->phys_addr < end && efi_md_end(md) > phys_addr)
-			return 1;
-	}
-	return 0;
-}
-
-int
-efi_mem_type (unsigned long phys_addr)
-{
-	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
-
-	if (md)
-		return md->type;
-	return -EINVAL;
-}
-
-u64
-efi_mem_attributes (unsigned long phys_addr)
-{
-	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
-
-	if (md)
-		return md->attribute;
-	return 0;
-}
-EXPORT_SYMBOL(efi_mem_attributes);
-
-u64
-efi_mem_attribute (unsigned long phys_addr, unsigned long size)
-{
-	unsigned long end = phys_addr + size;
-	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
-	u64 attr;
-
-	if (!md)
-		return 0;
-
-	/*
-	 * EFI_MEMORY_RUNTIME is not a memory attribute; it just tells
-	 * the kernel that firmware needs this region mapped.
-	 */
-	attr = md->attribute & ~EFI_MEMORY_RUNTIME;
-	do {
-		unsigned long md_end = efi_md_end(md);
-
-		if (end <= md_end)
-			return attr;
-
-		md = efi_memory_descriptor(md_end);
-		if (!md || (md->attribute & ~EFI_MEMORY_RUNTIME) != attr)
-			return 0;
-	} while (md);
-	return 0;	/* never reached */
-}
-
-u64
-kern_mem_attribute (unsigned long phys_addr, unsigned long size)
-{
-	unsigned long end = phys_addr + size;
-	struct kern_memdesc *md;
-	u64 attr;
-
-	/*
-	 * This is a hack for ioremap calls before we set up kern_memmap.
-	 * Maybe we should do efi_memmap_init() earlier instead.
-	 */
-	if (!kern_memmap) {
-		attr = efi_mem_attribute(phys_addr, size);
-		if (attr & EFI_MEMORY_WB)
-			return EFI_MEMORY_WB;
-		return 0;
-	}
-
-	md = kern_memory_descriptor(phys_addr);
-	if (!md)
-		return 0;
-
-	attr = md->attribute;
-	do {
-		unsigned long md_end = kmd_end(md);
-
-		if (end <= md_end)
-			return attr;
-
-		md = kern_memory_descriptor(md_end);
-		if (!md || md->attribute != attr)
-			return 0;
-	} while (md);
-	return 0;	/* never reached */
-}
-
-int
-valid_phys_addr_range (phys_addr_t phys_addr, unsigned long size)
-{
-	u64 attr;
-
-	/*
-	 * /dev/mem reads and writes use copy_to_user(), which implicitly
-	 * uses a granule-sized kernel identity mapping.  It's really
-	 * only safe to do this for regions in kern_memmap.  For more
-	 * details, see Documentation/arch/ia64/aliasing.rst.
-	 */
-	attr = kern_mem_attribute(phys_addr, size);
-	if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC)
-		return 1;
-	return 0;
-}
-
-int
-valid_mmap_phys_addr_range (unsigned long pfn, unsigned long size)
-{
-	unsigned long phys_addr = pfn << PAGE_SHIFT;
-	u64 attr;
-
-	attr = efi_mem_attribute(phys_addr, size);
-
-	/*
-	 * /dev/mem mmap uses normal user pages, so we don't need the entire
-	 * granule, but the entire region we're mapping must support the same
-	 * attribute.
-	 */
-	if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC)
-		return 1;
-
-	/*
-	 * Intel firmware doesn't tell us about all the MMIO regions, so
-	 * in general we have to allow mmap requests.  But if EFI *does*
-	 * tell us about anything inside this region, we should deny it.
-	 * The user can always map a smaller region to avoid the overlap.
-	 */
-	if (efi_memmap_intersects(phys_addr, size))
-		return 0;
-
-	return 1;
-}
-
-pgprot_t
-phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size,
-		     pgprot_t vma_prot)
-{
-	unsigned long phys_addr = pfn << PAGE_SHIFT;
-	u64 attr;
-
-	/*
-	 * For /dev/mem mmap, we use user mappings, but if the region is
-	 * in kern_memmap (and hence may be covered by a kernel mapping),
-	 * we must use the same attribute as the kernel mapping.
-	 */
-	attr = kern_mem_attribute(phys_addr, size);
-	if (attr & EFI_MEMORY_WB)
-		return pgprot_cacheable(vma_prot);
-	else if (attr & EFI_MEMORY_UC)
-		return pgprot_noncached(vma_prot);
-
-	/*
-	 * Some chipsets don't support UC access to memory.  If
-	 * WB is supported, we prefer that.
-	 */
-	if (efi_mem_attribute(phys_addr, size) & EFI_MEMORY_WB)
-		return pgprot_cacheable(vma_prot);
-
-	return pgprot_noncached(vma_prot);
-}
-
-int __init
-efi_uart_console_only(void)
-{
-	efi_status_t status;
-	char *s, name[] = "ConOut";
-	efi_guid_t guid = EFI_GLOBAL_VARIABLE_GUID;
-	efi_char16_t *utf16, name_utf16[32];
-	unsigned char data[1024];
-	unsigned long size = sizeof(data);
-	struct efi_generic_dev_path *hdr, *end_addr;
-	int uart = 0;
-
-	/* Convert to UTF-16 */
-	utf16 = name_utf16;
-	s = name;
-	while (*s)
-		*utf16++ = *s++ & 0x7f;
-	*utf16 = 0;
-
-	status = efi.get_variable(name_utf16, &guid, NULL, &size, data);
-	if (status != EFI_SUCCESS) {
-		printk(KERN_ERR "No EFI %s variable?\n", name);
-		return 0;
-	}
-
-	hdr = (struct efi_generic_dev_path *) data;
-	end_addr = (struct efi_generic_dev_path *) ((u8 *) data + size);
-	while (hdr < end_addr) {
-		if (hdr->type == EFI_DEV_MSG &&
-		    hdr->sub_type == EFI_DEV_MSG_UART)
-			uart = 1;
-		else if (hdr->type == EFI_DEV_END_PATH ||
-			  hdr->type == EFI_DEV_END_PATH2) {
-			if (!uart)
-				return 0;
-			if (hdr->sub_type == EFI_DEV_END_ENTIRE)
-				return 1;
-			uart = 0;
-		}
-		hdr = (struct efi_generic_dev_path *)((u8 *) hdr + hdr->length);
-	}
-	printk(KERN_ERR "Malformed %s value\n", name);
-	return 0;
-}
-
-/*
- * Look for the first granule aligned memory descriptor memory
- * that is big enough to hold EFI memory map. Make sure this
- * descriptor is at least granule sized so it does not get trimmed
- */
-struct kern_memdesc *
-find_memmap_space (void)
-{
-	u64	contig_low=0, contig_high=0;
-	u64	as = 0, ae;
-	void *efi_map_start, *efi_map_end, *p, *q;
-	efi_memory_desc_t *md, *pmd = NULL, *check_md;
-	u64	space_needed, efi_desc_size;
-	unsigned long total_mem = 0;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	/*
-	 * Worst case: we need 3 kernel descriptors for each efi descriptor
-	 * (if every entry has a WB part in the middle, and UC head and tail),
-	 * plus one for the end marker.
-	 */
-	space_needed = sizeof(kern_memdesc_t) *
-		(3 * (ia64_boot_param->efi_memmap_size/efi_desc_size) + 1);
-
-	for (p = efi_map_start; p < efi_map_end; pmd = md, p += efi_desc_size) {
-		md = p;
-		if (!efi_wb(md)) {
-			continue;
-		}
-		if (pmd == NULL || !efi_wb(pmd) ||
-		    efi_md_end(pmd) != md->phys_addr) {
-			contig_low = GRANULEROUNDUP(md->phys_addr);
-			contig_high = efi_md_end(md);
-			for (q = p + efi_desc_size; q < efi_map_end;
-			     q += efi_desc_size) {
-				check_md = q;
-				if (!efi_wb(check_md))
-					break;
-				if (contig_high != check_md->phys_addr)
-					break;
-				contig_high = efi_md_end(check_md);
-			}
-			contig_high = GRANULEROUNDDOWN(contig_high);
-		}
-		if (!is_memory_available(md) || md->type == EFI_LOADER_DATA)
-			continue;
-
-		/* Round ends inward to granule boundaries */
-		as = max(contig_low, md->phys_addr);
-		ae = min(contig_high, efi_md_end(md));
-
-		/* keep within max_addr= and min_addr= command line arg */
-		as = max(as, min_addr);
-		ae = min(ae, max_addr);
-		if (ae <= as)
-			continue;
-
-		/* avoid going over mem= command line arg */
-		if (total_mem + (ae - as) > mem_limit)
-			ae -= total_mem + (ae - as) - mem_limit;
-
-		if (ae <= as)
-			continue;
-
-		if (ae - as > space_needed)
-			break;
-	}
-	if (p >= efi_map_end)
-		panic("Can't allocate space for kernel memory descriptors");
-
-	return __va(as);
-}
-
-/*
- * Walk the EFI memory map and gather all memory available for kernel
- * to use.  We can allocate partial granules only if the unavailable
- * parts exist, and are WB.
- */
-unsigned long
-efi_memmap_init(u64 *s, u64 *e)
-{
-	struct kern_memdesc *k, *prev = NULL;
-	u64	contig_low=0, contig_high=0;
-	u64	as, ae, lim;
-	void *efi_map_start, *efi_map_end, *p, *q;
-	efi_memory_desc_t *md, *pmd = NULL, *check_md;
-	u64	efi_desc_size;
-	unsigned long total_mem = 0;
-
-	k = kern_memmap = find_memmap_space();
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; pmd = md, p += efi_desc_size) {
-		md = p;
-		if (!efi_wb(md)) {
-			if (efi_uc(md) &&
-			    (md->type == EFI_CONVENTIONAL_MEMORY ||
-			     md->type == EFI_BOOT_SERVICES_DATA)) {
-				k->attribute = EFI_MEMORY_UC;
-				k->start = md->phys_addr;
-				k->num_pages = md->num_pages;
-				k++;
-			}
-			continue;
-		}
-		if (pmd == NULL || !efi_wb(pmd) ||
-		    efi_md_end(pmd) != md->phys_addr) {
-			contig_low = GRANULEROUNDUP(md->phys_addr);
-			contig_high = efi_md_end(md);
-			for (q = p + efi_desc_size; q < efi_map_end;
-			     q += efi_desc_size) {
-				check_md = q;
-				if (!efi_wb(check_md))
-					break;
-				if (contig_high != check_md->phys_addr)
-					break;
-				contig_high = efi_md_end(check_md);
-			}
-			contig_high = GRANULEROUNDDOWN(contig_high);
-		}
-		if (!is_memory_available(md))
-			continue;
-
-		/*
-		 * Round ends inward to granule boundaries
-		 * Give trimmings to uncached allocator
-		 */
-		if (md->phys_addr < contig_low) {
-			lim = min(efi_md_end(md), contig_low);
-			if (efi_uc(md)) {
-				if (k > kern_memmap &&
-				    (k-1)->attribute == EFI_MEMORY_UC &&
-				    kmd_end(k-1) == md->phys_addr) {
-					(k-1)->num_pages +=
-						(lim - md->phys_addr)
-						>> EFI_PAGE_SHIFT;
-				} else {
-					k->attribute = EFI_MEMORY_UC;
-					k->start = md->phys_addr;
-					k->num_pages = (lim - md->phys_addr)
-						>> EFI_PAGE_SHIFT;
-					k++;
-				}
-			}
-			as = contig_low;
-		} else
-			as = md->phys_addr;
-
-		if (efi_md_end(md) > contig_high) {
-			lim = max(md->phys_addr, contig_high);
-			if (efi_uc(md)) {
-				if (lim == md->phys_addr && k > kern_memmap &&
-				    (k-1)->attribute == EFI_MEMORY_UC &&
-				    kmd_end(k-1) == md->phys_addr) {
-					(k-1)->num_pages += md->num_pages;
-				} else {
-					k->attribute = EFI_MEMORY_UC;
-					k->start = lim;
-					k->num_pages = (efi_md_end(md) - lim)
-						>> EFI_PAGE_SHIFT;
-					k++;
-				}
-			}
-			ae = contig_high;
-		} else
-			ae = efi_md_end(md);
-
-		/* keep within max_addr= and min_addr= command line arg */
-		as = max(as, min_addr);
-		ae = min(ae, max_addr);
-		if (ae <= as)
-			continue;
-
-		/* avoid going over mem= command line arg */
-		if (total_mem + (ae - as) > mem_limit)
-			ae -= total_mem + (ae - as) - mem_limit;
-
-		if (ae <= as)
-			continue;
-		if (prev && kmd_end(prev) == md->phys_addr) {
-			prev->num_pages += (ae - as) >> EFI_PAGE_SHIFT;
-			total_mem += ae - as;
-			continue;
-		}
-		k->attribute = EFI_MEMORY_WB;
-		k->start = as;
-		k->num_pages = (ae - as) >> EFI_PAGE_SHIFT;
-		total_mem += ae - as;
-		prev = k++;
-	}
-	k->start = ~0L; /* end-marker */
-
-	/* reserve the memory we are using for kern_memmap */
-	*s = (u64)kern_memmap;
-	*e = (u64)++k;
-
-	return total_mem;
-}
-
-void
-efi_initialize_iomem_resources(struct resource *code_resource,
-			       struct resource *data_resource,
-			       struct resource *bss_resource)
-{
-	struct resource *res;
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-	char *name;
-	unsigned long flags, desc;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	res = NULL;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-
-		if (md->num_pages == 0) /* should not happen */
-			continue;
-
-		flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-		desc = IORES_DESC_NONE;
-
-		switch (md->type) {
-
-			case EFI_MEMORY_MAPPED_IO:
-			case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
-				continue;
-
-			case EFI_LOADER_CODE:
-			case EFI_LOADER_DATA:
-			case EFI_BOOT_SERVICES_DATA:
-			case EFI_BOOT_SERVICES_CODE:
-			case EFI_CONVENTIONAL_MEMORY:
-				if (md->attribute & EFI_MEMORY_WP) {
-					name = "System ROM";
-					flags |= IORESOURCE_READONLY;
-				} else if (md->attribute == EFI_MEMORY_UC) {
-					name = "Uncached RAM";
-				} else {
-					name = "System RAM";
-					flags |= IORESOURCE_SYSRAM;
-				}
-				break;
-
-			case EFI_ACPI_MEMORY_NVS:
-				name = "ACPI Non-volatile Storage";
-				desc = IORES_DESC_ACPI_NV_STORAGE;
-				break;
-
-			case EFI_UNUSABLE_MEMORY:
-				name = "reserved";
-				flags |= IORESOURCE_DISABLED;
-				break;
-
-			case EFI_PERSISTENT_MEMORY:
-				name = "Persistent Memory";
-				desc = IORES_DESC_PERSISTENT_MEMORY;
-				break;
-
-			case EFI_RESERVED_TYPE:
-			case EFI_RUNTIME_SERVICES_CODE:
-			case EFI_RUNTIME_SERVICES_DATA:
-			case EFI_ACPI_RECLAIM_MEMORY:
-			default:
-				name = "reserved";
-				break;
-		}
-
-		if ((res = kzalloc(sizeof(struct resource),
-				   GFP_KERNEL)) == NULL) {
-			printk(KERN_ERR
-			       "failed to allocate resource for iomem\n");
-			return;
-		}
-
-		res->name = name;
-		res->start = md->phys_addr;
-		res->end = md->phys_addr + efi_md_size(md) - 1;
-		res->flags = flags;
-		res->desc = desc;
-
-		if (insert_resource(&iomem_resource, res) < 0)
-			kfree(res);
-		else {
-			/*
-			 * We don't know which region contains
-			 * kernel data so we try it repeatedly and
-			 * let the resource manager test it.
-			 */
-			insert_resource(res, code_resource);
-			insert_resource(res, data_resource);
-			insert_resource(res, bss_resource);
-#ifdef CONFIG_KEXEC
-                        insert_resource(res, &efi_memmap_res);
-                        insert_resource(res, &boot_param_res);
-			if (crashk_res.end > crashk_res.start)
-				insert_resource(res, &crashk_res);
-#endif
-		}
-	}
-}
-
-#ifdef CONFIG_KEXEC
-/* find a block of memory aligned to 64M exclude reserved regions
-   rsvd_regions are sorted
- */
-unsigned long __init
-kdump_find_rsvd_region (unsigned long size, struct rsvd_region *r, int n)
-{
-	int i;
-	u64 start, end;
-	u64 alignment = 1UL << _PAGE_SIZE_64M;
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-		if (!efi_wb(md))
-			continue;
-		start = ALIGN(md->phys_addr, alignment);
-		end = efi_md_end(md);
-		for (i = 0; i < n; i++) {
-			if (__pa(r[i].start) >= start && __pa(r[i].end) < end) {
-				if (__pa(r[i].start) > start + size)
-					return start;
-				start = ALIGN(__pa(r[i].end), alignment);
-				if (i < n-1 &&
-				    __pa(r[i+1].start) < start + size)
-					continue;
-				else
-					break;
-			}
-		}
-		if (end > start + size)
-			return start;
-	}
-
-	printk(KERN_WARNING
-	       "Cannot reserve 0x%lx byte of memory for crashdump\n", size);
-	return ~0UL;
-}
-#endif
-
-#ifdef CONFIG_CRASH_DUMP
-/* locate the size find a the descriptor at a certain address */
-unsigned long __init
-vmcore_find_descriptor_size (unsigned long address)
-{
-	void *efi_map_start, *efi_map_end, *p;
-	efi_memory_desc_t *md;
-	u64 efi_desc_size;
-	unsigned long ret = 0;
-
-	efi_map_start = __va(ia64_boot_param->efi_memmap);
-	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
-	efi_desc_size = ia64_boot_param->efi_memdesc_size;
-
-	for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) {
-		md = p;
-		if (efi_wb(md) && md->type == EFI_LOADER_DATA
-		    && md->phys_addr == address) {
-			ret = efi_md_size(md);
-			break;
-		}
-	}
-
-	if (ret == 0)
-		printk(KERN_WARNING "Cannot locate EFI vmcore descriptor\n");
-
-	return ret;
-}
-#endif
-
-char *efi_systab_show_arch(char *str)
-{
-	if (mps_phys != EFI_INVALID_TABLE_ADDR)
-		str += sprintf(str, "MPS=0x%lx\n", mps_phys);
-	if (hcdp_phys != EFI_INVALID_TABLE_ADDR)
-		str += sprintf(str, "HCDP=0x%lx\n", hcdp_phys);
-	return str;
-}
diff --git a/arch/ia64/kernel/efi_stub.S b/arch/ia64/kernel/efi_stub.S
deleted file mode 100644
index 1fd61b78fb29..000000000000
--- a/arch/ia64/kernel/efi_stub.S
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * EFI call stub.
- *
- * Copyright (C) 1999-2001 Hewlett-Packard Co
- *	David Mosberger <davidm@hpl.hp.com>
- *
- * This stub allows us to make EFI calls in physical mode with interrupts
- * turned off.  We need this because we can't call SetVirtualMap() until
- * the kernel has booted far enough to allow allocation of struct vm_area_struct
- * entries (which we would need to map stuff with memory attributes other
- * than uncached or writeback...).  Since the GetTime() service gets called
- * earlier than that, we need to be able to make physical mode EFI calls from
- * the kernel.
- */
-
-/*
- * PSR settings as per SAL spec (Chapter 8 in the "IA-64 System
- * Abstraction Layer Specification", revision 2.6e).  Note that
- * psr.dfl and psr.dfh MUST be cleared, despite what this manual says.
- * Otherwise, SAL dies whenever it's trying to do an IA-32 BIOS call
- * (the br.ia instruction fails unless psr.dfl and psr.dfh are
- * cleared).  Fortunately, SAL promises not to touch the floating
- * point regs, so at least we don't have to save f2-f127.
- */
-#define PSR_BITS_TO_CLEAR						\
-	(IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_RT |		\
-	 IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED |	\
-	 IA64_PSR_DFL | IA64_PSR_DFH)
-
-#define PSR_BITS_TO_SET							\
-	(IA64_PSR_BN)
-
-#include <asm/processor.h>
-#include <asm/asmmacro.h>
-
-/*
- * Inputs:
- *	in0 = address of function descriptor of EFI routine to call
- *	in1..in7 = arguments to routine
- *
- * Outputs:
- *	r8 = EFI_STATUS returned by called function
- */
-
-GLOBAL_ENTRY(efi_call_phys)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc loc1=ar.pfs,8,7,7,0
-	ld8 r2=[in0],8			// load EFI function's entry point
-	mov loc0=rp
-	.body
-	;;
-	mov loc2=gp			// save global pointer
-	mov loc4=ar.rsc			// save RSE configuration
-	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	;;
-	ld8 gp=[in0]			// load EFI function's global pointer
-	movl r16=PSR_BITS_TO_CLEAR
-	mov loc3=psr			// save processor status word
-	movl r17=PSR_BITS_TO_SET
-	;;
-	or loc3=loc3,r17
-	mov b6=r2
-	;;
-	andcm r16=loc3,r16	// get psr with IT, DT, and RT bits cleared
-	br.call.sptk.many rp=ia64_switch_mode_phys
-.ret0:	mov out4=in5
-	mov out0=in1
-	mov out1=in2
-	mov out2=in3
-	mov out3=in4
-	mov out5=in6
-	mov out6=in7
-	mov loc5=r19
-	mov loc6=r20
-	br.call.sptk.many rp=b6		// call the EFI function
-.ret1:	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	mov r16=loc3
-	mov r19=loc5
-	mov r20=loc6
-	br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
-.ret2:	mov ar.rsc=loc4			// restore RSE configuration
-	mov ar.pfs=loc1
-	mov rp=loc0
-	mov gp=loc2
-	br.ret.sptk.many rp
-END(efi_call_phys)
diff --git a/arch/ia64/kernel/elfcore.c b/arch/ia64/kernel/elfcore.c
deleted file mode 100644
index 8895df121540..000000000000
--- a/arch/ia64/kernel/elfcore.c
+++ /dev/null
@@ -1,77 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/elf.h>
-#include <linux/coredump.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-
-#include <asm/elf.h>
-
-
-Elf64_Half elf_core_extra_phdrs(struct coredump_params *cprm)
-{
-	return GATE_EHDR->e_phnum;
-}
-
-int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
-{
-	const struct elf_phdr *const gate_phdrs =
-		(const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
-	int i;
-	Elf64_Off ofs = 0;
-
-	for (i = 0; i < GATE_EHDR->e_phnum; ++i) {
-		struct elf_phdr phdr = gate_phdrs[i];
-
-		if (phdr.p_type == PT_LOAD) {
-			phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz);
-			phdr.p_filesz = phdr.p_memsz;
-			if (ofs == 0) {
-				ofs = phdr.p_offset = offset;
-				offset += phdr.p_filesz;
-			} else {
-				phdr.p_offset = ofs;
-			}
-		} else {
-			phdr.p_offset += ofs;
-		}
-		phdr.p_paddr = 0; /* match other core phdrs */
-		if (!dump_emit(cprm, &phdr, sizeof(phdr)))
-			return 0;
-	}
-	return 1;
-}
-
-int elf_core_write_extra_data(struct coredump_params *cprm)
-{
-	const struct elf_phdr *const gate_phdrs =
-		(const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
-	int i;
-
-	for (i = 0; i < GATE_EHDR->e_phnum; ++i) {
-		if (gate_phdrs[i].p_type == PT_LOAD) {
-			void *addr = (void *)gate_phdrs[i].p_vaddr;
-			size_t memsz = PAGE_ALIGN(gate_phdrs[i].p_memsz);
-
-			if (!dump_emit(cprm, addr, memsz))
-				return 0;
-			break;
-		}
-	}
-	return 1;
-}
-
-size_t elf_core_extra_data_size(struct coredump_params *cprm)
-{
-	const struct elf_phdr *const gate_phdrs =
-		(const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
-	int i;
-	size_t size = 0;
-
-	for (i = 0; i < GATE_EHDR->e_phnum; ++i) {
-		if (gate_phdrs[i].p_type == PT_LOAD) {
-			size += PAGE_ALIGN(gate_phdrs[i].p_memsz);
-			break;
-		}
-	}
-	return size;
-}
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
deleted file mode 100644
index ac06d44b9b27..000000000000
--- a/arch/ia64/kernel/entry.S
+++ /dev/null
@@ -1,1427 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * arch/ia64/kernel/entry.S
- *
- * Kernel entry points.
- *
- * Copyright (C) 1998-2003, 2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999, 2002-2003
- *	Asit Mallick <Asit.K.Mallick@intel.com>
- * 	Don Dugger <Don.Dugger@intel.com>
- *	Suresh Siddha <suresh.b.siddha@intel.com>
- *	Fenghua Yu <fenghua.yu@intel.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- */
-/*
- * ia64_switch_to now places correct virtual mapping in in TR2 for
- * kernel stack. This allows us to handle interrupts without changing
- * to physical mode.
- *
- * Jonathan Nicklin	<nicklin@missioncriticallinux.com>
- * Patrick O'Rourke	<orourke@missioncriticallinux.com>
- * 11/07/2000
- */
-/*
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *                    pv_ops.
- */
-/*
- * Global (preserved) predicate usage on syscall entry/exit path:
- *
- *	pKStk:		See entry.h.
- *	pUStk:		See entry.h.
- *	pSys:		See entry.h.
- *	pNonSys:	!pSys
- */
-
-#include <linux/export.h>
-#include <linux/pgtable.h>
-#include <asm/asmmacro.h>
-#include <asm/cache.h>
-#include <asm/errno.h>
-#include <asm/kregs.h>
-#include <asm/asm-offsets.h>
-#include <asm/percpu.h>
-#include <asm/processor.h>
-#include <asm/thread_info.h>
-#include <asm/unistd.h>
-#include <asm/ftrace.h>
-
-#include "minstate.h"
-
-	/*
-	 * execve() is special because in case of success, we need to
-	 * setup a null register window frame.
-	 */
-ENTRY(ia64_execve)
-	/*
-	 * Allocate 8 input registers since ptrace() may clobber them
-	 */
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc loc1=ar.pfs,8,2,3,0
-	mov loc0=rp
-	.body
-	mov out0=in0			// filename
-	;;				// stop bit between alloc and call
-	mov out1=in1			// argv
-	mov out2=in2			// envp
-	br.call.sptk.many rp=sys_execve
-.ret0:
-	cmp4.ge p6,p7=r8,r0
-	mov ar.pfs=loc1			// restore ar.pfs
-	sxt4 r8=r8			// return 64-bit result
-	;;
-	stf.spill [sp]=f0
-	mov rp=loc0
-(p6)	mov ar.pfs=r0			// clear ar.pfs on success
-(p7)	br.ret.sptk.many rp
-
-	/*
-	 * In theory, we'd have to zap this state only to prevent leaking of
-	 * security sensitive state (e.g., if current->mm->dumpable is zero).  However,
-	 * this executes in less than 20 cycles even on Itanium, so it's not worth
-	 * optimizing for...).
-	 */
-	mov ar.unat=0; 		mov ar.lc=0
-	mov r4=0;		mov f2=f0;		mov b1=r0
-	mov r5=0;		mov f3=f0;		mov b2=r0
-	mov r6=0;		mov f4=f0;		mov b3=r0
-	mov r7=0;		mov f5=f0;		mov b4=r0
-	ldf.fill f12=[sp];	mov f13=f0;		mov b5=r0
-	ldf.fill f14=[sp];	ldf.fill f15=[sp];	mov f16=f0
-	ldf.fill f17=[sp];	ldf.fill f18=[sp];	mov f19=f0
-	ldf.fill f20=[sp];	ldf.fill f21=[sp];	mov f22=f0
-	ldf.fill f23=[sp];	ldf.fill f24=[sp];	mov f25=f0
-	ldf.fill f26=[sp];	ldf.fill f27=[sp];	mov f28=f0
-	ldf.fill f29=[sp];	ldf.fill f30=[sp];	mov f31=f0
-	br.ret.sptk.many rp
-END(ia64_execve)
-
-/*
- * sys_clone2(u64 flags, u64 ustack_base, u64 ustack_size, u64 parent_tidptr, u64 child_tidptr,
- *	      u64 tls)
- */
-GLOBAL_ENTRY(sys_clone2)
-	/*
-	 * Allocate 8 input registers since ptrace() may clobber them
-	 */
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc r16=ar.pfs,8,2,6,0
-	DO_SAVE_SWITCH_STACK
-	mov loc0=rp
-	mov loc1=r16                             // save ar.pfs across ia64_clone
-	.body
-	mov out0=in0
-	mov out1=in1
-	mov out2=in2
-	mov out3=in3
-	mov out4=in4
-	mov out5=in5
-	br.call.sptk.many rp=ia64_clone
-.ret1:	.restore sp
-	adds sp=IA64_SWITCH_STACK_SIZE,sp	// pop the switch stack
-	mov ar.pfs=loc1
-	mov rp=loc0
-	br.ret.sptk.many rp
-END(sys_clone2)
-
-/*
- * sys_clone(u64 flags, u64 ustack_base, u64 parent_tidptr, u64 child_tidptr, u64 tls)
- *	Deprecated.  Use sys_clone2() instead.
- */
-GLOBAL_ENTRY(sys_clone)
-	/*
-	 * Allocate 8 input registers since ptrace() may clobber them
-	 */
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc r16=ar.pfs,8,2,6,0
-	DO_SAVE_SWITCH_STACK
-	mov loc0=rp
-	mov loc1=r16                             // save ar.pfs across ia64_clone
-	.body
-	mov out0=in0
-	mov out1=in1
-	mov out2=16				// stacksize (compensates for 16-byte scratch area)
-	mov out3=in3
-	mov out4=in4
-	mov out5=in5
-	br.call.sptk.many rp=ia64_clone
-.ret2:	.restore sp
-	adds sp=IA64_SWITCH_STACK_SIZE,sp	// pop the switch stack
-	mov ar.pfs=loc1
-	mov rp=loc0
-	br.ret.sptk.many rp
-END(sys_clone)
-
-/*
- * prev_task <- ia64_switch_to(struct task_struct *next)
- *	With Ingo's new scheduler, interrupts are disabled when this routine gets
- *	called.  The code starting at .map relies on this.  The rest of the code
- *	doesn't care about the interrupt masking status.
- */
-GLOBAL_ENTRY(ia64_switch_to)
-	.prologue
-	alloc r16=ar.pfs,1,0,0,0
-	DO_SAVE_SWITCH_STACK
-	.body
-
-	adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13
-	movl r25=init_task
-	mov r27=IA64_KR(CURRENT_STACK)
-	adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0
-	dep r20=0,in0,61,3		// physical address of "next"
-	;;
-	st8 [r22]=sp			// save kernel stack pointer of old task
-	shr.u r26=r20,IA64_GRANULE_SHIFT
-	cmp.eq p7,p6=r25,in0
-	;;
-	/*
-	 * If we've already mapped this task's page, we can skip doing it again.
-	 */
-(p6)	cmp.eq p7,p6=r26,r27
-(p6)	br.cond.dpnt .map
-	;;
-.done:
-	ld8 sp=[r21]			// load kernel stack pointer of new task
-	MOV_TO_KR(CURRENT, in0, r8, r9)		// update "current" application register
-	mov r8=r13			// return pointer to previously running task
-	mov r13=in0			// set "current" pointer
-	;;
-	DO_LOAD_SWITCH_STACK
-
-#ifdef CONFIG_SMP
-	sync.i				// ensure "fc"s done by this CPU are visible on other CPUs
-#endif
-	br.ret.sptk.many rp		// boogie on out in new context
-
-.map:
-	RSM_PSR_IC(r25)			// interrupts (psr.i) are already disabled here
-	movl r25=PAGE_KERNEL
-	;;
-	srlz.d
-	or r23=r25,r20			// construct PA | page properties
-	mov r25=IA64_GRANULE_SHIFT<<2
-	;;
-	MOV_TO_ITIR(p0, r25, r8)
-	MOV_TO_IFA(in0, r8)		// VA of next task...
-	;;
-	mov r25=IA64_TR_CURRENT_STACK
-	MOV_TO_KR(CURRENT_STACK, r26, r8, r9)	// remember last page we mapped...
-	;;
-	itr.d dtr[r25]=r23		// wire in new mapping...
-	SSM_PSR_IC_AND_SRLZ_D(r8, r9)	// reenable the psr.ic bit
-	br.cond.sptk .done
-END(ia64_switch_to)
-
-/*
- * Note that interrupts are enabled during save_switch_stack and load_switch_stack.  This
- * means that we may get an interrupt with "sp" pointing to the new kernel stack while
- * ar.bspstore is still pointing to the old kernel backing store area.  Since ar.rsc,
- * ar.rnat, ar.bsp, and ar.bspstore are all preserved by interrupts, this is not a
- * problem.  Also, we don't need to specify unwind information for preserved registers
- * that are not modified in save_switch_stack as the right unwind information is already
- * specified at the call-site of save_switch_stack.
- */
-
-/*
- * save_switch_stack:
- *	- r16 holds ar.pfs
- *	- b7 holds address to return to
- *	- rp (b0) holds return address to save
- */
-GLOBAL_ENTRY(save_switch_stack)
-	.prologue
-	.altrp b7
-	flushrs			// flush dirty regs to backing store (must be first in insn group)
-	.save @priunat,r17
-	mov r17=ar.unat		// preserve caller's
-	.body
-#ifdef CONFIG_ITANIUM
-	adds r2=16+128,sp
-	adds r3=16+64,sp
-	adds r14=SW(R4)+16,sp
-	;;
-	st8.spill [r14]=r4,16		// spill r4
-	lfetch.fault.excl.nt1 [r3],128
-	;;
-	lfetch.fault.excl.nt1 [r2],128
-	lfetch.fault.excl.nt1 [r3],128
-	;;
-	lfetch.fault.excl [r2]
-	lfetch.fault.excl [r3]
-	adds r15=SW(R5)+16,sp
-#else
-	add r2=16+3*128,sp
-	add r3=16,sp
-	add r14=SW(R4)+16,sp
-	;;
-	st8.spill [r14]=r4,SW(R6)-SW(R4)	// spill r4 and prefetch offset 0x1c0
-	lfetch.fault.excl.nt1 [r3],128	//		prefetch offset 0x010
-	;;
-	lfetch.fault.excl.nt1 [r3],128	//		prefetch offset 0x090
-	lfetch.fault.excl.nt1 [r2],128	//		prefetch offset 0x190
-	;;
-	lfetch.fault.excl.nt1 [r3]	//		prefetch offset 0x110
-	lfetch.fault.excl.nt1 [r2]	//		prefetch offset 0x210
-	adds r15=SW(R5)+16,sp
-#endif
-	;;
-	st8.spill [r15]=r5,SW(R7)-SW(R5)	// spill r5
-	mov.m ar.rsc=0			// put RSE in mode: enforced lazy, little endian, pl 0
-	add r2=SW(F2)+16,sp		// r2 = &sw->f2
-	;;
-	st8.spill [r14]=r6,SW(B0)-SW(R6)	// spill r6
-	mov.m r18=ar.fpsr		// preserve fpsr
-	add r3=SW(F3)+16,sp		// r3 = &sw->f3
-	;;
-	stf.spill [r2]=f2,32
-	mov.m r19=ar.rnat
-	mov r21=b0
-
-	stf.spill [r3]=f3,32
-	st8.spill [r15]=r7,SW(B2)-SW(R7)	// spill r7
-	mov r22=b1
-	;;
-	// since we're done with the spills, read and save ar.unat:
-	mov.m r29=ar.unat
-	mov.m r20=ar.bspstore
-	mov r23=b2
-	stf.spill [r2]=f4,32
-	stf.spill [r3]=f5,32
-	mov r24=b3
-	;;
-	st8 [r14]=r21,SW(B1)-SW(B0)		// save b0
-	st8 [r15]=r23,SW(B3)-SW(B2)		// save b2
-	mov r25=b4
-	mov r26=b5
-	;;
-	st8 [r14]=r22,SW(B4)-SW(B1)		// save b1
-	st8 [r15]=r24,SW(AR_PFS)-SW(B3)		// save b3
-	mov r21=ar.lc		// I-unit
-	stf.spill [r2]=f12,32
-	stf.spill [r3]=f13,32
-	;;
-	st8 [r14]=r25,SW(B5)-SW(B4)		// save b4
-	st8 [r15]=r16,SW(AR_LC)-SW(AR_PFS)	// save ar.pfs
-	stf.spill [r2]=f14,32
-	stf.spill [r3]=f15,32
-	;;
-	st8 [r14]=r26				// save b5
-	st8 [r15]=r21				// save ar.lc
-	stf.spill [r2]=f16,32
-	stf.spill [r3]=f17,32
-	;;
-	stf.spill [r2]=f18,32
-	stf.spill [r3]=f19,32
-	;;
-	stf.spill [r2]=f20,32
-	stf.spill [r3]=f21,32
-	;;
-	stf.spill [r2]=f22,32
-	stf.spill [r3]=f23,32
-	;;
-	stf.spill [r2]=f24,32
-	stf.spill [r3]=f25,32
-	;;
-	stf.spill [r2]=f26,32
-	stf.spill [r3]=f27,32
-	;;
-	stf.spill [r2]=f28,32
-	stf.spill [r3]=f29,32
-	;;
-	stf.spill [r2]=f30,SW(AR_UNAT)-SW(F30)
-	stf.spill [r3]=f31,SW(PR)-SW(F31)
-	add r14=SW(CALLER_UNAT)+16,sp
-	;;
-	st8 [r2]=r29,SW(AR_RNAT)-SW(AR_UNAT)	// save ar.unat
-	st8 [r14]=r17,SW(AR_FPSR)-SW(CALLER_UNAT) // save caller_unat
-	mov r21=pr
-	;;
-	st8 [r2]=r19,SW(AR_BSPSTORE)-SW(AR_RNAT) // save ar.rnat
-	st8 [r3]=r21				// save predicate registers
-	;;
-	st8 [r2]=r20				// save ar.bspstore
-	st8 [r14]=r18				// save fpsr
-	mov ar.rsc=3		// put RSE back into eager mode, pl 0
-	br.cond.sptk.many b7
-END(save_switch_stack)
-
-/*
- * load_switch_stack:
- *	- "invala" MUST be done at call site (normally in DO_LOAD_SWITCH_STACK)
- *	- b7 holds address to return to
- *	- must not touch r8-r11
- */
-GLOBAL_ENTRY(load_switch_stack)
-	.prologue
-	.altrp b7
-
-	.body
-	lfetch.fault.nt1 [sp]
-	adds r2=SW(AR_BSPSTORE)+16,sp
-	adds r3=SW(AR_UNAT)+16,sp
-	mov ar.rsc=0						// put RSE into enforced lazy mode
-	adds r14=SW(CALLER_UNAT)+16,sp
-	adds r15=SW(AR_FPSR)+16,sp
-	;;
-	ld8 r27=[r2],(SW(B0)-SW(AR_BSPSTORE))	// bspstore
-	ld8 r29=[r3],(SW(B1)-SW(AR_UNAT))	// unat
-	;;
-	ld8 r21=[r2],16		// restore b0
-	ld8 r22=[r3],16		// restore b1
-	;;
-	ld8 r23=[r2],16		// restore b2
-	ld8 r24=[r3],16		// restore b3
-	;;
-	ld8 r25=[r2],16		// restore b4
-	ld8 r26=[r3],16		// restore b5
-	;;
-	ld8 r16=[r2],(SW(PR)-SW(AR_PFS))	// ar.pfs
-	ld8 r17=[r3],(SW(AR_RNAT)-SW(AR_LC))	// ar.lc
-	;;
-	ld8 r28=[r2]		// restore pr
-	ld8 r30=[r3]		// restore rnat
-	;;
-	ld8 r18=[r14],16	// restore caller's unat
-	ld8 r19=[r15],24	// restore fpsr
-	;;
-	ldf.fill f2=[r14],32
-	ldf.fill f3=[r15],32
-	;;
-	ldf.fill f4=[r14],32
-	ldf.fill f5=[r15],32
-	;;
-	ldf.fill f12=[r14],32
-	ldf.fill f13=[r15],32
-	;;
-	ldf.fill f14=[r14],32
-	ldf.fill f15=[r15],32
-	;;
-	ldf.fill f16=[r14],32
-	ldf.fill f17=[r15],32
-	;;
-	ldf.fill f18=[r14],32
-	ldf.fill f19=[r15],32
-	mov b0=r21
-	;;
-	ldf.fill f20=[r14],32
-	ldf.fill f21=[r15],32
-	mov b1=r22
-	;;
-	ldf.fill f22=[r14],32
-	ldf.fill f23=[r15],32
-	mov b2=r23
-	;;
-	mov ar.bspstore=r27
-	mov ar.unat=r29		// establish unat holding the NaT bits for r4-r7
-	mov b3=r24
-	;;
-	ldf.fill f24=[r14],32
-	ldf.fill f25=[r15],32
-	mov b4=r25
-	;;
-	ldf.fill f26=[r14],32
-	ldf.fill f27=[r15],32
-	mov b5=r26
-	;;
-	ldf.fill f28=[r14],32
-	ldf.fill f29=[r15],32
-	mov ar.pfs=r16
-	;;
-	ldf.fill f30=[r14],32
-	ldf.fill f31=[r15],24
-	mov ar.lc=r17
-	;;
-	ld8.fill r4=[r14],16
-	ld8.fill r5=[r15],16
-	mov pr=r28,-1
-	;;
-	ld8.fill r6=[r14],16
-	ld8.fill r7=[r15],16
-
-	mov ar.unat=r18				// restore caller's unat
-	mov ar.rnat=r30				// must restore after bspstore but before rsc!
-	mov ar.fpsr=r19				// restore fpsr
-	mov ar.rsc=3				// put RSE back into eager mode, pl 0
-	br.cond.sptk.many b7
-END(load_switch_stack)
-
-	/*
-	 * Invoke a system call, but do some tracing before and after the call.
-	 * We MUST preserve the current register frame throughout this routine
-	 * because some system calls (such as ia64_execve) directly
-	 * manipulate ar.pfs.
-	 */
-GLOBAL_ENTRY(ia64_trace_syscall)
-	PT_REGS_UNWIND_INFO(0)
-	/*
-	 * We need to preserve the scratch registers f6-f11 in case the system
-	 * call is sigreturn.
-	 */
-	adds r16=PT(F6)+16,sp
-	adds r17=PT(F7)+16,sp
-	;;
- 	stf.spill [r16]=f6,32
- 	stf.spill [r17]=f7,32
-	;;
- 	stf.spill [r16]=f8,32
- 	stf.spill [r17]=f9,32
-	;;
- 	stf.spill [r16]=f10
- 	stf.spill [r17]=f11
-	br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args
-	cmp.lt p6,p0=r8,r0			// check tracehook
-	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
-	adds r3=PT(R10)+16,sp			// r3 = &pt_regs.r10
-	mov r10=0
-(p6)	br.cond.sptk strace_error		// syscall failed ->
-	adds r16=PT(F6)+16,sp
-	adds r17=PT(F7)+16,sp
-	;;
-	ldf.fill f6=[r16],32
-	ldf.fill f7=[r17],32
-	;;
-	ldf.fill f8=[r16],32
-	ldf.fill f9=[r17],32
-	;;
-	ldf.fill f10=[r16]
-	ldf.fill f11=[r17]
-	// the syscall number may have changed, so re-load it and re-calculate the
-	// syscall entry-point:
-	adds r15=PT(R15)+16,sp			// r15 = &pt_regs.r15 (syscall #)
-	;;
-	ld8 r15=[r15]
-	mov r3=NR_syscalls - 1
-	;;
-	adds r15=-1024,r15
-	movl r16=sys_call_table
-	;;
-	shladd r20=r15,3,r16			// r20 = sys_call_table + 8*(syscall-1024)
-	cmp.leu p6,p7=r15,r3
-	;;
-(p6)	ld8 r20=[r20]				// load address of syscall entry point
-(p7)	movl r20=sys_ni_syscall
-	;;
-	mov b6=r20
-	br.call.sptk.many rp=b6			// do the syscall
-.strace_check_retval:
-	cmp.lt p6,p0=r8,r0			// syscall failed?
-	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
-	adds r3=PT(R10)+16,sp			// r3 = &pt_regs.r10
-	mov r10=0
-(p6)	br.cond.sptk strace_error		// syscall failed ->
-	;;					// avoid RAW on r10
-.strace_save_retval:
-.mem.offset 0,0; st8.spill [r2]=r8		// store return value in slot for r8
-.mem.offset 8,0; st8.spill [r3]=r10		// clear error indication in slot for r10
-	br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
-.ret3:
-(pUStk)	cmp.eq.unc p6,p0=r0,r0			// p6 <- pUStk
-(pUStk)	rsm psr.i				// disable interrupts
-	br.cond.sptk ia64_work_pending_syscall_end
-
-strace_error:
-	ld8 r3=[r2]				// load pt_regs.r8
-	sub r9=0,r8				// negate return value to get errno value
-	;;
-	cmp.ne p6,p0=r3,r0			// is pt_regs.r8!=0?
-	adds r3=16,r2				// r3=&pt_regs.r10
-	;;
-(p6)	mov r10=-1
-(p6)	mov r8=r9
-	br.cond.sptk .strace_save_retval
-END(ia64_trace_syscall)
-
-	/*
-	 * When traced and returning from sigreturn, we invoke syscall_trace but then
-	 * go straight to ia64_leave_kernel rather than ia64_leave_syscall.
-	 */
-GLOBAL_ENTRY(ia64_strace_leave_kernel)
-	PT_REGS_UNWIND_INFO(0)
-{	/*
-	 * Some versions of gas generate bad unwind info if the first instruction of a
-	 * procedure doesn't go into the first slot of a bundle.  This is a workaround.
-	 */
-	nop.m 0
-	nop.i 0
-	br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value
-}
-.ret4:	br.cond.sptk ia64_leave_kernel
-END(ia64_strace_leave_kernel)
-
-ENTRY(call_payload)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(0)
-	/* call the kernel_thread payload; fn is in r4, arg - in r5 */
-	alloc loc1=ar.pfs,0,3,1,0
-	mov loc0=rp
-	mov loc2=gp
-	mov out0=r5		// arg
-	ld8 r14 = [r4], 8	// fn.address
-	;;
-	mov b6 = r14
-	ld8 gp = [r4]		// fn.gp
-	;;
-	br.call.sptk.many rp=b6	// fn(arg)
-.ret12:	mov gp=loc2
-	mov rp=loc0
-	mov ar.pfs=loc1
-	/* ... and if it has returned, we are going to userland */
-	cmp.ne pKStk,pUStk=r0,r0
-	br.ret.sptk.many rp
-END(call_payload)
-
-GLOBAL_ENTRY(ia64_ret_from_clone)
-	PT_REGS_UNWIND_INFO(0)
-{	/*
-	 * Some versions of gas generate bad unwind info if the first instruction of a
-	 * procedure doesn't go into the first slot of a bundle.  This is a workaround.
-	 */
-	nop.m 0
-	nop.i 0
-	/*
-	 * We need to call schedule_tail() to complete the scheduling process.
-	 * Called by ia64_switch_to() after ia64_clone()->copy_thread().  r8 contains the
-	 * address of the previously executing task.
-	 */
-	br.call.sptk.many rp=ia64_invoke_schedule_tail
-}
-.ret8:
-(pKStk)	br.call.sptk.many rp=call_payload
-	adds r2=TI_FLAGS+IA64_TASK_SIZE,r13
-	;;
-	ld4 r2=[r2]
-	;;
-	mov r8=0
-	and r2=_TIF_SYSCALL_TRACEAUDIT,r2
-	;;
-	cmp.ne p6,p0=r2,r0
-(p6)	br.cond.spnt .strace_check_retval
-	;;					// added stop bits to prevent r8 dependency
-END(ia64_ret_from_clone)
-	// fall through
-GLOBAL_ENTRY(ia64_ret_from_syscall)
-	PT_REGS_UNWIND_INFO(0)
-	cmp.ge p6,p7=r8,r0			// syscall executed successfully?
-	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
-	mov r10=r0				// clear error indication in r10
-(p7)	br.cond.spnt handle_syscall_error	// handle potential syscall failure
-END(ia64_ret_from_syscall)
-	// fall through
-
-/*
- * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
- *	need to switch to bank 0 and doesn't restore the scratch registers.
- *	To avoid leaking kernel bits, the scratch registers are set to
- *	the following known-to-be-safe values:
- *
- *		  r1: restored (global pointer)
- *		  r2: cleared
- *		  r3: 1 (when returning to user-level)
- *	      r8-r11: restored (syscall return value(s))
- *		 r12: restored (user-level stack pointer)
- *		 r13: restored (user-level thread pointer)
- *		 r14: set to __kernel_syscall_via_epc
- *		 r15: restored (syscall #)
- *	     r16-r17: cleared
- *		 r18: user-level b6
- *		 r19: cleared
- *		 r20: user-level ar.fpsr
- *		 r21: user-level b0
- *		 r22: cleared
- *		 r23: user-level ar.bspstore
- *		 r24: user-level ar.rnat
- *		 r25: user-level ar.unat
- *		 r26: user-level ar.pfs
- *		 r27: user-level ar.rsc
- *		 r28: user-level ip
- *		 r29: user-level psr
- *		 r30: user-level cfm
- *		 r31: user-level pr
- *	      f6-f11: cleared
- *		  pr: restored (user-level pr)
- *		  b0: restored (user-level rp)
- *	          b6: restored
- *		  b7: set to __kernel_syscall_via_epc
- *	     ar.unat: restored (user-level ar.unat)
- *	      ar.pfs: restored (user-level ar.pfs)
- *	      ar.rsc: restored (user-level ar.rsc)
- *	     ar.rnat: restored (user-level ar.rnat)
- *	 ar.bspstore: restored (user-level ar.bspstore)
- *	     ar.fpsr: restored (user-level ar.fpsr)
- *	      ar.ccv: cleared
- *	      ar.csd: cleared
- *	      ar.ssd: cleared
- */
-GLOBAL_ENTRY(ia64_leave_syscall)
-	PT_REGS_UNWIND_INFO(0)
-	/*
-	 * work.need_resched etc. mustn't get changed by this CPU before it returns to
-	 * user- or fsys-mode, hence we disable interrupts early on.
-	 *
-	 * p6 controls whether current_thread_info()->flags needs to be check for
-	 * extra work.  We always check for extra work when returning to user-level.
-	 * With CONFIG_PREEMPTION, we also check for extra work when the preempt_count
-	 * is 0.  After extra work processing has been completed, execution
-	 * resumes at ia64_work_processed_syscall with p6 set to 1 if the extra-work-check
-	 * needs to be redone.
-	 */
-#ifdef CONFIG_PREEMPTION
-	RSM_PSR_I(p0, r2, r18)			// disable interrupts
-	cmp.eq pLvSys,p0=r0,r0			// pLvSys=1: leave from syscall
-(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
-	;;
-	.pred.rel.mutex pUStk,pKStk
-(pKStk) ld4 r21=[r20]			// r21 <- preempt_count
-(pUStk)	mov r21=0			// r21 <- 0
-	;;
-	cmp.eq p6,p0=r21,r0		// p6 <- pUStk || (preempt_count == 0)
-#else /* !CONFIG_PREEMPTION */
-	RSM_PSR_I(pUStk, r2, r18)
-	cmp.eq pLvSys,p0=r0,r0		// pLvSys=1: leave from syscall
-(pUStk)	cmp.eq.unc p6,p0=r0,r0		// p6 <- pUStk
-#endif
-.global ia64_work_processed_syscall;
-ia64_work_processed_syscall:
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	adds r2=PT(LOADRS)+16,r12
-	MOV_FROM_ITC(pUStk, p9, r22, r19)	// fetch time at leave
-	adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
-	;;
-(p6)	ld4 r31=[r18]				// load current_thread_info()->flags
-	ld8 r19=[r2],PT(B6)-PT(LOADRS)		// load ar.rsc value for "loadrs"
-	adds r3=PT(AR_BSPSTORE)+16,r12		// deferred
-	;;
-#else
-	adds r2=PT(LOADRS)+16,r12
-	adds r3=PT(AR_BSPSTORE)+16,r12
-	adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
-	;;
-(p6)	ld4 r31=[r18]				// load current_thread_info()->flags
-	ld8 r19=[r2],PT(B6)-PT(LOADRS)		// load ar.rsc value for "loadrs"
-	nop.i 0
-	;;
-#endif
-	mov r16=ar.bsp				// M2  get existing backing store pointer
-	ld8 r18=[r2],PT(R9)-PT(B6)		// load b6
-(p6)	and r15=TIF_WORK_MASK,r31		// any work other than TIF_SYSCALL_TRACE?
-	;;
-	ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE)	// load ar.bspstore (may be garbage)
-(p6)	cmp4.ne.unc p6,p0=r15, r0		// any special work pending?
-(p6)	br.cond.spnt .work_pending_syscall
-	;;
-	// start restoring the state saved on the kernel stack (struct pt_regs):
-	ld8 r9=[r2],PT(CR_IPSR)-PT(R9)
-	ld8 r11=[r3],PT(CR_IIP)-PT(R11)
-(pNonSys) break 0		//      bug check: we shouldn't be here if pNonSys is TRUE!
-	;;
-	invala			// M0|1 invalidate ALAT
-	RSM_PSR_I_IC(r28, r29, r30)	// M2   turn off interrupts and interruption collection
-	cmp.eq p9,p0=r0,r0	// A    set p9 to indicate that we should restore cr.ifs
-
-	ld8 r29=[r2],16		// M0|1 load cr.ipsr
-	ld8 r28=[r3],16		// M0|1 load cr.iip
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-(pUStk) add r14=TI_AC_LEAVE+IA64_TASK_SIZE,r13
-	;;
-	ld8 r30=[r2],16		// M0|1 load cr.ifs
-	ld8 r25=[r3],16		// M0|1 load ar.unat
-(pUStk) add r15=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
-	;;
-#else
-	mov r22=r0		// A    clear r22
-	;;
-	ld8 r30=[r2],16		// M0|1 load cr.ifs
-	ld8 r25=[r3],16		// M0|1 load ar.unat
-(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
-	;;
-#endif
-	ld8 r26=[r2],PT(B0)-PT(AR_PFS)	// M0|1 load ar.pfs
-	MOV_FROM_PSR(pKStk, r22, r21)	// M2   read PSR now that interrupts are disabled
-	nop 0
-	;;
-	ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // M0|1 load b0
-	ld8 r27=[r3],PT(PR)-PT(AR_RSC)	// M0|1 load ar.rsc
-	mov f6=f0			// F    clear f6
-	;;
-	ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT)	// M0|1 load ar.rnat (may be garbage)
-	ld8 r31=[r3],PT(R1)-PT(PR)		// M0|1 load predicates
-	mov f7=f0				// F    clear f7
-	;;
-	ld8 r20=[r2],PT(R12)-PT(AR_FPSR)	// M0|1 load ar.fpsr
-	ld8.fill r1=[r3],16			// M0|1 load r1
-(pUStk) mov r17=1				// A
-	;;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-(pUStk) st1 [r15]=r17				// M2|3
-#else
-(pUStk) st1 [r14]=r17				// M2|3
-#endif
-	ld8.fill r13=[r3],16			// M0|1
-	mov f8=f0				// F    clear f8
-	;;
-	ld8.fill r12=[r2]			// M0|1 restore r12 (sp)
-	ld8.fill r15=[r3]			// M0|1 restore r15
-	mov b6=r18				// I0   restore b6
-
-	LOAD_PHYS_STACK_REG_SIZE(r17)
-	mov f9=f0					// F    clear f9
-(pKStk) br.cond.dpnt.many skip_rbs_switch		// B
-
-	srlz.d				// M0   ensure interruption collection is off (for cover)
-	shr.u r18=r19,16		// I0|1 get byte size of existing "dirty" partition
-	COVER				// B    add current frame into dirty partition & set cr.ifs
-	;;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	mov r19=ar.bsp			// M2   get new backing store pointer
-	st8 [r14]=r22			// M	save time at leave
-	mov f10=f0			// F    clear f10
-
-	mov r22=r0			// A	clear r22
-	movl r14=__kernel_syscall_via_epc // X
-	;;
-#else
-	mov r19=ar.bsp			// M2   get new backing store pointer
-	mov f10=f0			// F    clear f10
-
-	nop.m 0
-	movl r14=__kernel_syscall_via_epc // X
-	;;
-#endif
-	mov.m ar.csd=r0			// M2   clear ar.csd
-	mov.m ar.ccv=r0			// M2   clear ar.ccv
-	mov b7=r14			// I0   clear b7 (hint with __kernel_syscall_via_epc)
-
-	mov.m ar.ssd=r0			// M2   clear ar.ssd
-	mov f11=f0			// F    clear f11
-	br.cond.sptk.many rbs_switch	// B
-END(ia64_leave_syscall)
-
-GLOBAL_ENTRY(ia64_leave_kernel)
-	PT_REGS_UNWIND_INFO(0)
-	/*
-	 * work.need_resched etc. mustn't get changed by this CPU before it returns to
-	 * user- or fsys-mode, hence we disable interrupts early on.
-	 *
-	 * p6 controls whether current_thread_info()->flags needs to be check for
-	 * extra work.  We always check for extra work when returning to user-level.
-	 * With CONFIG_PREEMPTION, we also check for extra work when the preempt_count
-	 * is 0.  After extra work processing has been completed, execution
-	 * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check
-	 * needs to be redone.
-	 */
-#ifdef CONFIG_PREEMPTION
-	RSM_PSR_I(p0, r17, r31)			// disable interrupts
-	cmp.eq p0,pLvSys=r0,r0			// pLvSys=0: leave from kernel
-(pKStk)	adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13
-	;;
-	.pred.rel.mutex pUStk,pKStk
-(pKStk)	ld4 r21=[r20]			// r21 <- preempt_count
-(pUStk)	mov r21=0			// r21 <- 0
-	;;
-	cmp.eq p6,p0=r21,r0		// p6 <- pUStk || (preempt_count == 0)
-#else
-	RSM_PSR_I(pUStk, r17, r31)
-	cmp.eq p0,pLvSys=r0,r0		// pLvSys=0: leave from kernel
-(pUStk)	cmp.eq.unc p6,p0=r0,r0		// p6 <- pUStk
-#endif
-.work_processed_kernel:
-	adds r17=TI_FLAGS+IA64_TASK_SIZE,r13
-	;;
-(p6)	ld4 r31=[r17]				// load current_thread_info()->flags
-	adds r21=PT(PR)+16,r12
-	;;
-
-	lfetch [r21],PT(CR_IPSR)-PT(PR)
-	adds r2=PT(B6)+16,r12
-	adds r3=PT(R16)+16,r12
-	;;
-	lfetch [r21]
-	ld8 r28=[r2],8		// load b6
-	adds r29=PT(R24)+16,r12
-
-	ld8.fill r16=[r3],PT(AR_CSD)-PT(R16)
-	adds r30=PT(AR_CCV)+16,r12
-(p6)	and r19=TIF_WORK_MASK,r31		// any work other than TIF_SYSCALL_TRACE?
-	;;
-	ld8.fill r24=[r29]
-	ld8 r15=[r30]		// load ar.ccv
-(p6)	cmp4.ne.unc p6,p0=r19, r0		// any special work pending?
-	;;
-	ld8 r29=[r2],16		// load b7
-	ld8 r30=[r3],16		// load ar.csd
-(p6)	br.cond.spnt .work_pending
-	;;
-	ld8 r31=[r2],16		// load ar.ssd
-	ld8.fill r8=[r3],16
-	;;
-	ld8.fill r9=[r2],16
-	ld8.fill r10=[r3],PT(R17)-PT(R10)
-	;;
-	ld8.fill r11=[r2],PT(R18)-PT(R11)
-	ld8.fill r17=[r3],16
-	;;
-	ld8.fill r18=[r2],16
-	ld8.fill r19=[r3],16
-	;;
-	ld8.fill r20=[r2],16
-	ld8.fill r21=[r3],16
-	mov ar.csd=r30
-	mov ar.ssd=r31
-	;;
-	RSM_PSR_I_IC(r23, r22, r25)	// initiate turning off of interrupt and interruption collection
-	invala			// invalidate ALAT
-	;;
-	ld8.fill r22=[r2],24
-	ld8.fill r23=[r3],24
-	mov b6=r28
-	;;
-	ld8.fill r25=[r2],16
-	ld8.fill r26=[r3],16
-	mov b7=r29
-	;;
-	ld8.fill r27=[r2],16
-	ld8.fill r28=[r3],16
-	;;
-	ld8.fill r29=[r2],16
-	ld8.fill r30=[r3],24
-	;;
-	ld8.fill r31=[r2],PT(F9)-PT(R31)
-	adds r3=PT(F10)-PT(F6),r3
-	;;
-	ldf.fill f9=[r2],PT(F6)-PT(F9)
-	ldf.fill f10=[r3],PT(F8)-PT(F10)
-	;;
-	ldf.fill f6=[r2],PT(F7)-PT(F6)
-	;;
-	ldf.fill f7=[r2],PT(F11)-PT(F7)
-	ldf.fill f8=[r3],32
-	;;
-	srlz.d	// ensure that inter. collection is off (VHPT is don't care, since text is pinned)
-	mov ar.ccv=r15
-	;;
-	ldf.fill f11=[r2]
-	BSW_0(r2, r3, r15)	// switch back to bank 0 (no stop bit required beforehand...)
-	;;
-(pUStk)	mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency)
-	adds r16=PT(CR_IPSR)+16,r12
-	adds r17=PT(CR_IIP)+16,r12
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	.pred.rel.mutex pUStk,pKStk
-	MOV_FROM_PSR(pKStk, r22, r29)	// M2 read PSR now that interrupts are disabled
-	MOV_FROM_ITC(pUStk, p9, r22, r29)	// M  fetch time at leave
-	nop.i 0
-	;;
-#else
-	MOV_FROM_PSR(pKStk, r22, r29)	// M2 read PSR now that interrupts are disabled
-	nop.i 0
-	nop.i 0
-	;;
-#endif
-	ld8 r29=[r16],16	// load cr.ipsr
-	ld8 r28=[r17],16	// load cr.iip
-	;;
-	ld8 r30=[r16],16	// load cr.ifs
-	ld8 r25=[r17],16	// load ar.unat
-	;;
-	ld8 r26=[r16],16	// load ar.pfs
-	ld8 r27=[r17],16	// load ar.rsc
-	cmp.eq p9,p0=r0,r0	// set p9 to indicate that we should restore cr.ifs
-	;;
-	ld8 r24=[r16],16	// load ar.rnat (may be garbage)
-	ld8 r23=[r17],16	// load ar.bspstore (may be garbage)
-	;;
-	ld8 r31=[r16],16	// load predicates
-	ld8 r21=[r17],16	// load b0
-	;;
-	ld8 r19=[r16],16	// load ar.rsc value for "loadrs"
-	ld8.fill r1=[r17],16	// load r1
-	;;
-	ld8.fill r12=[r16],16
-	ld8.fill r13=[r17],16
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-(pUStk)	adds r3=TI_AC_LEAVE+IA64_TASK_SIZE,r18
-#else
-(pUStk)	adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18
-#endif
-	;;
-	ld8 r20=[r16],16	// ar.fpsr
-	ld8.fill r15=[r17],16
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-(pUStk)	adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18	// deferred
-#endif
-	;;
-	ld8.fill r14=[r16],16
-	ld8.fill r2=[r17]
-(pUStk)	mov r17=1
-	;;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	//  mmi_ :  ld8 st1 shr;;         mmi_ : st8 st1 shr;;
-	//  mib  :  mov add br        ->  mib  : ld8 add br
-	//  bbb_ :  br  nop cover;;       mbb_ : mov br  cover;;
-	//
-	//  no one require bsp in r16 if (pKStk) branch is selected.
-(pUStk)	st8 [r3]=r22		// save time at leave
-(pUStk)	st1 [r18]=r17		// restore current->thread.on_ustack
-	shr.u r18=r19,16	// get byte size of existing "dirty" partition
-	;;
-	ld8.fill r3=[r16]	// deferred
-	LOAD_PHYS_STACK_REG_SIZE(r17)
-(pKStk)	br.cond.dpnt skip_rbs_switch
-	mov r16=ar.bsp		// get existing backing store pointer
-#else
-	ld8.fill r3=[r16]
-(pUStk)	st1 [r18]=r17		// restore current->thread.on_ustack
-	shr.u r18=r19,16	// get byte size of existing "dirty" partition
-	;;
-	mov r16=ar.bsp		// get existing backing store pointer
-	LOAD_PHYS_STACK_REG_SIZE(r17)
-(pKStk)	br.cond.dpnt skip_rbs_switch
-#endif
-
-	/*
-	 * Restore user backing store.
-	 *
-	 * NOTE: alloc, loadrs, and cover can't be predicated.
-	 */
-(pNonSys) br.cond.dpnt dont_preserve_current_frame
-	COVER				// add current frame into dirty partition and set cr.ifs
-	;;
-	mov r19=ar.bsp			// get new backing store pointer
-rbs_switch:
-	sub r16=r16,r18			// krbs = old bsp - size of dirty partition
-	cmp.ne p9,p0=r0,r0		// clear p9 to skip restore of cr.ifs
-	;;
-	sub r19=r19,r16			// calculate total byte size of dirty partition
-	add r18=64,r18			// don't force in0-in7 into memory...
-	;;
-	shl r19=r19,16			// shift size of dirty partition into loadrs position
-	;;
-dont_preserve_current_frame:
-	/*
-	 * To prevent leaking bits between the kernel and user-space,
-	 * we must clear the stacked registers in the "invalid" partition here.
-	 * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium,
-	 * 5 registers/cycle on McKinley).
-	 */
-#	define pRecurse	p6
-#	define pReturn	p7
-#ifdef CONFIG_ITANIUM
-#	define Nregs	10
-#else
-#	define Nregs	14
-#endif
-	alloc loc0=ar.pfs,2,Nregs-2,2,0
-	shr.u loc1=r18,9		// RNaTslots <= floor(dirtySize / (64*8))
-	sub r17=r17,r18			// r17 = (physStackedSize + 8) - dirtySize
-	;;
-	mov ar.rsc=r19			// load ar.rsc to be used for "loadrs"
-	shladd in0=loc1,3,r17
-	mov in1=0
-	;;
-	TEXT_ALIGN(32)
-rse_clear_invalid:
-#ifdef CONFIG_ITANIUM
-	// cycle 0
- { .mii
-	alloc loc0=ar.pfs,2,Nregs-2,2,0
-	cmp.lt pRecurse,p0=Nregs*8,in0	// if more than Nregs regs left to clear, (re)curse
-	add out0=-Nregs*8,in0
-}{ .mfb
-	add out1=1,in1			// increment recursion count
-	nop.f 0
-	nop.b 0				// can't do br.call here because of alloc (WAW on CFM)
-	;;
-}{ .mfi	// cycle 1
-	mov loc1=0
-	nop.f 0
-	mov loc2=0
-}{ .mib
-	mov loc3=0
-	mov loc4=0
-(pRecurse) br.call.sptk.many b0=rse_clear_invalid
-
-}{ .mfi	// cycle 2
-	mov loc5=0
-	nop.f 0
-	cmp.ne pReturn,p0=r0,in1	// if recursion count != 0, we need to do a br.ret
-}{ .mib
-	mov loc6=0
-	mov loc7=0
-(pReturn) br.ret.sptk.many b0
-}
-#else /* !CONFIG_ITANIUM */
-	alloc loc0=ar.pfs,2,Nregs-2,2,0
-	cmp.lt pRecurse,p0=Nregs*8,in0	// if more than Nregs regs left to clear, (re)curse
-	add out0=-Nregs*8,in0
-	add out1=1,in1			// increment recursion count
-	mov loc1=0
-	mov loc2=0
-	;;
-	mov loc3=0
-	mov loc4=0
-	mov loc5=0
-	mov loc6=0
-	mov loc7=0
-(pRecurse) br.call.dptk.few b0=rse_clear_invalid
-	;;
-	mov loc8=0
-	mov loc9=0
-	cmp.ne pReturn,p0=r0,in1	// if recursion count != 0, we need to do a br.ret
-	mov loc10=0
-	mov loc11=0
-(pReturn) br.ret.dptk.many b0
-#endif /* !CONFIG_ITANIUM */
-#	undef pRecurse
-#	undef pReturn
-	;;
-	alloc r17=ar.pfs,0,0,0,0	// drop current register frame
-	;;
-	loadrs
-	;;
-skip_rbs_switch:
-	mov ar.unat=r25		// M2
-(pKStk)	extr.u r22=r22,21,1	// I0 extract current value of psr.pp from r22
-(pLvSys)mov r19=r0		// A  clear r19 for leave_syscall, no-op otherwise
-	;;
-(pUStk)	mov ar.bspstore=r23	// M2
-(pKStk)	dep r29=r22,r29,21,1	// I0 update ipsr.pp with psr.pp
-(pLvSys)mov r16=r0		// A  clear r16 for leave_syscall, no-op otherwise
-	;;
-	MOV_TO_IPSR(p0, r29, r25)	// M2
-	mov ar.pfs=r26		// I0
-(pLvSys)mov r17=r0		// A  clear r17 for leave_syscall, no-op otherwise
-
-	MOV_TO_IFS(p9, r30, r25)// M2
-	mov b0=r21		// I0
-(pLvSys)mov r18=r0		// A  clear r18 for leave_syscall, no-op otherwise
-
-	mov ar.fpsr=r20		// M2
-	MOV_TO_IIP(r28, r25)	// M2
-	nop 0
-	;;
-(pUStk)	mov ar.rnat=r24		// M2 must happen with RSE in lazy mode
-	nop 0
-(pLvSys)mov r2=r0
-
-	mov ar.rsc=r27		// M2
-	mov pr=r31,-1		// I0
-	RFI			// B
-
-	/*
-	 * On entry:
-	 *	r20 = &current->thread_info->pre_count (if CONFIG_PREEMPTION)
-	 *	r31 = current->thread_info->flags
-	 * On exit:
-	 *	p6 = TRUE if work-pending-check needs to be redone
-	 *
-	 * Interrupts are disabled on entry, reenabled depend on work, and
-	 * disabled on exit.
-	 */
-.work_pending_syscall:
-	add r2=-8,r2
-	add r3=-8,r3
-	;;
-	st8 [r2]=r8
-	st8 [r3]=r10
-.work_pending:
-	tbit.z p6,p0=r31,TIF_NEED_RESCHED	// is resched not needed?
-(p6)	br.cond.sptk.few .notify
-	br.call.spnt.many rp=preempt_schedule_irq
-.ret9:	cmp.eq p6,p0=r0,r0	// p6 <- 1 (re-check)
-(pLvSys)br.cond.sptk.few  ia64_work_pending_syscall_end
-	br.cond.sptk.many .work_processed_kernel
-
-.notify:
-(pUStk)	br.call.spnt.many rp=notify_resume_user
-.ret10:	cmp.ne p6,p0=r0,r0	// p6 <- 0 (don't re-check)
-(pLvSys)br.cond.sptk.few  ia64_work_pending_syscall_end
-	br.cond.sptk.many .work_processed_kernel
-
-.global ia64_work_pending_syscall_end;
-ia64_work_pending_syscall_end:
-	adds r2=PT(R8)+16,r12
-	adds r3=PT(R10)+16,r12
-	;;
-	ld8 r8=[r2]
-	ld8 r10=[r3]
-	br.cond.sptk.many ia64_work_processed_syscall
-END(ia64_leave_kernel)
-
-ENTRY(handle_syscall_error)
-	/*
-	 * Some system calls (e.g., ptrace, mmap) can return arbitrary values which could
-	 * lead us to mistake a negative return value as a failed syscall.  Those syscall
-	 * must deposit a non-zero value in pt_regs.r8 to indicate an error.  If
-	 * pt_regs.r8 is zero, we assume that the call completed successfully.
-	 */
-	PT_REGS_UNWIND_INFO(0)
-	ld8 r3=[r2]		// load pt_regs.r8
-	;;
-	cmp.eq p6,p7=r3,r0	// is pt_regs.r8==0?
-	;;
-(p7)	mov r10=-1
-(p7)	sub r8=0,r8		// negate return value to get errno
-	br.cond.sptk ia64_leave_syscall
-END(handle_syscall_error)
-
-	/*
-	 * Invoke schedule_tail(task) while preserving in0-in7, which may be needed
-	 * in case a system call gets restarted.
-	 */
-GLOBAL_ENTRY(ia64_invoke_schedule_tail)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc loc1=ar.pfs,8,2,1,0
-	mov loc0=rp
-	mov out0=r8				// Address of previous task
-	;;
-	br.call.sptk.many rp=schedule_tail
-.ret11:	mov ar.pfs=loc1
-	mov rp=loc0
-	br.ret.sptk.many rp
-END(ia64_invoke_schedule_tail)
-
-	/*
-	 * Setup stack and call do_notify_resume_user(), keeping interrupts
-	 * disabled.
-	 *
-	 * Note that pSys and pNonSys need to be set up by the caller.
-	 * We declare 8 input registers so the system call args get preserved,
-	 * in case we need to restart a system call.
-	 */
-GLOBAL_ENTRY(notify_resume_user)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8)
-	alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart!
-	mov r9=ar.unat
-	mov loc0=rp				// save return address
-	mov out0=0				// there is no "oldset"
-	adds out1=8,sp				// out1=&sigscratch->ar_pfs
-(pSys)	mov out2=1				// out2==1 => we're in a syscall
-	;;
-(pNonSys) mov out2=0				// out2==0 => not a syscall
-	.fframe 16
-	.spillsp ar.unat, 16
-	st8 [sp]=r9,-16				// allocate space for ar.unat and save it
-	st8 [out1]=loc1,-8			// save ar.pfs, out1=&sigscratch
-	.body
-	br.call.sptk.many rp=do_notify_resume_user
-.ret15:	.restore sp
-	adds sp=16,sp				// pop scratch stack space
-	;;
-	ld8 r9=[sp]				// load new unat from sigscratch->scratch_unat
-	mov rp=loc0
-	;;
-	mov ar.unat=r9
-	mov ar.pfs=loc1
-	br.ret.sptk.many rp
-END(notify_resume_user)
-
-ENTRY(sys_rt_sigreturn)
-	PT_REGS_UNWIND_INFO(0)
-	/*
-	 * Allocate 8 input registers since ptrace() may clobber them
-	 */
-	alloc r2=ar.pfs,8,0,1,0
-	.prologue
-	PT_REGS_SAVES(16)
-	adds sp=-16,sp
-	.body
-	cmp.eq pNonSys,pSys=r0,r0		// sigreturn isn't a normal syscall...
-	;;
-	/*
-	 * leave_kernel() restores f6-f11 from pt_regs, but since the streamlined
-	 * syscall-entry path does not save them we save them here instead.  Note: we
-	 * don't need to save any other registers that are not saved by the stream-lined
-	 * syscall path, because restore_sigcontext() restores them.
-	 */
-	adds r16=PT(F6)+32,sp
-	adds r17=PT(F7)+32,sp
-	;;
- 	stf.spill [r16]=f6,32
- 	stf.spill [r17]=f7,32
-	;;
- 	stf.spill [r16]=f8,32
- 	stf.spill [r17]=f9,32
-	;;
- 	stf.spill [r16]=f10
- 	stf.spill [r17]=f11
-	adds out0=16,sp				// out0 = &sigscratch
-	br.call.sptk.many rp=ia64_rt_sigreturn
-.ret19:	.restore sp,0
-	adds sp=16,sp
-	;;
-	ld8 r9=[sp]				// load new ar.unat
-	mov.sptk b7=r8,ia64_leave_kernel
-	;;
-	mov ar.unat=r9
-	br.many b7
-END(sys_rt_sigreturn)
-
-GLOBAL_ENTRY(ia64_prepare_handle_unaligned)
-	.prologue
-	/*
-	 * r16 = fake ar.pfs, we simply need to make sure privilege is still 0
-	 */
-	mov r16=r0
-	DO_SAVE_SWITCH_STACK
-	br.call.sptk.many rp=ia64_handle_unaligned	// stack frame setup in ivt
-.ret21:	.body
-	DO_LOAD_SWITCH_STACK
-	br.cond.sptk.many rp				// goes to ia64_leave_kernel
-END(ia64_prepare_handle_unaligned)
-
-	//
-	// unw_init_running(void (*callback)(info, arg), void *arg)
-	//
-#	define EXTRA_FRAME_SIZE	((UNW_FRAME_INFO_SIZE+15)&~15)
-
-GLOBAL_ENTRY(unw_init_running)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
-	alloc loc1=ar.pfs,2,3,3,0
-	;;
-	ld8 loc2=[in0],8
-	mov loc0=rp
-	mov r16=loc1
-	DO_SAVE_SWITCH_STACK
-	.body
-
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
-	.fframe IA64_SWITCH_STACK_SIZE+EXTRA_FRAME_SIZE
-	SWITCH_STACK_SAVES(EXTRA_FRAME_SIZE)
-	adds sp=-EXTRA_FRAME_SIZE,sp
-	.body
-	;;
-	adds out0=16,sp				// &info
-	mov out1=r13				// current
-	adds out2=16+EXTRA_FRAME_SIZE,sp	// &switch_stack
-	br.call.sptk.many rp=unw_init_frame_info
-1:	adds out0=16,sp				// &info
-	mov b6=loc2
-	mov loc2=gp				// save gp across indirect function call
-	;;
-	ld8 gp=[in0]
-	mov out1=in1				// arg
-	br.call.sptk.many rp=b6			// invoke the callback function
-1:	mov gp=loc2				// restore gp
-
-	// For now, we don't allow changing registers from within
-	// unw_init_running; if we ever want to allow that, we'd
-	// have to do a load_switch_stack here:
-	.restore sp
-	adds sp=IA64_SWITCH_STACK_SIZE+EXTRA_FRAME_SIZE,sp
-
-	mov ar.pfs=loc1
-	mov rp=loc0
-	br.ret.sptk.many rp
-END(unw_init_running)
-EXPORT_SYMBOL(unw_init_running)
-
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-GLOBAL_ENTRY(_mcount)
-	br ftrace_stub
-END(_mcount)
-EXPORT_SYMBOL(_mcount)
-
-.here:
-	br.ret.sptk.many b0
-
-GLOBAL_ENTRY(ftrace_caller)
-	alloc out0 = ar.pfs, 8, 0, 4, 0
-	mov out3 = r0
-	;;
-	mov out2 = b0
-	add r3 = 0x20, r3
-	mov out1 = r1;
-	br.call.sptk.many b0 = ftrace_patch_gp
-	//this might be called from module, so we must patch gp
-ftrace_patch_gp:
-	movl gp=__gp
-	mov b0 = r3
-	;;
-.global ftrace_call;
-ftrace_call:
-{
-	.mlx
-	nop.m 0x0
-	movl r3 = .here;;
-}
-	alloc loc0 = ar.pfs, 4, 4, 2, 0
-	;;
-	mov loc1 = b0
-	mov out0 = b0
-	mov loc2 = r8
-	mov loc3 = r15
-	;;
-	adds out0 = -MCOUNT_INSN_SIZE, out0
-	mov out1 = in2
-	mov b6 = r3
-
-	br.call.sptk.many b0 = b6
-	;;
-	mov ar.pfs = loc0
-	mov b0 = loc1
-	mov r8 = loc2
-	mov r15 = loc3
-	br ftrace_stub
-	;;
-END(ftrace_caller)
-
-#else
-GLOBAL_ENTRY(_mcount)
-	movl r2 = ftrace_stub
-	movl r3 = ftrace_trace_function;;
-	ld8 r3 = [r3];;
-	ld8 r3 = [r3];;
-	cmp.eq p7,p0 = r2, r3
-(p7)	br.sptk.many ftrace_stub
-	;;
-
-	alloc loc0 = ar.pfs, 4, 4, 2, 0
-	;;
-	mov loc1 = b0
-	mov out0 = b0
-	mov loc2 = r8
-	mov loc3 = r15
-	;;
-	adds out0 = -MCOUNT_INSN_SIZE, out0
-	mov out1 = in2
-	mov b6 = r3
-
-	br.call.sptk.many b0 = b6
-	;;
-	mov ar.pfs = loc0
-	mov b0 = loc1
-	mov r8 = loc2
-	mov r15 = loc3
-	br ftrace_stub
-	;;
-END(_mcount)
-#endif
-
-GLOBAL_ENTRY(ftrace_stub)
-	mov r3 = b0
-	movl r2 = _mcount_ret_helper
-	;;
-	mov b6 = r2
-	mov b7 = r3
-	br.ret.sptk.many b6
-
-_mcount_ret_helper:
-	mov b0 = r42
-	mov r1 = r41
-	mov ar.pfs = r40
-	br b7
-END(ftrace_stub)
-
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#define __SYSCALL(nr, entry) data8 entry
-	.rodata
-	.align 8
-	.globl sys_call_table
-sys_call_table:
-#include <asm/syscall_table.h>
diff --git a/arch/ia64/kernel/entry.h b/arch/ia64/kernel/entry.h
deleted file mode 100644
index 6463dc316263..000000000000
--- a/arch/ia64/kernel/entry.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/*
- * Preserved registers that are shared between code in ivt.S and
- * entry.S.  Be careful not to step on these!
- */
-#define PRED_LEAVE_SYSCALL	1 /* TRUE iff leave from syscall */
-#define PRED_KERNEL_STACK	2 /* returning to kernel-stacks? */
-#define PRED_USER_STACK		3 /* returning to user-stacks? */
-#define PRED_SYSCALL		4 /* inside a system call? */
-#define PRED_NON_SYSCALL	5 /* complement of PRED_SYSCALL */
-
-#ifdef __ASSEMBLY__
-# define PASTE2(x,y)	x##y
-# define PASTE(x,y)	PASTE2(x,y)
-
-# define pLvSys		PASTE(p,PRED_LEAVE_SYSCALL)
-# define pKStk		PASTE(p,PRED_KERNEL_STACK)
-# define pUStk		PASTE(p,PRED_USER_STACK)
-# define pSys		PASTE(p,PRED_SYSCALL)
-# define pNonSys	PASTE(p,PRED_NON_SYSCALL)
-#endif
-
-#define PT(f)		(IA64_PT_REGS_##f##_OFFSET)
-#define SW(f)		(IA64_SWITCH_STACK_##f##_OFFSET)
-#define SOS(f)		(IA64_SAL_OS_STATE_##f##_OFFSET)
-
-#define PT_REGS_SAVES(off)			\
-	.unwabi 3, 'i';				\
-	.fframe IA64_PT_REGS_SIZE+16+(off);	\
-	.spillsp rp, PT(CR_IIP)+16+(off);	\
-	.spillsp ar.pfs, PT(CR_IFS)+16+(off);	\
-	.spillsp ar.unat, PT(AR_UNAT)+16+(off);	\
-	.spillsp ar.fpsr, PT(AR_FPSR)+16+(off);	\
-	.spillsp pr, PT(PR)+16+(off);
-
-#define PT_REGS_UNWIND_INFO(off)		\
-	.prologue;				\
-	PT_REGS_SAVES(off);			\
-	.body
-
-#define SWITCH_STACK_SAVES(off)							\
-	.savesp ar.unat,SW(CALLER_UNAT)+16+(off);				\
-	.savesp ar.fpsr,SW(AR_FPSR)+16+(off);					\
-	.spillsp f2,SW(F2)+16+(off); .spillsp f3,SW(F3)+16+(off);		\
-	.spillsp f4,SW(F4)+16+(off); .spillsp f5,SW(F5)+16+(off);		\
-	.spillsp f16,SW(F16)+16+(off); .spillsp f17,SW(F17)+16+(off);		\
-	.spillsp f18,SW(F18)+16+(off); .spillsp f19,SW(F19)+16+(off);		\
-	.spillsp f20,SW(F20)+16+(off); .spillsp f21,SW(F21)+16+(off);		\
-	.spillsp f22,SW(F22)+16+(off); .spillsp f23,SW(F23)+16+(off);		\
-	.spillsp f24,SW(F24)+16+(off); .spillsp f25,SW(F25)+16+(off);		\
-	.spillsp f26,SW(F26)+16+(off); .spillsp f27,SW(F27)+16+(off);		\
-	.spillsp f28,SW(F28)+16+(off); .spillsp f29,SW(F29)+16+(off);		\
-	.spillsp f30,SW(F30)+16+(off); .spillsp f31,SW(F31)+16+(off);		\
-	.spillsp r4,SW(R4)+16+(off); .spillsp r5,SW(R5)+16+(off);		\
-	.spillsp r6,SW(R6)+16+(off); .spillsp r7,SW(R7)+16+(off);		\
-	.spillsp b0,SW(B0)+16+(off); .spillsp b1,SW(B1)+16+(off);		\
-	.spillsp b2,SW(B2)+16+(off); .spillsp b3,SW(B3)+16+(off);		\
-	.spillsp b4,SW(B4)+16+(off); .spillsp b5,SW(B5)+16+(off);		\
-	.spillsp ar.pfs,SW(AR_PFS)+16+(off); .spillsp ar.lc,SW(AR_LC)+16+(off);	\
-	.spillsp @priunat,SW(AR_UNAT)+16+(off);					\
-	.spillsp ar.rnat,SW(AR_RNAT)+16+(off);					\
-	.spillsp ar.bspstore,SW(AR_BSPSTORE)+16+(off);				\
-	.spillsp pr,SW(PR)+16+(off)
-
-#define DO_SAVE_SWITCH_STACK			\
-	movl r28=1f;				\
-	;;					\
-	.fframe IA64_SWITCH_STACK_SIZE;		\
-	adds sp=-IA64_SWITCH_STACK_SIZE,sp;	\
-	mov.ret.sptk b7=r28,1f;			\
-	SWITCH_STACK_SAVES(0);			\
-	br.cond.sptk.many save_switch_stack;	\
-1:
-
-#define DO_LOAD_SWITCH_STACK			\
-	movl r28=1f;				\
-	;;					\
-	invala;					\
-	mov.ret.sptk b7=r28,1f;			\
-	br.cond.sptk.many load_switch_stack;	\
-1:	.restore sp;				\
-	adds sp=IA64_SWITCH_STACK_SIZE,sp
diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
deleted file mode 100644
index dd5bfed52031..000000000000
--- a/arch/ia64/kernel/err_inject.c
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * err_inject.c -
- *	1.) Inject errors to a processor.
- *	2.) Query error injection capabilities.
- * This driver along with user space code can be acting as an error
- * injection tool.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Written by: Fenghua Yu <fenghua.yu@intel.com>, Intel Corporation
- * Copyright (C) 2006, Intel Corp.  All rights reserved.
- *
- */
-#include <linux/device.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/cpu.h>
-#include <linux/module.h>
-
-#define ERR_INJ_DEBUG
-
-#define ERR_DATA_BUFFER_SIZE 3 		// Three 8-byte;
-
-#define define_one_ro(name) 						\
-static DEVICE_ATTR(name, 0444, show_##name, NULL)
-
-#define define_one_rw(name) 						\
-static DEVICE_ATTR(name, 0644, show_##name, store_##name)
-
-static u64 call_start[NR_CPUS];
-static u64 phys_addr[NR_CPUS];
-static u64 err_type_info[NR_CPUS];
-static u64 err_struct_info[NR_CPUS];
-static struct {
-	u64 data1;
-	u64 data2;
-	u64 data3;
-} __attribute__((__aligned__(16))) err_data_buffer[NR_CPUS];
-static s64 status[NR_CPUS];
-static u64 capabilities[NR_CPUS];
-static u64 resources[NR_CPUS];
-
-#define show(name) 							\
-static ssize_t 								\
-show_##name(struct device *dev, struct device_attribute *attr,	\
-		char *buf)						\
-{									\
-	u32 cpu=dev->id;						\
-	return sprintf(buf, "%llx\n", name[cpu]);			\
-}
-
-#define store(name)							\
-static ssize_t 								\
-store_##name(struct device *dev, struct device_attribute *attr,	\
-					const char *buf, size_t size)	\
-{									\
-	unsigned int cpu=dev->id;					\
-	name[cpu] = simple_strtoull(buf, NULL, 16);			\
-	return size;							\
-}
-
-show(call_start)
-
-/* It's user's responsibility to call the PAL procedure on a specific
- * processor. The cpu number in driver is only used for storing data.
- */
-static ssize_t
-store_call_start(struct device *dev, struct device_attribute *attr,
-		const char *buf, size_t size)
-{
-	unsigned int cpu=dev->id;
-	unsigned long call_start = simple_strtoull(buf, NULL, 16);
-
-#ifdef ERR_INJ_DEBUG
-	printk(KERN_DEBUG "pal_mc_err_inject for cpu%d:\n", cpu);
-	printk(KERN_DEBUG "err_type_info=%llx,\n", err_type_info[cpu]);
-	printk(KERN_DEBUG "err_struct_info=%llx,\n", err_struct_info[cpu]);
-	printk(KERN_DEBUG "err_data_buffer=%llx, %llx, %llx.\n",
-			  err_data_buffer[cpu].data1,
-			  err_data_buffer[cpu].data2,
-			  err_data_buffer[cpu].data3);
-#endif
-	switch (call_start) {
-	    case 0: /* Do nothing. */
-		break;
-	    case 1: /* Call pal_mc_error_inject in physical mode. */
-		status[cpu]=ia64_pal_mc_error_inject_phys(err_type_info[cpu],
-					err_struct_info[cpu],
-					ia64_tpa(&err_data_buffer[cpu]),
-					&capabilities[cpu],
-			 		&resources[cpu]);
-		break;
-	    case 2: /* Call pal_mc_error_inject in virtual mode. */
-		status[cpu]=ia64_pal_mc_error_inject_virt(err_type_info[cpu],
-					err_struct_info[cpu],
-					ia64_tpa(&err_data_buffer[cpu]),
-					&capabilities[cpu],
-			 		&resources[cpu]);
-		break;
-	    default:
-		status[cpu] = -EINVAL;
-		break;
-	}
-
-#ifdef ERR_INJ_DEBUG
-	printk(KERN_DEBUG "Returns: status=%d,\n", (int)status[cpu]);
-	printk(KERN_DEBUG "capabilities=%llx,\n", capabilities[cpu]);
-	printk(KERN_DEBUG "resources=%llx\n", resources[cpu]);
-#endif
-	return size;
-}
-
-show(err_type_info)
-store(err_type_info)
-
-static ssize_t
-show_virtual_to_phys(struct device *dev, struct device_attribute *attr,
-			char *buf)
-{
-	unsigned int cpu=dev->id;
-	return sprintf(buf, "%llx\n", phys_addr[cpu]);
-}
-
-static ssize_t
-store_virtual_to_phys(struct device *dev, struct device_attribute *attr,
-			const char *buf, size_t size)
-{
-	unsigned int cpu=dev->id;
-	u64 virt_addr=simple_strtoull(buf, NULL, 16);
-	int ret;
-
-	ret = get_user_pages_fast(virt_addr, 1, FOLL_WRITE, NULL);
-	if (ret<=0) {
-#ifdef ERR_INJ_DEBUG
-		printk("Virtual address %llx is not existing.\n", virt_addr);
-#endif
-		return -EINVAL;
-	}
-
-	phys_addr[cpu] = ia64_tpa(virt_addr);
-	return size;
-}
-
-show(err_struct_info)
-store(err_struct_info)
-
-static ssize_t
-show_err_data_buffer(struct device *dev,
-			struct device_attribute *attr, char *buf)
-{
-	unsigned int cpu=dev->id;
-
-	return sprintf(buf, "%llx, %llx, %llx\n",
-			err_data_buffer[cpu].data1,
-			err_data_buffer[cpu].data2,
-			err_data_buffer[cpu].data3);
-}
-
-static ssize_t
-store_err_data_buffer(struct device *dev,
-			struct device_attribute *attr,
-			const char *buf, size_t size)
-{
-	unsigned int cpu=dev->id;
-	int ret;
-
-#ifdef ERR_INJ_DEBUG
-	printk("write err_data_buffer=[%llx,%llx,%llx] on cpu%d\n",
-		 err_data_buffer[cpu].data1,
-		 err_data_buffer[cpu].data2,
-		 err_data_buffer[cpu].data3,
-		 cpu);
-#endif
-	ret = sscanf(buf, "%llx, %llx, %llx",
-			&err_data_buffer[cpu].data1,
-			&err_data_buffer[cpu].data2,
-			&err_data_buffer[cpu].data3);
-	if (ret!=ERR_DATA_BUFFER_SIZE)
-		return -EINVAL;
-
-	return size;
-}
-
-show(status)
-show(capabilities)
-show(resources)
-
-define_one_rw(call_start);
-define_one_rw(err_type_info);
-define_one_rw(err_struct_info);
-define_one_rw(err_data_buffer);
-define_one_rw(virtual_to_phys);
-define_one_ro(status);
-define_one_ro(capabilities);
-define_one_ro(resources);
-
-static struct attribute *default_attrs[] = {
-	&dev_attr_call_start.attr,
-	&dev_attr_virtual_to_phys.attr,
-	&dev_attr_err_type_info.attr,
-	&dev_attr_err_struct_info.attr,
-	&dev_attr_err_data_buffer.attr,
-	&dev_attr_status.attr,
-	&dev_attr_capabilities.attr,
-	&dev_attr_resources.attr,
-	NULL
-};
-
-static struct attribute_group err_inject_attr_group = {
-	.attrs = default_attrs,
-	.name = "err_inject"
-};
-/* Add/Remove err_inject interface for CPU device */
-static int err_inject_add_dev(unsigned int cpu)
-{
-	struct device *sys_dev = get_cpu_device(cpu);
-
-	return sysfs_create_group(&sys_dev->kobj, &err_inject_attr_group);
-}
-
-static int err_inject_remove_dev(unsigned int cpu)
-{
-	struct device *sys_dev = get_cpu_device(cpu);
-
-	sysfs_remove_group(&sys_dev->kobj, &err_inject_attr_group);
-	return 0;
-}
-
-static enum cpuhp_state hp_online;
-
-static int __init err_inject_init(void)
-{
-	int ret;
-#ifdef ERR_INJ_DEBUG
-	printk(KERN_INFO "Enter error injection driver.\n");
-#endif
-
-	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/err_inj:online",
-				err_inject_add_dev, err_inject_remove_dev);
-	if (ret >= 0) {
-		hp_online = ret;
-		ret = 0;
-	}
-	return ret;
-}
-
-static void __exit err_inject_exit(void)
-{
-#ifdef ERR_INJ_DEBUG
-	printk(KERN_INFO "Exit error injection driver.\n");
-#endif
-	cpuhp_remove_state(hp_online);
-}
-
-module_init(err_inject_init);
-module_exit(err_inject_exit);
-
-MODULE_AUTHOR("Fenghua Yu <fenghua.yu@intel.com>");
-MODULE_DESCRIPTION("MC error injection kernel sysfs interface");
-MODULE_LICENSE("GPL");
diff --git a/arch/ia64/kernel/esi.c b/arch/ia64/kernel/esi.c
deleted file mode 100644
index 4df57c93e0a8..000000000000
--- a/arch/ia64/kernel/esi.c
+++ /dev/null
@@ -1,193 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Extensible SAL Interface (ESI) support routines.
- *
- * Copyright (C) 2006 Hewlett-Packard Co
- * 	Alex Williamson <alex.williamson@hp.com>
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/esi.h>
-#include <asm/sal.h>
-
-MODULE_AUTHOR("Alex Williamson <alex.williamson@hp.com>");
-MODULE_DESCRIPTION("Extensible SAL Interface (ESI) support");
-MODULE_LICENSE("GPL");
-
-#define MODULE_NAME	"esi"
-
-enum esi_systab_entry_type {
-	ESI_DESC_ENTRY_POINT = 0
-};
-
-/*
- * Entry type:	Size:
- *	0	48
- */
-#define ESI_DESC_SIZE(type)	"\060"[(unsigned) (type)]
-
-typedef struct ia64_esi_desc_entry_point {
-	u8 type;
-	u8 reserved1[15];
-	u64 esi_proc;
-	u64 gp;
-	efi_guid_t guid;
-} ia64_esi_desc_entry_point_t;
-
-struct pdesc {
-	void *addr;
-	void *gp;
-};
-
-static struct ia64_sal_systab *esi_systab;
-
-extern unsigned long esi_phys;
-
-static int __init esi_init (void)
-{
-	struct ia64_sal_systab *systab;
-	char *p;
-	int i;
-
-	if (esi_phys == EFI_INVALID_TABLE_ADDR)
-		return -ENODEV;
-
-	systab = __va(esi_phys);
-
-	if (strncmp(systab->signature, "ESIT", 4) != 0) {
-		printk(KERN_ERR "bad signature in ESI system table!");
-		return -ENODEV;
-	}
-
-	p = (char *) (systab + 1);
-	for (i = 0; i < systab->entry_count; i++) {
-		/*
-		 * The first byte of each entry type contains the type
-		 * descriptor.
-		 */
-		switch (*p) {
-		      case ESI_DESC_ENTRY_POINT:
-			break;
-		      default:
-			printk(KERN_WARNING "Unknown table type %d found in "
-			       "ESI table, ignoring rest of table\n", *p);
-			return -ENODEV;
-		}
-
-		p += ESI_DESC_SIZE(*p);
-	}
-
-	esi_systab = systab;
-	return 0;
-}
-
-
-int ia64_esi_call (efi_guid_t guid, struct ia64_sal_retval *isrvp,
-		   enum esi_proc_type proc_type, u64 func,
-		   u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6,
-		   u64 arg7)
-{
-	struct ia64_fpreg fr[6];
-	unsigned long flags = 0;
-	int i;
-	char *p;
-
-	if (!esi_systab)
-		return -1;
-
-	p = (char *) (esi_systab + 1);
-	for (i = 0; i < esi_systab->entry_count; i++) {
-		if (*p == ESI_DESC_ENTRY_POINT) {
-			ia64_esi_desc_entry_point_t *esi = (void *)p;
-			if (!efi_guidcmp(guid, esi->guid)) {
-				ia64_sal_handler esi_proc;
-				struct pdesc pdesc;
-
-				pdesc.addr = __va(esi->esi_proc);
-				pdesc.gp = __va(esi->gp);
-
-				esi_proc = (ia64_sal_handler) &pdesc;
-
-				ia64_save_scratch_fpregs(fr);
-				if (proc_type == ESI_PROC_SERIALIZED)
-					spin_lock_irqsave(&sal_lock, flags);
-				else if (proc_type == ESI_PROC_MP_SAFE)
-					local_irq_save(flags);
-				else
-					preempt_disable();
-				*isrvp = (*esi_proc)(func, arg1, arg2, arg3,
-						     arg4, arg5, arg6, arg7);
-				if (proc_type == ESI_PROC_SERIALIZED)
-					spin_unlock_irqrestore(&sal_lock,
-							       flags);
-				else if (proc_type == ESI_PROC_MP_SAFE)
-					local_irq_restore(flags);
-				else
-					preempt_enable();
-				ia64_load_scratch_fpregs(fr);
-				return 0;
-			}
-		}
-		p += ESI_DESC_SIZE(*p);
-	}
-	return -1;
-}
-EXPORT_SYMBOL_GPL(ia64_esi_call);
-
-int ia64_esi_call_phys (efi_guid_t guid, struct ia64_sal_retval *isrvp,
-			u64 func, u64 arg1, u64 arg2, u64 arg3, u64 arg4,
-			u64 arg5, u64 arg6, u64 arg7)
-{
-	struct ia64_fpreg fr[6];
-	unsigned long flags;
-	u64 esi_params[8];
-	char *p;
-	int i;
-
-	if (!esi_systab)
-		return -1;
-
-	p = (char *) (esi_systab + 1);
-	for (i = 0; i < esi_systab->entry_count; i++) {
-		if (*p == ESI_DESC_ENTRY_POINT) {
-			ia64_esi_desc_entry_point_t *esi = (void *)p;
-			if (!efi_guidcmp(guid, esi->guid)) {
-				ia64_sal_handler esi_proc;
-				struct pdesc pdesc;
-
-				pdesc.addr = (void *)esi->esi_proc;
-				pdesc.gp = (void *)esi->gp;
-
-				esi_proc = (ia64_sal_handler) &pdesc;
-
-				esi_params[0] = func;
-				esi_params[1] = arg1;
-				esi_params[2] = arg2;
-				esi_params[3] = arg3;
-				esi_params[4] = arg4;
-				esi_params[5] = arg5;
-				esi_params[6] = arg6;
-				esi_params[7] = arg7;
-				ia64_save_scratch_fpregs(fr);
-				spin_lock_irqsave(&sal_lock, flags);
-				*isrvp = esi_call_phys(esi_proc, esi_params);
-				spin_unlock_irqrestore(&sal_lock, flags);
-				ia64_load_scratch_fpregs(fr);
-				return 0;
-			}
-		}
-		p += ESI_DESC_SIZE(*p);
-	}
-	return -1;
-}
-EXPORT_SYMBOL_GPL(ia64_esi_call_phys);
-
-static void __exit esi_exit (void)
-{
-}
-
-module_init(esi_init);
-module_exit(esi_exit);	/* makes module removable... */
diff --git a/arch/ia64/kernel/esi_stub.S b/arch/ia64/kernel/esi_stub.S
deleted file mode 100644
index 9928c5b2957c..000000000000
--- a/arch/ia64/kernel/esi_stub.S
+++ /dev/null
@@ -1,99 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * ESI call stub.
- *
- * Copyright (C) 2005 Hewlett-Packard Co
- *	Alex Williamson <alex.williamson@hp.com>
- *
- * Based on EFI call stub by David Mosberger.  The stub is virtually
- * identical to the one for EFI phys-mode calls, except that ESI
- * calls may have up to 8 arguments, so they get passed to this routine
- * through memory.
- *
- * This stub allows us to make ESI calls in physical mode with interrupts
- * turned off.  ESI calls may not support calling from virtual mode.
- *
- * Google for "Extensible SAL specification" for a document describing the
- * ESI standard.
- */
-
-/*
- * PSR settings as per SAL spec (Chapter 8 in the "IA-64 System
- * Abstraction Layer Specification", revision 2.6e).  Note that
- * psr.dfl and psr.dfh MUST be cleared, despite what this manual says.
- * Otherwise, SAL dies whenever it's trying to do an IA-32 BIOS call
- * (the br.ia instruction fails unless psr.dfl and psr.dfh are
- * cleared).  Fortunately, SAL promises not to touch the floating
- * point regs, so at least we don't have to save f2-f127.
- */
-#define PSR_BITS_TO_CLEAR						\
-	(IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_RT |		\
-	 IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED |	\
-	 IA64_PSR_DFL | IA64_PSR_DFH)
-
-#define PSR_BITS_TO_SET							\
-	(IA64_PSR_BN)
-
-#include <linux/export.h>
-#include <asm/processor.h>
-#include <asm/asmmacro.h>
-
-/*
- * Inputs:
- *	in0 = address of function descriptor of ESI routine to call
- *	in1 = address of array of ESI parameters
- *
- * Outputs:
- *	r8 = result returned by called function
- */
-GLOBAL_ENTRY(esi_call_phys)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
-	alloc loc1=ar.pfs,2,7,8,0
-	ld8 r2=[in0],8			// load ESI function's entry point
-	mov loc0=rp
-	.body
-	;;
-	ld8 out0=[in1],8		// ESI params loaded from array
-	;;				// passing all as inputs doesn't work
-	ld8 out1=[in1],8
-	;;
-	ld8 out2=[in1],8
-	;;
-	ld8 out3=[in1],8
-	;;
-	ld8 out4=[in1],8
-	;;
-	ld8 out5=[in1],8
-	;;
-	ld8 out6=[in1],8
-	;;
-	ld8 out7=[in1]
-	mov loc2=gp			// save global pointer
-	mov loc4=ar.rsc			// save RSE configuration
-	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	;;
-	ld8 gp=[in0]			// load ESI function's global pointer
-	movl r16=PSR_BITS_TO_CLEAR
-	mov loc3=psr			// save processor status word
-	movl r17=PSR_BITS_TO_SET
-	;;
-	or loc3=loc3,r17
-	mov b6=r2
-	;;
-	andcm r16=loc3,r16	// get psr with IT, DT, and RT bits cleared
-	br.call.sptk.many rp=ia64_switch_mode_phys
-.ret0:	mov loc5=r19			// old ar.bsp
-	mov loc6=r20			// old sp
-	br.call.sptk.many rp=b6		// call the ESI function
-.ret1:	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	mov r16=loc3			// save virtual mode psr
-	mov r19=loc5			// save virtual mode bspstore
-	mov r20=loc6			// save virtual mode sp
-	br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
-.ret2:	mov ar.rsc=loc4			// restore RSE configuration
-	mov ar.pfs=loc1
-	mov rp=loc0
-	mov gp=loc2
-	br.ret.sptk.many rp
-END(esi_call_phys)
-EXPORT_SYMBOL_GPL(esi_call_phys)
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
deleted file mode 100644
index cc4733e9990a..000000000000
--- a/arch/ia64/kernel/fsys.S
+++ /dev/null
@@ -1,837 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This file contains the light-weight system call handlers (fsyscall-handlers).
- *
- * Copyright (C) 2003 Hewlett-Packard Co
- * 	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 25-Sep-03 davidm	Implement fsys_rt_sigprocmask().
- * 18-Feb-03 louisk	Implement fsys_gettimeofday().
- * 28-Feb-03 davidm	Fixed several bugs in fsys_gettimeofday().  Tuned it some more,
- *			probably broke it along the way... ;-)
- * 13-Jul-04 clameter   Implement fsys_clock_gettime and revise fsys_gettimeofday to make
- *                      it capable of using memory based clocks without falling back to C code.
- * 08-Feb-07 Fenghua Yu Implement fsys_getcpu.
- *
- */
-
-#include <asm/asmmacro.h>
-#include <asm/errno.h>
-#include <asm/asm-offsets.h>
-#include <asm/percpu.h>
-#include <asm/thread_info.h>
-#include <asm/sal.h>
-#include <asm/signal.h>
-#include <asm/unistd.h>
-
-#include "entry.h"
-#include <asm/native/inst.h>
-
-/*
- * See Documentation/arch/ia64/fsys.rst for details on fsyscalls.
- *
- * On entry to an fsyscall handler:
- *   r10	= 0 (i.e., defaults to "successful syscall return")
- *   r11	= saved ar.pfs (a user-level value)
- *   r15	= system call number
- *   r16	= "current" task pointer (in normal kernel-mode, this is in r13)
- *   r32-r39	= system call arguments
- *   b6		= return address (a user-level value)
- *   ar.pfs	= previous frame-state (a user-level value)
- *   PSR.be	= cleared to zero (i.e., little-endian byte order is in effect)
- *   all other registers may contain values passed in from user-mode
- *
- * On return from an fsyscall handler:
- *   r11	= saved ar.pfs (as passed into the fsyscall handler)
- *   r15	= system call number (as passed into the fsyscall handler)
- *   r32-r39	= system call arguments (as passed into the fsyscall handler)
- *   b6		= return address (as passed into the fsyscall handler)
- *   ar.pfs	= previous frame-state (as passed into the fsyscall handler)
- */
-
-ENTRY(fsys_ni_syscall)
-	.prologue
-	.altrp b6
-	.body
-	mov r8=ENOSYS
-	mov r10=-1
-	FSYS_RETURN
-END(fsys_ni_syscall)
-
-ENTRY(fsys_getpid)
-	.prologue
-	.altrp b6
-	.body
-	add r17=IA64_TASK_SIGNAL_OFFSET,r16
-	;;
-	ld8 r17=[r17]				// r17 = current->signal
-	add r9=TI_FLAGS+IA64_TASK_SIZE,r16
-	;;
-	ld4 r9=[r9]
-	add r17=IA64_SIGNAL_PIDS_TGID_OFFSET,r17
-	;;
-	and r9=TIF_ALLWORK_MASK,r9
-	ld8 r17=[r17]				// r17 = current->signal->pids[PIDTYPE_TGID]
-	;;
-	add r8=IA64_PID_LEVEL_OFFSET,r17
-	;;
-	ld4 r8=[r8]				// r8 = pid->level
-	add r17=IA64_PID_UPID_OFFSET,r17	// r17 = &pid->numbers[0]
-	;;
-	shl r8=r8,IA64_UPID_SHIFT
-	;;
-	add r17=r17,r8				// r17 = &pid->numbers[pid->level]
-	;;
-	ld4 r8=[r17]				// r8 = pid->numbers[pid->level].nr
-	;;
-	mov r17=0
-	;;
-	cmp.ne p8,p0=0,r9
-(p8)	br.spnt.many fsys_fallback_syscall
-	FSYS_RETURN
-END(fsys_getpid)
-
-ENTRY(fsys_set_tid_address)
-	.prologue
-	.altrp b6
-	.body
-	add r9=TI_FLAGS+IA64_TASK_SIZE,r16
-	add r17=IA64_TASK_THREAD_PID_OFFSET,r16
-	;;
-	ld4 r9=[r9]
-	tnat.z p6,p7=r32		// check argument register for being NaT
-	ld8 r17=[r17]				// r17 = current->thread_pid
-	;;
-	and r9=TIF_ALLWORK_MASK,r9
-	add r8=IA64_PID_LEVEL_OFFSET,r17
-	add r18=IA64_TASK_CLEAR_CHILD_TID_OFFSET,r16
-	;;
-	ld4 r8=[r8]				// r8 = pid->level
-	add r17=IA64_PID_UPID_OFFSET,r17	// r17 = &pid->numbers[0]
-	;;
-	shl r8=r8,IA64_UPID_SHIFT
-	;;
-	add r17=r17,r8				// r17 = &pid->numbers[pid->level]
-	;;
-	ld4 r8=[r17]				// r8 = pid->numbers[pid->level].nr
-	;;
-	cmp.ne p8,p0=0,r9
-	mov r17=-1
-	;;
-(p6)	st8 [r18]=r32
-(p7)	st8 [r18]=r17
-(p8)	br.spnt.many fsys_fallback_syscall
-	;;
-	mov r17=0			// i must not leak kernel bits...
-	mov r18=0			// i must not leak kernel bits...
-	FSYS_RETURN
-END(fsys_set_tid_address)
-
-#if IA64_GTOD_SEQ_OFFSET !=0
-#error fsys_gettimeofday incompatible with changes to struct fsyscall_gtod_data_t
-#endif
-#if IA64_ITC_JITTER_OFFSET !=0
-#error fsys_gettimeofday incompatible with changes to struct itc_jitter_data_t
-#endif
-#define CLOCK_REALTIME 0
-#define CLOCK_MONOTONIC 1
-#define CLOCK_DIVIDE_BY_1000 0x4000
-#define CLOCK_ADD_MONOTONIC 0x8000
-
-ENTRY(fsys_gettimeofday)
-	.prologue
-	.altrp b6
-	.body
-	mov r31 = r32
-	tnat.nz p6,p0 = r33		// guard against NaT argument
-(p6)    br.cond.spnt.few .fail_einval
-	mov r30 = CLOCK_DIVIDE_BY_1000
-	;;
-.gettime:
-	// Register map
-	// Incoming r31 = pointer to address where to place result
-	//          r30 = flags determining how time is processed
-	// r2,r3 = temp r4-r7 preserved
-	// r8 = result nanoseconds
-	// r9 = result seconds
-	// r10 = temporary storage for clock difference
-	// r11 = preserved: saved ar.pfs
-	// r12 = preserved: memory stack
-	// r13 = preserved: thread pointer
-	// r14 = address of mask / mask value
-	// r15 = preserved: system call number
-	// r16 = preserved: current task pointer
-	// r17 = (not used)
-	// r18 = (not used)
-	// r19 = address of itc_lastcycle
-	// r20 = struct fsyscall_gtod_data (= address of gtod_lock.sequence)
-	// r21 = address of mmio_ptr
-	// r22 = address of wall_time or monotonic_time
-	// r23 = address of shift / value
-	// r24 = address mult factor / cycle_last value
-	// r25 = itc_lastcycle value
-	// r26 = address clocksource cycle_last
-	// r27 = (not used)
-	// r28 = sequence number at the beginning of critical section
-	// r29 = address of itc_jitter
-	// r30 = time processing flags / memory address
-	// r31 = pointer to result
-	// Predicates
-	// p6,p7 short term use
-	// p8 = timesource ar.itc
-	// p9 = timesource mmio64
-	// p10 = timesource mmio32 - not used
-	// p11 = timesource not to be handled by asm code
-	// p12 = memory time source ( = p9 | p10) - not used
-	// p13 = do cmpxchg with itc_lastcycle
-	// p14 = Divide by 1000
-	// p15 = Add monotonic
-	//
-	// Note that instructions are optimized for McKinley. McKinley can
-	// process two bundles simultaneously and therefore we continuously
-	// try to feed the CPU two bundles and then a stop.
-
-	add r2 = TI_FLAGS+IA64_TASK_SIZE,r16
-	tnat.nz p6,p0 = r31		// guard against Nat argument
-(p6)	br.cond.spnt.few .fail_einval
-	movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address
-	;;
-	ld4 r2 = [r2]			// process work pending flags
-	movl r29 = itc_jitter_data	// itc_jitter
-	add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20	// wall_time
-	add r21 = IA64_CLKSRC_MMIO_OFFSET,r20
-	mov pr = r30,0xc000	// Set predicates according to function
-	;;
-	and r2 = TIF_ALLWORK_MASK,r2
-	add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29
-(p15)	add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20	// monotonic_time
-	;;
-	add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20	// clksrc_cycle_last
-	cmp.ne p6, p0 = 0, r2	// Fallback if work is scheduled
-(p6)	br.cond.spnt.many fsys_fallback_syscall
-	;;
-	// Begin critical section
-.time_redo:
-	ld4.acq r28 = [r20]	// gtod_lock.sequence, Must take first
-	;;
-	and r28 = ~1,r28	// And make sequence even to force retry if odd
-	;;
-	ld8 r30 = [r21]		// clocksource->mmio_ptr
-	add r24 = IA64_CLKSRC_MULT_OFFSET,r20
-	ld4 r2 = [r29]		// itc_jitter value
-	add r23 = IA64_CLKSRC_SHIFT_OFFSET,r20
-	add r14 = IA64_CLKSRC_MASK_OFFSET,r20
-	;;
-	ld4 r3 = [r24]		// clocksource mult value
-	ld8 r14 = [r14]         // clocksource mask value
-	cmp.eq p8,p9 = 0,r30	// use cpu timer if no mmio_ptr
-	;;
-	setf.sig f7 = r3	// Setup for mult scaling of counter
-(p8)	cmp.ne p13,p0 = r2,r0	// need itc_jitter compensation, set p13
-	ld4 r23 = [r23]		// clocksource shift value
-	ld8 r24 = [r26]		// get clksrc_cycle_last value
-(p9)	cmp.eq p13,p0 = 0,r30	// if mmio_ptr, clear p13 jitter control
-	;;
-	.pred.rel.mutex p8,p9
-	MOV_FROM_ITC(p8, p6, r2, r10)	// CPU_TIMER. 36 clocks latency!!!
-(p9)	ld8 r2 = [r30]		// MMIO_TIMER. Could also have latency issues..
-(p13)	ld8 r25 = [r19]		// get itc_lastcycle value
-	ld8 r9 = [r22],IA64_TIME_SN_SPEC_SNSEC_OFFSET	// sec
-	;;
-	ld8 r8 = [r22],-IA64_TIME_SN_SPEC_SNSEC_OFFSET	// snsec
-(p13)	sub r3 = r25,r2		// Diff needed before comparison (thanks davidm)
-	;;
-(p13)	cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared
-	sub r10 = r2,r24	// current_cycle - last_cycle
-	;;
-(p6)	sub r10 = r25,r24	// time we got was less than last_cycle
-(p7)	mov ar.ccv = r25	// more than last_cycle. Prep for cmpxchg
-	;;
-(p7)	cmpxchg8.rel r3 = [r19],r2,ar.ccv
-	;;
-(p7)	cmp.ne p7,p0 = r25,r3	// if cmpxchg not successful
-	;;
-(p7)	sub r10 = r3,r24	// then use new last_cycle instead
-	;;
-	and r10 = r10,r14	// Apply mask
-	;;
-	setf.sig f8 = r10
-	nop.i 123
-	;;
-	// fault check takes 5 cycles and we have spare time
-EX(.fail_efault, probe.w.fault r31, 3)
-	xmpy.l f8 = f8,f7	// nsec_per_cyc*(counter-last_counter)
-	;;
-	getf.sig r2 = f8
-	mf
-	;;
-	ld4 r10 = [r20]		// gtod_lock.sequence
-	add r8 = r8,r2		// Add xtime.nsecs
-	;;
-	shr.u r8 = r8,r23	// shift by factor
-	cmp4.ne p7,p0 = r28,r10
-(p7)	br.cond.dpnt.few .time_redo	// sequence number changed, redo
-	// End critical section.
-	// Now r8=tv->tv_nsec and r9=tv->tv_sec
-	mov r10 = r0
-	movl r2 = 1000000000
-	add r23 = IA64_TIMESPEC_TV_NSEC_OFFSET, r31
-(p14)	movl r3 = 2361183241434822607	// Prep for / 1000 hack
-	;;
-.time_normalize:
-	mov r21 = r8
-	cmp.ge p6,p0 = r8,r2
-(p14)	shr.u r20 = r8, 3 // We can repeat this if necessary just wasting time
-	;;
-(p14)	setf.sig f8 = r20
-(p6)	sub r8 = r8,r2
-(p6)	add r9 = 1,r9		// two nops before the branch.
-(p14)	setf.sig f7 = r3	// Chances for repeats are 1 in 10000 for gettod
-(p6)	br.cond.dpnt.few .time_normalize
-	;;
-	// Divided by 8 though shift. Now divide by 125
-	// The compiler was able to do that with a multiply
-	// and a shift and we do the same
-EX(.fail_efault, probe.w.fault r23, 3)	// This also costs 5 cycles
-(p14)	xmpy.hu f8 = f8, f7		// xmpy has 5 cycles latency so use it
-	;;
-(p14)	getf.sig r2 = f8
-	;;
-	mov r8 = r0
-(p14)	shr.u r21 = r2, 4
-	;;
-EX(.fail_efault, st8 [r31] = r9)
-EX(.fail_efault, st8 [r23] = r21)
-	FSYS_RETURN
-.fail_einval:
-	mov r8 = EINVAL
-	mov r10 = -1
-	FSYS_RETURN
-.fail_efault:
-	mov r8 = EFAULT
-	mov r10 = -1
-	FSYS_RETURN
-END(fsys_gettimeofday)
-
-ENTRY(fsys_clock_gettime)
-	.prologue
-	.altrp b6
-	.body
-	cmp4.ltu p6, p0 = CLOCK_MONOTONIC, r32
-	// Fallback if this is not CLOCK_REALTIME or CLOCK_MONOTONIC
-(p6)	br.spnt.few fsys_fallback_syscall
-	mov r31 = r33
-	shl r30 = r32,15
-	br.many .gettime
-END(fsys_clock_gettime)
-
-/*
- * fsys_getcpu doesn't use the third parameter in this implementation. It reads
- * current_thread_info()->cpu and corresponding node in cpu_to_node_map.
- */
-ENTRY(fsys_getcpu)
-	.prologue
-	.altrp b6
-	.body
-	;;
-	add r2=TI_FLAGS+IA64_TASK_SIZE,r16
-	tnat.nz p6,p0 = r32			// guard against NaT argument
-	add r3=TI_CPU+IA64_TASK_SIZE,r16
-	;;
-	ld4 r3=[r3]				// M r3 = thread_info->cpu
-	ld4 r2=[r2]				// M r2 = thread_info->flags
-(p6)    br.cond.spnt.few .fail_einval		// B
-	;;
-	tnat.nz p7,p0 = r33			// I guard against NaT argument
-(p7)    br.cond.spnt.few .fail_einval		// B
-	;;
-	cmp.ne p6,p0=r32,r0
-	cmp.ne p7,p0=r33,r0
-	;;
-#ifdef CONFIG_NUMA
-	movl r17=cpu_to_node_map
-	;;
-EX(.fail_efault, (p6) probe.w.fault r32, 3)		// M This takes 5 cycles
-EX(.fail_efault, (p7) probe.w.fault r33, 3)		// M This takes 5 cycles
-	shladd r18=r3,1,r17
-	;;
-	ld2 r20=[r18]				// r20 = cpu_to_node_map[cpu]
-	and r2 = TIF_ALLWORK_MASK,r2
-	;;
-	cmp.ne p8,p0=0,r2
-(p8)	br.spnt.many fsys_fallback_syscall
-	;;
-	;;
-EX(.fail_efault, (p6) st4 [r32] = r3)
-EX(.fail_efault, (p7) st2 [r33] = r20)
-	mov r8=0
-	;;
-#else
-EX(.fail_efault, (p6) probe.w.fault r32, 3)		// M This takes 5 cycles
-EX(.fail_efault, (p7) probe.w.fault r33, 3)		// M This takes 5 cycles
-	and r2 = TIF_ALLWORK_MASK,r2
-	;;
-	cmp.ne p8,p0=0,r2
-(p8)	br.spnt.many fsys_fallback_syscall
-	;;
-EX(.fail_efault, (p6) st4 [r32] = r3)
-EX(.fail_efault, (p7) st2 [r33] = r0)
-	mov r8=0
-	;;
-#endif
-	FSYS_RETURN
-END(fsys_getcpu)
-
-ENTRY(fsys_fallback_syscall)
-	.prologue
-	.altrp b6
-	.body
-	/*
-	 * We only get here from light-weight syscall handlers.  Thus, we already
-	 * know that r15 contains a valid syscall number.  No need to re-check.
-	 */
-	adds r17=-1024,r15
-	movl r14=sys_call_table
-	;;
-	RSM_PSR_I(p0, r26, r27)
-	shladd r18=r17,3,r14
-	;;
-	ld8 r18=[r18]				// load normal (heavy-weight) syscall entry-point
-	MOV_FROM_PSR(p0, r29, r26)		// read psr (12 cyc load latency)
-	mov r27=ar.rsc
-	mov r21=ar.fpsr
-	mov r26=ar.pfs
-END(fsys_fallback_syscall)
-	/* FALL THROUGH */
-GLOBAL_ENTRY(fsys_bubble_down)
-	.prologue
-	.altrp b6
-	.body
-	/*
-	 * We get here for syscalls that don't have a lightweight
-	 * handler.  For those, we need to bubble down into the kernel
-	 * and that requires setting up a minimal pt_regs structure,
-	 * and initializing the CPU state more or less as if an
-	 * interruption had occurred.  To make syscall-restarts work,
-	 * we setup pt_regs such that cr_iip points to the second
-	 * instruction in syscall_via_break.  Decrementing the IP
-	 * hence will restart the syscall via break and not
-	 * decrementing IP will return us to the caller, as usual.
-	 * Note that we preserve the value of psr.pp rather than
-	 * initializing it from dcr.pp.  This makes it possible to
-	 * distinguish fsyscall execution from other privileged
-	 * execution.
-	 *
-	 * On entry:
-	 *	- normal fsyscall handler register usage, except
-	 *	  that we also have:
-	 *	- r18: address of syscall entry point
-	 *	- r21: ar.fpsr
-	 *	- r26: ar.pfs
-	 *	- r27: ar.rsc
-	 *	- r29: psr
-	 *
-	 * We used to clear some PSR bits here but that requires slow
-	 * serialization.  Fortunately, that isn't really necessary.
-	 * The rationale is as follows: we used to clear bits
-	 * ~PSR_PRESERVED_BITS in PSR.L.  Since
-	 * PSR_PRESERVED_BITS==PSR.{UP,MFL,MFH,PK,DT,PP,SP,RT,IC}, we
-	 * ended up clearing PSR.{BE,AC,I,DFL,DFH,DI,DB,SI,TB}.
-	 * However,
-	 *
-	 * PSR.BE : already is turned off in __kernel_syscall_via_epc()
-	 * PSR.AC : don't care (kernel normally turns PSR.AC on)
-	 * PSR.I  : already turned off by the time fsys_bubble_down gets
-	 *	    invoked
-	 * PSR.DFL: always 0 (kernel never turns it on)
-	 * PSR.DFH: don't care --- kernel never touches f32-f127 on its own
-	 *	    initiative
-	 * PSR.DI : always 0 (kernel never turns it on)
-	 * PSR.SI : always 0 (kernel never turns it on)
-	 * PSR.DB : don't care --- kernel never enables kernel-level
-	 *	    breakpoints
-	 * PSR.TB : must be 0 already; if it wasn't zero on entry to
-	 *          __kernel_syscall_via_epc, the branch to fsys_bubble_down
-	 *          will trigger a taken branch; the taken-trap-handler then
-	 *          converts the syscall into a break-based system-call.
-	 */
-	/*
-	 * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc.
-	 * The rest we have to synthesize.
-	 */
-#	define PSR_ONE_BITS		((3 << IA64_PSR_CPL0_BIT)	\
-					 | (0x1 << IA64_PSR_RI_BIT)	\
-					 | IA64_PSR_BN | IA64_PSR_I)
-
-	invala					// M0|1
-	movl r14=ia64_ret_from_syscall		// X
-
-	nop.m 0
-	movl r28=__kernel_syscall_via_break	// X	create cr.iip
-	;;
-
-	mov r2=r16				// A    get task addr to addl-addressable register
-	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 // A
-	mov r31=pr				// I0   save pr (2 cyc)
-	;;
-	st1 [r16]=r0				// M2|3 clear current->thread.on_ustack flag
-	addl r22=IA64_RBS_OFFSET,r2		// A    compute base of RBS
-	add r3=TI_FLAGS+IA64_TASK_SIZE,r2	// A
-	;;
-	ld4 r3=[r3]				// M0|1 r3 = current_thread_info()->flags
-	lfetch.fault.excl.nt1 [r22]		// M0|1 prefetch register backing-store
-	nop.i 0
-	;;
-	mov ar.rsc=0				// M2   set enforced lazy mode, pl 0, LE, loadrs=0
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	MOV_FROM_ITC(p0, p6, r30, r23)		// M    get cycle for accounting
-#else
-	nop.m 0
-#endif
-	nop.i 0
-	;;
-	mov r23=ar.bspstore			// M2 (12 cyc) save ar.bspstore
-	mov.m r24=ar.rnat			// M2 (5 cyc) read ar.rnat (dual-issues!)
-	nop.i 0
-	;;
-	mov ar.bspstore=r22			// M2 (6 cyc) switch to kernel RBS
-	movl r8=PSR_ONE_BITS			// X
-	;;
-	mov r25=ar.unat				// M2 (5 cyc) save ar.unat
-	mov r19=b6				// I0   save b6 (2 cyc)
-	mov r20=r1				// A    save caller's gp in r20
-	;;
-	or r29=r8,r29				// A    construct cr.ipsr value to save
-	mov b6=r18				// I0   copy syscall entry-point to b6 (7 cyc)
-	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2 // A compute base of memory stack
-
-	mov r18=ar.bsp				// M2   save (kernel) ar.bsp (12 cyc)
-	cmp.ne pKStk,pUStk=r0,r0		// A    set pKStk <- 0, pUStk <- 1
-	br.call.sptk.many b7=ia64_syscall_setup	// B
-	;;
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	// mov.m r30=ar.itc is called in advance
-	add r16=TI_AC_STAMP+IA64_TASK_SIZE,r2
-	add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r2
-	;;
-	ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP	// time at last check in kernel
-	ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE	// time at leave kernel
-	;;
-	ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME	// cumulated stime
-	ld8 r21=[r17]				// cumulated utime
-	sub r22=r19,r18				// stime before leave kernel
-	;;
-	st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP	// update stamp
-	sub r18=r30,r19				// elapsed time in user mode
-	;;
-	add r20=r20,r22				// sum stime
-	add r21=r21,r18				// sum utime
-	;;
-	st8 [r16]=r20				// update stime
-	st8 [r17]=r21				// update utime
-	;;
-#endif
-	mov ar.rsc=0x3				// M2   set eager mode, pl 0, LE, loadrs=0
-	mov rp=r14				// I0   set the real return addr
-	and r3=_TIF_SYSCALL_TRACEAUDIT,r3	// A
-	;;
-	SSM_PSR_I(p0, p6, r22)			// M2   we're on kernel stacks now, reenable irqs
-	cmp.eq p8,p0=r3,r0			// A
-(p10)	br.cond.spnt.many ia64_ret_from_syscall	// B    return if bad call-frame or r15 is a NaT
-
-	nop.m 0
-(p8)	br.call.sptk.many b6=b6			// B    (ignore return address)
-	br.cond.spnt ia64_trace_syscall		// B
-END(fsys_bubble_down)
-
-	.rodata
-	.align 8
-	.globl fsyscall_table
-
-	data8 fsys_bubble_down
-fsyscall_table:
-	data8 fsys_ni_syscall
-	data8 0				// exit			// 1025
-	data8 0				// read
-	data8 0				// write
-	data8 0				// open
-	data8 0				// close
-	data8 0				// creat		// 1030
-	data8 0				// link
-	data8 0				// unlink
-	data8 0				// execve
-	data8 0				// chdir
-	data8 0				// fchdir		// 1035
-	data8 0				// utimes
-	data8 0				// mknod
-	data8 0				// chmod
-	data8 0				// chown
-	data8 0				// lseek		// 1040
-	data8 fsys_getpid		// getpid
-	data8 0				// getppid
-	data8 0				// mount
-	data8 0				// umount
-	data8 0				// setuid		// 1045
-	data8 0				// getuid
-	data8 0				// geteuid
-	data8 0				// ptrace
-	data8 0				// access
-	data8 0				// sync			// 1050
-	data8 0				// fsync
-	data8 0				// fdatasync
-	data8 0				// kill
-	data8 0				// rename
-	data8 0				// mkdir		// 1055
-	data8 0				// rmdir
-	data8 0				// dup
-	data8 0				// pipe
-	data8 0				// times
-	data8 0				// brk			// 1060
-	data8 0				// setgid
-	data8 0				// getgid
-	data8 0				// getegid
-	data8 0				// acct
-	data8 0				// ioctl		// 1065
-	data8 0				// fcntl
-	data8 0				// umask
-	data8 0				// chroot
-	data8 0				// ustat
-	data8 0				// dup2			// 1070
-	data8 0				// setreuid
-	data8 0				// setregid
-	data8 0				// getresuid
-	data8 0				// setresuid
-	data8 0				// getresgid		// 1075
-	data8 0				// setresgid
-	data8 0				// getgroups
-	data8 0				// setgroups
-	data8 0				// getpgid
-	data8 0				// setpgid		// 1080
-	data8 0				// setsid
-	data8 0				// getsid
-	data8 0				// sethostname
-	data8 0				// setrlimit
-	data8 0				// getrlimit		// 1085
-	data8 0				// getrusage
-	data8 fsys_gettimeofday		// gettimeofday
-	data8 0				// settimeofday
-	data8 0				// select
-	data8 0				// poll			// 1090
-	data8 0				// symlink
-	data8 0				// readlink
-	data8 0				// uselib
-	data8 0				// swapon
-	data8 0				// swapoff		// 1095
-	data8 0				// reboot
-	data8 0				// truncate
-	data8 0				// ftruncate
-	data8 0				// fchmod
-	data8 0				// fchown		// 1100
-	data8 0				// getpriority
-	data8 0				// setpriority
-	data8 0				// statfs
-	data8 0				// fstatfs
-	data8 0				// gettid		// 1105
-	data8 0				// semget
-	data8 0				// semop
-	data8 0				// semctl
-	data8 0				// msgget
-	data8 0				// msgsnd		// 1110
-	data8 0				// msgrcv
-	data8 0				// msgctl
-	data8 0				// shmget
-	data8 0				// shmat
-	data8 0				// shmdt		// 1115
-	data8 0				// shmctl
-	data8 0				// syslog
-	data8 0				// setitimer
-	data8 0				// getitimer
-	data8 0					 		// 1120
-	data8 0
-	data8 0
-	data8 0				// vhangup
-	data8 0				// lchown
-	data8 0				// remap_file_pages	// 1125
-	data8 0				// wait4
-	data8 0				// sysinfo
-	data8 0				// clone
-	data8 0				// setdomainname
-	data8 0				// newuname		// 1130
-	data8 0				// adjtimex
-	data8 0
-	data8 0				// init_module
-	data8 0				// delete_module
-	data8 0							// 1135
-	data8 0
-	data8 0				// quotactl
-	data8 0				// bdflush
-	data8 0				// sysfs
-	data8 0				// personality		// 1140
-	data8 0				// afs_syscall
-	data8 0				// setfsuid
-	data8 0				// setfsgid
-	data8 0				// getdents
-	data8 0				// flock		// 1145
-	data8 0				// readv
-	data8 0				// writev
-	data8 0				// pread64
-	data8 0				// pwrite64
-	data8 0				// sysctl		// 1150
-	data8 0				// mmap
-	data8 0				// munmap
-	data8 0				// mlock
-	data8 0				// mlockall
-	data8 0				// mprotect		// 1155
-	data8 0				// mremap
-	data8 0				// msync
-	data8 0				// munlock
-	data8 0				// munlockall
-	data8 0				// sched_getparam	// 1160
-	data8 0				// sched_setparam
-	data8 0				// sched_getscheduler
-	data8 0				// sched_setscheduler
-	data8 0				// sched_yield
-	data8 0				// sched_get_priority_max	// 1165
-	data8 0				// sched_get_priority_min
-	data8 0				// sched_rr_get_interval
-	data8 0				// nanosleep
-	data8 0				// nfsservctl
-	data8 0				// prctl		// 1170
-	data8 0				// getpagesize
-	data8 0				// mmap2
-	data8 0				// pciconfig_read
-	data8 0				// pciconfig_write
-	data8 0				// perfmonctl		// 1175
-	data8 0				// sigaltstack
-	data8 0				// rt_sigaction
-	data8 0				// rt_sigpending
-	data8 0				// rt_sigprocmask
-	data8 0				// rt_sigqueueinfo	// 1180
-	data8 0				// rt_sigreturn
-	data8 0				// rt_sigsuspend
-	data8 0				// rt_sigtimedwait
-	data8 0				// getcwd
-	data8 0				// capget		// 1185
-	data8 0				// capset
-	data8 0				// sendfile
-	data8 0
-	data8 0
-	data8 0				// socket		// 1190
-	data8 0				// bind
-	data8 0				// connect
-	data8 0				// listen
-	data8 0				// accept
-	data8 0				// getsockname		// 1195
-	data8 0				// getpeername
-	data8 0				// socketpair
-	data8 0				// send
-	data8 0				// sendto
-	data8 0				// recv			// 1200
-	data8 0				// recvfrom
-	data8 0				// shutdown
-	data8 0				// setsockopt
-	data8 0				// getsockopt
-	data8 0				// sendmsg		// 1205
-	data8 0				// recvmsg
-	data8 0				// pivot_root
-	data8 0				// mincore
-	data8 0				// madvise
-	data8 0				// newstat		// 1210
-	data8 0				// newlstat
-	data8 0				// newfstat
-	data8 0				// clone2
-	data8 0				// getdents64
-	data8 0				// getunwind		// 1215
-	data8 0				// readahead
-	data8 0				// setxattr
-	data8 0				// lsetxattr
-	data8 0				// fsetxattr
-	data8 0				// getxattr		// 1220
-	data8 0				// lgetxattr
-	data8 0				// fgetxattr
-	data8 0				// listxattr
-	data8 0				// llistxattr
-	data8 0				// flistxattr		// 1225
-	data8 0				// removexattr
-	data8 0				// lremovexattr
-	data8 0				// fremovexattr
-	data8 0				// tkill
-	data8 0				// futex		// 1230
-	data8 0				// sched_setaffinity
-	data8 0				// sched_getaffinity
-	data8 fsys_set_tid_address	// set_tid_address
-	data8 0				// fadvise64_64
-	data8 0				// tgkill		// 1235
-	data8 0				// exit_group
-	data8 0				// lookup_dcookie
-	data8 0				// io_setup
-	data8 0				// io_destroy
-	data8 0				// io_getevents		// 1240
-	data8 0				// io_submit
-	data8 0				// io_cancel
-	data8 0				// epoll_create
-	data8 0				// epoll_ctl
-	data8 0				// epoll_wait		// 1245
-	data8 0				// restart_syscall
-	data8 0				// semtimedop
-	data8 0				// timer_create
-	data8 0				// timer_settime
-	data8 0				// timer_gettime 	// 1250
-	data8 0				// timer_getoverrun
-	data8 0				// timer_delete
-	data8 0				// clock_settime
-	data8 fsys_clock_gettime	// clock_gettime
-	data8 0				// clock_getres		// 1255
-	data8 0				// clock_nanosleep
-	data8 0				// fstatfs64
-	data8 0				// statfs64
-	data8 0				// mbind
-	data8 0				// get_mempolicy	// 1260
-	data8 0				// set_mempolicy
-	data8 0				// mq_open
-	data8 0				// mq_unlink
-	data8 0				// mq_timedsend
-	data8 0				// mq_timedreceive	// 1265
-	data8 0				// mq_notify
-	data8 0				// mq_getsetattr
-	data8 0				// kexec_load
-	data8 0				// vserver
-	data8 0				// waitid		// 1270
-	data8 0				// add_key
-	data8 0				// request_key
-	data8 0				// keyctl
-	data8 0				// ioprio_set
-	data8 0				// ioprio_get		// 1275
-	data8 0				// move_pages
-	data8 0				// inotify_init
-	data8 0				// inotify_add_watch
-	data8 0				// inotify_rm_watch
-	data8 0				// migrate_pages	// 1280
-	data8 0				// openat
-	data8 0				// mkdirat
-	data8 0				// mknodat
-	data8 0				// fchownat
-	data8 0				// futimesat		// 1285
-	data8 0				// newfstatat
-	data8 0				// unlinkat
-	data8 0				// renameat
-	data8 0				// linkat
-	data8 0				// symlinkat		// 1290
-	data8 0				// readlinkat
-	data8 0				// fchmodat
-	data8 0				// faccessat
-	data8 0
-	data8 0							// 1295
-	data8 0				// unshare
-	data8 0				// splice
-	data8 0				// set_robust_list
-	data8 0				// get_robust_list
-	data8 0				// sync_file_range	// 1300
-	data8 0				// tee
-	data8 0				// vmsplice
-	data8 0
-	data8 fsys_getcpu		// getcpu		// 1304
-
-	// fill in zeros for the remaining entries
-	.zero:
-	.space fsyscall_table + 8*NR_syscalls - .zero, 0
diff --git a/arch/ia64/kernel/fsyscall_gtod_data.h b/arch/ia64/kernel/fsyscall_gtod_data.h
deleted file mode 100644
index cc2861445965..000000000000
--- a/arch/ia64/kernel/fsyscall_gtod_data.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * (c) Copyright 2007 Hewlett-Packard Development Company, L.P.
- *        Contributed by Peter Keilty <peter.keilty@hp.com>
- *
- * fsyscall gettimeofday data
- */
-
-/* like timespec, but includes "shifted nanoseconds" */
-struct time_sn_spec {
-	u64	sec;
-	u64	snsec;
-};
-
-struct fsyscall_gtod_data_t {
-	seqcount_t	seq;
-	struct time_sn_spec wall_time;
-	struct time_sn_spec monotonic_time;
-	u64		clk_mask;
-	u32		clk_mult;
-	u32		clk_shift;
-	void		*clk_fsys_mmio;
-	u64		clk_cycle_last;
-} ____cacheline_aligned;
-
-struct itc_jitter_data_t {
-	int		itc_jitter;
-	u64		itc_lastcycle;
-} ____cacheline_aligned;
-
diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c
deleted file mode 100644
index d6360fd404ab..000000000000
--- a/arch/ia64/kernel/ftrace.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Dynamic function tracing support.
- *
- * Copyright (C) 2008 Shaohua Li <shaohua.li@intel.com>
- *
- * For licencing details, see COPYING.
- *
- * Defines low-level handling of mcount calls when the kernel
- * is compiled with the -pg flag. When using dynamic ftrace, the
- * mcount call-sites get patched lazily with NOP till they are
- * enabled. All code mutation routines here take effect atomically.
- */
-
-#include <linux/uaccess.h>
-#include <linux/ftrace.h>
-
-#include <asm/cacheflush.h>
-#include <asm/patch.h>
-
-/* In IA64, each function will be added below two bundles with -pg option */
-static unsigned char __attribute__((aligned(8)))
-ftrace_orig_code[MCOUNT_INSN_SIZE] = {
-	0x02, 0x40, 0x31, 0x10, 0x80, 0x05, /* alloc r40=ar.pfs,12,8,0 */
-	0xb0, 0x02, 0x00, 0x00, 0x42, 0x40, /* mov r43=r0;; */
-	0x05, 0x00, 0xc4, 0x00,             /* mov r42=b0 */
-	0x11, 0x48, 0x01, 0x02, 0x00, 0x21, /* mov r41=r1 */
-	0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* nop.i 0x0 */
-	0x08, 0x00, 0x00, 0x50              /* br.call.sptk.many b0 = _mcount;; */
-};
-
-struct ftrace_orig_insn {
-	u64 dummy1, dummy2, dummy3;
-	u64 dummy4:64-41+13;
-	u64 imm20:20;
-	u64 dummy5:3;
-	u64 sign:1;
-	u64 dummy6:4;
-};
-
-/* mcount stub will be converted below for nop */
-static unsigned char ftrace_nop_code[MCOUNT_INSN_SIZE] = {
-	0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */
-	0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */
-	0x00, 0x00, 0x04, 0x00,             /* nop.i 0x0 */
-	0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */
-	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* nop.x 0x0;; */
-	0x00, 0x00, 0x04, 0x00
-};
-
-static unsigned char *ftrace_nop_replace(void)
-{
-	return ftrace_nop_code;
-}
-
-/*
- * mcount stub will be converted below for call
- * Note: Just the last instruction is changed against nop
- * */
-static unsigned char __attribute__((aligned(8)))
-ftrace_call_code[MCOUNT_INSN_SIZE] = {
-	0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */
-	0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */
-	0x00, 0x00, 0x04, 0x00,             /* nop.i 0x0 */
-	0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */
-	0xff, 0xff, 0xff, 0xff, 0x7f, 0x00, /* brl.many .;;*/
-	0xf8, 0xff, 0xff, 0xc8
-};
-
-struct ftrace_call_insn {
-	u64 dummy1, dummy2;
-	u64 dummy3:48;
-	u64 imm39_l:16;
-	u64 imm39_h:23;
-	u64 dummy4:13;
-	u64 imm20:20;
-	u64 dummy5:3;
-	u64 i:1;
-	u64 dummy6:4;
-};
-
-static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
-{
-	struct ftrace_call_insn *code = (void *)ftrace_call_code;
-	unsigned long offset = addr - (ip + 0x10);
-
-	code->imm39_l = offset >> 24;
-	code->imm39_h = offset >> 40;
-	code->imm20 = offset >> 4;
-	code->i = offset >> 63;
-	return ftrace_call_code;
-}
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned char *old_code,
-		   unsigned char *new_code, int do_check)
-{
-	unsigned char replaced[MCOUNT_INSN_SIZE];
-
-	/*
-	 * Note:
-	 * We are paranoid about modifying text, as if a bug was to happen, it
-	 * could cause us to read or write to someplace that could cause harm.
-	 * Carefully read and modify the code with probe_kernel_*(), and make
-	 * sure what we read is what we expected it to be before modifying it.
-	 */
-
-	if (!do_check)
-		goto skip_check;
-
-	/* read the text we want to modify */
-	if (copy_from_kernel_nofault(replaced, (void *)ip, MCOUNT_INSN_SIZE))
-		return -EFAULT;
-
-	/* Make sure it is what we expect it to be */
-	if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
-		return -EINVAL;
-
-skip_check:
-	/* replace the text with the new text */
-	if (copy_to_kernel_nofault(((void *)ip), new_code, MCOUNT_INSN_SIZE))
-		return -EPERM;
-	flush_icache_range(ip, ip + MCOUNT_INSN_SIZE);
-
-	return 0;
-}
-
-static int ftrace_make_nop_check(struct dyn_ftrace *rec, unsigned long addr)
-{
-	unsigned char __attribute__((aligned(8))) replaced[MCOUNT_INSN_SIZE];
-	unsigned long ip = rec->ip;
-
-	if (copy_from_kernel_nofault(replaced, (void *)ip, MCOUNT_INSN_SIZE))
-		return -EFAULT;
-	if (rec->flags & FTRACE_FL_CONVERTED) {
-		struct ftrace_call_insn *call_insn, *tmp_call;
-
-		call_insn = (void *)ftrace_call_code;
-		tmp_call = (void *)replaced;
-		call_insn->imm39_l = tmp_call->imm39_l;
-		call_insn->imm39_h = tmp_call->imm39_h;
-		call_insn->imm20 = tmp_call->imm20;
-		call_insn->i = tmp_call->i;
-		if (memcmp(replaced, ftrace_call_code, MCOUNT_INSN_SIZE) != 0)
-			return -EINVAL;
-		return 0;
-	} else {
-		struct ftrace_orig_insn *call_insn, *tmp_call;
-
-		call_insn = (void *)ftrace_orig_code;
-		tmp_call = (void *)replaced;
-		call_insn->sign = tmp_call->sign;
-		call_insn->imm20 = tmp_call->imm20;
-		if (memcmp(replaced, ftrace_orig_code, MCOUNT_INSN_SIZE) != 0)
-			return -EINVAL;
-		return 0;
-	}
-}
-
-int ftrace_make_nop(struct module *mod,
-		    struct dyn_ftrace *rec, unsigned long addr)
-{
-	int ret;
-	char *new;
-
-	ret = ftrace_make_nop_check(rec, addr);
-	if (ret)
-		return ret;
-	new = ftrace_nop_replace();
-	return ftrace_modify_code(rec->ip, NULL, new, 0);
-}
-
-int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
-{
-	unsigned long ip = rec->ip;
-	unsigned char *old, *new;
-
-	old=  ftrace_nop_replace();
-	new = ftrace_call_replace(ip, addr);
-	return ftrace_modify_code(ip, old, new, 1);
-}
-
-/* in IA64, _mcount can't directly call ftrace_stub. Only jump is ok */
-int ftrace_update_ftrace_func(ftrace_func_t func)
-{
-	unsigned long ip;
-	unsigned long addr = ((struct fnptr *)ftrace_call)->ip;
-
-	if (func == ftrace_stub)
-		return 0;
-	ip = ((struct fnptr *)func)->ip;
-
-	ia64_patch_imm64(addr + 2, ip);
-
-	flush_icache_range(addr, addr + 16);
-	return 0;
-}
diff --git a/arch/ia64/kernel/gate-data.S b/arch/ia64/kernel/gate-data.S
deleted file mode 100644
index b3ef1c72e132..000000000000
--- a/arch/ia64/kernel/gate-data.S
+++ /dev/null
@@ -1,3 +0,0 @@
-	.section .data..gate, "aw"
-
-	.incbin "arch/ia64/kernel/gate.so"
diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
deleted file mode 100644
index 9f235cd551ab..000000000000
--- a/arch/ia64/kernel/gate.S
+++ /dev/null
@@ -1,380 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This file contains the code that gets mapped at the upper end of each task's text
- * region.  For now, it contains the signal trampoline code only.
- *
- * Copyright (C) 1999-2003 Hewlett-Packard Co
- * 	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-
-#include <asm/asmmacro.h>
-#include <asm/errno.h>
-#include <asm/asm-offsets.h>
-#include <asm/sigcontext.h>
-#include <asm/unistd.h>
-#include <asm/kregs.h>
-#include <asm/page.h>
-#include <asm/native/inst.h>
-
-/*
- * We can't easily refer to symbols inside the kernel.  To avoid full runtime relocation,
- * complications with the linker (which likes to create PLT stubs for branches
- * to targets outside the shared object) and to avoid multi-phase kernel builds, we
- * simply create minimalistic "patch lists" in special ELF sections.
- */
-	.section ".data..patch.fsyscall_table", "a"
-	.previous
-#define LOAD_FSYSCALL_TABLE(reg)			\
-[1:]	movl reg=0;					\
-	.xdata4 ".data..patch.fsyscall_table", 1b-.
-
-	.section ".data..patch.brl_fsys_bubble_down", "a"
-	.previous
-#define BRL_COND_FSYS_BUBBLE_DOWN(pr)			\
-[1:](pr)brl.cond.sptk 0;				\
-	;;						\
-	.xdata4 ".data..patch.brl_fsys_bubble_down", 1b-.
-
-GLOBAL_ENTRY(__kernel_syscall_via_break)
-	.prologue
-	.altrp b6
-	.body
-	/*
-	 * Note: for (fast) syscall restart to work, the break instruction must be
-	 *	 the first one in the bundle addressed by syscall_via_break.
-	 */
-{ .mib
-	break 0x100000
-	nop.i 0
-	br.ret.sptk.many b6
-}
-END(__kernel_syscall_via_break)
-
-#	define ARG0_OFF		(16 + IA64_SIGFRAME_ARG0_OFFSET)
-#	define ARG1_OFF		(16 + IA64_SIGFRAME_ARG1_OFFSET)
-#	define ARG2_OFF		(16 + IA64_SIGFRAME_ARG2_OFFSET)
-#	define SIGHANDLER_OFF	(16 + IA64_SIGFRAME_HANDLER_OFFSET)
-#	define SIGCONTEXT_OFF	(16 + IA64_SIGFRAME_SIGCONTEXT_OFFSET)
-
-#	define FLAGS_OFF	IA64_SIGCONTEXT_FLAGS_OFFSET
-#	define CFM_OFF		IA64_SIGCONTEXT_CFM_OFFSET
-#	define FR6_OFF		IA64_SIGCONTEXT_FR6_OFFSET
-#	define BSP_OFF		IA64_SIGCONTEXT_AR_BSP_OFFSET
-#	define RNAT_OFF		IA64_SIGCONTEXT_AR_RNAT_OFFSET
-#	define UNAT_OFF		IA64_SIGCONTEXT_AR_UNAT_OFFSET
-#	define FPSR_OFF		IA64_SIGCONTEXT_AR_FPSR_OFFSET
-#	define PR_OFF		IA64_SIGCONTEXT_PR_OFFSET
-#	define RP_OFF		IA64_SIGCONTEXT_IP_OFFSET
-#	define SP_OFF		IA64_SIGCONTEXT_R12_OFFSET
-#	define RBS_BASE_OFF	IA64_SIGCONTEXT_RBS_BASE_OFFSET
-#	define LOADRS_OFF	IA64_SIGCONTEXT_LOADRS_OFFSET
-#	define base0		r2
-#	define base1		r3
-	/*
-	 * When we get here, the memory stack looks like this:
-	 *
-	 *   +===============================+
-       	 *   |				     |
-       	 *   //	    struct sigframe          //
-       	 *   |				     |
-	 *   +-------------------------------+ <-- sp+16
-	 *   |      16 byte of scratch       |
-	 *   |            space              |
-	 *   +-------------------------------+ <-- sp
-	 *
-	 * The register stack looks _exactly_ the way it looked at the time the signal
-	 * occurred.  In other words, we're treading on a potential mine-field: each
-	 * incoming general register may be a NaT value (including sp, in which case the
-	 * process ends up dying with a SIGSEGV).
-	 *
-	 * The first thing need to do is a cover to get the registers onto the backing
-	 * store.  Once that is done, we invoke the signal handler which may modify some
-	 * of the machine state.  After returning from the signal handler, we return
-	 * control to the previous context by executing a sigreturn system call.  A signal
-	 * handler may call the rt_sigreturn() function to directly return to a given
-	 * sigcontext.  However, the user-level sigreturn() needs to do much more than
-	 * calling the rt_sigreturn() system call as it needs to unwind the stack to
-	 * restore preserved registers that may have been saved on the signal handler's
-	 * call stack.
-	 */
-
-#define SIGTRAMP_SAVES										\
-	.unwabi 3, 's';		/* mark this as a sigtramp handler (saves scratch regs) */	\
-	.unwabi @svr4, 's'; /* backwards compatibility with old unwinders (remove in v2.7) */	\
-	.savesp ar.unat, UNAT_OFF+SIGCONTEXT_OFF;						\
-	.savesp ar.fpsr, FPSR_OFF+SIGCONTEXT_OFF;						\
-	.savesp pr, PR_OFF+SIGCONTEXT_OFF;     							\
-	.savesp rp, RP_OFF+SIGCONTEXT_OFF;							\
-	.savesp ar.pfs, CFM_OFF+SIGCONTEXT_OFF;							\
-	.vframesp SP_OFF+SIGCONTEXT_OFF
-
-GLOBAL_ENTRY(__kernel_sigtramp)
-	// describe the state that is active when we get here:
-	.prologue
-	SIGTRAMP_SAVES
-	.body
-
-	.label_state 1
-
-	adds base0=SIGHANDLER_OFF,sp
-	adds base1=RBS_BASE_OFF+SIGCONTEXT_OFF,sp
-	br.call.sptk.many rp=1f
-1:
-	ld8 r17=[base0],(ARG0_OFF-SIGHANDLER_OFF)	// get pointer to signal handler's plabel
-	ld8 r15=[base1]					// get address of new RBS base (or NULL)
-	cover				// push args in interrupted frame onto backing store
-	;;
-	cmp.ne p1,p0=r15,r0		// do we need to switch rbs? (note: pr is saved by kernel)
-	mov.m r9=ar.bsp			// fetch ar.bsp
-	.spillsp.p p1, ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
-(p1)	br.cond.spnt setup_rbs		// yup -> (clobbers p8, r14-r16, and r18-r20)
-back_from_setup_rbs:
-	alloc r8=ar.pfs,0,0,3,0
-	ld8 out0=[base0],16		// load arg0 (signum)
-	adds base1=(ARG1_OFF-(RBS_BASE_OFF+SIGCONTEXT_OFF)),base1
-	;;
-	ld8 out1=[base1]		// load arg1 (siginfop)
-	ld8 r10=[r17],8			// get signal handler entry point
-	;;
-	ld8 out2=[base0]		// load arg2 (sigcontextp)
-	ld8 gp=[r17]			// get signal handler's global pointer
-	adds base0=(BSP_OFF+SIGCONTEXT_OFF),sp
-	;;
-	.spillsp ar.bsp, BSP_OFF+SIGCONTEXT_OFF
-	st8 [base0]=r9			// save sc_ar_bsp
-	adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp
-	adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp
-	;;
-	stf.spill [base0]=f6,32
-	stf.spill [base1]=f7,32
-	;;
-	stf.spill [base0]=f8,32
-	stf.spill [base1]=f9,32
-	mov b6=r10
-	;;
-	stf.spill [base0]=f10,32
-	stf.spill [base1]=f11,32
-	;;
-	stf.spill [base0]=f12,32
-	stf.spill [base1]=f13,32
-	;;
-	stf.spill [base0]=f14,32
-	stf.spill [base1]=f15,32
-	br.call.sptk.many rp=b6			// call the signal handler
-.ret0:	adds base0=(BSP_OFF+SIGCONTEXT_OFF),sp
-	;;
-	ld8 r15=[base0]				// fetch sc_ar_bsp
-	mov r14=ar.bsp
-	;;
-	cmp.ne p1,p0=r14,r15			// do we need to restore the rbs?
-(p1)	br.cond.spnt restore_rbs		// yup -> (clobbers r14-r18, f6 & f7)
-	;;
-back_from_restore_rbs:
-	adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp
-	adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp
-	;;
-	ldf.fill f6=[base0],32
-	ldf.fill f7=[base1],32
-	;;
-	ldf.fill f8=[base0],32
-	ldf.fill f9=[base1],32
-	;;
-	ldf.fill f10=[base0],32
-	ldf.fill f11=[base1],32
-	;;
-	ldf.fill f12=[base0],32
-	ldf.fill f13=[base1],32
-	;;
-	ldf.fill f14=[base0],32
-	ldf.fill f15=[base1],32
-	mov r15=__NR_rt_sigreturn
-	.restore sp				// pop .prologue
-	break __BREAK_SYSCALL
-
-	.prologue
-	SIGTRAMP_SAVES
-setup_rbs:
-	mov ar.rsc=0				// put RSE into enforced lazy mode
-	;;
-	.save ar.rnat, r19
-	mov r19=ar.rnat				// save RNaT before switching backing store area
-	adds r14=(RNAT_OFF+SIGCONTEXT_OFF),sp
-
-	mov r18=ar.bspstore
-	mov ar.bspstore=r15			// switch over to new register backing store area
-	;;
-
-	.spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
-	st8 [r14]=r19				// save sc_ar_rnat
-	.body
-	mov.m r16=ar.bsp			// sc_loadrs <- (new bsp - new bspstore) << 16
-	adds r14=(LOADRS_OFF+SIGCONTEXT_OFF),sp
-	;;
-	invala
-	sub r15=r16,r15
-	extr.u r20=r18,3,6
-	;;
-	mov ar.rsc=0xf				// set RSE into eager mode, pl 3
-	cmp.eq p8,p0=63,r20
-	shl r15=r15,16
-	;;
-	st8 [r14]=r15				// save sc_loadrs
-(p8)	st8 [r18]=r19		// if bspstore points at RNaT slot, store RNaT there now
-	.restore sp				// pop .prologue
-	br.cond.sptk back_from_setup_rbs
-
-	.prologue
-	SIGTRAMP_SAVES
-	.spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
-	.body
-restore_rbs:
-	// On input:
-	//	r14 = bsp1 (bsp at the time of return from signal handler)
-	//	r15 = bsp0 (bsp at the time the signal occurred)
-	//
-	// Here, we need to calculate bspstore0, the value that ar.bspstore needs
-	// to be set to, based on bsp0 and the size of the dirty partition on
-	// the alternate stack (sc_loadrs >> 16).  This can be done with the
-	// following algorithm:
-	//
-	//  bspstore0 = rse_skip_regs(bsp0, -rse_num_regs(bsp1 - (loadrs >> 19), bsp1));
-	//
-	// This is what the code below does.
-	//
-	alloc r2=ar.pfs,0,0,0,0			// alloc null frame
-	adds r16=(LOADRS_OFF+SIGCONTEXT_OFF),sp
-	adds r18=(RNAT_OFF+SIGCONTEXT_OFF),sp
-	;;
-	ld8 r17=[r16]
-	ld8 r16=[r18]			// get new rnat
-	extr.u r18=r15,3,6	// r18 <- rse_slot_num(bsp0)
-	;;
-	mov ar.rsc=r17			// put RSE into enforced lazy mode
-	shr.u r17=r17,16
-	;;
-	sub r14=r14,r17		// r14 (bspstore1) <- bsp1 - (sc_loadrs >> 16)
-	shr.u r17=r17,3		// r17 <- (sc_loadrs >> 19)
-	;;
-	loadrs			// restore dirty partition
-	extr.u r14=r14,3,6	// r14 <- rse_slot_num(bspstore1)
-	;;
-	add r14=r14,r17		// r14 <- rse_slot_num(bspstore1) + (sc_loadrs >> 19)
-	;;
-	shr.u r14=r14,6		// r14 <- (rse_slot_num(bspstore1) + (sc_loadrs >> 19))/0x40
-	;;
-	sub r14=r14,r17		// r14 <- -rse_num_regs(bspstore1, bsp1)
-	movl r17=0x8208208208208209
-	;;
-	add r18=r18,r14		// r18 (delta) <- rse_slot_num(bsp0) - rse_num_regs(bspstore1,bsp1)
-	setf.sig f7=r17
-	cmp.lt p7,p0=r14,r0	// p7 <- (r14 < 0)?
-	;;
-(p7)	adds r18=-62,r18	// delta -= 62
-	;;
-	setf.sig f6=r18
-	;;
-	xmpy.h f6=f6,f7
-	;;
-	getf.sig r17=f6
-	;;
-	add r17=r17,r18
-	shr r18=r18,63
-	;;
-	shr r17=r17,5
-	;;
-	sub r17=r17,r18		// r17 = delta/63
-	;;
-	add r17=r14,r17		// r17 <- delta/63 - rse_num_regs(bspstore1, bsp1)
-	;;
-	shladd r15=r17,3,r15	// r15 <- bsp0 + 8*(delta/63 - rse_num_regs(bspstore1, bsp1))
-	;;
-	mov ar.bspstore=r15			// switch back to old register backing store area
-	;;
-	mov ar.rnat=r16				// restore RNaT
-	mov ar.rsc=0xf				// (will be restored later on from sc_ar_rsc)
-	// invala not necessary as that will happen when returning to user-mode
-	br.cond.sptk back_from_restore_rbs
-END(__kernel_sigtramp)
-
-/*
- * On entry:
- *	r11 = saved ar.pfs
- *	r15 = system call #
- *	b0  = saved return address
- *	b6  = return address
- * On exit:
- *	r11 = saved ar.pfs
- *	r15 = system call #
- *	b0  = saved return address
- *	all other "scratch" registers:	undefined
- *	all "preserved" registers:	same as on entry
- */
-
-GLOBAL_ENTRY(__kernel_syscall_via_epc)
-	.prologue
-	.altrp b6
-	.body
-{
-	/*
-	 * Note: the kernel cannot assume that the first two instructions in this
-	 * bundle get executed.  The remaining code must be safe even if
-	 * they do not get executed.
-	 */
-	adds r17=-1024,r15			// A
-	mov r10=0				// A    default to successful syscall execution
-	epc					// B	causes split-issue
-}
-	;;
-	RSM_PSR_BE_I(r20, r22)			// M2 (5 cyc to srlz.d)
-	LOAD_FSYSCALL_TABLE(r14)		// X
-	;;
-	mov r16=IA64_KR(CURRENT)		// M2 (12 cyc)
-	shladd r18=r17,3,r14			// A
-	mov r19=NR_syscalls-1			// A
-	;;
-	lfetch [r18]				// M0|1
-	MOV_FROM_PSR(p0, r29, r8)		// M2 (12 cyc)
-	// If r17 is a NaT, p6 will be zero
-	cmp.geu p6,p7=r19,r17			// A    (sysnr > 0 && sysnr < 1024+NR_syscalls)?
-	;;
-	mov r21=ar.fpsr				// M2 (12 cyc)
-	tnat.nz p10,p9=r15			// I0
-	mov.i r26=ar.pfs			// I0 (would stall anyhow due to srlz.d...)
-	;;
-	srlz.d					// M0 (forces split-issue) ensure PSR.BE==0
-(p6)	ld8 r18=[r18]				// M0|1
-	nop.i 0
-	;;
-	nop.m 0
-(p6)	tbit.z.unc p8,p0=r18,0			// I0 (dual-issues with "mov b7=r18"!)
-	nop.i 0
-	;;
-	SSM_PSR_I(p8, p14, r25)
-(p6)	mov b7=r18				// I0
-(p8)	br.dptk.many b7				// B
-
-	mov r27=ar.rsc				// M2 (12 cyc)
-/*
- * brl.cond doesn't work as intended because the linker would convert this branch
- * into a branch to a PLT.  Perhaps there will be a way to avoid this with some
- * future version of the linker.  In the meantime, we just use an indirect branch
- * instead.
- */
-#ifdef CONFIG_ITANIUM
-(p6)	add r14=-8,r14				// r14 <- addr of fsys_bubble_down entry
-	;;
-(p6)	ld8 r14=[r14]				// r14 <- fsys_bubble_down
-	;;
-(p6)	mov b7=r14
-(p6)	br.sptk.many b7
-#else
-	BRL_COND_FSYS_BUBBLE_DOWN(p6)
-#endif
-	SSM_PSR_I(p0, p14, r10)
-	mov r10=-1
-(p10)	mov r8=EINVAL
-(p9)	mov r8=ENOSYS
-	FSYS_RETURN
-
-END(__kernel_syscall_via_epc)
diff --git a/arch/ia64/kernel/gate.lds.S b/arch/ia64/kernel/gate.lds.S
deleted file mode 100644
index 461c7e69d465..000000000000
--- a/arch/ia64/kernel/gate.lds.S
+++ /dev/null
@@ -1,108 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Linker script for gate DSO.  The gate pages are an ELF shared object
- * prelinked to its virtual address, with only one read-only segment and
- * one execute-only segment (both fit in one page).  This script controls
- * its layout.
- */
-
-#include <asm/page.h>
-
-SECTIONS
-{
-	. = GATE_ADDR + SIZEOF_HEADERS;
-
-	.hash			: { *(.hash) }		:readable
-	.gnu.hash		: { *(.gnu.hash) }
-	.dynsym			: { *(.dynsym) }
-	.dynstr			: { *(.dynstr) }
-	.gnu.version		: { *(.gnu.version) }
-	.gnu.version_d		: { *(.gnu.version_d) }
-	.gnu.version_r		: { *(.gnu.version_r) }
-
-	.note			: { *(.note*) }		:readable	:note
-
-	.dynamic		: { *(.dynamic) }	:readable	:dynamic
-
-	/*
-	 * This linker script is used both with -r and with -shared.  For
-	 * the layouts to match, we need to skip more than enough space for
-	 * the dynamic symbol table et al.  If this amount is insufficient,
-	 * ld -shared will barf.  Just increase it here.
-	 */
-	. = GATE_ADDR + 0x600;
-
-	.data..patch		: {
-		__start_gate_mckinley_e9_patchlist = .;
-		*(.data..patch.mckinley_e9)
-		__end_gate_mckinley_e9_patchlist = .;
-
-		__start_gate_vtop_patchlist = .;
-		*(.data..patch.vtop)
-		__end_gate_vtop_patchlist = .;
-
-		__start_gate_fsyscall_patchlist = .;
-		*(.data..patch.fsyscall_table)
-		__end_gate_fsyscall_patchlist = .;
-
-		__start_gate_brl_fsys_bubble_down_patchlist = .;
-		*(.data..patch.brl_fsys_bubble_down)
-		__end_gate_brl_fsys_bubble_down_patchlist = .;
-	}						:readable
-
-	.IA_64.unwind_info	: { *(.IA_64.unwind_info*) }
-	.IA_64.unwind		: { *(.IA_64.unwind*) }	:readable	:unwind
-#ifdef HAVE_BUGGY_SEGREL
-	.text (GATE_ADDR + PAGE_SIZE) : { *(.text) *(.text.*) }	:readable
-#else
-	. = ALIGN(PERCPU_PAGE_SIZE) + (. & (PERCPU_PAGE_SIZE - 1));
-	.text			: { *(.text) *(.text.*) }	:epc
-#endif
-
-	/DISCARD/		: {
-		*(.got.plt) *(.got)
-		*(.data .data.* .gnu.linkonce.d.*)
-		*(.dynbss)
-		*(.bss .bss.* .gnu.linkonce.b.*)
-		*(__ex_table)
-		*(__mca_table)
-	}
-}
-
-/*
- * ld does not recognize this name token; use the constant.
- */
-#define	PT_IA_64_UNWIND	0x70000001
-
-/*
- * We must supply the ELF program headers explicitly to get just one
- * PT_LOAD segment, and set the flags explicitly to make segments read-only.
- */
-PHDRS
-{
-	readable	PT_LOAD	FILEHDR	PHDRS	FLAGS(4);	/* PF_R */
-#ifndef HAVE_BUGGY_SEGREL
-	epc		PT_LOAD	FILEHDR PHDRS	FLAGS(1);	/* PF_X */
-#endif
-	dynamic		PT_DYNAMIC		FLAGS(4);	/* PF_R */
-	note		PT_NOTE			FLAGS(4);	/* PF_R */
-	unwind		PT_IA_64_UNWIND;
-}
-
-/*
- * This controls what symbols we export from the DSO.
- */
-VERSION
-{
-	LINUX_2.5 {
-	global:
-		__kernel_syscall_via_break;
-		__kernel_syscall_via_epc;
-		__kernel_sigtramp;
-
-	local: *;
-	};
-}
-
-/* The ELF entry point can be used to set the AT_SYSINFO value.  */
-ENTRY(__kernel_syscall_via_epc)
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
deleted file mode 100644
index 85c8a57da402..000000000000
--- a/arch/ia64/kernel/head.S
+++ /dev/null
@@ -1,1167 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Here is where the ball gets rolling as far as the kernel is concerned.
- * When control is transferred to _start, the bootload has already
- * loaded us to the correct address.  All that's left to do here is
- * to set up the kernel's global pointer and jump to the kernel
- * entry point.
- *
- * Copyright (C) 1998-2001, 2003, 2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999 Intel Corp.
- * Copyright (C) 1999 Asit Mallick <Asit.K.Mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <Don.Dugger@intel.com>
- * Copyright (C) 2002 Fenghua Yu <fenghua.yu@intel.com>
- *   -Optimize __ia64_save_fpu() and __ia64_load_fpu() for Itanium 2.
- * Copyright (C) 2004 Ashok Raj <ashok.raj@intel.com>
- *   Support for CPU Hotplug
- */
-
-#include <linux/export.h>
-#include <linux/pgtable.h>
-#include <asm/asmmacro.h>
-#include <asm/fpu.h>
-#include <asm/kregs.h>
-#include <asm/mmu_context.h>
-#include <asm/asm-offsets.h>
-#include <asm/pal.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/mca_asm.h>
-#include <linux/init.h>
-#include <linux/linkage.h>
-
-#ifdef CONFIG_HOTPLUG_CPU
-#define SAL_PSR_BITS_TO_SET				\
-	(IA64_PSR_AC | IA64_PSR_BN | IA64_PSR_MFH | IA64_PSR_MFL)
-
-#define SAVE_FROM_REG(src, ptr, dest)	\
-	mov dest=src;;						\
-	st8 [ptr]=dest,0x08
-
-#define RESTORE_REG(reg, ptr, _tmp)		\
-	ld8 _tmp=[ptr],0x08;;				\
-	mov reg=_tmp
-
-#define SAVE_BREAK_REGS(ptr, _idx, _breg, _dest)\
-	mov ar.lc=IA64_NUM_DBG_REGS-1;; 			\
-	mov _idx=0;; 								\
-1: 												\
-	SAVE_FROM_REG(_breg[_idx], ptr, _dest);;	\
-	add _idx=1,_idx;;							\
-	br.cloop.sptk.many 1b
-
-#define RESTORE_BREAK_REGS(ptr, _idx, _breg, _tmp, _lbl)\
-	mov ar.lc=IA64_NUM_DBG_REGS-1;;			\
-	mov _idx=0;;							\
-_lbl:  RESTORE_REG(_breg[_idx], ptr, _tmp);;	\
-	add _idx=1, _idx;;						\
-	br.cloop.sptk.many _lbl
-
-#define SAVE_ONE_RR(num, _reg, _tmp) \
-	movl _tmp=(num<<61);;	\
-	mov _reg=rr[_tmp]
-
-#define SAVE_REGION_REGS(_tmp, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7) \
-	SAVE_ONE_RR(0,_r0, _tmp);; \
-	SAVE_ONE_RR(1,_r1, _tmp);; \
-	SAVE_ONE_RR(2,_r2, _tmp);; \
-	SAVE_ONE_RR(3,_r3, _tmp);; \
-	SAVE_ONE_RR(4,_r4, _tmp);; \
-	SAVE_ONE_RR(5,_r5, _tmp);; \
-	SAVE_ONE_RR(6,_r6, _tmp);; \
-	SAVE_ONE_RR(7,_r7, _tmp);;
-
-#define STORE_REGION_REGS(ptr, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7) \
-	st8 [ptr]=_r0, 8;; \
-	st8 [ptr]=_r1, 8;; \
-	st8 [ptr]=_r2, 8;; \
-	st8 [ptr]=_r3, 8;; \
-	st8 [ptr]=_r4, 8;; \
-	st8 [ptr]=_r5, 8;; \
-	st8 [ptr]=_r6, 8;; \
-	st8 [ptr]=_r7, 8;;
-
-#define RESTORE_REGION_REGS(ptr, _idx1, _idx2, _tmp) \
-	mov		ar.lc=0x08-1;;						\
-	movl	_idx1=0x00;;						\
-RestRR:											\
-	dep.z	_idx2=_idx1,61,3;;					\
-	ld8		_tmp=[ptr],8;;						\
-	mov		rr[_idx2]=_tmp;;					\
-	srlz.d;;									\
-	add		_idx1=1,_idx1;;						\
-	br.cloop.sptk.few	RestRR
-
-#define SET_AREA_FOR_BOOTING_CPU(reg1, reg2) \
-	movl reg1=sal_state_for_booting_cpu;;	\
-	ld8 reg2=[reg1];;
-
-/*
- * Adjust region registers saved before starting to save
- * break regs and rest of the states that need to be preserved.
- */
-#define SAL_TO_OS_BOOT_HANDOFF_STATE_SAVE(_reg1,_reg2,_pred)  \
-	SAVE_FROM_REG(b0,_reg1,_reg2);;						\
-	SAVE_FROM_REG(b1,_reg1,_reg2);;						\
-	SAVE_FROM_REG(b2,_reg1,_reg2);;						\
-	SAVE_FROM_REG(b3,_reg1,_reg2);;						\
-	SAVE_FROM_REG(b4,_reg1,_reg2);;						\
-	SAVE_FROM_REG(b5,_reg1,_reg2);;						\
-	st8 [_reg1]=r1,0x08;;								\
-	st8 [_reg1]=r12,0x08;;								\
-	st8 [_reg1]=r13,0x08;;								\
-	SAVE_FROM_REG(ar.fpsr,_reg1,_reg2);;				\
-	SAVE_FROM_REG(ar.pfs,_reg1,_reg2);;					\
-	SAVE_FROM_REG(ar.rnat,_reg1,_reg2);;				\
-	SAVE_FROM_REG(ar.unat,_reg1,_reg2);;				\
-	SAVE_FROM_REG(ar.bspstore,_reg1,_reg2);;			\
-	SAVE_FROM_REG(cr.dcr,_reg1,_reg2);;					\
-	SAVE_FROM_REG(cr.iva,_reg1,_reg2);;					\
-	SAVE_FROM_REG(cr.pta,_reg1,_reg2);;					\
-	SAVE_FROM_REG(cr.itv,_reg1,_reg2);;					\
-	SAVE_FROM_REG(cr.pmv,_reg1,_reg2);;					\
-	SAVE_FROM_REG(cr.cmcv,_reg1,_reg2);;				\
-	SAVE_FROM_REG(cr.lrr0,_reg1,_reg2);;				\
-	SAVE_FROM_REG(cr.lrr1,_reg1,_reg2);;				\
-	st8 [_reg1]=r4,0x08;;								\
-	st8 [_reg1]=r5,0x08;;								\
-	st8 [_reg1]=r6,0x08;;								\
-	st8 [_reg1]=r7,0x08;;								\
-	st8 [_reg1]=_pred,0x08;;							\
-	SAVE_FROM_REG(ar.lc, _reg1, _reg2);;				\
-	stf.spill.nta [_reg1]=f2,16;;						\
-	stf.spill.nta [_reg1]=f3,16;;						\
-	stf.spill.nta [_reg1]=f4,16;;						\
-	stf.spill.nta [_reg1]=f5,16;;						\
-	stf.spill.nta [_reg1]=f16,16;;						\
-	stf.spill.nta [_reg1]=f17,16;;						\
-	stf.spill.nta [_reg1]=f18,16;;						\
-	stf.spill.nta [_reg1]=f19,16;;						\
-	stf.spill.nta [_reg1]=f20,16;;						\
-	stf.spill.nta [_reg1]=f21,16;;						\
-	stf.spill.nta [_reg1]=f22,16;;						\
-	stf.spill.nta [_reg1]=f23,16;;						\
-	stf.spill.nta [_reg1]=f24,16;;						\
-	stf.spill.nta [_reg1]=f25,16;;						\
-	stf.spill.nta [_reg1]=f26,16;;						\
-	stf.spill.nta [_reg1]=f27,16;;						\
-	stf.spill.nta [_reg1]=f28,16;;						\
-	stf.spill.nta [_reg1]=f29,16;;						\
-	stf.spill.nta [_reg1]=f30,16;;						\
-	stf.spill.nta [_reg1]=f31,16;;
-
-#else
-#define SET_AREA_FOR_BOOTING_CPU(a1, a2)
-#define SAL_TO_OS_BOOT_HANDOFF_STATE_SAVE(a1,a2, a3)
-#define SAVE_REGION_REGS(_tmp, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7)
-#define STORE_REGION_REGS(ptr, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7)
-#endif
-
-#define SET_ONE_RR(num, pgsize, _tmp1, _tmp2, vhpt) \
-	movl _tmp1=(num << 61);;	\
-	mov _tmp2=((ia64_rid(IA64_REGION_ID_KERNEL, (num<<61)) << 8) | (pgsize << 2) | vhpt);; \
-	mov rr[_tmp1]=_tmp2
-
-	__PAGE_ALIGNED_DATA
-
-	.global empty_zero_page
-EXPORT_SYMBOL_GPL(empty_zero_page)
-empty_zero_page:
-	.skip PAGE_SIZE
-
-	.global swapper_pg_dir
-swapper_pg_dir:
-	.skip PAGE_SIZE
-
-	.rodata
-halt_msg:
-	stringz "Halting kernel\n"
-
-	__REF
-
-	.global start_ap
-
-	/*
-	 * Start the kernel.  When the bootloader passes control to _start(), r28
-	 * points to the address of the boot parameter area.  Execution reaches
-	 * here in physical mode.
-	 */
-GLOBAL_ENTRY(_start)
-start_ap:
-	.prologue
-	.save rp, r0		// terminate unwind chain with a NULL rp
-	.body
-
-	rsm psr.i | psr.ic
-	;;
-	srlz.i
-	;;
- {
-	flushrs				// must be first insn in group
-	srlz.i
- }
-	;;
-	/*
-	 * Save the region registers, predicate before they get clobbered
-	 */
-	SAVE_REGION_REGS(r2, r8,r9,r10,r11,r12,r13,r14,r15);
-	mov r25=pr;;
-
-	/*
-	 * Initialize kernel region registers:
-	 *	rr[0]: VHPT enabled, page size = PAGE_SHIFT
-	 *	rr[1]: VHPT enabled, page size = PAGE_SHIFT
-	 *	rr[2]: VHPT enabled, page size = PAGE_SHIFT
-	 *	rr[3]: VHPT enabled, page size = PAGE_SHIFT
-	 *	rr[4]: VHPT enabled, page size = PAGE_SHIFT
-	 *	rr[5]: VHPT enabled, page size = PAGE_SHIFT
-	 *	rr[6]: VHPT disabled, page size = IA64_GRANULE_SHIFT
-	 *	rr[7]: VHPT disabled, page size = IA64_GRANULE_SHIFT
-	 * We initialize all of them to prevent inadvertently assuming
-	 * something about the state of address translation early in boot.
-	 */
-	SET_ONE_RR(0, PAGE_SHIFT, r2, r16, 1);;
-	SET_ONE_RR(1, PAGE_SHIFT, r2, r16, 1);;
-	SET_ONE_RR(2, PAGE_SHIFT, r2, r16, 1);;
-	SET_ONE_RR(3, PAGE_SHIFT, r2, r16, 1);;
-	SET_ONE_RR(4, PAGE_SHIFT, r2, r16, 1);;
-	SET_ONE_RR(5, PAGE_SHIFT, r2, r16, 1);;
-	SET_ONE_RR(6, IA64_GRANULE_SHIFT, r2, r16, 0);;
-	SET_ONE_RR(7, IA64_GRANULE_SHIFT, r2, r16, 0);;
-	/*
-	 * Now pin mappings into the TLB for kernel text and data
-	 */
-	mov r18=KERNEL_TR_PAGE_SHIFT<<2
-	movl r17=KERNEL_START
-	;;
-	mov cr.itir=r18
-	mov cr.ifa=r17
-	mov r16=IA64_TR_KERNEL
-	mov r3=ip
-	movl r18=PAGE_KERNEL
-	;;
-	dep r2=0,r3,0,KERNEL_TR_PAGE_SHIFT
-	;;
-	or r18=r2,r18
-	;;
-	srlz.i
-	;;
-	itr.i itr[r16]=r18
-	;;
-	itr.d dtr[r16]=r18
-	;;
-	srlz.i
-
-	/*
-	 * Switch into virtual mode:
-	 */
-	movl r16=(IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH|IA64_PSR_BN \
-		  |IA64_PSR_DI)
-	;;
-	mov cr.ipsr=r16
-	movl r17=1f
-	;;
-	mov cr.iip=r17
-	mov cr.ifs=r0
-	;;
-	rfi
-	;;
-1:	// now we are in virtual mode
-
-	SET_AREA_FOR_BOOTING_CPU(r2, r16);
-
-	STORE_REGION_REGS(r16, r8,r9,r10,r11,r12,r13,r14,r15);
-	SAL_TO_OS_BOOT_HANDOFF_STATE_SAVE(r16,r17,r25)
-	;;
-
-	// set IVT entry point---can't access I/O ports without it
-	movl r3=ia64_ivt
-	;;
-	mov cr.iva=r3
-	movl r2=FPSR_DEFAULT
-	;;
-	srlz.i
-	movl gp=__gp
-
-	mov ar.fpsr=r2
-	;;
-
-#define isAP	p2	// are we an Application Processor?
-#define isBP	p3	// are we the Bootstrap Processor?
-
-#ifdef CONFIG_SMP
-	/*
-	 * Find the init_task for the currently booting CPU.  At poweron, and in
-	 * UP mode, task_for_booting_cpu is NULL.
-	 */
-	movl r3=task_for_booting_cpu
- 	;;
-	ld8 r3=[r3]
-	movl r2=init_task
-	;;
-	cmp.eq isBP,isAP=r3,r0
-	;;
-(isAP)	mov r2=r3
-#else
-	movl r2=init_task
-	cmp.eq isBP,isAP=r0,r0
-#endif
-	;;
-	tpa r3=r2		// r3 == phys addr of task struct
-	mov r16=-1
-(isBP)	br.cond.dpnt .load_current // BP stack is on region 5 --- no need to map it
-
-	// load mapping for stack (virtaddr in r2, physaddr in r3)
-	rsm psr.ic
-	movl r17=PAGE_KERNEL
-	;;
-	srlz.d
-	dep r18=0,r3,0,12
-	;;
-	or r18=r17,r18
-	dep r2=-1,r3,61,3	// IMVA of task
-	;;
-	mov r17=rr[r2]
-	shr.u r16=r3,IA64_GRANULE_SHIFT
-	;;
-	dep r17=0,r17,8,24
-	;;
-	mov cr.itir=r17
-	mov cr.ifa=r2
-
-	mov r19=IA64_TR_CURRENT_STACK
-	;;
-	itr.d dtr[r19]=r18
-	;;
-	ssm psr.ic
-	srlz.d
-  	;;
-
-.load_current:
-	// load the "current" pointer (r13) and ar.k6 with the current task
-	mov IA64_KR(CURRENT)=r2		// virtual address
-	mov IA64_KR(CURRENT_STACK)=r16
-	mov r13=r2
-	/*
-	 * Reserve space at the top of the stack for "struct pt_regs".  Kernel
-	 * threads don't store interesting values in that structure, but the space
-	 * still needs to be there because time-critical stuff such as the context
-	 * switching can be implemented more efficiently (for example, __switch_to()
-	 * always sets the psr.dfh bit of the task it is switching to).
-	 */
-
-	addl r12=IA64_STK_OFFSET-IA64_PT_REGS_SIZE-16,r2
-	addl r2=IA64_RBS_OFFSET,r2	// initialize the RSE
-	mov ar.rsc=0		// place RSE in enforced lazy mode
-	;;
-	loadrs			// clear the dirty partition
-	movl r19=__phys_per_cpu_start
-	mov r18=PERCPU_PAGE_SIZE
-	;;
-#ifndef CONFIG_SMP
-	add r19=r19,r18
-	;;
-#else
-(isAP)	br.few 2f
-	movl r20=__cpu0_per_cpu
-	;;
-	shr.u r18=r18,3
-1:
-	ld8 r21=[r19],8;;
-	st8[r20]=r21,8
-	adds r18=-1,r18;;
-	cmp4.lt p7,p6=0,r18
-(p7)	br.cond.dptk.few 1b
-	mov r19=r20
-	;;
-2:
-#endif
-	tpa r19=r19
-	;;
-	.pred.rel.mutex isBP,isAP
-(isBP)	mov IA64_KR(PER_CPU_DATA)=r19	// per-CPU base for cpu0
-(isAP)	mov IA64_KR(PER_CPU_DATA)=r0	// clear physical per-CPU base
-	;;
-	mov ar.bspstore=r2	// establish the new RSE stack
-	;;
-	mov ar.rsc=0x3		// place RSE in eager mode
-
-(isBP)	dep r28=-1,r28,61,3	// make address virtual
-(isBP)	movl r2=ia64_boot_param
-	;;
-(isBP)	st8 [r2]=r28		// save the address of the boot param area passed by the bootloader
-
-#ifdef CONFIG_SMP
-(isAP)	br.call.sptk.many rp=start_secondary
-.ret0:
-(isAP)	br.cond.sptk self
-#endif
-
-	// This is executed by the bootstrap processor (bsp) only:
-
-	br.call.sptk.many rp=start_kernel
-.ret2:	addl r3=@ltoff(halt_msg),gp
-	;;
-	alloc r2=ar.pfs,8,0,2,0
-	;;
-	ld8 out0=[r3]
-	br.call.sptk.many b0=console_print
-
-self:	hint @pause
-	br.sptk.many self		// endless loop
-END(_start)
-
-	.text
-
-GLOBAL_ENTRY(ia64_save_debug_regs)
-	alloc r16=ar.pfs,1,0,0,0
-	mov r20=ar.lc			// preserve ar.lc
-	mov ar.lc=IA64_NUM_DBG_REGS-1
-	mov r18=0
-	add r19=IA64_NUM_DBG_REGS*8,in0
-	;;
-1:	mov r16=dbr[r18]
-#ifdef CONFIG_ITANIUM
-	;;
-	srlz.d
-#endif
-	mov r17=ibr[r18]
-	add r18=1,r18
-	;;
-	st8.nta [in0]=r16,8
-	st8.nta [r19]=r17,8
-	br.cloop.sptk.many 1b
-	;;
-	mov ar.lc=r20			// restore ar.lc
-	br.ret.sptk.many rp
-END(ia64_save_debug_regs)
-
-GLOBAL_ENTRY(ia64_load_debug_regs)
-	alloc r16=ar.pfs,1,0,0,0
-	lfetch.nta [in0]
-	mov r20=ar.lc			// preserve ar.lc
-	add r19=IA64_NUM_DBG_REGS*8,in0
-	mov ar.lc=IA64_NUM_DBG_REGS-1
-	mov r18=-1
-	;;
-1:	ld8.nta r16=[in0],8
-	ld8.nta r17=[r19],8
-	add r18=1,r18
-	;;
-	mov dbr[r18]=r16
-#ifdef CONFIG_ITANIUM
-	;;
-	srlz.d				// Errata 132 (NoFix status)
-#endif
-	mov ibr[r18]=r17
-	br.cloop.sptk.many 1b
-	;;
-	mov ar.lc=r20			// restore ar.lc
-	br.ret.sptk.many rp
-END(ia64_load_debug_regs)
-
-GLOBAL_ENTRY(__ia64_save_fpu)
-	alloc r2=ar.pfs,1,4,0,0
-	adds loc0=96*16-16,in0
-	adds loc1=96*16-16-128,in0
-	;;
-	stf.spill.nta [loc0]=f127,-256
-	stf.spill.nta [loc1]=f119,-256
-	;;
-	stf.spill.nta [loc0]=f111,-256
-	stf.spill.nta [loc1]=f103,-256
-	;;
-	stf.spill.nta [loc0]=f95,-256
-	stf.spill.nta [loc1]=f87,-256
-	;;
-	stf.spill.nta [loc0]=f79,-256
-	stf.spill.nta [loc1]=f71,-256
-	;;
-	stf.spill.nta [loc0]=f63,-256
-	stf.spill.nta [loc1]=f55,-256
-	adds loc2=96*16-32,in0
-	;;
-	stf.spill.nta [loc0]=f47,-256
-	stf.spill.nta [loc1]=f39,-256
-	adds loc3=96*16-32-128,in0
-	;;
-	stf.spill.nta [loc2]=f126,-256
-	stf.spill.nta [loc3]=f118,-256
-	;;
-	stf.spill.nta [loc2]=f110,-256
-	stf.spill.nta [loc3]=f102,-256
-	;;
-	stf.spill.nta [loc2]=f94,-256
-	stf.spill.nta [loc3]=f86,-256
-	;;
-	stf.spill.nta [loc2]=f78,-256
-	stf.spill.nta [loc3]=f70,-256
-	;;
-	stf.spill.nta [loc2]=f62,-256
-	stf.spill.nta [loc3]=f54,-256
-	adds loc0=96*16-48,in0
-	;;
-	stf.spill.nta [loc2]=f46,-256
-	stf.spill.nta [loc3]=f38,-256
-	adds loc1=96*16-48-128,in0
-	;;
-	stf.spill.nta [loc0]=f125,-256
-	stf.spill.nta [loc1]=f117,-256
-	;;
-	stf.spill.nta [loc0]=f109,-256
-	stf.spill.nta [loc1]=f101,-256
-	;;
-	stf.spill.nta [loc0]=f93,-256
-	stf.spill.nta [loc1]=f85,-256
-	;;
-	stf.spill.nta [loc0]=f77,-256
-	stf.spill.nta [loc1]=f69,-256
-	;;
-	stf.spill.nta [loc0]=f61,-256
-	stf.spill.nta [loc1]=f53,-256
-	adds loc2=96*16-64,in0
-	;;
-	stf.spill.nta [loc0]=f45,-256
-	stf.spill.nta [loc1]=f37,-256
-	adds loc3=96*16-64-128,in0
-	;;
-	stf.spill.nta [loc2]=f124,-256
-	stf.spill.nta [loc3]=f116,-256
-	;;
-	stf.spill.nta [loc2]=f108,-256
-	stf.spill.nta [loc3]=f100,-256
-	;;
-	stf.spill.nta [loc2]=f92,-256
-	stf.spill.nta [loc3]=f84,-256
-	;;
-	stf.spill.nta [loc2]=f76,-256
-	stf.spill.nta [loc3]=f68,-256
-	;;
-	stf.spill.nta [loc2]=f60,-256
-	stf.spill.nta [loc3]=f52,-256
-	adds loc0=96*16-80,in0
-	;;
-	stf.spill.nta [loc2]=f44,-256
-	stf.spill.nta [loc3]=f36,-256
-	adds loc1=96*16-80-128,in0
-	;;
-	stf.spill.nta [loc0]=f123,-256
-	stf.spill.nta [loc1]=f115,-256
-	;;
-	stf.spill.nta [loc0]=f107,-256
-	stf.spill.nta [loc1]=f99,-256
-	;;
-	stf.spill.nta [loc0]=f91,-256
-	stf.spill.nta [loc1]=f83,-256
-	;;
-	stf.spill.nta [loc0]=f75,-256
-	stf.spill.nta [loc1]=f67,-256
-	;;
-	stf.spill.nta [loc0]=f59,-256
-	stf.spill.nta [loc1]=f51,-256
-	adds loc2=96*16-96,in0
-	;;
-	stf.spill.nta [loc0]=f43,-256
-	stf.spill.nta [loc1]=f35,-256
-	adds loc3=96*16-96-128,in0
-	;;
-	stf.spill.nta [loc2]=f122,-256
-	stf.spill.nta [loc3]=f114,-256
-	;;
-	stf.spill.nta [loc2]=f106,-256
-	stf.spill.nta [loc3]=f98,-256
-	;;
-	stf.spill.nta [loc2]=f90,-256
-	stf.spill.nta [loc3]=f82,-256
-	;;
-	stf.spill.nta [loc2]=f74,-256
-	stf.spill.nta [loc3]=f66,-256
-	;;
-	stf.spill.nta [loc2]=f58,-256
-	stf.spill.nta [loc3]=f50,-256
-	adds loc0=96*16-112,in0
-	;;
-	stf.spill.nta [loc2]=f42,-256
-	stf.spill.nta [loc3]=f34,-256
-	adds loc1=96*16-112-128,in0
-	;;
-	stf.spill.nta [loc0]=f121,-256
-	stf.spill.nta [loc1]=f113,-256
-	;;
-	stf.spill.nta [loc0]=f105,-256
-	stf.spill.nta [loc1]=f97,-256
-	;;
-	stf.spill.nta [loc0]=f89,-256
-	stf.spill.nta [loc1]=f81,-256
-	;;
-	stf.spill.nta [loc0]=f73,-256
-	stf.spill.nta [loc1]=f65,-256
-	;;
-	stf.spill.nta [loc0]=f57,-256
-	stf.spill.nta [loc1]=f49,-256
-	adds loc2=96*16-128,in0
-	;;
-	stf.spill.nta [loc0]=f41,-256
-	stf.spill.nta [loc1]=f33,-256
-	adds loc3=96*16-128-128,in0
-	;;
-	stf.spill.nta [loc2]=f120,-256
-	stf.spill.nta [loc3]=f112,-256
-	;;
-	stf.spill.nta [loc2]=f104,-256
-	stf.spill.nta [loc3]=f96,-256
-	;;
-	stf.spill.nta [loc2]=f88,-256
-	stf.spill.nta [loc3]=f80,-256
-	;;
-	stf.spill.nta [loc2]=f72,-256
-	stf.spill.nta [loc3]=f64,-256
-	;;
-	stf.spill.nta [loc2]=f56,-256
-	stf.spill.nta [loc3]=f48,-256
-	;;
-	stf.spill.nta [loc2]=f40
-	stf.spill.nta [loc3]=f32
-	br.ret.sptk.many rp
-END(__ia64_save_fpu)
-
-GLOBAL_ENTRY(__ia64_load_fpu)
-	alloc r2=ar.pfs,1,2,0,0
-	adds r3=128,in0
-	adds r14=256,in0
-	adds r15=384,in0
-	mov loc0=512
-	mov loc1=-1024+16
-	;;
-	ldf.fill.nta f32=[in0],loc0
-	ldf.fill.nta f40=[ r3],loc0
-	ldf.fill.nta f48=[r14],loc0
-	ldf.fill.nta f56=[r15],loc0
-	;;
-	ldf.fill.nta f64=[in0],loc0
-	ldf.fill.nta f72=[ r3],loc0
-	ldf.fill.nta f80=[r14],loc0
-	ldf.fill.nta f88=[r15],loc0
-	;;
-	ldf.fill.nta f96=[in0],loc1
-	ldf.fill.nta f104=[ r3],loc1
-	ldf.fill.nta f112=[r14],loc1
-	ldf.fill.nta f120=[r15],loc1
-	;;
-	ldf.fill.nta f33=[in0],loc0
-	ldf.fill.nta f41=[ r3],loc0
-	ldf.fill.nta f49=[r14],loc0
-	ldf.fill.nta f57=[r15],loc0
-	;;
-	ldf.fill.nta f65=[in0],loc0
-	ldf.fill.nta f73=[ r3],loc0
-	ldf.fill.nta f81=[r14],loc0
-	ldf.fill.nta f89=[r15],loc0
-	;;
-	ldf.fill.nta f97=[in0],loc1
-	ldf.fill.nta f105=[ r3],loc1
-	ldf.fill.nta f113=[r14],loc1
-	ldf.fill.nta f121=[r15],loc1
-	;;
-	ldf.fill.nta f34=[in0],loc0
-	ldf.fill.nta f42=[ r3],loc0
-	ldf.fill.nta f50=[r14],loc0
-	ldf.fill.nta f58=[r15],loc0
-	;;
-	ldf.fill.nta f66=[in0],loc0
-	ldf.fill.nta f74=[ r3],loc0
-	ldf.fill.nta f82=[r14],loc0
-	ldf.fill.nta f90=[r15],loc0
-	;;
-	ldf.fill.nta f98=[in0],loc1
-	ldf.fill.nta f106=[ r3],loc1
-	ldf.fill.nta f114=[r14],loc1
-	ldf.fill.nta f122=[r15],loc1
-	;;
-	ldf.fill.nta f35=[in0],loc0
-	ldf.fill.nta f43=[ r3],loc0
-	ldf.fill.nta f51=[r14],loc0
-	ldf.fill.nta f59=[r15],loc0
-	;;
-	ldf.fill.nta f67=[in0],loc0
-	ldf.fill.nta f75=[ r3],loc0
-	ldf.fill.nta f83=[r14],loc0
-	ldf.fill.nta f91=[r15],loc0
-	;;
-	ldf.fill.nta f99=[in0],loc1
-	ldf.fill.nta f107=[ r3],loc1
-	ldf.fill.nta f115=[r14],loc1
-	ldf.fill.nta f123=[r15],loc1
-	;;
-	ldf.fill.nta f36=[in0],loc0
-	ldf.fill.nta f44=[ r3],loc0
-	ldf.fill.nta f52=[r14],loc0
-	ldf.fill.nta f60=[r15],loc0
-	;;
-	ldf.fill.nta f68=[in0],loc0
-	ldf.fill.nta f76=[ r3],loc0
-	ldf.fill.nta f84=[r14],loc0
-	ldf.fill.nta f92=[r15],loc0
-	;;
-	ldf.fill.nta f100=[in0],loc1
-	ldf.fill.nta f108=[ r3],loc1
-	ldf.fill.nta f116=[r14],loc1
-	ldf.fill.nta f124=[r15],loc1
-	;;
-	ldf.fill.nta f37=[in0],loc0
-	ldf.fill.nta f45=[ r3],loc0
-	ldf.fill.nta f53=[r14],loc0
-	ldf.fill.nta f61=[r15],loc0
-	;;
-	ldf.fill.nta f69=[in0],loc0
-	ldf.fill.nta f77=[ r3],loc0
-	ldf.fill.nta f85=[r14],loc0
-	ldf.fill.nta f93=[r15],loc0
-	;;
-	ldf.fill.nta f101=[in0],loc1
-	ldf.fill.nta f109=[ r3],loc1
-	ldf.fill.nta f117=[r14],loc1
-	ldf.fill.nta f125=[r15],loc1
-	;;
-	ldf.fill.nta f38 =[in0],loc0
-	ldf.fill.nta f46 =[ r3],loc0
-	ldf.fill.nta f54 =[r14],loc0
-	ldf.fill.nta f62 =[r15],loc0
-	;;
-	ldf.fill.nta f70 =[in0],loc0
-	ldf.fill.nta f78 =[ r3],loc0
-	ldf.fill.nta f86 =[r14],loc0
-	ldf.fill.nta f94 =[r15],loc0
-	;;
-	ldf.fill.nta f102=[in0],loc1
-	ldf.fill.nta f110=[ r3],loc1
-	ldf.fill.nta f118=[r14],loc1
-	ldf.fill.nta f126=[r15],loc1
-	;;
-	ldf.fill.nta f39 =[in0],loc0
-	ldf.fill.nta f47 =[ r3],loc0
-	ldf.fill.nta f55 =[r14],loc0
-	ldf.fill.nta f63 =[r15],loc0
-	;;
-	ldf.fill.nta f71 =[in0],loc0
-	ldf.fill.nta f79 =[ r3],loc0
-	ldf.fill.nta f87 =[r14],loc0
-	ldf.fill.nta f95 =[r15],loc0
-	;;
-	ldf.fill.nta f103=[in0]
-	ldf.fill.nta f111=[ r3]
-	ldf.fill.nta f119=[r14]
-	ldf.fill.nta f127=[r15]
-	br.ret.sptk.many rp
-END(__ia64_load_fpu)
-
-GLOBAL_ENTRY(__ia64_init_fpu)
-	stf.spill [sp]=f0		// M3
-	mov	 f32=f0			// F
-	nop.b	 0
-
-	ldfps	 f33,f34=[sp]		// M0
-	ldfps	 f35,f36=[sp]		// M1
-	mov      f37=f0			// F
-	;;
-
-	setf.s	 f38=r0			// M2
-	setf.s	 f39=r0			// M3
-	mov      f40=f0			// F
-
-	ldfps	 f41,f42=[sp]		// M0
-	ldfps	 f43,f44=[sp]		// M1
-	mov      f45=f0			// F
-
-	setf.s	 f46=r0			// M2
-	setf.s	 f47=r0			// M3
-	mov      f48=f0			// F
-
-	ldfps	 f49,f50=[sp]		// M0
-	ldfps	 f51,f52=[sp]		// M1
-	mov      f53=f0			// F
-
-	setf.s	 f54=r0			// M2
-	setf.s	 f55=r0			// M3
-	mov      f56=f0			// F
-
-	ldfps	 f57,f58=[sp]		// M0
-	ldfps	 f59,f60=[sp]		// M1
-	mov      f61=f0			// F
-
-	setf.s	 f62=r0			// M2
-	setf.s	 f63=r0			// M3
-	mov      f64=f0			// F
-
-	ldfps	 f65,f66=[sp]		// M0
-	ldfps	 f67,f68=[sp]		// M1
-	mov      f69=f0			// F
-
-	setf.s	 f70=r0			// M2
-	setf.s	 f71=r0			// M3
-	mov      f72=f0			// F
-
-	ldfps	 f73,f74=[sp]		// M0
-	ldfps	 f75,f76=[sp]		// M1
-	mov      f77=f0			// F
-
-	setf.s	 f78=r0			// M2
-	setf.s	 f79=r0			// M3
-	mov      f80=f0			// F
-
-	ldfps	 f81,f82=[sp]		// M0
-	ldfps	 f83,f84=[sp]		// M1
-	mov      f85=f0			// F
-
-	setf.s	 f86=r0			// M2
-	setf.s	 f87=r0			// M3
-	mov      f88=f0			// F
-
-	/*
-	 * When the instructions are cached, it would be faster to initialize
-	 * the remaining registers with simply mov instructions (F-unit).
-	 * This gets the time down to ~29 cycles.  However, this would use up
-	 * 33 bundles, whereas continuing with the above pattern yields
-	 * 10 bundles and ~30 cycles.
-	 */
-
-	ldfps	 f89,f90=[sp]		// M0
-	ldfps	 f91,f92=[sp]		// M1
-	mov      f93=f0			// F
-
-	setf.s	 f94=r0			// M2
-	setf.s	 f95=r0			// M3
-	mov      f96=f0			// F
-
-	ldfps	 f97,f98=[sp]		// M0
-	ldfps	 f99,f100=[sp]		// M1
-	mov      f101=f0		// F
-
-	setf.s	 f102=r0		// M2
-	setf.s	 f103=r0		// M3
-	mov      f104=f0		// F
-
-	ldfps	 f105,f106=[sp]		// M0
-	ldfps	 f107,f108=[sp]		// M1
-	mov      f109=f0		// F
-
-	setf.s	 f110=r0		// M2
-	setf.s	 f111=r0		// M3
-	mov      f112=f0		// F
-
-	ldfps	 f113,f114=[sp]		// M0
-	ldfps	 f115,f116=[sp]		// M1
-	mov      f117=f0		// F
-
-	setf.s	 f118=r0		// M2
-	setf.s	 f119=r0		// M3
-	mov      f120=f0		// F
-
-	ldfps	 f121,f122=[sp]		// M0
-	ldfps	 f123,f124=[sp]		// M1
-	mov      f125=f0		// F
-
-	setf.s	 f126=r0		// M2
-	setf.s	 f127=r0		// M3
-	br.ret.sptk.many rp		// F
-END(__ia64_init_fpu)
-
-/*
- * Switch execution mode from virtual to physical
- *
- * Inputs:
- *	r16 = new psr to establish
- * Output:
- *	r19 = old virtual address of ar.bsp
- *	r20 = old virtual address of sp
- *
- * Note: RSE must already be in enforced lazy mode
- */
-GLOBAL_ENTRY(ia64_switch_mode_phys)
- {
-	rsm psr.i | psr.ic		// disable interrupts and interrupt collection
-	mov r15=ip
- }
-	;;
- {
-	flushrs				// must be first insn in group
-	srlz.i
- }
-	;;
-	mov cr.ipsr=r16			// set new PSR
-	add r3=1f-ia64_switch_mode_phys,r15
-
-	mov r19=ar.bsp
-	mov r20=sp
-	mov r14=rp			// get return address into a general register
-	;;
-
-	// going to physical mode, use tpa to translate virt->phys
-	tpa r17=r19
-	tpa r3=r3
-	tpa sp=sp
-	tpa r14=r14
-	;;
-
-	mov r18=ar.rnat			// save ar.rnat
-	mov ar.bspstore=r17		// this steps on ar.rnat
-	mov cr.iip=r3
-	mov cr.ifs=r0
-	;;
-	mov ar.rnat=r18			// restore ar.rnat
-	rfi				// must be last insn in group
-	;;
-1:	mov rp=r14
-	br.ret.sptk.many rp
-END(ia64_switch_mode_phys)
-
-/*
- * Switch execution mode from physical to virtual
- *
- * Inputs:
- *	r16 = new psr to establish
- *	r19 = new bspstore to establish
- *	r20 = new sp to establish
- *
- * Note: RSE must already be in enforced lazy mode
- */
-GLOBAL_ENTRY(ia64_switch_mode_virt)
- {
-	rsm psr.i | psr.ic		// disable interrupts and interrupt collection
-	mov r15=ip
- }
-	;;
- {
-	flushrs				// must be first insn in group
-	srlz.i
- }
-	;;
-	mov cr.ipsr=r16			// set new PSR
-	add r3=1f-ia64_switch_mode_virt,r15
-
-	mov r14=rp			// get return address into a general register
-	;;
-
-	// going to virtual
-	//   - for code addresses, set upper bits of addr to KERNEL_START
-	//   - for stack addresses, copy from input argument
-	movl r18=KERNEL_START
-	dep r3=0,r3,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT
-	dep r14=0,r14,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT
-	mov sp=r20
-	;;
-	or r3=r3,r18
-	or r14=r14,r18
-	;;
-
-	mov r18=ar.rnat			// save ar.rnat
-	mov ar.bspstore=r19		// this steps on ar.rnat
-	mov cr.iip=r3
-	mov cr.ifs=r0
-	;;
-	mov ar.rnat=r18			// restore ar.rnat
-	rfi				// must be last insn in group
-	;;
-1:	mov rp=r14
-	br.ret.sptk.many rp
-END(ia64_switch_mode_virt)
-
-GLOBAL_ENTRY(ia64_delay_loop)
-	.prologue
-{	nop 0			// work around GAS unwind info generation bug...
-	.save ar.lc,r2
-	mov r2=ar.lc
-	.body
-	;;
-	mov ar.lc=r32
-}
-	;;
-	// force loop to be 32-byte aligned (GAS bug means we cannot use .align
-	// inside function body without corrupting unwind info).
-{	nop 0 }
-1:	br.cloop.sptk.few 1b
-	;;
-	mov ar.lc=r2
-	br.ret.sptk.many rp
-END(ia64_delay_loop)
-
-/*
- * Return a CPU-local timestamp in nano-seconds.  This timestamp is
- * NOT synchronized across CPUs its return value must never be
- * compared against the values returned on another CPU.  The usage in
- * kernel/sched/core.c ensures that.
- *
- * The return-value of sched_clock() is NOT supposed to wrap-around.
- * If it did, it would cause some scheduling hiccups (at the worst).
- * Fortunately, with a 64-bit cycle-counter ticking at 100GHz, even
- * that would happen only once every 5+ years.
- *
- * The code below basically calculates:
- *
- *   (ia64_get_itc() * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT
- *
- * except that the multiplication and the shift are done with 128-bit
- * intermediate precision so that we can produce a full 64-bit result.
- */
-GLOBAL_ENTRY(ia64_native_sched_clock)
-	addl r8=THIS_CPU(ia64_cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
-	mov.m r9=ar.itc		// fetch cycle-counter				(35 cyc)
-	;;
-	ldf8 f8=[r8]
-	;;
-	setf.sig f9=r9		// certain to stall, so issue it _after_ ldf8...
-	;;
-	xmpy.lu f10=f9,f8	// calculate low 64 bits of 128-bit product	(4 cyc)
-	xmpy.hu f11=f9,f8	// calculate high 64 bits of 128-bit product
-	;;
-	getf.sig r8=f10		//						(5 cyc)
-	getf.sig r9=f11
-	;;
-	shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
-	br.ret.sptk.many rp
-END(ia64_native_sched_clock)
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-GLOBAL_ENTRY(cycle_to_nsec)
-	alloc r16=ar.pfs,1,0,0,0
-	addl r8=THIS_CPU(ia64_cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
-	;;
-	ldf8 f8=[r8]
-	;;
-	setf.sig f9=r32
-	;;
-	xmpy.lu f10=f9,f8	// calculate low 64 bits of 128-bit product	(4 cyc)
-	xmpy.hu f11=f9,f8	// calculate high 64 bits of 128-bit product
-	;;
-	getf.sig r8=f10		//						(5 cyc)
-	getf.sig r9=f11
-	;;
-	shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
-	br.ret.sptk.many rp
-END(cycle_to_nsec)
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-
-#ifdef CONFIG_IA64_BRL_EMU
-
-/*
- *  Assembly routines used by brl_emu.c to set preserved register state.
- */
-
-#define SET_REG(reg)				\
- GLOBAL_ENTRY(ia64_set_##reg);			\
-	alloc r16=ar.pfs,1,0,0,0;		\
-	mov reg=r32;				\
-	;;					\
-	br.ret.sptk.many rp;			\
- END(ia64_set_##reg)
-
-SET_REG(b1);
-SET_REG(b2);
-SET_REG(b3);
-SET_REG(b4);
-SET_REG(b5);
-
-#endif /* CONFIG_IA64_BRL_EMU */
-
-#ifdef CONFIG_SMP
-
-#ifdef CONFIG_HOTPLUG_CPU
-GLOBAL_ENTRY(ia64_jump_to_sal)
-	alloc r16=ar.pfs,1,0,0,0;;
-	rsm psr.i  | psr.ic
-{
-	flushrs
-	srlz.i
-}
-	tpa r25=in0
-	movl r18=tlb_purge_done;;
-	DATA_VA_TO_PA(r18);;
-	mov b1=r18 	// Return location
-	movl r18=ia64_do_tlb_purge;;
-	DATA_VA_TO_PA(r18);;
-	mov b2=r18 	// doing tlb_flush work
-	mov ar.rsc=0  // Put RSE  in enforced lazy, LE mode
-	movl r17=1f;;
-	DATA_VA_TO_PA(r17);;
-	mov cr.iip=r17
-	movl r16=SAL_PSR_BITS_TO_SET;;
-	mov cr.ipsr=r16
-	mov cr.ifs=r0;;
-	rfi;;			// note: this unmask MCA/INIT (psr.mc)
-1:
-	/*
-	 * Invalidate all TLB data/inst
-	 */
-	br.sptk.many b2;; // jump to tlb purge code
-
-tlb_purge_done:
-	RESTORE_REGION_REGS(r25, r17,r18,r19);;
-	RESTORE_REG(b0, r25, r17);;
-	RESTORE_REG(b1, r25, r17);;
-	RESTORE_REG(b2, r25, r17);;
-	RESTORE_REG(b3, r25, r17);;
-	RESTORE_REG(b4, r25, r17);;
-	RESTORE_REG(b5, r25, r17);;
-	ld8 r1=[r25],0x08;;
-	ld8 r12=[r25],0x08;;
-	ld8 r13=[r25],0x08;;
-	RESTORE_REG(ar.fpsr, r25, r17);;
-	RESTORE_REG(ar.pfs, r25, r17);;
-	RESTORE_REG(ar.rnat, r25, r17);;
-	RESTORE_REG(ar.unat, r25, r17);;
-	RESTORE_REG(ar.bspstore, r25, r17);;
-	RESTORE_REG(cr.dcr, r25, r17);;
-	RESTORE_REG(cr.iva, r25, r17);;
-	RESTORE_REG(cr.pta, r25, r17);;
-	srlz.d;;	// required not to violate RAW dependency
-	RESTORE_REG(cr.itv, r25, r17);;
-	RESTORE_REG(cr.pmv, r25, r17);;
-	RESTORE_REG(cr.cmcv, r25, r17);;
-	RESTORE_REG(cr.lrr0, r25, r17);;
-	RESTORE_REG(cr.lrr1, r25, r17);;
-	ld8 r4=[r25],0x08;;
-	ld8 r5=[r25],0x08;;
-	ld8 r6=[r25],0x08;;
-	ld8 r7=[r25],0x08;;
-	ld8 r17=[r25],0x08;;
-	mov pr=r17,-1;;
-	RESTORE_REG(ar.lc, r25, r17);;
-	/*
-	 * Now Restore floating point regs
-	 */
-	ldf.fill.nta f2=[r25],16;;
-	ldf.fill.nta f3=[r25],16;;
-	ldf.fill.nta f4=[r25],16;;
-	ldf.fill.nta f5=[r25],16;;
-	ldf.fill.nta f16=[r25],16;;
-	ldf.fill.nta f17=[r25],16;;
-	ldf.fill.nta f18=[r25],16;;
-	ldf.fill.nta f19=[r25],16;;
-	ldf.fill.nta f20=[r25],16;;
-	ldf.fill.nta f21=[r25],16;;
-	ldf.fill.nta f22=[r25],16;;
-	ldf.fill.nta f23=[r25],16;;
-	ldf.fill.nta f24=[r25],16;;
-	ldf.fill.nta f25=[r25],16;;
-	ldf.fill.nta f26=[r25],16;;
-	ldf.fill.nta f27=[r25],16;;
-	ldf.fill.nta f28=[r25],16;;
-	ldf.fill.nta f29=[r25],16;;
-	ldf.fill.nta f30=[r25],16;;
-	ldf.fill.nta f31=[r25],16;;
-
-	/*
-	 * Now that we have done all the register restores
-	 * we are now ready for the big DIVE to SAL Land
-	 */
-	ssm psr.ic;;
-	srlz.d;;
-	br.ret.sptk.many b0;;
-END(ia64_jump_to_sal)
-#endif /* CONFIG_HOTPLUG_CPU */
-
-#endif /* CONFIG_SMP */
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
deleted file mode 100644
index 99300850abc1..000000000000
--- a/arch/ia64/kernel/iosapic.c
+++ /dev/null
@@ -1,1137 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * I/O SAPIC support.
- *
- * Copyright (C) 1999 Intel Corp.
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 2000-2002 J.I. Lee <jung-ik.lee@intel.com>
- * Copyright (C) 1999-2000, 2002-2003 Hewlett-Packard Co.
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com>
- *
- * 00/04/19	D. Mosberger	Rewritten to mirror more closely the x86 I/O
- *				APIC code.  In particular, we now have separate
- *				handlers for edge and level triggered
- *				interrupts.
- * 00/10/27	Asit Mallick, Goutham Rao <goutham.rao@intel.com> IRQ vector
- *				allocation PCI to vector mapping, shared PCI
- *				interrupts.
- * 00/10/27	D. Mosberger	Document things a bit more to make them more
- *				understandable.  Clean up much of the old
- *				IOSAPIC cruft.
- * 01/07/27	J.I. Lee	PCI irq routing, Platform/Legacy interrupts
- *				and fixes for ACPI S5(SoftOff) support.
- * 02/01/23	J.I. Lee	iosapic pgm fixes for PCI irq routing from _PRT
- * 02/01/07     E. Focht        <efocht@ess.nec.de> Redirectable interrupt
- *				vectors in iosapic_set_affinity(),
- *				initializations for /proc/irq/#/smp_affinity
- * 02/04/02	P. Diefenbaugh	Cleaned up ACPI PCI IRQ routing.
- * 02/04/18	J.I. Lee	bug fix in iosapic_init_pci_irq
- * 02/04/30	J.I. Lee	bug fix in find_iosapic to fix ACPI PCI IRQ to
- *				IOSAPIC mapping error
- * 02/07/29	T. Kochi	Allocate interrupt vectors dynamically
- * 02/08/04	T. Kochi	Cleaned up terminology (irq, global system
- *				interrupt, vector, etc.)
- * 02/09/20	D. Mosberger	Simplified by taking advantage of ACPI's
- *				pci_irq code.
- * 03/02/19	B. Helgaas	Make pcat_compat system-wide, not per-IOSAPIC.
- *				Remove iosapic_address & gsi_base from
- *				external interfaces.  Rationalize
- *				__init/__devinit attributes.
- * 04/12/04 Ashok Raj	<ashok.raj@intel.com> Intel Corporation 2004
- *				Updated to work with irq migration necessary
- *				for CPU Hotplug
- */
-/*
- * Here is what the interrupt logic between a PCI device and the kernel looks
- * like:
- *
- * (1) A PCI device raises one of the four interrupt pins (INTA, INTB, INTC,
- *     INTD).  The device is uniquely identified by its bus-, and slot-number
- *     (the function number does not matter here because all functions share
- *     the same interrupt lines).
- *
- * (2) The motherboard routes the interrupt line to a pin on a IOSAPIC
- *     controller.  Multiple interrupt lines may have to share the same
- *     IOSAPIC pin (if they're level triggered and use the same polarity).
- *     Each interrupt line has a unique Global System Interrupt (GSI) number
- *     which can be calculated as the sum of the controller's base GSI number
- *     and the IOSAPIC pin number to which the line connects.
- *
- * (3) The IOSAPIC uses an internal routing table entries (RTEs) to map the
- * IOSAPIC pin into the IA-64 interrupt vector.  This interrupt vector is then
- * sent to the CPU.
- *
- * (4) The kernel recognizes an interrupt as an IRQ.  The IRQ interface is
- *     used as architecture-independent interrupt handling mechanism in Linux.
- *     As an IRQ is a number, we have to have
- *     IA-64 interrupt vector number <-> IRQ number mapping.  On smaller
- *     systems, we use one-to-one mapping between IA-64 vector and IRQ.
- *
- * To sum up, there are three levels of mappings involved:
- *
- *	PCI pin -> global system interrupt (GSI) -> IA-64 vector <-> IRQ
- *
- * Note: The term "IRQ" is loosely used everywhere in Linux kernel to
- * describe interrupts.  Now we use "IRQ" only for Linux IRQ's.  ISA IRQ
- * (isa_irq) is the only exception in this source code.
- */
-
-#include <linux/acpi.h>
-#include <linux/init.h>
-#include <linux/irq.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
-#include <linux/string.h>
-#include <linux/memblock.h>
-
-#include <asm/delay.h>
-#include <asm/hw_irq.h>
-#include <asm/io.h>
-#include <asm/iosapic.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/xtp.h>
-
-#undef DEBUG_INTERRUPT_ROUTING
-
-#ifdef DEBUG_INTERRUPT_ROUTING
-#define DBG(fmt...)	printk(fmt)
-#else
-#define DBG(fmt...)
-#endif
-
-static DEFINE_SPINLOCK(iosapic_lock);
-
-/*
- * These tables map IA-64 vectors to the IOSAPIC pin that generates this
- * vector.
- */
-
-#define NO_REF_RTE	0
-
-static struct iosapic {
-	char __iomem	*addr;		/* base address of IOSAPIC */
-	unsigned int	gsi_base;	/* GSI base */
-	unsigned short	num_rte;	/* # of RTEs on this IOSAPIC */
-	int		rtes_inuse;	/* # of RTEs in use on this IOSAPIC */
-#ifdef CONFIG_NUMA
-	unsigned short	node;		/* numa node association via pxm */
-#endif
-	spinlock_t	lock;		/* lock for indirect reg access */
-} iosapic_lists[NR_IOSAPICS];
-
-struct iosapic_rte_info {
-	struct list_head rte_list;	/* RTEs sharing the same vector */
-	char		rte_index;	/* IOSAPIC RTE index */
-	int		refcnt;		/* reference counter */
-	struct iosapic	*iosapic;
-} ____cacheline_aligned;
-
-static struct iosapic_intr_info {
-	struct list_head rtes;		/* RTEs using this vector (empty =>
-					 * not an IOSAPIC interrupt) */
-	int		count;		/* # of registered RTEs */
-	u32		low32;		/* current value of low word of
-					 * Redirection table entry */
-	unsigned int	dest;		/* destination CPU physical ID */
-	unsigned char	dmode	: 3;	/* delivery mode (see iosapic.h) */
-	unsigned char 	polarity: 1;	/* interrupt polarity
-					 * (see iosapic.h) */
-	unsigned char	trigger	: 1;	/* trigger mode (see iosapic.h) */
-} iosapic_intr_info[NR_IRQS];
-
-static unsigned char pcat_compat;	/* 8259 compatibility flag */
-
-static inline void
-iosapic_write(struct iosapic *iosapic, unsigned int reg, u32 val)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&iosapic->lock, flags);
-	__iosapic_write(iosapic->addr, reg, val);
-	spin_unlock_irqrestore(&iosapic->lock, flags);
-}
-
-/*
- * Find an IOSAPIC associated with a GSI
- */
-static inline int
-find_iosapic (unsigned int gsi)
-{
-	int i;
-
-	for (i = 0; i < NR_IOSAPICS; i++) {
-		if ((unsigned) (gsi - iosapic_lists[i].gsi_base) <
-		    iosapic_lists[i].num_rte)
-			return i;
-	}
-
-	return -1;
-}
-
-static inline int __gsi_to_irq(unsigned int gsi)
-{
-	int irq;
-	struct iosapic_intr_info *info;
-	struct iosapic_rte_info *rte;
-
-	for (irq = 0; irq < NR_IRQS; irq++) {
-		info = &iosapic_intr_info[irq];
-		list_for_each_entry(rte, &info->rtes, rte_list)
-			if (rte->iosapic->gsi_base + rte->rte_index == gsi)
-				return irq;
-	}
-	return -1;
-}
-
-int
-gsi_to_irq (unsigned int gsi)
-{
-	unsigned long flags;
-	int irq;
-
-	spin_lock_irqsave(&iosapic_lock, flags);
-	irq = __gsi_to_irq(gsi);
-	spin_unlock_irqrestore(&iosapic_lock, flags);
-	return irq;
-}
-
-static struct iosapic_rte_info *find_rte(unsigned int irq, unsigned int gsi)
-{
-	struct iosapic_rte_info *rte;
-
-	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list)
-		if (rte->iosapic->gsi_base + rte->rte_index == gsi)
-			return rte;
-	return NULL;
-}
-
-static void
-set_rte (unsigned int gsi, unsigned int irq, unsigned int dest, int mask)
-{
-	unsigned long pol, trigger, dmode;
-	u32 low32, high32;
-	int rte_index;
-	char redir;
-	struct iosapic_rte_info *rte;
-	ia64_vector vector = irq_to_vector(irq);
-
-	DBG(KERN_DEBUG"IOSAPIC: routing vector %d to 0x%x\n", vector, dest);
-
-	rte = find_rte(irq, gsi);
-	if (!rte)
-		return;		/* not an IOSAPIC interrupt */
-
-	rte_index = rte->rte_index;
-	pol     = iosapic_intr_info[irq].polarity;
-	trigger = iosapic_intr_info[irq].trigger;
-	dmode   = iosapic_intr_info[irq].dmode;
-
-	redir = (dmode == IOSAPIC_LOWEST_PRIORITY) ? 1 : 0;
-
-#ifdef CONFIG_SMP
-	set_irq_affinity_info(irq, (int)(dest & 0xffff), redir);
-#endif
-
-	low32 = ((pol << IOSAPIC_POLARITY_SHIFT) |
-		 (trigger << IOSAPIC_TRIGGER_SHIFT) |
-		 (dmode << IOSAPIC_DELIVERY_SHIFT) |
-		 ((mask ? 1 : 0) << IOSAPIC_MASK_SHIFT) |
-		 vector);
-
-	/* dest contains both id and eid */
-	high32 = (dest << IOSAPIC_DEST_SHIFT);
-
-	iosapic_write(rte->iosapic, IOSAPIC_RTE_HIGH(rte_index), high32);
-	iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
-	iosapic_intr_info[irq].low32 = low32;
-	iosapic_intr_info[irq].dest = dest;
-}
-
-static void
-iosapic_nop (struct irq_data *data)
-{
-	/* do nothing... */
-}
-
-
-#ifdef CONFIG_KEXEC
-void
-kexec_disable_iosapic(void)
-{
-	struct iosapic_intr_info *info;
-	struct iosapic_rte_info *rte;
-	ia64_vector vec;
-	int irq;
-
-	for (irq = 0; irq < NR_IRQS; irq++) {
-		info = &iosapic_intr_info[irq];
-		vec = irq_to_vector(irq);
-		list_for_each_entry(rte, &info->rtes,
-				rte_list) {
-			iosapic_write(rte->iosapic,
-					IOSAPIC_RTE_LOW(rte->rte_index),
-					IOSAPIC_MASK|vec);
-			iosapic_eoi(rte->iosapic->addr, vec);
-		}
-	}
-}
-#endif
-
-static void
-mask_irq (struct irq_data *data)
-{
-	unsigned int irq = data->irq;
-	u32 low32;
-	int rte_index;
-	struct iosapic_rte_info *rte;
-
-	if (!iosapic_intr_info[irq].count)
-		return;			/* not an IOSAPIC interrupt! */
-
-	/* set only the mask bit */
-	low32 = iosapic_intr_info[irq].low32 |= IOSAPIC_MASK;
-	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) {
-		rte_index = rte->rte_index;
-		iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
-	}
-}
-
-static void
-unmask_irq (struct irq_data *data)
-{
-	unsigned int irq = data->irq;
-	u32 low32;
-	int rte_index;
-	struct iosapic_rte_info *rte;
-
-	if (!iosapic_intr_info[irq].count)
-		return;			/* not an IOSAPIC interrupt! */
-
-	low32 = iosapic_intr_info[irq].low32 &= ~IOSAPIC_MASK;
-	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) {
-		rte_index = rte->rte_index;
-		iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
-	}
-}
-
-
-static int
-iosapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		     bool force)
-{
-#ifdef CONFIG_SMP
-	unsigned int irq = data->irq;
-	u32 high32, low32;
-	int cpu, dest, rte_index;
-	int redir = (irq & IA64_IRQ_REDIRECTED) ? 1 : 0;
-	struct iosapic_rte_info *rte;
-	struct iosapic *iosapic;
-
-	irq &= (~IA64_IRQ_REDIRECTED);
-
-	cpu = cpumask_first_and(cpu_online_mask, mask);
-	if (cpu >= nr_cpu_ids)
-		return -1;
-
-	if (irq_prepare_move(irq, cpu))
-		return -1;
-
-	dest = cpu_physical_id(cpu);
-
-	if (!iosapic_intr_info[irq].count)
-		return -1;			/* not an IOSAPIC interrupt */
-
-	set_irq_affinity_info(irq, dest, redir);
-
-	/* dest contains both id and eid */
-	high32 = dest << IOSAPIC_DEST_SHIFT;
-
-	low32 = iosapic_intr_info[irq].low32 & ~(7 << IOSAPIC_DELIVERY_SHIFT);
-	if (redir)
-		/* change delivery mode to lowest priority */
-		low32 |= (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT);
-	else
-		/* change delivery mode to fixed */
-		low32 |= (IOSAPIC_FIXED << IOSAPIC_DELIVERY_SHIFT);
-	low32 &= IOSAPIC_VECTOR_MASK;
-	low32 |= irq_to_vector(irq);
-
-	iosapic_intr_info[irq].low32 = low32;
-	iosapic_intr_info[irq].dest = dest;
-	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) {
-		iosapic = rte->iosapic;
-		rte_index = rte->rte_index;
-		iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32);
-		iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
-	}
-
-#endif
-	return 0;
-}
-
-/*
- * Handlers for level-triggered interrupts.
- */
-
-static unsigned int
-iosapic_startup_level_irq (struct irq_data *data)
-{
-	unmask_irq(data);
-	return 0;
-}
-
-static void
-iosapic_unmask_level_irq (struct irq_data *data)
-{
-	unsigned int irq = data->irq;
-	ia64_vector vec = irq_to_vector(irq);
-	struct iosapic_rte_info *rte;
-	int do_unmask_irq = 0;
-
-	irq_complete_move(irq);
-	if (unlikely(irqd_is_setaffinity_pending(data))) {
-		do_unmask_irq = 1;
-		mask_irq(data);
-	} else
-		unmask_irq(data);
-
-	list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list)
-		iosapic_eoi(rte->iosapic->addr, vec);
-
-	if (unlikely(do_unmask_irq)) {
-		irq_move_masked_irq(data);
-		unmask_irq(data);
-	}
-}
-
-#define iosapic_shutdown_level_irq	mask_irq
-#define iosapic_enable_level_irq	unmask_irq
-#define iosapic_disable_level_irq	mask_irq
-#define iosapic_ack_level_irq		iosapic_nop
-
-static struct irq_chip irq_type_iosapic_level = {
-	.name =			"IO-SAPIC-level",
-	.irq_startup =		iosapic_startup_level_irq,
-	.irq_shutdown =		iosapic_shutdown_level_irq,
-	.irq_enable =		iosapic_enable_level_irq,
-	.irq_disable =		iosapic_disable_level_irq,
-	.irq_ack =		iosapic_ack_level_irq,
-	.irq_mask =		mask_irq,
-	.irq_unmask =		iosapic_unmask_level_irq,
-	.irq_set_affinity =	iosapic_set_affinity
-};
-
-/*
- * Handlers for edge-triggered interrupts.
- */
-
-static unsigned int
-iosapic_startup_edge_irq (struct irq_data *data)
-{
-	unmask_irq(data);
-	/*
-	 * IOSAPIC simply drops interrupts pended while the
-	 * corresponding pin was masked, so we can't know if an
-	 * interrupt is pending already.  Let's hope not...
-	 */
-	return 0;
-}
-
-static void
-iosapic_ack_edge_irq (struct irq_data *data)
-{
-	irq_complete_move(data->irq);
-	irq_move_irq(data);
-}
-
-#define iosapic_enable_edge_irq		unmask_irq
-#define iosapic_disable_edge_irq	iosapic_nop
-
-static struct irq_chip irq_type_iosapic_edge = {
-	.name =			"IO-SAPIC-edge",
-	.irq_startup =		iosapic_startup_edge_irq,
-	.irq_shutdown =		iosapic_disable_edge_irq,
-	.irq_enable =		iosapic_enable_edge_irq,
-	.irq_disable =		iosapic_disable_edge_irq,
-	.irq_ack =		iosapic_ack_edge_irq,
-	.irq_mask =		mask_irq,
-	.irq_unmask =		unmask_irq,
-	.irq_set_affinity =	iosapic_set_affinity
-};
-
-static unsigned int
-iosapic_version (char __iomem *addr)
-{
-	/*
-	 * IOSAPIC Version Register return 32 bit structure like:
-	 * {
-	 *	unsigned int version   : 8;
-	 *	unsigned int reserved1 : 8;
-	 *	unsigned int max_redir : 8;
-	 *	unsigned int reserved2 : 8;
-	 * }
-	 */
-	return __iosapic_read(addr, IOSAPIC_VERSION);
-}
-
-static int iosapic_find_sharable_irq(unsigned long trigger, unsigned long pol)
-{
-	int i, irq = -ENOSPC, min_count = -1;
-	struct iosapic_intr_info *info;
-
-	/*
-	 * shared vectors for edge-triggered interrupts are not
-	 * supported yet
-	 */
-	if (trigger == IOSAPIC_EDGE)
-		return -EINVAL;
-
-	for (i = 0; i < NR_IRQS; i++) {
-		info = &iosapic_intr_info[i];
-		if (info->trigger == trigger && info->polarity == pol &&
-		    (info->dmode == IOSAPIC_FIXED ||
-		     info->dmode == IOSAPIC_LOWEST_PRIORITY) &&
-		    can_request_irq(i, IRQF_SHARED)) {
-			if (min_count == -1 || info->count < min_count) {
-				irq = i;
-				min_count = info->count;
-			}
-		}
-	}
-	return irq;
-}
-
-/*
- * if the given vector is already owned by other,
- *  assign a new vector for the other and make the vector available
- */
-static void __init
-iosapic_reassign_vector (int irq)
-{
-	int new_irq;
-
-	if (iosapic_intr_info[irq].count) {
-		new_irq = create_irq();
-		if (new_irq < 0)
-			panic("%s: out of interrupt vectors!\n", __func__);
-		printk(KERN_INFO "Reassigning vector %d to %d\n",
-		       irq_to_vector(irq), irq_to_vector(new_irq));
-		memcpy(&iosapic_intr_info[new_irq], &iosapic_intr_info[irq],
-		       sizeof(struct iosapic_intr_info));
-		INIT_LIST_HEAD(&iosapic_intr_info[new_irq].rtes);
-		list_move(iosapic_intr_info[irq].rtes.next,
-			  &iosapic_intr_info[new_irq].rtes);
-		memset(&iosapic_intr_info[irq], 0,
-		       sizeof(struct iosapic_intr_info));
-		iosapic_intr_info[irq].low32 = IOSAPIC_MASK;
-		INIT_LIST_HEAD(&iosapic_intr_info[irq].rtes);
-	}
-}
-
-static inline int irq_is_shared (int irq)
-{
-	return (iosapic_intr_info[irq].count > 1);
-}
-
-struct irq_chip*
-ia64_native_iosapic_get_irq_chip(unsigned long trigger)
-{
-	if (trigger == IOSAPIC_EDGE)
-		return &irq_type_iosapic_edge;
-	else
-		return &irq_type_iosapic_level;
-}
-
-static int
-register_intr (unsigned int gsi, int irq, unsigned char delivery,
-	       unsigned long polarity, unsigned long trigger)
-{
-	struct irq_chip *chip, *irq_type;
-	int index;
-	struct iosapic_rte_info *rte;
-
-	index = find_iosapic(gsi);
-	if (index < 0) {
-		printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n",
-		       __func__, gsi);
-		return -ENODEV;
-	}
-
-	rte = find_rte(irq, gsi);
-	if (!rte) {
-		rte = kzalloc(sizeof (*rte), GFP_ATOMIC);
-		if (!rte) {
-			printk(KERN_WARNING "%s: cannot allocate memory\n",
-			       __func__);
-			return -ENOMEM;
-		}
-
-		rte->iosapic	= &iosapic_lists[index];
-		rte->rte_index	= gsi - rte->iosapic->gsi_base;
-		rte->refcnt++;
-		list_add_tail(&rte->rte_list, &iosapic_intr_info[irq].rtes);
-		iosapic_intr_info[irq].count++;
-		iosapic_lists[index].rtes_inuse++;
-	}
-	else if (rte->refcnt == NO_REF_RTE) {
-		struct iosapic_intr_info *info = &iosapic_intr_info[irq];
-		if (info->count > 0 &&
-		    (info->trigger != trigger || info->polarity != polarity)){
-			printk (KERN_WARNING
-				"%s: cannot override the interrupt\n",
-				__func__);
-			return -EINVAL;
-		}
-		rte->refcnt++;
-		iosapic_intr_info[irq].count++;
-		iosapic_lists[index].rtes_inuse++;
-	}
-
-	iosapic_intr_info[irq].polarity = polarity;
-	iosapic_intr_info[irq].dmode    = delivery;
-	iosapic_intr_info[irq].trigger  = trigger;
-
-	irq_type = iosapic_get_irq_chip(trigger);
-
-	chip = irq_get_chip(irq);
-	if (irq_type != NULL && chip != irq_type) {
-		if (chip != &no_irq_chip)
-			printk(KERN_WARNING
-			       "%s: changing vector %d from %s to %s\n",
-			       __func__, irq_to_vector(irq),
-			       chip->name, irq_type->name);
-		chip = irq_type;
-	}
-	irq_set_chip_handler_name_locked(irq_get_irq_data(irq), chip,
-		trigger == IOSAPIC_EDGE ? handle_edge_irq : handle_level_irq,
-		NULL);
-	return 0;
-}
-
-static unsigned int
-get_target_cpu (unsigned int gsi, int irq)
-{
-#ifdef CONFIG_SMP
-	static int cpu = -1;
-	extern int cpe_vector;
-	cpumask_t domain = irq_to_domain(irq);
-
-	/*
-	 * In case of vector shared by multiple RTEs, all RTEs that
-	 * share the vector need to use the same destination CPU.
-	 */
-	if (iosapic_intr_info[irq].count)
-		return iosapic_intr_info[irq].dest;
-
-	/*
-	 * If the platform supports redirection via XTP, let it
-	 * distribute interrupts.
-	 */
-	if (smp_int_redirect & SMP_IRQ_REDIRECTION)
-		return cpu_physical_id(smp_processor_id());
-
-	/*
-	 * Some interrupts (ACPI SCI, for instance) are registered
-	 * before the BSP is marked as online.
-	 */
-	if (!cpu_online(smp_processor_id()))
-		return cpu_physical_id(smp_processor_id());
-
-	if (cpe_vector > 0 && irq_to_vector(irq) == IA64_CPEP_VECTOR)
-		return get_cpei_target_cpu();
-
-#ifdef CONFIG_NUMA
-	{
-		int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0;
-		const struct cpumask *cpu_mask;
-
-		iosapic_index = find_iosapic(gsi);
-		if (iosapic_index < 0 ||
-		    iosapic_lists[iosapic_index].node == MAX_NUMNODES)
-			goto skip_numa_setup;
-
-		cpu_mask = cpumask_of_node(iosapic_lists[iosapic_index].node);
-		num_cpus = 0;
-		for_each_cpu_and(numa_cpu, cpu_mask, &domain) {
-			if (cpu_online(numa_cpu))
-				num_cpus++;
-		}
-
-		if (!num_cpus)
-			goto skip_numa_setup;
-
-		/* Use irq assignment to distribute across cpus in node */
-		cpu_index = irq % num_cpus;
-
-		for_each_cpu_and(numa_cpu, cpu_mask, &domain)
-			if (cpu_online(numa_cpu) && i++ >= cpu_index)
-				break;
-
-		if (numa_cpu < nr_cpu_ids)
-			return cpu_physical_id(numa_cpu);
-	}
-skip_numa_setup:
-#endif
-	/*
-	 * Otherwise, round-robin interrupt vectors across all the
-	 * processors.  (It'd be nice if we could be smarter in the
-	 * case of NUMA.)
-	 */
-	do {
-		if (++cpu >= nr_cpu_ids)
-			cpu = 0;
-	} while (!cpu_online(cpu) || !cpumask_test_cpu(cpu, &domain));
-
-	return cpu_physical_id(cpu);
-#else  /* CONFIG_SMP */
-	return cpu_physical_id(smp_processor_id());
-#endif
-}
-
-static inline unsigned char choose_dmode(void)
-{
-#ifdef CONFIG_SMP
-	if (smp_int_redirect & SMP_IRQ_REDIRECTION)
-		return IOSAPIC_LOWEST_PRIORITY;
-#endif
-	return IOSAPIC_FIXED;
-}
-
-/*
- * ACPI can describe IOSAPIC interrupts via static tables and namespace
- * methods.  This provides an interface to register those interrupts and
- * program the IOSAPIC RTE.
- */
-int
-iosapic_register_intr (unsigned int gsi,
-		       unsigned long polarity, unsigned long trigger)
-{
-	int irq, mask = 1, err;
-	unsigned int dest;
-	unsigned long flags;
-	struct iosapic_rte_info *rte;
-	u32 low32;
-	unsigned char dmode;
-	struct irq_desc *desc;
-
-	/*
-	 * If this GSI has already been registered (i.e., it's a
-	 * shared interrupt, or we lost a race to register it),
-	 * don't touch the RTE.
-	 */
-	spin_lock_irqsave(&iosapic_lock, flags);
-	irq = __gsi_to_irq(gsi);
-	if (irq > 0) {
-		rte = find_rte(irq, gsi);
-		if(iosapic_intr_info[irq].count == 0) {
-			assign_irq_vector(irq);
-			irq_init_desc(irq);
-		} else if (rte->refcnt != NO_REF_RTE) {
-			rte->refcnt++;
-			goto unlock_iosapic_lock;
-		}
-	} else
-		irq = create_irq();
-
-	/* If vector is running out, we try to find a sharable vector */
-	if (irq < 0) {
-		irq = iosapic_find_sharable_irq(trigger, polarity);
-		if (irq < 0)
-			goto unlock_iosapic_lock;
-	}
-
-	desc = irq_to_desc(irq);
-	raw_spin_lock(&desc->lock);
-	dest = get_target_cpu(gsi, irq);
-	dmode = choose_dmode();
-	err = register_intr(gsi, irq, dmode, polarity, trigger);
-	if (err < 0) {
-		raw_spin_unlock(&desc->lock);
-		irq = err;
-		goto unlock_iosapic_lock;
-	}
-
-	/*
-	 * If the vector is shared and already unmasked for other
-	 * interrupt sources, don't mask it.
-	 */
-	low32 = iosapic_intr_info[irq].low32;
-	if (irq_is_shared(irq) && !(low32 & IOSAPIC_MASK))
-		mask = 0;
-	set_rte(gsi, irq, dest, mask);
-
-	printk(KERN_INFO "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d\n",
-	       gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
-	       (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
-	       cpu_logical_id(dest), dest, irq_to_vector(irq));
-
-	raw_spin_unlock(&desc->lock);
- unlock_iosapic_lock:
-	spin_unlock_irqrestore(&iosapic_lock, flags);
-	return irq;
-}
-
-void
-iosapic_unregister_intr (unsigned int gsi)
-{
-	unsigned long flags;
-	int irq, index;
-	u32 low32;
-	unsigned long trigger, polarity;
-	unsigned int dest;
-	struct iosapic_rte_info *rte;
-
-	/*
-	 * If the irq associated with the gsi is not found,
-	 * iosapic_unregister_intr() is unbalanced. We need to check
-	 * this again after getting locks.
-	 */
-	irq = gsi_to_irq(gsi);
-	if (irq < 0) {
-		printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n",
-		       gsi);
-		WARN_ON(1);
-		return;
-	}
-
-	spin_lock_irqsave(&iosapic_lock, flags);
-	if ((rte = find_rte(irq, gsi)) == NULL) {
-		printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n",
-		       gsi);
-		WARN_ON(1);
-		goto out;
-	}
-
-	if (--rte->refcnt > 0)
-		goto out;
-
-	rte->refcnt = NO_REF_RTE;
-
-	/* Mask the interrupt */
-	low32 = iosapic_intr_info[irq].low32 | IOSAPIC_MASK;
-	iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte->rte_index), low32);
-
-	iosapic_intr_info[irq].count--;
-	index = find_iosapic(gsi);
-	iosapic_lists[index].rtes_inuse--;
-	WARN_ON(iosapic_lists[index].rtes_inuse < 0);
-
-	trigger  = iosapic_intr_info[irq].trigger;
-	polarity = iosapic_intr_info[irq].polarity;
-	dest     = iosapic_intr_info[irq].dest;
-	printk(KERN_INFO
-	       "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d unregistered\n",
-	       gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
-	       (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
-	       cpu_logical_id(dest), dest, irq_to_vector(irq));
-
-	if (iosapic_intr_info[irq].count == 0) {
-#ifdef CONFIG_SMP
-		/* Clear affinity */
-		irq_data_update_affinity(irq_get_irq_data(irq), cpu_all_mask);
-#endif
-		/* Clear the interrupt information */
-		iosapic_intr_info[irq].dest = 0;
-		iosapic_intr_info[irq].dmode = 0;
-		iosapic_intr_info[irq].polarity = 0;
-		iosapic_intr_info[irq].trigger = 0;
-		iosapic_intr_info[irq].low32 |= IOSAPIC_MASK;
-
-		/* Destroy and reserve IRQ */
-		destroy_and_reserve_irq(irq);
-	}
- out:
-	spin_unlock_irqrestore(&iosapic_lock, flags);
-}
-
-/*
- * ACPI calls this when it finds an entry for a platform interrupt.
- */
-int __init
-iosapic_register_platform_intr (u32 int_type, unsigned int gsi,
-				int iosapic_vector, u16 eid, u16 id,
-				unsigned long polarity, unsigned long trigger)
-{
-	static const char * const name[] = {"unknown", "PMI", "INIT", "CPEI"};
-	unsigned char delivery;
-	int irq, vector, mask = 0;
-	unsigned int dest = ((id << 8) | eid) & 0xffff;
-
-	switch (int_type) {
-	      case ACPI_INTERRUPT_PMI:
-		irq = vector = iosapic_vector;
-		bind_irq_vector(irq, vector, CPU_MASK_ALL);
-		/*
-		 * since PMI vector is alloc'd by FW(ACPI) not by kernel,
-		 * we need to make sure the vector is available
-		 */
-		iosapic_reassign_vector(irq);
-		delivery = IOSAPIC_PMI;
-		break;
-	      case ACPI_INTERRUPT_INIT:
-		irq = create_irq();
-		if (irq < 0)
-			panic("%s: out of interrupt vectors!\n", __func__);
-		vector = irq_to_vector(irq);
-		delivery = IOSAPIC_INIT;
-		break;
-	      case ACPI_INTERRUPT_CPEI:
-		irq = vector = IA64_CPE_VECTOR;
-		BUG_ON(bind_irq_vector(irq, vector, CPU_MASK_ALL));
-		delivery = IOSAPIC_FIXED;
-		mask = 1;
-		break;
-	      default:
-		printk(KERN_ERR "%s: invalid int type 0x%x\n", __func__,
-		       int_type);
-		return -1;
-	}
-
-	register_intr(gsi, irq, delivery, polarity, trigger);
-
-	printk(KERN_INFO
-	       "PLATFORM int %s (0x%x): GSI %u (%s, %s) -> CPU %d (0x%04x)"
-	       " vector %d\n",
-	       int_type < ARRAY_SIZE(name) ? name[int_type] : "unknown",
-	       int_type, gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"),
-	       (polarity == IOSAPIC_POL_HIGH ? "high" : "low"),
-	       cpu_logical_id(dest), dest, vector);
-
-	set_rte(gsi, irq, dest, mask);
-	return vector;
-}
-
-/*
- * ACPI calls this when it finds an entry for a legacy ISA IRQ override.
- */
-void iosapic_override_isa_irq(unsigned int isa_irq, unsigned int gsi,
-			      unsigned long polarity, unsigned long trigger)
-{
-	int vector, irq;
-	unsigned int dest = cpu_physical_id(smp_processor_id());
-	unsigned char dmode;
-
-	irq = vector = isa_irq_to_vector(isa_irq);
-	BUG_ON(bind_irq_vector(irq, vector, CPU_MASK_ALL));
-	dmode = choose_dmode();
-	register_intr(gsi, irq, dmode, polarity, trigger);
-
-	DBG("ISA: IRQ %u -> GSI %u (%s,%s) -> CPU %d (0x%04x) vector %d\n",
-	    isa_irq, gsi, trigger == IOSAPIC_EDGE ? "edge" : "level",
-	    polarity == IOSAPIC_POL_HIGH ? "high" : "low",
-	    cpu_logical_id(dest), dest, vector);
-
-	set_rte(gsi, irq, dest, 1);
-}
-
-void __init
-ia64_native_iosapic_pcat_compat_init(void)
-{
-	if (pcat_compat) {
-		/*
-		 * Disable the compatibility mode interrupts (8259 style),
-		 * needs IN/OUT support enabled.
-		 */
-		printk(KERN_INFO
-		       "%s: Disabling PC-AT compatible 8259 interrupts\n",
-		       __func__);
-		outb(0xff, 0xA1);
-		outb(0xff, 0x21);
-	}
-}
-
-void __init
-iosapic_system_init (int system_pcat_compat)
-{
-	int irq;
-
-	for (irq = 0; irq < NR_IRQS; ++irq) {
-		iosapic_intr_info[irq].low32 = IOSAPIC_MASK;
-		/* mark as unused */
-		INIT_LIST_HEAD(&iosapic_intr_info[irq].rtes);
-
-		iosapic_intr_info[irq].count = 0;
-	}
-
-	pcat_compat = system_pcat_compat;
-	if (pcat_compat)
-		iosapic_pcat_compat_init();
-}
-
-static inline int
-iosapic_alloc (void)
-{
-	int index;
-
-	for (index = 0; index < NR_IOSAPICS; index++)
-		if (!iosapic_lists[index].addr)
-			return index;
-
-	printk(KERN_WARNING "%s: failed to allocate iosapic\n", __func__);
-	return -1;
-}
-
-static inline void
-iosapic_free (int index)
-{
-	memset(&iosapic_lists[index], 0, sizeof(iosapic_lists[0]));
-}
-
-static inline int
-iosapic_check_gsi_range (unsigned int gsi_base, unsigned int ver)
-{
-	int index;
-	unsigned int gsi_end, base, end;
-
-	/* check gsi range */
-	gsi_end = gsi_base + ((ver >> 16) & 0xff);
-	for (index = 0; index < NR_IOSAPICS; index++) {
-		if (!iosapic_lists[index].addr)
-			continue;
-
-		base = iosapic_lists[index].gsi_base;
-		end  = base + iosapic_lists[index].num_rte - 1;
-
-		if (gsi_end < base || end < gsi_base)
-			continue; /* OK */
-
-		return -EBUSY;
-	}
-	return 0;
-}
-
-static int
-iosapic_delete_rte(unsigned int irq, unsigned int gsi)
-{
-	struct iosapic_rte_info *rte, *temp;
-
-	list_for_each_entry_safe(rte, temp, &iosapic_intr_info[irq].rtes,
-								rte_list) {
-		if (rte->iosapic->gsi_base + rte->rte_index == gsi) {
-			if (rte->refcnt)
-				return -EBUSY;
-
-			list_del(&rte->rte_list);
-			kfree(rte);
-			return 0;
-		}
-	}
-
-	return -EINVAL;
-}
-
-int iosapic_init(unsigned long phys_addr, unsigned int gsi_base)
-{
-	int num_rte, err, index;
-	unsigned int isa_irq, ver;
-	char __iomem *addr;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iosapic_lock, flags);
-	index = find_iosapic(gsi_base);
-	if (index >= 0) {
-		spin_unlock_irqrestore(&iosapic_lock, flags);
-		return -EBUSY;
-	}
-
-	addr = ioremap(phys_addr, 0);
-	if (addr == NULL) {
-		spin_unlock_irqrestore(&iosapic_lock, flags);
-		return -ENOMEM;
-	}
-	ver = iosapic_version(addr);
-	if ((err = iosapic_check_gsi_range(gsi_base, ver))) {
-		iounmap(addr);
-		spin_unlock_irqrestore(&iosapic_lock, flags);
-		return err;
-	}
-
-	/*
-	 * The MAX_REDIR register holds the highest input pin number
-	 * (starting from 0).  We add 1 so that we can use it for
-	 * number of pins (= RTEs)
-	 */
-	num_rte = ((ver >> 16) & 0xff) + 1;
-
-	index = iosapic_alloc();
-	iosapic_lists[index].addr = addr;
-	iosapic_lists[index].gsi_base = gsi_base;
-	iosapic_lists[index].num_rte = num_rte;
-#ifdef CONFIG_NUMA
-	iosapic_lists[index].node = MAX_NUMNODES;
-#endif
-	spin_lock_init(&iosapic_lists[index].lock);
-	spin_unlock_irqrestore(&iosapic_lock, flags);
-
-	if ((gsi_base == 0) && pcat_compat) {
-		/*
-		 * Map the legacy ISA devices into the IOSAPIC data.  Some of
-		 * these may get reprogrammed later on with data from the ACPI
-		 * Interrupt Source Override table.
-		 */
-		for (isa_irq = 0; isa_irq < 16; ++isa_irq)
-			iosapic_override_isa_irq(isa_irq, isa_irq,
-						 IOSAPIC_POL_HIGH,
-						 IOSAPIC_EDGE);
-	}
-	return 0;
-}
-
-int iosapic_remove(unsigned int gsi_base)
-{
-	int i, irq, index, err = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iosapic_lock, flags);
-	index = find_iosapic(gsi_base);
-	if (index < 0) {
-		printk(KERN_WARNING "%s: No IOSAPIC for GSI base %u\n",
-		       __func__, gsi_base);
-		goto out;
-	}
-
-	if (iosapic_lists[index].rtes_inuse) {
-		err = -EBUSY;
-		printk(KERN_WARNING "%s: IOSAPIC for GSI base %u is busy\n",
-		       __func__, gsi_base);
-		goto out;
-	}
-
-	for (i = gsi_base; i < gsi_base + iosapic_lists[index].num_rte; i++) {
-		irq = __gsi_to_irq(i);
-		if (irq < 0)
-			continue;
-
-		err = iosapic_delete_rte(irq, i);
-		if (err)
-			goto out;
-	}
-
-	iounmap(iosapic_lists[index].addr);
-	iosapic_free(index);
- out:
-	spin_unlock_irqrestore(&iosapic_lock, flags);
-	return err;
-}
-
-#ifdef CONFIG_NUMA
-void map_iosapic_to_node(unsigned int gsi_base, int node)
-{
-	int index;
-
-	index = find_iosapic(gsi_base);
-	if (index < 0) {
-		printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n",
-		       __func__, gsi_base);
-		return;
-	}
-	iosapic_lists[index].node = node;
-	return;
-}
-#endif
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
deleted file mode 100644
index 275b9ea58c64..000000000000
--- a/arch/ia64/kernel/irq.c
+++ /dev/null
@@ -1,181 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *	linux/arch/ia64/kernel/irq.c
- *
- *	Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
- *
- * This file contains the code used by various IRQ handling routines:
- * asking for different IRQs should be done through these routines
- * instead of just grabbing them. Thus setups with different IRQ numbers
- * shouldn't result in any weird surprises, and installing new handlers
- * should be easier.
- *
- * Copyright (C) Ashok Raj<ashok.raj@intel.com>, Intel Corporation 2004
- *
- * 4/14/2004: Added code to handle cpu migration and do safe irq
- *			migration without losing interrupts for iosapic
- *			architecture.
- */
-
-#include <asm/delay.h>
-#include <linux/uaccess.h>
-#include <linux/module.h>
-#include <linux/seq_file.h>
-#include <linux/interrupt.h>
-#include <linux/kernel_stat.h>
-
-#include <asm/mca.h>
-#include <asm/xtp.h>
-
-/*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
- */
-void ack_bad_irq(unsigned int irq)
-{
-	printk(KERN_ERR "Unexpected irq vector 0x%x on CPU %u!\n", irq, smp_processor_id());
-}
-
-/*
- * Interrupt statistics:
- */
-
-atomic_t irq_err_count;
-
-/*
- * /proc/interrupts printing:
- */
-int arch_show_interrupts(struct seq_file *p, int prec)
-{
-	seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
-	return 0;
-}
-
-#ifdef CONFIG_SMP
-static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
-
-void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
-{
-	if (irq < NR_IRQS) {
-		irq_data_update_affinity(irq_get_irq_data(irq),
-					 cpumask_of(cpu_logical_id(hwid)));
-		irq_redir[irq] = (char) (redir & 0xff);
-	}
-}
-#endif /* CONFIG_SMP */
-
-int __init arch_early_irq_init(void)
-{
-	ia64_mca_irq_init();
-	return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-unsigned int vectors_in_migration[NR_IRQS];
-
-/*
- * Since cpu_online_mask is already updated, we just need to check for
- * affinity that has zeros
- */
-static void migrate_irqs(void)
-{
-	int 		irq, new_cpu;
-
-	for (irq=0; irq < NR_IRQS; irq++) {
-		struct irq_desc *desc = irq_to_desc(irq);
-		struct irq_data *data = irq_desc_get_irq_data(desc);
-		struct irq_chip *chip = irq_data_get_irq_chip(data);
-
-		if (irqd_irq_disabled(data))
-			continue;
-
-		/*
-		 * No handling for now.
-		 * TBD: Implement a disable function so we can now
-		 * tell CPU not to respond to these local intr sources.
-		 * such as ITV,CPEI,MCA etc.
-		 */
-		if (irqd_is_per_cpu(data))
-			continue;
-
-		if (cpumask_any_and(irq_data_get_affinity_mask(data),
-				    cpu_online_mask) >= nr_cpu_ids) {
-			/*
-			 * Save it for phase 2 processing
-			 */
-			vectors_in_migration[irq] = irq;
-
-			new_cpu = cpumask_any(cpu_online_mask);
-
-			/*
-			 * Al three are essential, currently WARN_ON.. maybe panic?
-			 */
-			if (chip && chip->irq_disable &&
-				chip->irq_enable && chip->irq_set_affinity) {
-				chip->irq_disable(data);
-				chip->irq_set_affinity(data,
-						       cpumask_of(new_cpu), false);
-				chip->irq_enable(data);
-			} else {
-				WARN_ON((!chip || !chip->irq_disable ||
-					 !chip->irq_enable ||
-					 !chip->irq_set_affinity));
-			}
-		}
-	}
-}
-
-void fixup_irqs(void)
-{
-	unsigned int irq;
-	extern void ia64_process_pending_intr(void);
-	extern volatile int time_keeper_id;
-
-	/* Mask ITV to disable timer */
-	ia64_set_itv(1 << 16);
-
-	/*
-	 * Find a new timesync master
-	 */
-	if (smp_processor_id() == time_keeper_id) {
-		time_keeper_id = cpumask_first(cpu_online_mask);
-		printk ("CPU %d is now promoted to time-keeper master\n", time_keeper_id);
-	}
-
-	/*
-	 * Phase 1: Locate IRQs bound to this cpu and
-	 * relocate them for cpu removal.
-	 */
-	migrate_irqs();
-
-	/*
-	 * Phase 2: Perform interrupt processing for all entries reported in
-	 * local APIC.
-	 */
-	ia64_process_pending_intr();
-
-	/*
-	 * Phase 3: Now handle any interrupts not captured in local APIC.
-	 * This is to account for cases that device interrupted during the time the
-	 * rte was being disabled and re-programmed.
-	 */
-	for (irq=0; irq < NR_IRQS; irq++) {
-		if (vectors_in_migration[irq]) {
-			struct pt_regs *old_regs = set_irq_regs(NULL);
-
-			vectors_in_migration[irq]=0;
-			generic_handle_irq(irq);
-			set_irq_regs(old_regs);
-		}
-	}
-
-	/*
-	 * Now let processor die. We do irq disable and max_xtp() to
-	 * ensure there is no more interrupts routed to this processor.
-	 * But the local timer interrupt can have 1 pending which we
-	 * take care in timer_interrupt().
-	 */
-	max_xtp();
-	local_irq_disable();
-}
-#endif
diff --git a/arch/ia64/kernel/irq.h b/arch/ia64/kernel/irq.h
deleted file mode 100644
index 4d16f3cbeb1d..000000000000
--- a/arch/ia64/kernel/irq.h
+++ /dev/null
@@ -1,3 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-extern void register_percpu_irq(ia64_vector vec, irq_handler_t handler,
-				unsigned long flags, const char *name);
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
deleted file mode 100644
index 46e33c5cb53d..000000000000
--- a/arch/ia64/kernel/irq_ia64.c
+++ /dev/null
@@ -1,645 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/arch/ia64/kernel/irq_ia64.c
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- *  6/10/99: Updated to bring in sync with x86 version to facilitate
- *	     support for SMP and different interrupt controllers.
- *
- * 09/15/00 Goutham Rao <goutham.rao@intel.com> Implemented pci_irq_to_vector
- *                      PCI to vector allocation routine.
- * 04/14/2004 Ashok Raj <ashok.raj@intel.com>
- *						Added CPU Hotplug handling for IPF.
- */
-
-#include <linux/module.h>
-#include <linux/pgtable.h>
-
-#include <linux/jiffies.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/ioport.h>
-#include <linux/kernel_stat.h>
-#include <linux/ptrace.h>
-#include <linux/signal.h>
-#include <linux/smp.h>
-#include <linux/threads.h>
-#include <linux/bitops.h>
-#include <linux/irq.h>
-#include <linux/ratelimit.h>
-#include <linux/acpi.h>
-#include <linux/sched.h>
-
-#include <asm/delay.h>
-#include <asm/intrinsics.h>
-#include <asm/io.h>
-#include <asm/hw_irq.h>
-#include <asm/tlbflush.h>
-
-#define IRQ_DEBUG	0
-
-#define IRQ_VECTOR_UNASSIGNED	(0)
-
-#define IRQ_UNUSED		(0)
-#define IRQ_USED		(1)
-#define IRQ_RSVD		(2)
-
-int ia64_first_device_vector = IA64_DEF_FIRST_DEVICE_VECTOR;
-int ia64_last_device_vector = IA64_DEF_LAST_DEVICE_VECTOR;
-
-/* default base addr of IPI table */
-void __iomem *ipi_base_addr = ((void __iomem *)
-			       (__IA64_UNCACHED_OFFSET | IA64_IPI_DEFAULT_BASE_ADDR));
-
-static cpumask_t vector_allocation_domain(int cpu);
-
-/*
- * Legacy IRQ to IA-64 vector translation table.
- */
-__u8 isa_irq_to_vector_map[16] = {
-	/* 8259 IRQ translation, first 16 entries */
-	0x2f, 0x20, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29,
-	0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21
-};
-EXPORT_SYMBOL(isa_irq_to_vector_map);
-
-DEFINE_SPINLOCK(vector_lock);
-
-struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
-	[0 ... NR_IRQS - 1] = {
-		.vector = IRQ_VECTOR_UNASSIGNED,
-		.domain = CPU_MASK_NONE
-	}
-};
-
-DEFINE_PER_CPU(int[IA64_NUM_VECTORS], vector_irq) = {
-	[0 ... IA64_NUM_VECTORS - 1] = -1
-};
-
-static cpumask_t vector_table[IA64_NUM_VECTORS] = {
-	[0 ... IA64_NUM_VECTORS - 1] = CPU_MASK_NONE
-};
-
-static int irq_status[NR_IRQS] = {
-	[0 ... NR_IRQS -1] = IRQ_UNUSED
-};
-
-static inline int find_unassigned_irq(void)
-{
-	int irq;
-
-	for (irq = IA64_FIRST_DEVICE_VECTOR; irq < NR_IRQS; irq++)
-		if (irq_status[irq] == IRQ_UNUSED)
-			return irq;
-	return -ENOSPC;
-}
-
-static inline int find_unassigned_vector(cpumask_t domain)
-{
-	cpumask_t mask;
-	int pos, vector;
-
-	cpumask_and(&mask, &domain, cpu_online_mask);
-	if (cpumask_empty(&mask))
-		return -EINVAL;
-
-	for (pos = 0; pos < IA64_NUM_DEVICE_VECTORS; pos++) {
-		vector = IA64_FIRST_DEVICE_VECTOR + pos;
-		cpumask_and(&mask, &domain, &vector_table[vector]);
-		if (!cpumask_empty(&mask))
-			continue;
-		return vector;
-	}
-	return -ENOSPC;
-}
-
-static int __bind_irq_vector(int irq, int vector, cpumask_t domain)
-{
-	cpumask_t mask;
-	int cpu;
-	struct irq_cfg *cfg = &irq_cfg[irq];
-
-	BUG_ON((unsigned)irq >= NR_IRQS);
-	BUG_ON((unsigned)vector >= IA64_NUM_VECTORS);
-
-	cpumask_and(&mask, &domain, cpu_online_mask);
-	if (cpumask_empty(&mask))
-		return -EINVAL;
-	if ((cfg->vector == vector) && cpumask_equal(&cfg->domain, &domain))
-		return 0;
-	if (cfg->vector != IRQ_VECTOR_UNASSIGNED)
-		return -EBUSY;
-	for_each_cpu(cpu, &mask)
-		per_cpu(vector_irq, cpu)[vector] = irq;
-	cfg->vector = vector;
-	cfg->domain = domain;
-	irq_status[irq] = IRQ_USED;
-	cpumask_or(&vector_table[vector], &vector_table[vector], &domain);
-	return 0;
-}
-
-int bind_irq_vector(int irq, int vector, cpumask_t domain)
-{
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&vector_lock, flags);
-	ret = __bind_irq_vector(irq, vector, domain);
-	spin_unlock_irqrestore(&vector_lock, flags);
-	return ret;
-}
-
-static void __clear_irq_vector(int irq)
-{
-	int vector, cpu;
-	cpumask_t domain;
-	struct irq_cfg *cfg = &irq_cfg[irq];
-
-	BUG_ON((unsigned)irq >= NR_IRQS);
-	BUG_ON(cfg->vector == IRQ_VECTOR_UNASSIGNED);
-	vector = cfg->vector;
-	domain = cfg->domain;
-	for_each_cpu_and(cpu, &cfg->domain, cpu_online_mask)
-		per_cpu(vector_irq, cpu)[vector] = -1;
-	cfg->vector = IRQ_VECTOR_UNASSIGNED;
-	cfg->domain = CPU_MASK_NONE;
-	irq_status[irq] = IRQ_UNUSED;
-	cpumask_andnot(&vector_table[vector], &vector_table[vector], &domain);
-}
-
-static void clear_irq_vector(int irq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq);
-	spin_unlock_irqrestore(&vector_lock, flags);
-}
-
-int
-ia64_native_assign_irq_vector (int irq)
-{
-	unsigned long flags;
-	int vector, cpu;
-	cpumask_t domain = CPU_MASK_NONE;
-
-	vector = -ENOSPC;
-
-	spin_lock_irqsave(&vector_lock, flags);
-	for_each_online_cpu(cpu) {
-		domain = vector_allocation_domain(cpu);
-		vector = find_unassigned_vector(domain);
-		if (vector >= 0)
-			break;
-	}
-	if (vector < 0)
-		goto out;
-	if (irq == AUTO_ASSIGN)
-		irq = vector;
-	BUG_ON(__bind_irq_vector(irq, vector, domain));
- out:
-	spin_unlock_irqrestore(&vector_lock, flags);
-	return vector;
-}
-
-void
-ia64_native_free_irq_vector (int vector)
-{
-	if (vector < IA64_FIRST_DEVICE_VECTOR ||
-	    vector > IA64_LAST_DEVICE_VECTOR)
-		return;
-	clear_irq_vector(vector);
-}
-
-int
-reserve_irq_vector (int vector)
-{
-	if (vector < IA64_FIRST_DEVICE_VECTOR ||
-	    vector > IA64_LAST_DEVICE_VECTOR)
-		return -EINVAL;
-	return !!bind_irq_vector(vector, vector, CPU_MASK_ALL);
-}
-
-/*
- * Initialize vector_irq on a new cpu. This function must be called
- * with vector_lock held.
- */
-void __setup_vector_irq(int cpu)
-{
-	int irq, vector;
-
-	/* Clear vector_irq */
-	for (vector = 0; vector < IA64_NUM_VECTORS; ++vector)
-		per_cpu(vector_irq, cpu)[vector] = -1;
-	/* Mark the inuse vectors */
-	for (irq = 0; irq < NR_IRQS; ++irq) {
-		if (!cpumask_test_cpu(cpu, &irq_cfg[irq].domain))
-			continue;
-		vector = irq_to_vector(irq);
-		per_cpu(vector_irq, cpu)[vector] = irq;
-	}
-}
-
-#ifdef CONFIG_SMP
-
-static enum vector_domain_type {
-	VECTOR_DOMAIN_NONE,
-	VECTOR_DOMAIN_PERCPU
-} vector_domain_type = VECTOR_DOMAIN_NONE;
-
-static cpumask_t vector_allocation_domain(int cpu)
-{
-	if (vector_domain_type == VECTOR_DOMAIN_PERCPU)
-		return *cpumask_of(cpu);
-	return CPU_MASK_ALL;
-}
-
-static int __irq_prepare_move(int irq, int cpu)
-{
-	struct irq_cfg *cfg = &irq_cfg[irq];
-	int vector;
-	cpumask_t domain;
-
-	if (cfg->move_in_progress || cfg->move_cleanup_count)
-		return -EBUSY;
-	if (cfg->vector == IRQ_VECTOR_UNASSIGNED || !cpu_online(cpu))
-		return -EINVAL;
-	if (cpumask_test_cpu(cpu, &cfg->domain))
-		return 0;
-	domain = vector_allocation_domain(cpu);
-	vector = find_unassigned_vector(domain);
-	if (vector < 0)
-		return -ENOSPC;
-	cfg->move_in_progress = 1;
-	cfg->old_domain = cfg->domain;
-	cfg->vector = IRQ_VECTOR_UNASSIGNED;
-	cfg->domain = CPU_MASK_NONE;
-	BUG_ON(__bind_irq_vector(irq, vector, domain));
-	return 0;
-}
-
-int irq_prepare_move(int irq, int cpu)
-{
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&vector_lock, flags);
-	ret = __irq_prepare_move(irq, cpu);
-	spin_unlock_irqrestore(&vector_lock, flags);
-	return ret;
-}
-
-void irq_complete_move(unsigned irq)
-{
-	struct irq_cfg *cfg = &irq_cfg[irq];
-	cpumask_t cleanup_mask;
-	int i;
-
-	if (likely(!cfg->move_in_progress))
-		return;
-
-	if (unlikely(cpumask_test_cpu(smp_processor_id(), &cfg->old_domain)))
-		return;
-
-	cpumask_and(&cleanup_mask, &cfg->old_domain, cpu_online_mask);
-	cfg->move_cleanup_count = cpumask_weight(&cleanup_mask);
-	for_each_cpu(i, &cleanup_mask)
-		ia64_send_ipi(i, IA64_IRQ_MOVE_VECTOR, IA64_IPI_DM_INT, 0);
-	cfg->move_in_progress = 0;
-}
-
-static irqreturn_t smp_irq_move_cleanup_interrupt(int irq, void *dev_id)
-{
-	int me = smp_processor_id();
-	ia64_vector vector;
-	unsigned long flags;
-
-	for (vector = IA64_FIRST_DEVICE_VECTOR;
-	     vector < IA64_LAST_DEVICE_VECTOR; vector++) {
-		int irq;
-		struct irq_desc *desc;
-		struct irq_cfg *cfg;
-		irq = __this_cpu_read(vector_irq[vector]);
-		if (irq < 0)
-			continue;
-
-		desc = irq_to_desc(irq);
-		cfg = irq_cfg + irq;
-		raw_spin_lock(&desc->lock);
-		if (!cfg->move_cleanup_count)
-			goto unlock;
-
-		if (!cpumask_test_cpu(me, &cfg->old_domain))
-			goto unlock;
-
-		spin_lock_irqsave(&vector_lock, flags);
-		__this_cpu_write(vector_irq[vector], -1);
-		cpumask_clear_cpu(me, &vector_table[vector]);
-		spin_unlock_irqrestore(&vector_lock, flags);
-		cfg->move_cleanup_count--;
-	unlock:
-		raw_spin_unlock(&desc->lock);
-	}
-	return IRQ_HANDLED;
-}
-
-static int __init parse_vector_domain(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-	if (!strcmp(arg, "percpu")) {
-		vector_domain_type = VECTOR_DOMAIN_PERCPU;
-		no_int_routing = 1;
-	}
-	return 0;
-}
-early_param("vector", parse_vector_domain);
-#else
-static cpumask_t vector_allocation_domain(int cpu)
-{
-	return CPU_MASK_ALL;
-}
-#endif
-
-
-void destroy_and_reserve_irq(unsigned int irq)
-{
-	unsigned long flags;
-
-	irq_init_desc(irq);
-	spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq);
-	irq_status[irq] = IRQ_RSVD;
-	spin_unlock_irqrestore(&vector_lock, flags);
-}
-
-/*
- * Dynamic irq allocate and deallocation for MSI
- */
-int create_irq(void)
-{
-	unsigned long flags;
-	int irq, vector, cpu;
-	cpumask_t domain = CPU_MASK_NONE;
-
-	irq = vector = -ENOSPC;
-	spin_lock_irqsave(&vector_lock, flags);
-	for_each_online_cpu(cpu) {
-		domain = vector_allocation_domain(cpu);
-		vector = find_unassigned_vector(domain);
-		if (vector >= 0)
-			break;
-	}
-	if (vector < 0)
-		goto out;
-	irq = find_unassigned_irq();
-	if (irq < 0)
-		goto out;
-	BUG_ON(__bind_irq_vector(irq, vector, domain));
- out:
-	spin_unlock_irqrestore(&vector_lock, flags);
-	if (irq >= 0)
-		irq_init_desc(irq);
-	return irq;
-}
-
-void destroy_irq(unsigned int irq)
-{
-	irq_init_desc(irq);
-	clear_irq_vector(irq);
-}
-
-#ifdef CONFIG_SMP
-#	define IS_RESCHEDULE(vec)	(vec == IA64_IPI_RESCHEDULE)
-#	define IS_LOCAL_TLB_FLUSH(vec)	(vec == IA64_IPI_LOCAL_TLB_FLUSH)
-#else
-#	define IS_RESCHEDULE(vec)	(0)
-#	define IS_LOCAL_TLB_FLUSH(vec)	(0)
-#endif
-/*
- * That's where the IVT branches when we get an external
- * interrupt. This branches to the correct hardware IRQ handler via
- * function ptr.
- */
-void
-ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
-{
-	struct pt_regs *old_regs = set_irq_regs(regs);
-	unsigned long saved_tpr;
-
-#if IRQ_DEBUG
-	{
-		unsigned long bsp, sp;
-
-		/*
-		 * Note: if the interrupt happened while executing in
-		 * the context switch routine (ia64_switch_to), we may
-		 * get a spurious stack overflow here.  This is
-		 * because the register and the memory stack are not
-		 * switched atomically.
-		 */
-		bsp = ia64_getreg(_IA64_REG_AR_BSP);
-		sp = ia64_getreg(_IA64_REG_SP);
-
-		if ((sp - bsp) < 1024) {
-			static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5);
-
-			if (__ratelimit(&ratelimit)) {
-				printk("ia64_handle_irq: DANGER: less than "
-				       "1KB of free stack space!!\n"
-				       "(bsp=0x%lx, sp=%lx)\n", bsp, sp);
-			}
-		}
-	}
-#endif /* IRQ_DEBUG */
-
-	/*
-	 * Always set TPR to limit maximum interrupt nesting depth to
-	 * 16 (without this, it would be ~240, which could easily lead
-	 * to kernel stack overflows).
-	 */
-	irq_enter();
-	saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
-	ia64_srlz_d();
-	while (vector != IA64_SPURIOUS_INT_VECTOR) {
-		int irq = local_vector_to_irq(vector);
-
-		if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
-			smp_local_flush_tlb();
-			kstat_incr_irq_this_cpu(irq);
-		} else if (unlikely(IS_RESCHEDULE(vector))) {
-			scheduler_ipi();
-			kstat_incr_irq_this_cpu(irq);
-		} else {
-			ia64_setreg(_IA64_REG_CR_TPR, vector);
-			ia64_srlz_d();
-
-			if (unlikely(irq < 0)) {
-				printk(KERN_ERR "%s: Unexpected interrupt "
-				       "vector %d on CPU %d is not mapped "
-				       "to any IRQ!\n", __func__, vector,
-				       smp_processor_id());
-			} else
-				generic_handle_irq(irq);
-
-			/*
-			 * Disable interrupts and send EOI:
-			 */
-			local_irq_disable();
-			ia64_setreg(_IA64_REG_CR_TPR, saved_tpr);
-		}
-		ia64_eoi();
-		vector = ia64_get_ivr();
-	}
-	/*
-	 * This must be done *after* the ia64_eoi().  For example, the keyboard softirq
-	 * handler needs to be able to wait for further keyboard interrupts, which can't
-	 * come through until ia64_eoi() has been done.
-	 */
-	irq_exit();
-	set_irq_regs(old_regs);
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * This function emulates a interrupt processing when a cpu is about to be
- * brought down.
- */
-void ia64_process_pending_intr(void)
-{
-	ia64_vector vector;
-	unsigned long saved_tpr;
-	extern unsigned int vectors_in_migration[NR_IRQS];
-
-	vector = ia64_get_ivr();
-
-	irq_enter();
-	saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
-	ia64_srlz_d();
-
-	 /*
-	  * Perform normal interrupt style processing
-	  */
-	while (vector != IA64_SPURIOUS_INT_VECTOR) {
-		int irq = local_vector_to_irq(vector);
-
-		if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
-			smp_local_flush_tlb();
-			kstat_incr_irq_this_cpu(irq);
-		} else if (unlikely(IS_RESCHEDULE(vector))) {
-			kstat_incr_irq_this_cpu(irq);
-		} else {
-			struct pt_regs *old_regs = set_irq_regs(NULL);
-
-			ia64_setreg(_IA64_REG_CR_TPR, vector);
-			ia64_srlz_d();
-
-			/*
-			 * Now try calling normal ia64_handle_irq as it would have got called
-			 * from a real intr handler. Try passing null for pt_regs, hopefully
-			 * it will work. I hope it works!.
-			 * Probably could shared code.
-			 */
-			if (unlikely(irq < 0)) {
-				printk(KERN_ERR "%s: Unexpected interrupt "
-				       "vector %d on CPU %d not being mapped "
-				       "to any IRQ!!\n", __func__, vector,
-				       smp_processor_id());
-			} else {
-				vectors_in_migration[irq]=0;
-				generic_handle_irq(irq);
-			}
-			set_irq_regs(old_regs);
-
-			/*
-			 * Disable interrupts and send EOI
-			 */
-			local_irq_disable();
-			ia64_setreg(_IA64_REG_CR_TPR, saved_tpr);
-		}
-		ia64_eoi();
-		vector = ia64_get_ivr();
-	}
-	irq_exit();
-}
-#endif
-
-
-#ifdef CONFIG_SMP
-
-static irqreturn_t dummy_handler (int irq, void *dev_id)
-{
-	BUG();
-	return IRQ_NONE;
-}
-
-/*
- * KVM uses this interrupt to force a cpu out of guest mode
- */
-
-#endif
-
-void
-register_percpu_irq(ia64_vector vec, irq_handler_t handler, unsigned long flags,
-		    const char *name)
-{
-	unsigned int irq;
-
-	irq = vec;
-	BUG_ON(bind_irq_vector(irq, vec, CPU_MASK_ALL));
-	irq_set_status_flags(irq, IRQ_PER_CPU);
-	irq_set_chip(irq, &irq_type_ia64_lsapic);
-	if (handler)
-		if (request_irq(irq, handler, flags, name, NULL))
-			pr_err("Failed to request irq %u (%s)\n", irq, name);
-	irq_set_handler(irq, handle_percpu_irq);
-}
-
-void __init
-ia64_native_register_ipi(void)
-{
-#ifdef CONFIG_SMP
-	register_percpu_irq(IA64_IPI_VECTOR, handle_IPI, 0, "IPI");
-	register_percpu_irq(IA64_IPI_RESCHEDULE, dummy_handler, 0, "resched");
-	register_percpu_irq(IA64_IPI_LOCAL_TLB_FLUSH, dummy_handler, 0,
-			    "tlb_flush");
-#endif
-}
-
-void __init
-init_IRQ (void)
-{
-	acpi_boot_init();
-	ia64_register_ipi();
-	register_percpu_irq(IA64_SPURIOUS_INT_VECTOR, NULL, 0, NULL);
-#ifdef CONFIG_SMP
-	if (vector_domain_type != VECTOR_DOMAIN_NONE) {
-		register_percpu_irq(IA64_IRQ_MOVE_VECTOR,
-				    smp_irq_move_cleanup_interrupt, 0,
-				    "irq_move");
-	}
-#endif
-}
-
-void
-ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect)
-{
-	void __iomem *ipi_addr;
-	unsigned long ipi_data;
-	unsigned long phys_cpu_id;
-
-	phys_cpu_id = cpu_physical_id(cpu);
-
-	/*
-	 * cpu number is in 8bit ID and 8bit EID
-	 */
-
-	ipi_data = (delivery_mode << 8) | (vector & 0xff);
-	ipi_addr = ipi_base_addr + ((phys_cpu_id << 4) | ((redirect & 1) << 3));
-
-	writeq(ipi_data, ipi_addr);
-}
diff --git a/arch/ia64/kernel/irq_lsapic.c b/arch/ia64/kernel/irq_lsapic.c
deleted file mode 100644
index 23bf4499a75d..000000000000
--- a/arch/ia64/kernel/irq_lsapic.c
+++ /dev/null
@@ -1,45 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * LSAPIC Interrupt Controller
- *
- * This takes care of interrupts that are generated by the CPU's
- * internal Streamlined Advanced Programmable Interrupt Controller
- * (LSAPIC), such as the ITC and IPI interrupts.
-    *
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 2000 Hewlett-Packard Co
- * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/sched.h>
-#include <linux/irq.h>
-
-static unsigned int
-lsapic_noop_startup (struct irq_data *data)
-{
-	return 0;
-}
-
-static void
-lsapic_noop (struct irq_data *data)
-{
-	/* nothing to do... */
-}
-
-static int lsapic_retrigger(struct irq_data *data)
-{
-	ia64_resend_irq(data->irq);
-
-	return 1;
-}
-
-struct irq_chip irq_type_ia64_lsapic = {
-	.name =			"LSAPIC",
-	.irq_startup =		lsapic_noop_startup,
-	.irq_shutdown =		lsapic_noop,
-	.irq_enable =		lsapic_noop,
-	.irq_disable =		lsapic_noop,
-	.irq_ack =		lsapic_noop,
-	.irq_retrigger =	lsapic_retrigger,
-};
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
deleted file mode 100644
index da90c49df628..000000000000
--- a/arch/ia64/kernel/ivt.S
+++ /dev/null
@@ -1,1688 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * arch/ia64/kernel/ivt.S
- *
- * Copyright (C) 1998-2001, 2003, 2005 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger <davidm@hpl.hp.com>
- * Copyright (C) 2000, 2002-2003 Intel Co
- *	Asit Mallick <asit.k.mallick@intel.com>
- *      Suresh Siddha <suresh.b.siddha@intel.com>
- *      Kenneth Chen <kenneth.w.chen@intel.com>
- *      Fenghua Yu <fenghua.yu@intel.com>
- *
- * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> TLB handling for SMP
- * 00/12/20 David Mosberger-Tang <davidm@hpl.hp.com> DTLB/ITLB handler now uses virtual PT.
- *
- * Copyright (C) 2005 Hewlett-Packard Co
- *	Dan Magenheimer <dan.magenheimer@hp.com>
- *      Xen paravirtualization
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *                    pv_ops.
- *      Yaozu (Eddie) Dong <eddie.dong@intel.com>
- */
-/*
- * This file defines the interruption vector table used by the CPU.
- * It does not include one entry per possible cause of interruption.
- *
- * The first 20 entries of the table contain 64 bundles each while the
- * remaining 48 entries contain only 16 bundles each.
- *
- * The 64 bundles are used to allow inlining the whole handler for critical
- * interruptions like TLB misses.
- *
- *  For each entry, the comment is as follows:
- *
- *		// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
- *  entry offset ----/     /         /                  /          /
- *  entry number ---------/         /                  /          /
- *  size of the entry -------------/                  /          /
- *  vector name -------------------------------------/          /
- *  interruptions triggering this vector ----------------------/
- *
- * The table is 32KB in size and must be aligned on 32KB boundary.
- * (The CPU ignores the 15 lower bits of the address)
- *
- * Table is based upon EAS2.6 (Oct 1999)
- */
-
-#include <linux/export.h>
-#include <linux/pgtable.h>
-#include <asm/asmmacro.h>
-#include <asm/break.h>
-#include <asm/kregs.h>
-#include <asm/asm-offsets.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/thread_info.h>
-#include <asm/unistd.h>
-#include <asm/errno.h>
-
-#if 0
-# define PSR_DEFAULT_BITS	psr.ac
-#else
-# define PSR_DEFAULT_BITS	0
-#endif
-
-#if 0
-  /*
-   * This lets you track the last eight faults that occurred on the CPU.  Make sure ar.k2 isn't
-   * needed for something else before enabling this...
-   */
-# define DBG_FAULT(i)	mov r16=ar.k2;;	shl r16=r16,8;;	add r16=(i),r16;;mov ar.k2=r16
-#else
-# define DBG_FAULT(i)
-#endif
-
-#include "minstate.h"
-
-#define FAULT(n)									\
-	mov r31=pr;									\
-	mov r19=n;;			/* prepare to save predicates */		\
-	br.sptk.many dispatch_to_fault_handler
-
-	.section .text..ivt,"ax"
-
-	.align 32768	// align on 32KB boundary
-	.global ia64_ivt
-	EXPORT_SYMBOL(ia64_ivt)
-ia64_ivt:
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
-ENTRY(vhpt_miss)
-	DBG_FAULT(0)
-	/*
-	 * The VHPT vector is invoked when the TLB entry for the virtual page table
-	 * is missing.  This happens only as a result of a previous
-	 * (the "original") TLB miss, which may either be caused by an instruction
-	 * fetch or a data access (or non-access).
-	 *
-	 * What we do here is normal TLB miss handing for the _original_ miss,
-	 * followed by inserting the TLB entry for the virtual page table page
-	 * that the VHPT walker was attempting to access.  The latter gets
-	 * inserted as long as page table entry above pte level have valid
-	 * mappings for the faulting address.  The TLB entry for the original
-	 * miss gets inserted only if the pte entry indicates that the page is
-	 * present.
-	 *
-	 * do_page_fault gets invoked in the following cases:
-	 *	- the faulting virtual address uses unimplemented address bits
-	 *	- the faulting virtual address has no valid page table mapping
-	 */
-	MOV_FROM_IFA(r16)			// get address that caused the TLB miss
-#ifdef CONFIG_HUGETLB_PAGE
-	movl r18=PAGE_SHIFT
-	MOV_FROM_ITIR(r25)
-#endif
-	;;
-	RSM_PSR_DT				// use physical addressing for data
-	mov r31=pr				// save the predicate registers
-	mov r19=IA64_KR(PT_BASE)		// get page table base address
-	shl r21=r16,3				// shift bit 60 into sign bit
-	shr.u r17=r16,61			// get the region number into r17
-	;;
-	shr.u r22=r21,3
-#ifdef CONFIG_HUGETLB_PAGE
-	extr.u r26=r25,2,6
-	;;
-	cmp.ne p8,p0=r18,r26
-	sub r27=r26,r18
-	;;
-(p8)	dep r25=r18,r25,2,6
-(p8)	shr r22=r22,r27
-#endif
-	;;
-	cmp.eq p6,p7=5,r17			// is IFA pointing into to region 5?
-	shr.u r18=r22,PGDIR_SHIFT		// get bottom portion of pgd index bit
-	;;
-(p7)	dep r17=r17,r19,(PAGE_SHIFT-3),3	// put region number bits in place
-
-	srlz.d
-	LOAD_PHYSICAL(p6, r19, swapper_pg_dir)	// region 5 is rooted at swapper_pg_dir
-
-	.pred.rel "mutex", p6, p7
-(p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
-(p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
-	;;
-(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=pgd_offset for region 5
-(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=pgd_offset for region[0-4]
-	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
-#if CONFIG_PGTABLE_LEVELS == 4
-	shr.u r28=r22,PUD_SHIFT			// shift pud index into position
-#else
-	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
-#endif
-	;;
-	ld8 r17=[r17]				// get *pgd (may be 0)
-	;;
-(p7)	cmp.eq p6,p7=r17,r0			// was pgd_present(*pgd) == NULL?
-#if CONFIG_PGTABLE_LEVELS == 4
-	dep r28=r28,r17,3,(PAGE_SHIFT-3)	// r28=pud_offset(pgd,addr)
-	;;
-	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
-(p7)	ld8 r29=[r28]				// get *pud (may be 0)
-	;;
-(p7)	cmp.eq.or.andcm p6,p7=r29,r0		// was pud_present(*pud) == NULL?
-	dep r17=r18,r29,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pud,addr)
-#else
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pgd,addr)
-#endif
-	;;
-(p7)	ld8 r20=[r17]				// get *pmd (may be 0)
-	shr.u r19=r22,PAGE_SHIFT		// shift pte index into position
-	;;
-(p7)	cmp.eq.or.andcm p6,p7=r20,r0		// was pmd_present(*pmd) == NULL?
-	dep r21=r19,r20,3,(PAGE_SHIFT-3)	// r21=pte_offset(pmd,addr)
-	;;
-(p7)	ld8 r18=[r21]				// read *pte
-	MOV_FROM_ISR(r19)			// cr.isr bit 32 tells us if this is an insn miss
-	;;
-(p7)	tbit.z p6,p7=r18,_PAGE_P_BIT		// page present bit cleared?
-	MOV_FROM_IHA(r22)			// get the VHPT address that caused the TLB miss
-	;;					// avoid RAW on p7
-(p7)	tbit.nz.unc p10,p11=r19,32		// is it an instruction TLB miss?
-	dep r23=0,r20,0,PAGE_SHIFT		// clear low bits to get page address
-	;;
-	ITC_I_AND_D(p10, p11, r18, r24)		// insert the instruction TLB entry and
-						// insert the data TLB entry
-(p6)	br.cond.spnt.many page_fault		// handle bad address/page not present (page fault)
-	MOV_TO_IFA(r22, r24)
-
-#ifdef CONFIG_HUGETLB_PAGE
-	MOV_TO_ITIR(p8, r25, r24)		// change to default page-size for VHPT
-#endif
-
-	/*
-	 * Now compute and insert the TLB entry for the virtual page table.  We never
-	 * execute in a page table page so there is no need to set the exception deferral
-	 * bit.
-	 */
-	adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23
-	;;
-	ITC_D(p7, r24, r25)
-	;;
-#ifdef CONFIG_SMP
-	/*
-	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
-	 * cannot possibly affect the following loads:
-	 */
-	dv_serialize_data
-
-	/*
-	 * Re-check pagetable entry.  If they changed, we may have received a ptc.g
-	 * between reading the pagetable and the "itc".  If so, flush the entry we
-	 * inserted and retry.  At this point, we have:
-	 *
-	 * r28 = equivalent of pud_offset(pgd, ifa)
-	 * r17 = equivalent of pmd_offset(pud, ifa)
-	 * r21 = equivalent of pte_offset(pmd, ifa)
-	 *
-	 * r29 = *pud
-	 * r20 = *pmd
-	 * r18 = *pte
-	 */
-	ld8 r25=[r21]				// read *pte again
-	ld8 r26=[r17]				// read *pmd again
-#if CONFIG_PGTABLE_LEVELS == 4
-	ld8 r19=[r28]				// read *pud again
-#endif
-	cmp.ne p6,p7=r0,r0
-	;;
-	cmp.ne.or.andcm p6,p7=r26,r20		// did *pmd change
-#if CONFIG_PGTABLE_LEVELS == 4
-	cmp.ne.or.andcm p6,p7=r19,r29		// did *pud change
-#endif
-	mov r27=PAGE_SHIFT<<2
-	;;
-(p6)	ptc.l r22,r27				// purge PTE page translation
-(p7)	cmp.ne.or.andcm p6,p7=r25,r18		// did *pte change
-	;;
-(p6)	ptc.l r16,r27				// purge translation
-#endif
-
-	mov pr=r31,-1				// restore predicate registers
-	RFI
-END(vhpt_miss)
-
-	.org ia64_ivt+0x400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x0400 Entry 1 (size 64 bundles) ITLB (21)
-ENTRY(itlb_miss)
-	DBG_FAULT(1)
-	/*
-	 * The ITLB handler accesses the PTE via the virtually mapped linear
-	 * page table.  If a nested TLB miss occurs, we switch into physical
-	 * mode, walk the page table, and then re-execute the PTE read and
-	 * go on normally after that.
-	 */
-	MOV_FROM_IFA(r16)			// get virtual address
-	mov r29=b0				// save b0
-	mov r31=pr				// save predicates
-.itlb_fault:
-	MOV_FROM_IHA(r17)			// get virtual address of PTE
-	movl r30=1f				// load nested fault continuation point
-	;;
-1:	ld8 r18=[r17]				// read *pte
-	;;
-	mov b0=r29
-	tbit.z p6,p0=r18,_PAGE_P_BIT		// page present bit cleared?
-(p6)	br.cond.spnt page_fault
-	;;
-	ITC_I(p0, r18, r19)
-	;;
-#ifdef CONFIG_SMP
-	/*
-	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
-	 * cannot possibly affect the following loads:
-	 */
-	dv_serialize_data
-
-	ld8 r19=[r17]				// read *pte again and see if same
-	mov r20=PAGE_SHIFT<<2			// setup page size for purge
-	;;
-	cmp.ne p7,p0=r18,r19
-	;;
-(p7)	ptc.l r16,r20
-#endif
-	mov pr=r31,-1
-	RFI
-END(itlb_miss)
-
-	.org ia64_ivt+0x0800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
-ENTRY(dtlb_miss)
-	DBG_FAULT(2)
-	/*
-	 * The DTLB handler accesses the PTE via the virtually mapped linear
-	 * page table.  If a nested TLB miss occurs, we switch into physical
-	 * mode, walk the page table, and then re-execute the PTE read and
-	 * go on normally after that.
-	 */
-	MOV_FROM_IFA(r16)			// get virtual address
-	mov r29=b0				// save b0
-	mov r31=pr				// save predicates
-dtlb_fault:
-	MOV_FROM_IHA(r17)			// get virtual address of PTE
-	movl r30=1f				// load nested fault continuation point
-	;;
-1:	ld8 r18=[r17]				// read *pte
-	;;
-	mov b0=r29
-	tbit.z p6,p0=r18,_PAGE_P_BIT		// page present bit cleared?
-(p6)	br.cond.spnt page_fault
-	;;
-	ITC_D(p0, r18, r19)
-	;;
-#ifdef CONFIG_SMP
-	/*
-	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
-	 * cannot possibly affect the following loads:
-	 */
-	dv_serialize_data
-
-	ld8 r19=[r17]				// read *pte again and see if same
-	mov r20=PAGE_SHIFT<<2			// setup page size for purge
-	;;
-	cmp.ne p7,p0=r18,r19
-	;;
-(p7)	ptc.l r16,r20
-#endif
-	mov pr=r31,-1
-	RFI
-END(dtlb_miss)
-
-	.org ia64_ivt+0x0c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
-ENTRY(alt_itlb_miss)
-	DBG_FAULT(3)
-	MOV_FROM_IFA(r16)	// get address that caused the TLB miss
-	movl r17=PAGE_KERNEL
-	MOV_FROM_IPSR(p0, r21)
-	movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
-	mov r31=pr
-	;;
-#ifdef CONFIG_DISABLE_VHPT
-	shr.u r22=r16,61			// get the region number into r21
-	;;
-	cmp.gt p8,p0=6,r22			// user mode
-	;;
-	THASH(p8, r17, r16, r23)
-	;;
-	MOV_TO_IHA(p8, r17, r23)
-(p8)	mov r29=b0				// save b0
-(p8)	br.cond.dptk .itlb_fault
-#endif
-	extr.u r23=r21,IA64_PSR_CPL0_BIT,2	// extract psr.cpl
-	and r19=r19,r16		// clear ed, reserved bits, and PTE control bits
-	shr.u r18=r16,57	// move address bit 61 to bit 4
-	;;
-	andcm r18=0x10,r18	// bit 4=~address-bit(61)
-	cmp.ne p8,p0=r0,r23	// psr.cpl != 0?
-	or r19=r17,r19		// insert PTE control bits into r19
-	;;
-	or r19=r19,r18		// set bit 4 (uncached) if the access was to region 6
-(p8)	br.cond.spnt page_fault
-	;;
-	ITC_I(p0, r19, r18)	// insert the TLB entry
-	mov pr=r31,-1
-	RFI
-END(alt_itlb_miss)
-
-	.org ia64_ivt+0x1000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
-ENTRY(alt_dtlb_miss)
-	DBG_FAULT(4)
-	MOV_FROM_IFA(r16)	// get address that caused the TLB miss
-	movl r17=PAGE_KERNEL
-	MOV_FROM_ISR(r20)
-	movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
-	MOV_FROM_IPSR(p0, r21)
-	mov r31=pr
-	mov r24=PERCPU_ADDR
-	;;
-#ifdef CONFIG_DISABLE_VHPT
-	shr.u r22=r16,61			// get the region number into r21
-	;;
-	cmp.gt p8,p0=6,r22			// access to region 0-5
-	;;
-	THASH(p8, r17, r16, r25)
-	;;
-	MOV_TO_IHA(p8, r17, r25)
-(p8)	mov r29=b0				// save b0
-(p8)	br.cond.dptk dtlb_fault
-#endif
-	cmp.ge p10,p11=r16,r24			// access to per_cpu_data?
-	tbit.z p12,p0=r16,61			// access to region 6?
-	mov r25=PERCPU_PAGE_SHIFT << 2
-	mov r26=PERCPU_PAGE_SIZE
-	nop.m 0
-	nop.b 0
-	;;
-(p10)	mov r19=IA64_KR(PER_CPU_DATA)
-(p11)	and r19=r19,r16				// clear non-ppn fields
-	extr.u r23=r21,IA64_PSR_CPL0_BIT,2	// extract psr.cpl
-	and r22=IA64_ISR_CODE_MASK,r20		// get the isr.code field
-	tbit.nz p6,p7=r20,IA64_ISR_SP_BIT	// is speculation bit on?
-	tbit.nz p9,p0=r20,IA64_ISR_NA_BIT	// is non-access bit on?
-	;;
-(p10)	sub r19=r19,r26
-	MOV_TO_ITIR(p10, r25, r24)
-	cmp.ne p8,p0=r0,r23
-(p9)	cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22	// check isr.code field
-(p12)	dep r17=-1,r17,4,1			// set ma=UC for region 6 addr
-(p8)	br.cond.spnt page_fault
-
-	dep r21=-1,r21,IA64_PSR_ED_BIT,1
-	;;
-	or r19=r19,r17		// insert PTE control bits into r19
-	MOV_TO_IPSR(p6, r21, r24)
-	;;
-	ITC_D(p7, r19, r18)	// insert the TLB entry
-	mov pr=r31,-1
-	RFI
-END(alt_dtlb_miss)
-
-	.org ia64_ivt+0x1400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
-ENTRY(nested_dtlb_miss)
-	/*
-	 * In the absence of kernel bugs, we get here when the virtually mapped linear
-	 * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction
-	 * Access-bit, or Data Access-bit faults).  If the DTLB entry for the virtual page
-	 * table is missing, a nested TLB miss fault is triggered and control is
-	 * transferred to this point.  When this happens, we lookup the pte for the
-	 * faulting address by walking the page table in physical mode and return to the
-	 * continuation point passed in register r30 (or call page_fault if the address is
-	 * not mapped).
-	 *
-	 * Input:	r16:	faulting address
-	 *		r29:	saved b0
-	 *		r30:	continuation address
-	 *		r31:	saved pr
-	 *
-	 * Output:	r17:	physical address of PTE of faulting address
-	 *		r29:	saved b0
-	 *		r30:	continuation address
-	 *		r31:	saved pr
-	 *
-	 * Clobbered:	b0, r18, r19, r21, r22, psr.dt (cleared)
-	 */
-	RSM_PSR_DT				// switch to using physical data addressing
-	mov r19=IA64_KR(PT_BASE)		// get the page table base address
-	shl r21=r16,3				// shift bit 60 into sign bit
-	MOV_FROM_ITIR(r18)
-	;;
-	shr.u r17=r16,61			// get the region number into r17
-	extr.u r18=r18,2,6			// get the faulting page size
-	;;
-	cmp.eq p6,p7=5,r17			// is faulting address in region 5?
-	add r22=-PAGE_SHIFT,r18			// adjustment for hugetlb address
-	add r18=PGDIR_SHIFT-PAGE_SHIFT,r18
-	;;
-	shr.u r22=r16,r22
-	shr.u r18=r16,r18
-(p7)	dep r17=r17,r19,(PAGE_SHIFT-3),3	// put region number bits in place
-
-	srlz.d
-	LOAD_PHYSICAL(p6, r19, swapper_pg_dir)	// region 5 is rooted at swapper_pg_dir
-
-	.pred.rel "mutex", p6, p7
-(p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
-(p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
-	;;
-(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=pgd_offset for region 5
-(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=pgd_offset for region[0-4]
-	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
-#if CONFIG_PGTABLE_LEVELS == 4
-	shr.u r18=r22,PUD_SHIFT			// shift pud index into position
-#else
-	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
-#endif
-	;;
-	ld8 r17=[r17]				// get *pgd (may be 0)
-	;;
-(p7)	cmp.eq p6,p7=r17,r0			// was pgd_present(*pgd) == NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// r17=p[u|m]d_offset(pgd,addr)
-	;;
-#if CONFIG_PGTABLE_LEVELS == 4
-(p7)	ld8 r17=[r17]				// get *pud (may be 0)
-	shr.u r18=r22,PMD_SHIFT			// shift pmd index into position
-	;;
-(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was pud_present(*pud) == NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// r17=pmd_offset(pud,addr)
-	;;
-#endif
-(p7)	ld8 r17=[r17]				// get *pmd (may be 0)
-	shr.u r19=r22,PAGE_SHIFT		// shift pte index into position
-	;;
-(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was pmd_present(*pmd) == NULL?
-	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// r17=pte_offset(pmd,addr);
-(p6)	br.cond.spnt page_fault
-	mov b0=r30
-	br.sptk.many b0				// return to continuation point
-END(nested_dtlb_miss)
-
-	.org ia64_ivt+0x1800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
-ENTRY(ikey_miss)
-	DBG_FAULT(6)
-	FAULT(6)
-END(ikey_miss)
-
-	.org ia64_ivt+0x1c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
-ENTRY(dkey_miss)
-	DBG_FAULT(7)
-	FAULT(7)
-END(dkey_miss)
-
-	.org ia64_ivt+0x2000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
-ENTRY(dirty_bit)
-	DBG_FAULT(8)
-	/*
-	 * What we do here is to simply turn on the dirty bit in the PTE.  We need to
-	 * update both the page-table and the TLB entry.  To efficiently access the PTE,
-	 * we address it through the virtual page table.  Most likely, the TLB entry for
-	 * the relevant virtual page table page is still present in the TLB so we can
-	 * normally do this without additional TLB misses.  In case the necessary virtual
-	 * page table TLB entry isn't present, we take a nested TLB miss hit where we look
-	 * up the physical address of the L3 PTE and then continue at label 1 below.
-	 */
-	MOV_FROM_IFA(r16)			// get the address that caused the fault
-	movl r30=1f				// load continuation point in case of nested fault
-	;;
-	THASH(p0, r17, r16, r18)		// compute virtual address of L3 PTE
-	mov r29=b0				// save b0 in case of nested fault
-	mov r31=pr				// save pr
-#ifdef CONFIG_SMP
-	mov r28=ar.ccv				// save ar.ccv
-	;;
-1:	ld8 r18=[r17]
-	;;					// avoid RAW on r18
-	mov ar.ccv=r18				// set compare value for cmpxchg
-	or r25=_PAGE_D|_PAGE_A,r18		// set the dirty and accessed bits
-	tbit.z p7,p6 = r18,_PAGE_P_BIT		// Check present bit
-	;;
-(p6)	cmpxchg8.acq r26=[r17],r25,ar.ccv	// Only update if page is present
-	mov r24=PAGE_SHIFT<<2
-	;;
-(p6)	cmp.eq p6,p7=r26,r18			// Only compare if page is present
-	;;
-	ITC_D(p6, r25, r18)			// install updated PTE
-	;;
-	/*
-	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
-	 * cannot possibly affect the following loads:
-	 */
-	dv_serialize_data
-
-	ld8 r18=[r17]				// read PTE again
-	;;
-	cmp.eq p6,p7=r18,r25			// is it same as the newly installed
-	;;
-(p7)	ptc.l r16,r24
-	mov b0=r29				// restore b0
-	mov ar.ccv=r28
-#else
-	;;
-1:	ld8 r18=[r17]
-	;;					// avoid RAW on r18
-	or r18=_PAGE_D|_PAGE_A,r18		// set the dirty and accessed bits
-	mov b0=r29				// restore b0
-	;;
-	st8 [r17]=r18				// store back updated PTE
-	ITC_D(p0, r18, r16)			// install updated PTE
-#endif
-	mov pr=r31,-1				// restore pr
-	RFI
-END(dirty_bit)
-
-	.org ia64_ivt+0x2400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
-ENTRY(iaccess_bit)
-	DBG_FAULT(9)
-	// Like Entry 8, except for instruction access
-	MOV_FROM_IFA(r16)			// get the address that caused the fault
-	movl r30=1f				// load continuation point in case of nested fault
-	mov r31=pr				// save predicates
-#ifdef CONFIG_ITANIUM
-	/*
-	 * Erratum 10 (IFA may contain incorrect address) has "NoFix" status.
-	 */
-	MOV_FROM_IPSR(p0, r17)
-	;;
-	MOV_FROM_IIP(r18)
-	tbit.z p6,p0=r17,IA64_PSR_IS_BIT	// IA64 instruction set?
-	;;
-(p6)	mov r16=r18				// if so, use cr.iip instead of cr.ifa
-#endif /* CONFIG_ITANIUM */
-	;;
-	THASH(p0, r17, r16, r18)		// compute virtual address of L3 PTE
-	mov r29=b0				// save b0 in case of nested fault)
-#ifdef CONFIG_SMP
-	mov r28=ar.ccv				// save ar.ccv
-	;;
-1:	ld8 r18=[r17]
-	;;
-	mov ar.ccv=r18				// set compare value for cmpxchg
-	or r25=_PAGE_A,r18			// set the accessed bit
-	tbit.z p7,p6 = r18,_PAGE_P_BIT	 	// Check present bit
-	;;
-(p6)	cmpxchg8.acq r26=[r17],r25,ar.ccv	// Only if page present
-	mov r24=PAGE_SHIFT<<2
-	;;
-(p6)	cmp.eq p6,p7=r26,r18			// Only if page present
-	;;
-	ITC_I(p6, r25, r26)			// install updated PTE
-	;;
-	/*
-	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
-	 * cannot possibly affect the following loads:
-	 */
-	dv_serialize_data
-
-	ld8 r18=[r17]				// read PTE again
-	;;
-	cmp.eq p6,p7=r18,r25			// is it same as the newly installed
-	;;
-(p7)	ptc.l r16,r24
-	mov b0=r29				// restore b0
-	mov ar.ccv=r28
-#else /* !CONFIG_SMP */
-	;;
-1:	ld8 r18=[r17]
-	;;
-	or r18=_PAGE_A,r18			// set the accessed bit
-	mov b0=r29				// restore b0
-	;;
-	st8 [r17]=r18				// store back updated PTE
-	ITC_I(p0, r18, r16)			// install updated PTE
-#endif /* !CONFIG_SMP */
-	mov pr=r31,-1
-	RFI
-END(iaccess_bit)
-
-	.org ia64_ivt+0x2800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
-ENTRY(daccess_bit)
-	DBG_FAULT(10)
-	// Like Entry 8, except for data access
-	MOV_FROM_IFA(r16)			// get the address that caused the fault
-	movl r30=1f				// load continuation point in case of nested fault
-	;;
-	THASH(p0, r17, r16, r18)		// compute virtual address of L3 PTE
-	mov r31=pr
-	mov r29=b0				// save b0 in case of nested fault)
-#ifdef CONFIG_SMP
-	mov r28=ar.ccv				// save ar.ccv
-	;;
-1:	ld8 r18=[r17]
-	;;					// avoid RAW on r18
-	mov ar.ccv=r18				// set compare value for cmpxchg
-	or r25=_PAGE_A,r18			// set the dirty bit
-	tbit.z p7,p6 = r18,_PAGE_P_BIT		// Check present bit
-	;;
-(p6)	cmpxchg8.acq r26=[r17],r25,ar.ccv	// Only if page is present
-	mov r24=PAGE_SHIFT<<2
-	;;
-(p6)	cmp.eq p6,p7=r26,r18			// Only if page is present
-	;;
-	ITC_D(p6, r25, r26)			// install updated PTE
-	/*
-	 * Tell the assemblers dependency-violation checker that the above "itc" instructions
-	 * cannot possibly affect the following loads:
-	 */
-	dv_serialize_data
-	;;
-	ld8 r18=[r17]				// read PTE again
-	;;
-	cmp.eq p6,p7=r18,r25			// is it same as the newly installed
-	;;
-(p7)	ptc.l r16,r24
-	mov ar.ccv=r28
-#else
-	;;
-1:	ld8 r18=[r17]
-	;;					// avoid RAW on r18
-	or r18=_PAGE_A,r18			// set the accessed bit
-	;;
-	st8 [r17]=r18				// store back updated PTE
-	ITC_D(p0, r18, r16)			// install updated PTE
-#endif
-	mov b0=r29				// restore b0
-	mov pr=r31,-1
-	RFI
-END(daccess_bit)
-
-	.org ia64_ivt+0x2c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x2c00 Entry 11 (size 64 bundles) Break instruction (33)
-ENTRY(break_fault)
-	/*
-	 * The streamlined system call entry/exit paths only save/restore the initial part
-	 * of pt_regs.  This implies that the callers of system-calls must adhere to the
-	 * normal procedure calling conventions.
-	 *
-	 *   Registers to be saved & restored:
-	 *	CR registers: cr.ipsr, cr.iip, cr.ifs
-	 *	AR registers: ar.unat, ar.pfs, ar.rsc, ar.rnat, ar.bspstore, ar.fpsr
-	 * 	others: pr, b0, b6, loadrs, r1, r11, r12, r13, r15
-	 *   Registers to be restored only:
-	 * 	r8-r11: output value from the system call.
-	 *
-	 * During system call exit, scratch registers (including r15) are modified/cleared
-	 * to prevent leaking bits from kernel to user level.
-	 */
-	DBG_FAULT(11)
-	mov.m r16=IA64_KR(CURRENT)		// M2 r16 <- current task (12 cyc)
-	MOV_FROM_IPSR(p0, r29)			// M2 (12 cyc)
-	mov r31=pr				// I0 (2 cyc)
-
-	MOV_FROM_IIM(r17)			// M2 (2 cyc)
-	mov.m r27=ar.rsc			// M2 (12 cyc)
-	mov r18=__IA64_BREAK_SYSCALL		// A
-
-	mov.m ar.rsc=0				// M2
-	mov.m r21=ar.fpsr			// M2 (12 cyc)
-	mov r19=b6				// I0 (2 cyc)
-	;;
-	mov.m r23=ar.bspstore			// M2 (12 cyc)
-	mov.m r24=ar.rnat			// M2 (5 cyc)
-	mov.i r26=ar.pfs			// I0 (2 cyc)
-
-	invala					// M0|1
-	nop.m 0					// M
-	mov r20=r1				// A			save r1
-
-	nop.m 0
-	movl r30=sys_call_table			// X
-
-	MOV_FROM_IIP(r28)			// M2 (2 cyc)
-	cmp.eq p0,p7=r18,r17			// I0 is this a system call?
-(p7)	br.cond.spnt non_syscall		// B  no ->
-	//
-	// From this point on, we are definitely on the syscall-path
-	// and we can use (non-banked) scratch registers.
-	//
-///////////////////////////////////////////////////////////////////////
-	mov r1=r16				// A    move task-pointer to "addl"-addressable reg
-	mov r2=r16				// A    setup r2 for ia64_syscall_setup
-	add r9=TI_FLAGS+IA64_TASK_SIZE,r16	// A	r9 = &current_thread_info()->flags
-
-	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16
-	adds r15=-1024,r15			// A    subtract 1024 from syscall number
-	mov r3=NR_syscalls - 1
-	;;
-	ld1.bias r17=[r16]			// M0|1 r17 = current->thread.on_ustack flag
-	ld4 r9=[r9]				// M0|1 r9 = current_thread_info()->flags
-	extr.u r8=r29,41,2			// I0   extract ei field from cr.ipsr
-
-	shladd r30=r15,3,r30			// A    r30 = sys_call_table + 8*(syscall-1024)
-	addl r22=IA64_RBS_OFFSET,r1		// A    compute base of RBS
-	cmp.leu p6,p7=r15,r3			// A    syscall number in range?
-	;;
-
-	lfetch.fault.excl.nt1 [r22]		// M0|1 prefetch RBS
-(p6)	ld8 r30=[r30]				// M0|1 load address of syscall entry point
-	tnat.nz.or p7,p0=r15			// I0	is syscall nr a NaT?
-
-	mov.m ar.bspstore=r22			// M2   switch to kernel RBS
-	cmp.eq p8,p9=2,r8			// A    isr.ei==2?
-	;;
-
-(p8)	mov r8=0				// A    clear ei to 0
-(p7)	movl r30=sys_ni_syscall			// X
-
-(p8)	adds r28=16,r28				// A    switch cr.iip to next bundle
-(p9)	adds r8=1,r8				// A    increment ei to next slot
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	;;
-	mov b6=r30				// I0   setup syscall handler branch reg early
-#else
-	nop.i 0
-	;;
-#endif
-
-	mov.m r25=ar.unat			// M2 (5 cyc)
-	dep r29=r8,r29,41,2			// I0   insert new ei into cr.ipsr
-	adds r15=1024,r15			// A    restore original syscall number
-	//
-	// If any of the above loads miss in L1D, we'll stall here until
-	// the data arrives.
-	//
-///////////////////////////////////////////////////////////////////////
-	st1 [r16]=r0				// M2|3 clear current->thread.on_ustack flag
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	MOV_FROM_ITC(p0, p14, r30, r18)		// M    get cycle for accounting
-#else
-	mov b6=r30				// I0   setup syscall handler branch reg early
-#endif
-	cmp.eq pKStk,pUStk=r0,r17		// A    were we on kernel stacks already?
-
-	and r9=_TIF_SYSCALL_TRACEAUDIT,r9	// A    mask trace or audit
-	mov r18=ar.bsp				// M2 (12 cyc)
-(pKStk)	br.cond.spnt .break_fixup		// B	we're already in kernel-mode -- fix up RBS
-	;;
-.back_from_break_fixup:
-(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1 // A    compute base of memory stack
-	cmp.eq p14,p0=r9,r0			// A    are syscalls being traced/audited?
-	br.call.sptk.many b7=ia64_syscall_setup	// B
-1:
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	// mov.m r30=ar.itc is called in advance, and r13 is current
-	add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13	// A
-	add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13	// A
-(pKStk)	br.cond.spnt .skip_accounting		// B	unlikely skip
-	;;
-	ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP	// M  get last stamp
-	ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE	// M  time at leave
-	;;
-	ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME	// M  cumulated stime
-	ld8 r21=[r17]				// M  cumulated utime
-	sub r22=r19,r18				// A  stime before leave
-	;;
-	st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP	// M  update stamp
-	sub r18=r30,r19				// A  elapsed time in user
-	;;
-	add r20=r20,r22				// A  sum stime
-	add r21=r21,r18				// A  sum utime
-	;;
-	st8 [r16]=r20				// M  update stime
-	st8 [r17]=r21				// M  update utime
-	;;
-.skip_accounting:
-#endif
-	mov ar.rsc=0x3				// M2   set eager mode, pl 0, LE, loadrs=0
-	nop 0
-	BSW_1(r2, r14)				// B (6 cyc) regs are saved, switch to bank 1
-	;;
-
-	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r16)	// M2	now it's safe to re-enable intr.-collection
-						// M0   ensure interruption collection is on
-	movl r3=ia64_ret_from_syscall		// X
-	;;
-	mov rp=r3				// I0   set the real return addr
-(p10)	br.cond.spnt.many ia64_ret_from_syscall	// B    return if bad call-frame or r15 is a NaT
-
-	SSM_PSR_I(p15, p15, r16)		// M2   restore psr.i
-(p14)	br.call.sptk.many b6=b6			// B    invoke syscall-handker (ignore return addr)
-	br.cond.spnt.many ia64_trace_syscall	// B	do syscall-tracing thingamagic
-	// NOT REACHED
-///////////////////////////////////////////////////////////////////////
-	// On entry, we optimistically assumed that we're coming from user-space.
-	// For the rare cases where a system-call is done from within the kernel,
-	// we fix things up at this point:
-.break_fixup:
-	add r1=-IA64_PT_REGS_SIZE,sp		// A    allocate space for pt_regs structure
-	mov ar.rnat=r24				// M2	restore kernel's AR.RNAT
-	;;
-	mov ar.bspstore=r23			// M2	restore kernel's AR.BSPSTORE
-	br.cond.sptk .back_from_break_fixup
-END(break_fault)
-
-	.org ia64_ivt+0x3000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
-ENTRY(interrupt)
-	/* interrupt handler has become too big to fit this area. */
-	br.sptk.many __interrupt
-END(interrupt)
-
-	.org ia64_ivt+0x3400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x3400 Entry 13 (size 64 bundles) Reserved
-	DBG_FAULT(13)
-	FAULT(13)
-
-	.org ia64_ivt+0x3800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x3800 Entry 14 (size 64 bundles) Reserved
-	DBG_FAULT(14)
-	FAULT(14)
-
-	/*
-	 * There is no particular reason for this code to be here, other than that
-	 * there happens to be space here that would go unused otherwise.  If this
-	 * fault ever gets "unreserved", simply moved the following code to a more
-	 * suitable spot...
-	 *
-	 * ia64_syscall_setup() is a separate subroutine so that it can
-	 *	allocate stacked registers so it can safely demine any
-	 *	potential NaT values from the input registers.
-	 *
-	 * On entry:
-	 *	- executing on bank 0 or bank 1 register set (doesn't matter)
-	 *	-  r1: stack pointer
-	 *	-  r2: current task pointer
-	 *	-  r3: preserved
-	 *	- r11: original contents (saved ar.pfs to be saved)
-	 *	- r12: original contents (sp to be saved)
-	 *	- r13: original contents (tp to be saved)
-	 *	- r15: original contents (syscall # to be saved)
-	 *	- r18: saved bsp (after switching to kernel stack)
-	 *	- r19: saved b6
-	 *	- r20: saved r1 (gp)
-	 *	- r21: saved ar.fpsr
-	 *	- r22: kernel's register backing store base (krbs_base)
-	 *	- r23: saved ar.bspstore
-	 *	- r24: saved ar.rnat
-	 *	- r25: saved ar.unat
-	 *	- r26: saved ar.pfs
-	 *	- r27: saved ar.rsc
-	 *	- r28: saved cr.iip
-	 *	- r29: saved cr.ipsr
-	 *	- r30: ar.itc for accounting (don't touch)
-	 *	- r31: saved pr
-	 *	-  b0: original contents (to be saved)
-	 * On exit:
-	 *	-  p10: TRUE if syscall is invoked with more than 8 out
-	 *		registers or r15's Nat is true
-	 *	-  r1: kernel's gp
-	 *	-  r3: preserved (same as on entry)
-	 *	-  r8: -EINVAL if p10 is true
-	 *	- r12: points to kernel stack
-	 *	- r13: points to current task
-	 *	- r14: preserved (same as on entry)
-	 *	- p13: preserved
-	 *	- p15: TRUE if interrupts need to be re-enabled
-	 *	- ar.fpsr: set to kernel settings
-	 *	-  b6: preserved (same as on entry)
-	 */
-GLOBAL_ENTRY(ia64_syscall_setup)
-#if PT(B6) != 0
-# error This code assumes that b6 is the first field in pt_regs.
-#endif
-	st8 [r1]=r19				// save b6
-	add r16=PT(CR_IPSR),r1			// initialize first base pointer
-	add r17=PT(R11),r1			// initialize second base pointer
-	;;
-	alloc r19=ar.pfs,8,0,0,0		// ensure in0-in7 are writable
-	st8 [r16]=r29,PT(AR_PFS)-PT(CR_IPSR)	// save cr.ipsr
-	tnat.nz p8,p0=in0
-
-	st8.spill [r17]=r11,PT(CR_IIP)-PT(R11)	// save r11
-	tnat.nz p9,p0=in1
-(pKStk)	mov r18=r0				// make sure r18 isn't NaT
-	;;
-
-	st8 [r16]=r26,PT(CR_IFS)-PT(AR_PFS)	// save ar.pfs
-	st8 [r17]=r28,PT(AR_UNAT)-PT(CR_IIP)	// save cr.iip
-	mov r28=b0				// save b0 (2 cyc)
-	;;
-
-	st8 [r17]=r25,PT(AR_RSC)-PT(AR_UNAT)	// save ar.unat
-	dep r19=0,r19,38,26			// clear all bits but 0..37 [I0]
-(p8)	mov in0=-1
-	;;
-
-	st8 [r16]=r19,PT(AR_RNAT)-PT(CR_IFS)	// store ar.pfs.pfm in cr.ifs
-	extr.u r11=r19,7,7	// I0		// get sol of ar.pfs
-	and r8=0x7f,r19		// A		// get sof of ar.pfs
-
-	st8 [r17]=r27,PT(AR_BSPSTORE)-PT(AR_RSC)// save ar.rsc
-	tbit.nz p15,p0=r29,IA64_PSR_I_BIT // I0
-(p9)	mov in1=-1
-	;;
-
-(pUStk) sub r18=r18,r22				// r18=RSE.ndirty*8
-	tnat.nz p10,p0=in2
-	add r11=8,r11
-	;;
-(pKStk) adds r16=PT(PR)-PT(AR_RNAT),r16		// skip over ar_rnat field
-(pKStk) adds r17=PT(B0)-PT(AR_BSPSTORE),r17	// skip over ar_bspstore field
-	tnat.nz p11,p0=in3
-	;;
-(p10)	mov in2=-1
-	tnat.nz p12,p0=in4				// [I0]
-(p11)	mov in3=-1
-	;;
-(pUStk) st8 [r16]=r24,PT(PR)-PT(AR_RNAT)	// save ar.rnat
-(pUStk) st8 [r17]=r23,PT(B0)-PT(AR_BSPSTORE)	// save ar.bspstore
-	shl r18=r18,16				// compute ar.rsc to be used for "loadrs"
-	;;
-	st8 [r16]=r31,PT(LOADRS)-PT(PR)		// save predicates
-	st8 [r17]=r28,PT(R1)-PT(B0)		// save b0
-	tnat.nz p13,p0=in5				// [I0]
-	;;
-	st8 [r16]=r18,PT(R12)-PT(LOADRS)	// save ar.rsc value for "loadrs"
-	st8.spill [r17]=r20,PT(R13)-PT(R1)	// save original r1
-(p12)	mov in4=-1
-	;;
-
-.mem.offset 0,0; st8.spill [r16]=r12,PT(AR_FPSR)-PT(R12)	// save r12
-.mem.offset 8,0; st8.spill [r17]=r13,PT(R15)-PT(R13)		// save r13
-(p13)	mov in5=-1
-	;;
-	st8 [r16]=r21,PT(R8)-PT(AR_FPSR)	// save ar.fpsr
-	tnat.nz p13,p0=in6
-	cmp.lt p10,p9=r11,r8	// frame size can't be more than local+8
-	;;
-	mov r8=1
-(p9)	tnat.nz p10,p0=r15
-	adds r12=-16,r1		// switch to kernel memory stack (with 16 bytes of scratch)
-
-	st8.spill [r17]=r15			// save r15
-	tnat.nz p8,p0=in7
-	nop.i 0
-
-	mov r13=r2				// establish `current'
-	movl r1=__gp				// establish kernel global pointer
-	;;
-	st8 [r16]=r8		// ensure pt_regs.r8 != 0 (see handle_syscall_error)
-(p13)	mov in6=-1
-(p8)	mov in7=-1
-
-	cmp.eq pSys,pNonSys=r0,r0		// set pSys=1, pNonSys=0
-	movl r17=FPSR_DEFAULT
-	;;
-	mov.m ar.fpsr=r17			// set ar.fpsr to kernel default value
-(p10)	mov r8=-EINVAL
-	br.ret.sptk.many b7
-END(ia64_syscall_setup)
-
-	.org ia64_ivt+0x3c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x3c00 Entry 15 (size 64 bundles) Reserved
-	DBG_FAULT(15)
-	FAULT(15)
-
-	.org ia64_ivt+0x4000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x4000 Entry 16 (size 64 bundles) Reserved
-	DBG_FAULT(16)
-	FAULT(16)
-
-#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE)
-	/*
-	 * There is no particular reason for this code to be here, other than
-	 * that there happens to be space here that would go unused otherwise.
-	 * If this fault ever gets "unreserved", simply moved the following
-	 * code to a more suitable spot...
-	 *
-	 * account_sys_enter is called from SAVE_MIN* macros if accounting is
-	 * enabled and if the macro is entered from user mode.
-	 */
-GLOBAL_ENTRY(account_sys_enter)
-	// mov.m r20=ar.itc is called in advance, and r13 is current
-	add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13
-	add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13
-	;;
-	ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP	// time at last check in kernel
-	ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE	// time at left from kernel
-        ;;
-	ld8 r23=[r16],TI_AC_STAMP-TI_AC_STIME	// cumulated stime
-	ld8 r21=[r17]				// cumulated utime
-	sub r22=r19,r18				// stime before leave kernel
-	;;
-	st8 [r16]=r20,TI_AC_STIME-TI_AC_STAMP	// update stamp
-	sub r18=r20,r19				// elapsed time in user mode
-	;;
-	add r23=r23,r22				// sum stime
-	add r21=r21,r18				// sum utime
-	;;
-	st8 [r16]=r23				// update stime
-	st8 [r17]=r21				// update utime
-	;;
-	br.ret.sptk.many rp
-END(account_sys_enter)
-#endif
-
-	.org ia64_ivt+0x4400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x4400 Entry 17 (size 64 bundles) Reserved
-	DBG_FAULT(17)
-	FAULT(17)
-
-	.org ia64_ivt+0x4800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x4800 Entry 18 (size 64 bundles) Reserved
-	DBG_FAULT(18)
-	FAULT(18)
-
-	.org ia64_ivt+0x4c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x4c00 Entry 19 (size 64 bundles) Reserved
-	DBG_FAULT(19)
-	FAULT(19)
-
-//
-// --- End of long entries, Beginning of short entries
-//
-
-	.org ia64_ivt+0x5000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49)
-ENTRY(page_not_present)
-	DBG_FAULT(20)
-	MOV_FROM_IFA(r16)
-	RSM_PSR_DT
-	/*
-	 * The Linux page fault handler doesn't expect non-present pages to be in
-	 * the TLB.  Flush the existing entry now, so we meet that expectation.
-	 */
-	mov r17=PAGE_SHIFT<<2
-	;;
-	ptc.l r16,r17
-	;;
-	mov r31=pr
-	srlz.d
-	br.sptk.many page_fault
-END(page_not_present)
-
-	.org ia64_ivt+0x5100
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52)
-ENTRY(key_permission)
-	DBG_FAULT(21)
-	MOV_FROM_IFA(r16)
-	RSM_PSR_DT
-	mov r31=pr
-	;;
-	srlz.d
-	br.sptk.many page_fault
-END(key_permission)
-
-	.org ia64_ivt+0x5200
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
-ENTRY(iaccess_rights)
-	DBG_FAULT(22)
-	MOV_FROM_IFA(r16)
-	RSM_PSR_DT
-	mov r31=pr
-	;;
-	srlz.d
-	br.sptk.many page_fault
-END(iaccess_rights)
-
-	.org ia64_ivt+0x5300
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
-ENTRY(daccess_rights)
-	DBG_FAULT(23)
-	MOV_FROM_IFA(r16)
-	RSM_PSR_DT
-	mov r31=pr
-	;;
-	srlz.d
-	br.sptk.many page_fault
-END(daccess_rights)
-
-	.org ia64_ivt+0x5400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
-ENTRY(general_exception)
-	DBG_FAULT(24)
-	MOV_FROM_ISR(r16)
-	mov r31=pr
-	;;
-	cmp4.eq p6,p0=0,r16
-(p6)	br.sptk.many dispatch_illegal_op_fault
-	;;
-	mov r19=24		// fault number
-	br.sptk.many dispatch_to_fault_handler
-END(general_exception)
-
-	.org ia64_ivt+0x5500
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
-ENTRY(disabled_fp_reg)
-	DBG_FAULT(25)
-	rsm psr.dfh		// ensure we can access fph
-	;;
-	srlz.d
-	mov r31=pr
-	mov r19=25
-	br.sptk.many dispatch_to_fault_handler
-END(disabled_fp_reg)
-
-	.org ia64_ivt+0x5600
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
-ENTRY(nat_consumption)
-	DBG_FAULT(26)
-
-	MOV_FROM_IPSR(p0, r16)
-	MOV_FROM_ISR(r17)
-	mov r31=pr				// save PR
-	;;
-	and r18=0xf,r17				// r18 = cr.ipsr.code{3:0}
-	tbit.z p6,p0=r17,IA64_ISR_NA_BIT
-	;;
-	cmp.ne.or p6,p0=IA64_ISR_CODE_LFETCH,r18
-	dep r16=-1,r16,IA64_PSR_ED_BIT,1
-(p6)	br.cond.spnt 1f		// branch if (cr.ispr.na == 0 || cr.ipsr.code{3:0} != LFETCH)
-	;;
-	MOV_TO_IPSR(p0, r16, r18)
-	mov pr=r31,-1
-	;;
-	RFI
-
-1:	mov pr=r31,-1
-	;;
-	FAULT(26)
-END(nat_consumption)
-
-	.org ia64_ivt+0x5700
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5700 Entry 27 (size 16 bundles) Speculation (40)
-ENTRY(speculation_vector)
-	DBG_FAULT(27)
-	/*
-	 * A [f]chk.[as] instruction needs to take the branch to the recovery code but
-	 * this part of the architecture is not implemented in hardware on some CPUs, such
-	 * as Itanium.  Thus, in general we need to emulate the behavior.  IIM contains
-	 * the relative target (not yet sign extended).  So after sign extending it we
-	 * simply add it to IIP.  We also need to reset the EI field of the IPSR to zero,
-	 * i.e., the slot to restart into.
-	 *
-	 * cr.imm contains zero_ext(imm21)
-	 */
-	MOV_FROM_IIM(r18)
-	;;
-	MOV_FROM_IIP(r17)
-	shl r18=r18,43			// put sign bit in position (43=64-21)
-	;;
-
-	MOV_FROM_IPSR(p0, r16)
-	shr r18=r18,39			// sign extend (39=43-4)
-	;;
-
-	add r17=r17,r18			// now add the offset
-	;;
-	MOV_TO_IIP(r17, r19)
-	dep r16=0,r16,41,2		// clear EI
-	;;
-
-	MOV_TO_IPSR(p0, r16, r19)
-	;;
-
-	RFI
-END(speculation_vector)
-
-	.org ia64_ivt+0x5800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5800 Entry 28 (size 16 bundles) Reserved
-	DBG_FAULT(28)
-	FAULT(28)
-
-	.org ia64_ivt+0x5900
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
-ENTRY(debug_vector)
-	DBG_FAULT(29)
-	FAULT(29)
-END(debug_vector)
-
-	.org ia64_ivt+0x5a00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
-ENTRY(unaligned_access)
-	DBG_FAULT(30)
-	mov r31=pr		// prepare to save predicates
-	;;
-	br.sptk.many dispatch_unaligned_handler
-END(unaligned_access)
-
-	.org ia64_ivt+0x5b00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57)
-ENTRY(unsupported_data_reference)
-	DBG_FAULT(31)
-	FAULT(31)
-END(unsupported_data_reference)
-
-	.org ia64_ivt+0x5c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5c00 Entry 32 (size 16 bundles) Floating-Point Fault (64)
-ENTRY(floating_point_fault)
-	DBG_FAULT(32)
-	FAULT(32)
-END(floating_point_fault)
-
-	.org ia64_ivt+0x5d00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66)
-ENTRY(floating_point_trap)
-	DBG_FAULT(33)
-	FAULT(33)
-END(floating_point_trap)
-
-	.org ia64_ivt+0x5e00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66)
-ENTRY(lower_privilege_trap)
-	DBG_FAULT(34)
-	FAULT(34)
-END(lower_privilege_trap)
-
-	.org ia64_ivt+0x5f00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68)
-ENTRY(taken_branch_trap)
-	DBG_FAULT(35)
-	FAULT(35)
-END(taken_branch_trap)
-
-	.org ia64_ivt+0x6000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69)
-ENTRY(single_step_trap)
-	DBG_FAULT(36)
-	FAULT(36)
-END(single_step_trap)
-
-	.org ia64_ivt+0x6100
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6100 Entry 37 (size 16 bundles) Reserved
-	DBG_FAULT(37)
-	FAULT(37)
-
-	.org ia64_ivt+0x6200
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6200 Entry 38 (size 16 bundles) Reserved
-	DBG_FAULT(38)
-	FAULT(38)
-
-	.org ia64_ivt+0x6300
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6300 Entry 39 (size 16 bundles) Reserved
-	DBG_FAULT(39)
-	FAULT(39)
-
-	.org ia64_ivt+0x6400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6400 Entry 40 (size 16 bundles) Reserved
-	DBG_FAULT(40)
-	FAULT(40)
-
-	.org ia64_ivt+0x6500
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6500 Entry 41 (size 16 bundles) Reserved
-	DBG_FAULT(41)
-	FAULT(41)
-
-	.org ia64_ivt+0x6600
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6600 Entry 42 (size 16 bundles) Reserved
-	DBG_FAULT(42)
-	FAULT(42)
-
-	.org ia64_ivt+0x6700
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6700 Entry 43 (size 16 bundles) Reserved
-	DBG_FAULT(43)
-	FAULT(43)
-
-	.org ia64_ivt+0x6800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6800 Entry 44 (size 16 bundles) Reserved
-	DBG_FAULT(44)
-	FAULT(44)
-
-	.org ia64_ivt+0x6900
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception (17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77)
-ENTRY(ia32_exception)
-	DBG_FAULT(45)
-	FAULT(45)
-END(ia32_exception)
-
-	.org ia64_ivt+0x6a00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept  (30,31,59,70,71)
-ENTRY(ia32_intercept)
-	DBG_FAULT(46)
-	FAULT(46)
-END(ia32_intercept)
-
-	.org ia64_ivt+0x6b00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt  (74)
-ENTRY(ia32_interrupt)
-	DBG_FAULT(47)
-	FAULT(47)
-END(ia32_interrupt)
-
-	.org ia64_ivt+0x6c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6c00 Entry 48 (size 16 bundles) Reserved
-	DBG_FAULT(48)
-	FAULT(48)
-
-	.org ia64_ivt+0x6d00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6d00 Entry 49 (size 16 bundles) Reserved
-	DBG_FAULT(49)
-	FAULT(49)
-
-	.org ia64_ivt+0x6e00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6e00 Entry 50 (size 16 bundles) Reserved
-	DBG_FAULT(50)
-	FAULT(50)
-
-	.org ia64_ivt+0x6f00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x6f00 Entry 51 (size 16 bundles) Reserved
-	DBG_FAULT(51)
-	FAULT(51)
-
-	.org ia64_ivt+0x7000
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7000 Entry 52 (size 16 bundles) Reserved
-	DBG_FAULT(52)
-	FAULT(52)
-
-	.org ia64_ivt+0x7100
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7100 Entry 53 (size 16 bundles) Reserved
-	DBG_FAULT(53)
-	FAULT(53)
-
-	.org ia64_ivt+0x7200
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7200 Entry 54 (size 16 bundles) Reserved
-	DBG_FAULT(54)
-	FAULT(54)
-
-	.org ia64_ivt+0x7300
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7300 Entry 55 (size 16 bundles) Reserved
-	DBG_FAULT(55)
-	FAULT(55)
-
-	.org ia64_ivt+0x7400
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7400 Entry 56 (size 16 bundles) Reserved
-	DBG_FAULT(56)
-	FAULT(56)
-
-	.org ia64_ivt+0x7500
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7500 Entry 57 (size 16 bundles) Reserved
-	DBG_FAULT(57)
-	FAULT(57)
-
-	.org ia64_ivt+0x7600
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7600 Entry 58 (size 16 bundles) Reserved
-	DBG_FAULT(58)
-	FAULT(58)
-
-	.org ia64_ivt+0x7700
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7700 Entry 59 (size 16 bundles) Reserved
-	DBG_FAULT(59)
-	FAULT(59)
-
-	.org ia64_ivt+0x7800
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7800 Entry 60 (size 16 bundles) Reserved
-	DBG_FAULT(60)
-	FAULT(60)
-
-	.org ia64_ivt+0x7900
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7900 Entry 61 (size 16 bundles) Reserved
-	DBG_FAULT(61)
-	FAULT(61)
-
-	.org ia64_ivt+0x7a00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7a00 Entry 62 (size 16 bundles) Reserved
-	DBG_FAULT(62)
-	FAULT(62)
-
-	.org ia64_ivt+0x7b00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7b00 Entry 63 (size 16 bundles) Reserved
-	DBG_FAULT(63)
-	FAULT(63)
-
-	.org ia64_ivt+0x7c00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7c00 Entry 64 (size 16 bundles) Reserved
-	DBG_FAULT(64)
-	FAULT(64)
-
-	.org ia64_ivt+0x7d00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7d00 Entry 65 (size 16 bundles) Reserved
-	DBG_FAULT(65)
-	FAULT(65)
-
-	.org ia64_ivt+0x7e00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7e00 Entry 66 (size 16 bundles) Reserved
-	DBG_FAULT(66)
-	FAULT(66)
-
-	.org ia64_ivt+0x7f00
-/////////////////////////////////////////////////////////////////////////////////////////
-// 0x7f00 Entry 67 (size 16 bundles) Reserved
-	DBG_FAULT(67)
-	FAULT(67)
-
-	//-----------------------------------------------------------------------------------
-	// call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address)
-ENTRY(page_fault)
-	SSM_PSR_DT_AND_SRLZ_I
-	;;
-	SAVE_MIN_WITH_COVER
-	alloc r15=ar.pfs,0,0,3,0
-	MOV_FROM_IFA(out0)
-	MOV_FROM_ISR(out1)
-	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r14, r3)
-	adds r3=8,r2				// set up second base pointer
-	SSM_PSR_I(p15, p15, r14)		// restore psr.i
-	movl r14=ia64_leave_kernel
-	;;
-	SAVE_REST
-	mov rp=r14
-	;;
-	adds out2=16,r12			// out2 = pointer to pt_regs
-	br.call.sptk.many b6=ia64_do_page_fault	// ignore return address
-END(page_fault)
-
-ENTRY(non_syscall)
-	mov ar.rsc=r27			// restore ar.rsc before SAVE_MIN_WITH_COVER
-	;;
-	SAVE_MIN_WITH_COVER
-
-	// There is no particular reason for this code to be here, other than that
-	// there happens to be space here that would go unused otherwise.  If this
-	// fault ever gets "unreserved", simply moved the following code to a more
-	// suitable spot...
-
-	alloc r14=ar.pfs,0,0,2,0
-	MOV_FROM_IIM(out0)
-	add out1=16,sp
-	adds r3=8,r2			// set up second base pointer for SAVE_REST
-
-	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r15, r24)
-					// guarantee that interruption collection is on
-	SSM_PSR_I(p15, p15, r15)	// restore psr.i
-	movl r15=ia64_leave_kernel
-	;;
-	SAVE_REST
-	mov rp=r15
-	;;
-	br.call.sptk.many b6=ia64_bad_break	// avoid WAW on CFM and ignore return addr
-END(non_syscall)
-
-ENTRY(__interrupt)
-	DBG_FAULT(12)
-	mov r31=pr		// prepare to save predicates
-	;;
-	SAVE_MIN_WITH_COVER	// uses r31; defines r2 and r3
-	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r14)
-				// ensure everybody knows psr.ic is back on
-	adds r3=8,r2		// set up second base pointer for SAVE_REST
-	;;
-	SAVE_REST
-	;;
-	MCA_RECOVER_RANGE(interrupt)
-	alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
-	MOV_FROM_IVR(out0, r8)	// pass cr.ivr as first arg
-	add out1=16,sp		// pass pointer to pt_regs as second arg
-	;;
-	srlz.d			// make sure we see the effect of cr.ivr
-	movl r14=ia64_leave_kernel
-	;;
-	mov rp=r14
-	br.call.sptk.many b6=ia64_handle_irq
-END(__interrupt)
-
-	/*
-	 * There is no particular reason for this code to be here, other than that
-	 * there happens to be space here that would go unused otherwise.  If this
-	 * fault ever gets "unreserved", simply moved the following code to a more
-	 * suitable spot...
-	 */
-
-ENTRY(dispatch_unaligned_handler)
-	SAVE_MIN_WITH_COVER
-	;;
-	alloc r14=ar.pfs,0,0,2,0		// now it's safe (must be first in insn group!)
-	MOV_FROM_IFA(out0)
-	adds out1=16,sp
-
-	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r24)
-						// guarantee that interruption collection is on
-	SSM_PSR_I(p15, p15, r3)			// restore psr.i
-	adds r3=8,r2				// set up second base pointer
-	;;
-	SAVE_REST
-	movl r14=ia64_leave_kernel
-	;;
-	mov rp=r14
-	br.sptk.many ia64_prepare_handle_unaligned
-END(dispatch_unaligned_handler)
-
-	/*
-	 * There is no particular reason for this code to be here, other than that
-	 * there happens to be space here that would go unused otherwise.  If this
-	 * fault ever gets "unreserved", simply moved the following code to a more
-	 * suitable spot...
-	 */
-
-ENTRY(dispatch_to_fault_handler)
-	/*
-	 * Input:
-	 *	psr.ic:	off
-	 *	r19:	fault vector number (e.g., 24 for General Exception)
-	 *	r31:	contains saved predicates (pr)
-	 */
-	SAVE_MIN_WITH_COVER_R19
-	alloc r14=ar.pfs,0,0,5,0
-	MOV_FROM_ISR(out1)
-	MOV_FROM_IFA(out2)
-	MOV_FROM_IIM(out3)
-	MOV_FROM_ITIR(out4)
-	;;
-	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, out0)
-						// guarantee that interruption collection is on
-	mov out0=r15
-	;;
-	SSM_PSR_I(p15, p15, r3)			// restore psr.i
-	adds r3=8,r2				// set up second base pointer for SAVE_REST
-	;;
-	SAVE_REST
-	movl r14=ia64_leave_kernel
-	;;
-	mov rp=r14
-	br.call.sptk.many b6=ia64_fault
-END(dispatch_to_fault_handler)
-
-	/*
-	 * Squatting in this space ...
-	 *
-	 * This special case dispatcher for illegal operation faults allows preserved
-	 * registers to be modified through a callback function (asm only) that is handed
-	 * back from the fault handler in r8. Up to three arguments can be passed to the
-	 * callback function by returning an aggregate with the callback as its first
-	 * element, followed by the arguments.
-	 */
-ENTRY(dispatch_illegal_op_fault)
-	.prologue
-	.body
-	SAVE_MIN_WITH_COVER
-	SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r24)
-				// guarantee that interruption collection is on
-	;;
-	SSM_PSR_I(p15, p15, r3)	// restore psr.i
-	adds r3=8,r2	// set up second base pointer for SAVE_REST
-	;;
-	alloc r14=ar.pfs,0,0,1,0	// must be first in insn group
-	mov out0=ar.ec
-	;;
-	SAVE_REST
-	PT_REGS_UNWIND_INFO(0)
-	;;
-	br.call.sptk.many rp=ia64_illegal_op_fault
-.ret0:	;;
-	alloc r14=ar.pfs,0,0,3,0	// must be first in insn group
-	mov out0=r9
-	mov out1=r10
-	mov out2=r11
-	movl r15=ia64_leave_kernel
-	;;
-	mov rp=r15
-	mov b6=r8
-	;;
-	cmp.ne p6,p0=0,r8
-(p6)	br.call.dpnt.many b6=b6		// call returns to ia64_leave_kernel
-	br.sptk.many ia64_leave_kernel
-END(dispatch_illegal_op_fault)
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
deleted file mode 100644
index ca34e51e84b4..000000000000
--- a/arch/ia64/kernel/kprobes.c
+++ /dev/null
@@ -1,911 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- *  Kernel Probes (KProbes)
- *  arch/ia64/kernel/kprobes.c
- *
- * Copyright (C) IBM Corporation, 2002, 2004
- * Copyright (C) Intel Corporation, 2005
- *
- * 2005-Apr     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
- *              <anil.s.keshavamurthy@intel.com> adapted from i386
- */
-
-#include <linux/kprobes.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/preempt.h>
-#include <linux/extable.h>
-#include <linux/kdebug.h>
-#include <linux/pgtable.h>
-
-#include <asm/sections.h>
-#include <asm/exception.h>
-
-DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
-DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
-
-struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
-
-enum instruction_type {A, I, M, F, B, L, X, u};
-static enum instruction_type bundle_encoding[32][3] = {
-	[0x00] = { M, I, I },
-	[0x01] = { M, I, I },
-	[0x02] = { M, I, I },
-	[0x03] = { M, I, I },
-	[0x04] = { M, L, X },
-	[0x05] = { M, L, X },
-	[0x06] = { u, u, u },
-	[0x07] = { u, u, u },
-	[0x08] = { M, M, I },
-	[0x09] = { M, M, I },
-	[0x0A] = { M, M, I },
-	[0x0B] = { M, M, I },
-	[0x0C] = { M, F, I },
-	[0x0D] = { M, F, I },
-	[0x0E] = { M, M, F },
-	[0x0F] = { M, M, F },
-	[0x10] = { M, I, B },
-	[0x11] = { M, I, B },
-	[0x12] = { M, B, B },
-	[0x13] = { M, B, B },
-	[0x14] = { u, u, u },
-	[0x15] = { u, u, u },
-	[0x16] = { B, B, B },
-	[0x17] = { B, B, B },
-	[0x18] = { M, M, B },
-	[0x19] = { M, M, B },
-	[0x1A] = { u, u, u },
-	[0x1B] = { u, u, u },
-	[0x1C] = { M, F, B },
-	[0x1D] = { M, F, B },
-	[0x1E] = { u, u, u },
-	[0x1F] = { u, u, u },
-};
-
-/* Insert a long branch code */
-static void __kprobes set_brl_inst(void *from, void *to)
-{
-	s64 rel = ((s64) to - (s64) from) >> 4;
-	bundle_t *brl;
-	brl = (bundle_t *) ((u64) from & ~0xf);
-	brl->quad0.template = 0x05;	/* [MLX](stop) */
-	brl->quad0.slot0 = NOP_M_INST;	/* nop.m 0x0 */
-	brl->quad0.slot1_p0 = ((rel >> 20) & 0x7fffffffff) << 2;
-	brl->quad1.slot1_p1 = (((rel >> 20) & 0x7fffffffff) << 2) >> (64 - 46);
-	/* brl.cond.sptk.many.clr rel<<4 (qp=0) */
-	brl->quad1.slot2 = BRL_INST(rel >> 59, rel & 0xfffff);
-}
-
-/*
- * In this function we check to see if the instruction
- * is IP relative instruction and update the kprobe
- * inst flag accordingly
- */
-static void __kprobes update_kprobe_inst_flag(uint template, uint  slot,
-					      uint major_opcode,
-					      unsigned long kprobe_inst,
-					      struct kprobe *p)
-{
-	p->ainsn.inst_flag = 0;
-	p->ainsn.target_br_reg = 0;
-	p->ainsn.slot = slot;
-
-	/* Check for Break instruction
-	 * Bits 37:40 Major opcode to be zero
-	 * Bits 27:32 X6 to be zero
-	 * Bits 32:35 X3 to be zero
-	 */
-	if ((!major_opcode) && (!((kprobe_inst >> 27) & 0x1FF)) ) {
-		/* is a break instruction */
-	 	p->ainsn.inst_flag |= INST_FLAG_BREAK_INST;
-		return;
-	}
-
-	if (bundle_encoding[template][slot] == B) {
-		switch (major_opcode) {
-		  case INDIRECT_CALL_OPCODE:
-	 		p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG;
-			p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7);
-			break;
-		  case IP_RELATIVE_PREDICT_OPCODE:
-		  case IP_RELATIVE_BRANCH_OPCODE:
-			p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR;
-			break;
-		  case IP_RELATIVE_CALL_OPCODE:
-			p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR;
-			p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG;
-			p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7);
-			break;
-		}
-	} else if (bundle_encoding[template][slot] == X) {
-		switch (major_opcode) {
-		  case LONG_CALL_OPCODE:
-			p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG;
-			p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7);
-		  break;
-		}
-	}
-	return;
-}
-
-/*
- * In this function we check to see if the instruction
- * (qp) cmpx.crel.ctype p1,p2=r2,r3
- * on which we are inserting kprobe is cmp instruction
- * with ctype as unc.
- */
-static uint __kprobes is_cmp_ctype_unc_inst(uint template, uint slot,
-					    uint major_opcode,
-					    unsigned long kprobe_inst)
-{
-	cmp_inst_t cmp_inst;
-	uint ctype_unc = 0;
-
-	if (!((bundle_encoding[template][slot] == I) ||
-		(bundle_encoding[template][slot] == M)))
-		goto out;
-
-	if (!((major_opcode == 0xC) || (major_opcode == 0xD) ||
-		(major_opcode == 0xE)))
-		goto out;
-
-	cmp_inst.l = kprobe_inst;
-	if ((cmp_inst.f.x2 == 0) || (cmp_inst.f.x2 == 1)) {
-		/* Integer compare - Register Register (A6 type)*/
-		if ((cmp_inst.f.tb == 0) && (cmp_inst.f.ta == 0)
-				&&(cmp_inst.f.c == 1))
-			ctype_unc = 1;
-	} else if ((cmp_inst.f.x2 == 2)||(cmp_inst.f.x2 == 3)) {
-		/* Integer compare - Immediate Register (A8 type)*/
-		if ((cmp_inst.f.ta == 0) &&(cmp_inst.f.c == 1))
-			ctype_unc = 1;
-	}
-out:
-	return ctype_unc;
-}
-
-/*
- * In this function we check to see if the instruction
- * on which we are inserting kprobe is supported.
- * Returns qp value if supported
- * Returns -EINVAL if unsupported
- */
-static int __kprobes unsupported_inst(uint template, uint  slot,
-				      uint major_opcode,
-				      unsigned long kprobe_inst,
-				      unsigned long addr)
-{
-	int qp;
-
-	qp = kprobe_inst & 0x3f;
-	if (is_cmp_ctype_unc_inst(template, slot, major_opcode, kprobe_inst)) {
-		if (slot == 1 && qp)  {
-			printk(KERN_WARNING "Kprobes on cmp unc "
-					"instruction on slot 1 at <0x%lx> "
-					"is not supported\n", addr);
-			return -EINVAL;
-
-		}
-		qp = 0;
-	}
-	else if (bundle_encoding[template][slot] == I) {
-		if (major_opcode == 0) {
-			/*
-			 * Check for Integer speculation instruction
-			 * - Bit 33-35 to be equal to 0x1
-			 */
-			if (((kprobe_inst >> 33) & 0x7) == 1) {
-				printk(KERN_WARNING
-					"Kprobes on speculation inst at <0x%lx> not supported\n",
-						addr);
-				return -EINVAL;
-			}
-			/*
-			 * IP relative mov instruction
-			 *  - Bit 27-35 to be equal to 0x30
-			 */
-			if (((kprobe_inst >> 27) & 0x1FF) == 0x30) {
-				printk(KERN_WARNING
-					"Kprobes on \"mov r1=ip\" at <0x%lx> not supported\n",
-						addr);
-				return -EINVAL;
-
-			}
-		}
-		else if ((major_opcode == 5) &&	!(kprobe_inst & (0xFUl << 33)) &&
-				(kprobe_inst & (0x1UL << 12))) {
-			/* test bit instructions, tbit,tnat,tf
-			 * bit 33-36 to be equal to 0
-			 * bit 12 to be equal to 1
-			 */
-			if (slot == 1 && qp) {
-				printk(KERN_WARNING "Kprobes on test bit "
-						"instruction on slot at <0x%lx> "
-						"is not supported\n", addr);
-				return -EINVAL;
-			}
-			qp = 0;
-		}
-	}
-	else if (bundle_encoding[template][slot] == B) {
-		if (major_opcode == 7) {
-			/* IP-Relative Predict major code is 7 */
-			printk(KERN_WARNING "Kprobes on IP-Relative"
-					"Predict is not supported\n");
-			return -EINVAL;
-		}
-		else if (major_opcode == 2) {
-			/* Indirect Predict, major code is 2
-			 * bit 27-32 to be equal to 10 or 11
-			 */
-			int x6=(kprobe_inst >> 27) & 0x3F;
-			if ((x6 == 0x10) || (x6 == 0x11)) {
-				printk(KERN_WARNING "Kprobes on "
-					"Indirect Predict is not supported\n");
-				return -EINVAL;
-			}
-		}
-	}
-	/* kernel does not use float instruction, here for safety kprobe
-	 * will judge whether it is fcmp/flass/float approximation instruction
-	 */
-	else if (unlikely(bundle_encoding[template][slot] == F)) {
-		if ((major_opcode == 4 || major_opcode == 5) &&
-				(kprobe_inst  & (0x1 << 12))) {
-			/* fcmp/fclass unc instruction */
-			if (slot == 1 && qp) {
-				printk(KERN_WARNING "Kprobes on fcmp/fclass "
-					"instruction on slot at <0x%lx> "
-					"is not supported\n", addr);
-				return -EINVAL;
-
-			}
-			qp = 0;
-		}
-		if ((major_opcode == 0 || major_opcode == 1) &&
-			(kprobe_inst & (0x1UL << 33))) {
-			/* float Approximation instruction */
-			if (slot == 1 && qp) {
-				printk(KERN_WARNING "Kprobes on float Approx "
-					"instr at <0x%lx> is not supported\n",
-						addr);
-				return -EINVAL;
-			}
-			qp = 0;
-		}
-	}
-	return qp;
-}
-
-/*
- * In this function we override the bundle with
- * the break instruction at the given slot.
- */
-static void __kprobes prepare_break_inst(uint template, uint  slot,
-					 uint major_opcode,
-					 unsigned long kprobe_inst,
-					 struct kprobe *p,
-					 int qp)
-{
-	unsigned long break_inst = BREAK_INST;
-	bundle_t *bundle = &p->opcode.bundle;
-
-	/*
-	 * Copy the original kprobe_inst qualifying predicate(qp)
-	 * to the break instruction
-	 */
-	break_inst |= qp;
-
-	switch (slot) {
-	  case 0:
-		bundle->quad0.slot0 = break_inst;
-		break;
-	  case 1:
-		bundle->quad0.slot1_p0 = break_inst;
-		bundle->quad1.slot1_p1 = break_inst >> (64-46);
-		break;
-	  case 2:
-		bundle->quad1.slot2 = break_inst;
-		break;
-	}
-
-	/*
-	 * Update the instruction flag, so that we can
-	 * emulate the instruction properly after we
-	 * single step on original instruction
-	 */
-	update_kprobe_inst_flag(template, slot, major_opcode, kprobe_inst, p);
-}
-
-static void __kprobes get_kprobe_inst(bundle_t *bundle, uint slot,
-	       	unsigned long *kprobe_inst, uint *major_opcode)
-{
-	unsigned long kprobe_inst_p0, kprobe_inst_p1;
-	unsigned int template;
-
-	template = bundle->quad0.template;
-
-	switch (slot) {
-	  case 0:
-		*major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT);
-		*kprobe_inst = bundle->quad0.slot0;
-		  break;
-	  case 1:
-		*major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT);
-		kprobe_inst_p0 = bundle->quad0.slot1_p0;
-		kprobe_inst_p1 = bundle->quad1.slot1_p1;
-		*kprobe_inst = kprobe_inst_p0 | (kprobe_inst_p1 << (64-46));
-		break;
-	  case 2:
-		*major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT);
-		*kprobe_inst = bundle->quad1.slot2;
-		break;
-	}
-}
-
-/* Returns non-zero if the addr is in the Interrupt Vector Table */
-static int __kprobes in_ivt_functions(unsigned long addr)
-{
-	return (addr >= (unsigned long)__start_ivt_text
-		&& addr < (unsigned long)__end_ivt_text);
-}
-
-static int __kprobes valid_kprobe_addr(int template, int slot,
-				       unsigned long addr)
-{
-	if ((slot > 2) || ((bundle_encoding[template][1] == L) && slot > 1)) {
-		printk(KERN_WARNING "Attempting to insert unaligned kprobe "
-				"at 0x%lx\n", addr);
-		return -EINVAL;
-	}
-
-	if (in_ivt_functions(addr)) {
-		printk(KERN_WARNING "Kprobes can't be inserted inside "
-				"IVT functions at 0x%lx\n", addr);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-	unsigned int i;
-	i = atomic_add_return(1, &kcb->prev_kprobe_index);
-	kcb->prev_kprobe[i-1].kp = kprobe_running();
-	kcb->prev_kprobe[i-1].status = kcb->kprobe_status;
-}
-
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
-{
-	unsigned int i;
-	i = atomic_read(&kcb->prev_kprobe_index);
-	__this_cpu_write(current_kprobe, kcb->prev_kprobe[i-1].kp);
-	kcb->kprobe_status = kcb->prev_kprobe[i-1].status;
-	atomic_sub(1, &kcb->prev_kprobe_index);
-}
-
-static void __kprobes set_current_kprobe(struct kprobe *p,
-			struct kprobe_ctlblk *kcb)
-{
-	__this_cpu_write(current_kprobe, p);
-}
-
-void __kretprobe_trampoline(void)
-{
-}
-
-int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
-{
-	regs->cr_iip = __kretprobe_trampoline_handler(regs, NULL);
-	/*
-	 * By returning a non-zero value, we are telling
-	 * kprobe_handler() that we don't want the post_handler
-	 * to run (and have re-enabled preemption)
-	 */
-	return 1;
-}
-
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
-				      struct pt_regs *regs)
-{
-	ri->ret_addr = (kprobe_opcode_t *)regs->b0;
-	ri->fp = NULL;
-
-	/* Replace the return addr with trampoline addr */
-	regs->b0 = (unsigned long)dereference_function_descriptor(__kretprobe_trampoline);
-}
-
-/* Check the instruction in the slot is break */
-static int __kprobes __is_ia64_break_inst(bundle_t *bundle, uint slot)
-{
-	unsigned int major_opcode;
-	unsigned int template = bundle->quad0.template;
-	unsigned long kprobe_inst;
-
-	/* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */
-	if (slot == 1 && bundle_encoding[template][1] == L)
-		slot++;
-
-	/* Get Kprobe probe instruction at given slot*/
-	get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode);
-
-	/* For break instruction,
-	 * Bits 37:40 Major opcode to be zero
-	 * Bits 27:32 X6 to be zero
-	 * Bits 32:35 X3 to be zero
-	 */
-	if (major_opcode || ((kprobe_inst >> 27) & 0x1FF)) {
-		/* Not a break instruction */
-		return 0;
-	}
-
-	/* Is a break instruction */
-	return 1;
-}
-
-/*
- * In this function, we check whether the target bundle modifies IP or
- * it triggers an exception. If so, it cannot be boostable.
- */
-static int __kprobes can_boost(bundle_t *bundle, uint slot,
-			       unsigned long bundle_addr)
-{
-	unsigned int template = bundle->quad0.template;
-
-	do {
-		if (search_exception_tables(bundle_addr + slot) ||
-		    __is_ia64_break_inst(bundle, slot))
-			return 0;	/* exception may occur in this bundle*/
-	} while ((++slot) < 3);
-	template &= 0x1e;
-	if (template >= 0x10 /* including B unit */ ||
-	    template == 0x04 /* including X unit */ ||
-	    template == 0x06) /* undefined */
-		return 0;
-
-	return 1;
-}
-
-/* Prepare long jump bundle and disables other boosters if need */
-static void __kprobes prepare_booster(struct kprobe *p)
-{
-	unsigned long addr = (unsigned long)p->addr & ~0xFULL;
-	unsigned int slot = (unsigned long)p->addr & 0xf;
-	struct kprobe *other_kp;
-
-	if (can_boost(&p->ainsn.insn[0].bundle, slot, addr)) {
-		set_brl_inst(&p->ainsn.insn[1].bundle, (bundle_t *)addr + 1);
-		p->ainsn.inst_flag |= INST_FLAG_BOOSTABLE;
-	}
-
-	/* disables boosters in previous slots */
-	for (; addr < (unsigned long)p->addr; addr++) {
-		other_kp = get_kprobe((void *)addr);
-		if (other_kp)
-			other_kp->ainsn.inst_flag &= ~INST_FLAG_BOOSTABLE;
-	}
-}
-
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
-{
-	unsigned long addr = (unsigned long) p->addr;
-	unsigned long *kprobe_addr = (unsigned long *)(addr & ~0xFULL);
-	unsigned long kprobe_inst=0;
-	unsigned int slot = addr & 0xf, template, major_opcode = 0;
-	bundle_t *bundle;
-	int qp;
-
-	bundle = &((kprobe_opcode_t *)kprobe_addr)->bundle;
-	template = bundle->quad0.template;
-
-	if(valid_kprobe_addr(template, slot, addr))
-		return -EINVAL;
-
-	/* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */
-	if (slot == 1 && bundle_encoding[template][1] == L)
-		slot++;
-
-	/* Get kprobe_inst and major_opcode from the bundle */
-	get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode);
-
-	qp = unsupported_inst(template, slot, major_opcode, kprobe_inst, addr);
-	if (qp < 0)
-		return -EINVAL;
-
-	p->ainsn.insn = get_insn_slot();
-	if (!p->ainsn.insn)
-		return -ENOMEM;
-	memcpy(&p->opcode, kprobe_addr, sizeof(kprobe_opcode_t));
-	memcpy(p->ainsn.insn, kprobe_addr, sizeof(kprobe_opcode_t));
-
-	prepare_break_inst(template, slot, major_opcode, kprobe_inst, p, qp);
-
-	prepare_booster(p);
-
-	return 0;
-}
-
-void __kprobes arch_arm_kprobe(struct kprobe *p)
-{
-	unsigned long arm_addr;
-	bundle_t *src, *dest;
-
-	arm_addr = ((unsigned long)p->addr) & ~0xFUL;
-	dest = &((kprobe_opcode_t *)arm_addr)->bundle;
-	src = &p->opcode.bundle;
-
-	flush_icache_range((unsigned long)p->ainsn.insn,
-			   (unsigned long)p->ainsn.insn +
-			   sizeof(kprobe_opcode_t) * MAX_INSN_SIZE);
-
-	switch (p->ainsn.slot) {
-		case 0:
-			dest->quad0.slot0 = src->quad0.slot0;
-			break;
-		case 1:
-			dest->quad1.slot1_p1 = src->quad1.slot1_p1;
-			break;
-		case 2:
-			dest->quad1.slot2 = src->quad1.slot2;
-			break;
-	}
-	flush_icache_range(arm_addr, arm_addr + sizeof(kprobe_opcode_t));
-}
-
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
-{
-	unsigned long arm_addr;
-	bundle_t *src, *dest;
-
-	arm_addr = ((unsigned long)p->addr) & ~0xFUL;
-	dest = &((kprobe_opcode_t *)arm_addr)->bundle;
-	/* p->ainsn.insn contains the original unaltered kprobe_opcode_t */
-	src = &p->ainsn.insn->bundle;
-	switch (p->ainsn.slot) {
-		case 0:
-			dest->quad0.slot0 = src->quad0.slot0;
-			break;
-		case 1:
-			dest->quad1.slot1_p1 = src->quad1.slot1_p1;
-			break;
-		case 2:
-			dest->quad1.slot2 = src->quad1.slot2;
-			break;
-	}
-	flush_icache_range(arm_addr, arm_addr + sizeof(kprobe_opcode_t));
-}
-
-void __kprobes arch_remove_kprobe(struct kprobe *p)
-{
-	if (p->ainsn.insn) {
-		free_insn_slot(p->ainsn.insn,
-			       p->ainsn.inst_flag & INST_FLAG_BOOSTABLE);
-		p->ainsn.insn = NULL;
-	}
-}
-/*
- * We are resuming execution after a single step fault, so the pt_regs
- * structure reflects the register state after we executed the instruction
- * located in the kprobe (p->ainsn.insn->bundle).  We still need to adjust
- * the ip to point back to the original stack address. To set the IP address
- * to original stack address, handle the case where we need to fixup the
- * relative IP address and/or fixup branch register.
- */
-static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
-{
-	unsigned long bundle_addr = (unsigned long) (&p->ainsn.insn->bundle);
-	unsigned long resume_addr = (unsigned long)p->addr & ~0xFULL;
-	unsigned long template;
-	int slot = ((unsigned long)p->addr & 0xf);
-
-	template = p->ainsn.insn->bundle.quad0.template;
-
-	if (slot == 1 && bundle_encoding[template][1] == L)
-		slot = 2;
-
-	if (p->ainsn.inst_flag & ~INST_FLAG_BOOSTABLE) {
-
-		if (p->ainsn.inst_flag & INST_FLAG_FIX_RELATIVE_IP_ADDR) {
-			/* Fix relative IP address */
-			regs->cr_iip = (regs->cr_iip - bundle_addr) +
-					resume_addr;
-		}
-
-		if (p->ainsn.inst_flag & INST_FLAG_FIX_BRANCH_REG) {
-		/*
-		 * Fix target branch register, software convention is
-		 * to use either b0 or b6 or b7, so just checking
-		 * only those registers
-		 */
-			switch (p->ainsn.target_br_reg) {
-			case 0:
-				if ((regs->b0 == bundle_addr) ||
-					(regs->b0 == bundle_addr + 0x10)) {
-					regs->b0 = (regs->b0 - bundle_addr) +
-						resume_addr;
-				}
-				break;
-			case 6:
-				if ((regs->b6 == bundle_addr) ||
-					(regs->b6 == bundle_addr + 0x10)) {
-					regs->b6 = (regs->b6 - bundle_addr) +
-						resume_addr;
-				}
-				break;
-			case 7:
-				if ((regs->b7 == bundle_addr) ||
-					(regs->b7 == bundle_addr + 0x10)) {
-					regs->b7 = (regs->b7 - bundle_addr) +
-						resume_addr;
-				}
-				break;
-			} /* end switch */
-		}
-		goto turn_ss_off;
-	}
-
-	if (slot == 2) {
-		if (regs->cr_iip == bundle_addr + 0x10) {
-			regs->cr_iip = resume_addr + 0x10;
-		}
-	} else {
-		if (regs->cr_iip == bundle_addr) {
-			regs->cr_iip = resume_addr;
-		}
-	}
-
-turn_ss_off:
-	/* Turn off Single Step bit */
-	ia64_psr(regs)->ss = 0;
-}
-
-static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs)
-{
-	unsigned long bundle_addr = (unsigned long) &p->ainsn.insn->bundle;
-	unsigned long slot = (unsigned long)p->addr & 0xf;
-
-	/* single step inline if break instruction */
-	if (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)
-		regs->cr_iip = (unsigned long)p->addr & ~0xFULL;
-	else
-		regs->cr_iip = bundle_addr & ~0xFULL;
-
-	if (slot > 2)
-		slot = 0;
-
-	ia64_psr(regs)->ri = slot;
-
-	/* turn on single stepping */
-	ia64_psr(regs)->ss = 1;
-}
-
-static int __kprobes is_ia64_break_inst(struct pt_regs *regs)
-{
-	unsigned int slot = ia64_psr(regs)->ri;
-	unsigned long *kprobe_addr = (unsigned long *)regs->cr_iip;
-	bundle_t bundle;
-
-	memcpy(&bundle, kprobe_addr, sizeof(bundle_t));
-
-	return __is_ia64_break_inst(&bundle, slot);
-}
-
-static int __kprobes pre_kprobes_handler(struct die_args *args)
-{
-	struct kprobe *p;
-	int ret = 0;
-	struct pt_regs *regs = args->regs;
-	kprobe_opcode_t *addr = (kprobe_opcode_t *)instruction_pointer(regs);
-	struct kprobe_ctlblk *kcb;
-
-	/*
-	 * We don't want to be preempted for the entire
-	 * duration of kprobe processing
-	 */
-	preempt_disable();
-	kcb = get_kprobe_ctlblk();
-
-	/* Handle recursion cases */
-	if (kprobe_running()) {
-		p = get_kprobe(addr);
-		if (p) {
-			if ((kcb->kprobe_status == KPROBE_HIT_SS) &&
-	 		     (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)) {
-				ia64_psr(regs)->ss = 0;
-				goto no_kprobe;
-			}
-			/* We have reentered the pre_kprobe_handler(), since
-			 * another probe was hit while within the handler.
-			 * We here save the original kprobes variables and
-			 * just single step on the instruction of the new probe
-			 * without calling any user handlers.
-			 */
-			save_previous_kprobe(kcb);
-			set_current_kprobe(p, kcb);
-			kprobes_inc_nmissed_count(p);
-			prepare_ss(p, regs);
-			kcb->kprobe_status = KPROBE_REENTER;
-			return 1;
-		} else if (!is_ia64_break_inst(regs)) {
-			/* The breakpoint instruction was removed by
-			 * another cpu right after we hit, no further
-			 * handling of this interrupt is appropriate
-			 */
-			ret = 1;
-			goto no_kprobe;
-		} else {
-			/* Not our break */
-			goto no_kprobe;
-		}
-	}
-
-	p = get_kprobe(addr);
-	if (!p) {
-		if (!is_ia64_break_inst(regs)) {
-			/*
-			 * The breakpoint instruction was removed right
-			 * after we hit it.  Another cpu has removed
-			 * either a probepoint or a debugger breakpoint
-			 * at this address.  In either case, no further
-			 * handling of this interrupt is appropriate.
-			 */
-			ret = 1;
-
-		}
-
-		/* Not one of our break, let kernel handle it */
-		goto no_kprobe;
-	}
-
-	set_current_kprobe(p, kcb);
-	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-
-	if (p->pre_handler && p->pre_handler(p, regs)) {
-		reset_current_kprobe();
-		preempt_enable_no_resched();
-		return 1;
-	}
-
-#if !defined(CONFIG_PREEMPTION)
-	if (p->ainsn.inst_flag == INST_FLAG_BOOSTABLE && !p->post_handler) {
-		/* Boost up -- we can execute copied instructions directly */
-		ia64_psr(regs)->ri = p->ainsn.slot;
-		regs->cr_iip = (unsigned long)&p->ainsn.insn->bundle & ~0xFULL;
-		/* turn single stepping off */
-		ia64_psr(regs)->ss = 0;
-
-		reset_current_kprobe();
-		preempt_enable_no_resched();
-		return 1;
-	}
-#endif
-	prepare_ss(p, regs);
-	kcb->kprobe_status = KPROBE_HIT_SS;
-	return 1;
-
-no_kprobe:
-	preempt_enable_no_resched();
-	return ret;
-}
-
-static int __kprobes post_kprobes_handler(struct pt_regs *regs)
-{
-	struct kprobe *cur = kprobe_running();
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-	if (!cur)
-		return 0;
-
-	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
-		kcb->kprobe_status = KPROBE_HIT_SSDONE;
-		cur->post_handler(cur, regs, 0);
-	}
-
-	resume_execution(cur, regs);
-
-	/*Restore back the original saved kprobes variables and continue. */
-	if (kcb->kprobe_status == KPROBE_REENTER) {
-		restore_previous_kprobe(kcb);
-		goto out;
-	}
-	reset_current_kprobe();
-
-out:
-	preempt_enable_no_resched();
-	return 1;
-}
-
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
-{
-	struct kprobe *cur = kprobe_running();
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
-
-	switch(kcb->kprobe_status) {
-	case KPROBE_HIT_SS:
-	case KPROBE_REENTER:
-		/*
-		 * We are here because the instruction being single
-		 * stepped caused a page fault. We reset the current
-		 * kprobe and the instruction pointer points back to
-		 * the probe address and allow the page fault handler
-		 * to continue as a normal page fault.
-		 */
-		regs->cr_iip = ((unsigned long)cur->addr) & ~0xFULL;
-		ia64_psr(regs)->ri = ((unsigned long)cur->addr) & 0xf;
-		if (kcb->kprobe_status == KPROBE_REENTER)
-			restore_previous_kprobe(kcb);
-		else
-			reset_current_kprobe();
-		preempt_enable_no_resched();
-		break;
-	case KPROBE_HIT_ACTIVE:
-	case KPROBE_HIT_SSDONE:
-		/*
-		 * In case the user-specified fault handler returned
-		 * zero, try to fix up.
-		 */
-		if (ia64_done_with_exception(regs))
-			return 1;
-
-		/*
-		 * Let ia64_do_page_fault() fix it.
-		 */
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
-				       unsigned long val, void *data)
-{
-	struct die_args *args = (struct die_args *)data;
-	int ret = NOTIFY_DONE;
-
-	if (args->regs && user_mode(args->regs))
-		return ret;
-
-	switch(val) {
-	case DIE_BREAK:
-		/* err is break number from ia64_bad_break() */
-		if ((args->err >> 12) == (__IA64_BREAK_KPROBE >> 12)
-			|| args->err == 0)
-			if (pre_kprobes_handler(args))
-				ret = NOTIFY_STOP;
-		break;
-	case DIE_FAULT:
-		/* err is vector number from ia64_fault() */
-		if (args->err == 36)
-			if (post_kprobes_handler(args->regs))
-				ret = NOTIFY_STOP;
-		break;
-	default:
-		break;
-	}
-	return ret;
-}
-
-static struct kprobe trampoline_p = {
-	.pre_handler = trampoline_probe_handler
-};
-
-int __init arch_init_kprobes(void)
-{
-	trampoline_p.addr =
-		dereference_function_descriptor(__kretprobe_trampoline);
-	return register_kprobe(&trampoline_p);
-}
-
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
-{
-	if (p->addr ==
-		dereference_function_descriptor(__kretprobe_trampoline))
-		return 1;
-
-	return 0;
-}
diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
deleted file mode 100644
index 4db9ca144fa5..000000000000
--- a/arch/ia64/kernel/machine_kexec.c
+++ /dev/null
@@ -1,163 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * arch/ia64/kernel/machine_kexec.c
- *
- * Handle transition of Linux booting another kernel
- * Copyright (C) 2005 Hewlett-Packard Development Comapny, L.P.
- * Copyright (C) 2005 Khalid Aziz <khalid.aziz@hp.com>
- * Copyright (C) 2006 Intel Corp, Zou Nan hai <nanhai.zou@intel.com>
- */
-
-#include <linux/mm.h>
-#include <linux/kexec.h>
-#include <linux/cpu.h>
-#include <linux/irq.h>
-#include <linux/efi.h>
-#include <linux/numa.h>
-#include <linux/mmzone.h>
-
-#include <asm/efi.h>
-#include <asm/numa.h>
-#include <asm/mmu_context.h>
-#include <asm/setup.h>
-#include <asm/delay.h>
-#include <asm/meminit.h>
-#include <asm/processor.h>
-#include <asm/sal.h>
-#include <asm/mca.h>
-
-typedef void (*relocate_new_kernel_t)(
-					unsigned long indirection_page,
-					unsigned long start_address,
-					struct ia64_boot_param *boot_param,
-					unsigned long pal_addr) __noreturn;
-
-struct kimage *ia64_kimage;
-
-struct resource efi_memmap_res = {
-        .name  = "EFI Memory Map",
-        .start = 0,
-        .end   = 0,
-        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-struct resource boot_param_res = {
-        .name  = "Boot parameter",
-        .start = 0,
-        .end   = 0,
-        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-
-/*
- * Do what every setup is needed on image and the
- * reboot code buffer to allow us to avoid allocations
- * later.
- */
-int machine_kexec_prepare(struct kimage *image)
-{
-	void *control_code_buffer;
-	const unsigned long *func;
-
-	func = (unsigned long *)&relocate_new_kernel;
-	/* Pre-load control code buffer to minimize work in kexec path */
-	control_code_buffer = page_address(image->control_code_page);
-	memcpy((void *)control_code_buffer, (const void *)func[0],
-			relocate_new_kernel_size);
-	flush_icache_range((unsigned long)control_code_buffer,
-			(unsigned long)control_code_buffer + relocate_new_kernel_size);
-	ia64_kimage = image;
-
-	return 0;
-}
-
-void machine_kexec_cleanup(struct kimage *image)
-{
-}
-
-/*
- * Do not allocate memory (or fail in any way) in machine_kexec().
- * We are past the point of no return, committed to rebooting now.
- */
-static void ia64_machine_kexec(struct unw_frame_info *info, void *arg)
-{
-	struct kimage *image = arg;
-	relocate_new_kernel_t rnk;
-	void *pal_addr = efi_get_pal_addr();
-	unsigned long code_addr;
-	int ii;
-	u64 fp, gp;
-	ia64_fptr_t *init_handler = (ia64_fptr_t *)ia64_os_init_on_kdump;
-
-	BUG_ON(!image);
-	code_addr = (unsigned long)page_address(image->control_code_page);
-	if (image->type == KEXEC_TYPE_CRASH) {
-		crash_save_this_cpu();
-		current->thread.ksp = (__u64)info->sw - 16;
-
-		/* Register noop init handler */
-		fp = ia64_tpa(init_handler->fp);
-		gp = ia64_tpa(ia64_getreg(_IA64_REG_GP));
-		ia64_sal_set_vectors(SAL_VECTOR_OS_INIT, fp, gp, 0, fp, gp, 0);
-	} else {
-		/* Unregister init handlers of current kernel */
-		ia64_sal_set_vectors(SAL_VECTOR_OS_INIT, 0, 0, 0, 0, 0, 0);
-	}
-
-	/* Unregister mca handler - No more recovery on current kernel */
-	ia64_sal_set_vectors(SAL_VECTOR_OS_MCA, 0, 0, 0, 0, 0, 0);
-
-	/* Interrupts aren't acceptable while we reboot */
-	local_irq_disable();
-
-	/* Mask CMC and Performance Monitor interrupts */
-	ia64_setreg(_IA64_REG_CR_PMV, 1 << 16);
-	ia64_setreg(_IA64_REG_CR_CMCV, 1 << 16);
-
-	/* Mask ITV and Local Redirect Registers */
-	ia64_set_itv(1 << 16);
-	ia64_set_lrr0(1 << 16);
-	ia64_set_lrr1(1 << 16);
-
-	/* terminate possible nested in-service interrupts */
-	for (ii = 0; ii < 16; ii++)
-		ia64_eoi();
-
-	/* unmask TPR and clear any pending interrupts */
-	ia64_setreg(_IA64_REG_CR_TPR, 0);
-	ia64_srlz_d();
-	while (ia64_get_ivr() != IA64_SPURIOUS_INT_VECTOR)
-		ia64_eoi();
-	rnk = (relocate_new_kernel_t)&code_addr;
-	(*rnk)(image->head, image->start, ia64_boot_param,
-		     GRANULEROUNDDOWN((unsigned long) pal_addr));
-	BUG();
-}
-
-void machine_kexec(struct kimage *image)
-{
-	BUG_ON(!image);
-	unw_init_running(ia64_machine_kexec, image);
-	for(;;);
-}
-
-void arch_crash_save_vmcoreinfo(void)
-{
-#if defined(CONFIG_SPARSEMEM)
-	VMCOREINFO_SYMBOL(pgdat_list);
-	VMCOREINFO_LENGTH(pgdat_list, MAX_NUMNODES);
-#endif
-#ifdef CONFIG_NUMA
-	VMCOREINFO_SYMBOL(node_memblk);
-	VMCOREINFO_LENGTH(node_memblk, NR_NODE_MEMBLKS);
-	VMCOREINFO_STRUCT_SIZE(node_memblk_s);
-	VMCOREINFO_OFFSET(node_memblk_s, start_paddr);
-	VMCOREINFO_OFFSET(node_memblk_s, size);
-#endif
-#if CONFIG_PGTABLE_LEVELS == 3
-	VMCOREINFO_CONFIG(PGTABLE_3);
-#elif CONFIG_PGTABLE_LEVELS == 4
-	VMCOREINFO_CONFIG(PGTABLE_4);
-#endif
-}
-
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
deleted file mode 100644
index 2671688d349a..000000000000
--- a/arch/ia64/kernel/mca.c
+++ /dev/null
@@ -1,2111 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * File:	mca.c
- * Purpose:	Generic MCA handling layer
- *
- * Copyright (C) 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * Copyright (C) 2002 Dell Inc.
- * Copyright (C) Matt Domsch <Matt_Domsch@dell.com>
- *
- * Copyright (C) 2002 Intel
- * Copyright (C) Jenna Hall <jenna.s.hall@intel.com>
- *
- * Copyright (C) 2001 Intel
- * Copyright (C) Fred Lewis <frederick.v.lewis@intel.com>
- *
- * Copyright (C) 2000 Intel
- * Copyright (C) Chuck Fleckenstein <cfleck@co.intel.com>
- *
- * Copyright (C) 1999, 2004-2008 Silicon Graphics, Inc.
- * Copyright (C) Vijay Chander <vijay@engr.sgi.com>
- *
- * Copyright (C) 2006 FUJITSU LIMITED
- * Copyright (C) Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
- *
- * 2000-03-29 Chuck Fleckenstein <cfleck@co.intel.com>
- *	      Fixed PAL/SAL update issues, began MCA bug fixes, logging issues,
- *	      added min save state dump, added INIT handler.
- *
- * 2001-01-03 Fred Lewis <frederick.v.lewis@intel.com>
- *	      Added setup of CMCI and CPEI IRQs, logging of corrected platform
- *	      errors, completed code for logging of corrected & uncorrected
- *	      machine check errors, and updated for conformance with Nov. 2000
- *	      revision of the SAL 3.0 spec.
- *
- * 2002-01-04 Jenna Hall <jenna.s.hall@intel.com>
- *	      Aligned MCA stack to 16 bytes, added platform vs. CPU error flag,
- *	      set SAL default return values, changed error record structure to
- *	      linked list, added init call to sal_get_state_info_size().
- *
- * 2002-03-25 Matt Domsch <Matt_Domsch@dell.com>
- *	      GUID cleanups.
- *
- * 2003-04-15 David Mosberger-Tang <davidm@hpl.hp.com>
- *	      Added INIT backtrace support.
- *
- * 2003-12-08 Keith Owens <kaos@sgi.com>
- *	      smp_call_function() must not be called from interrupt context
- *	      (can deadlock on tasklist_lock).
- *	      Use keventd to call smp_call_function().
- *
- * 2004-02-01 Keith Owens <kaos@sgi.com>
- *	      Avoid deadlock when using printk() for MCA and INIT records.
- *	      Delete all record printing code, moved to salinfo_decode in user
- *	      space.  Mark variables and functions static where possible.
- *	      Delete dead variables and functions.  Reorder to remove the need
- *	      for forward declarations and to consolidate related code.
- *
- * 2005-08-12 Keith Owens <kaos@sgi.com>
- *	      Convert MCA/INIT handlers to use per event stacks and SAL/OS
- *	      state.
- *
- * 2005-10-07 Keith Owens <kaos@sgi.com>
- *	      Add notify_die() hooks.
- *
- * 2006-09-15 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
- *	      Add printing support for MCA/INIT.
- *
- * 2007-04-27 Russ Anderson <rja@sgi.com>
- *	      Support multiple cpus going through OS_MCA in the same event.
- */
-#include <linux/jiffies.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/sched/task.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/memblock.h>
-#include <linux/acpi.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/workqueue.h>
-#include <linux/cpumask.h>
-#include <linux/kdebug.h>
-#include <linux/cpu.h>
-#include <linux/gfp.h>
-
-#include <asm/delay.h>
-#include <asm/efi.h>
-#include <asm/meminit.h>
-#include <asm/page.h>
-#include <asm/ptrace.h>
-#include <asm/sal.h>
-#include <asm/mca.h>
-#include <asm/mca_asm.h>
-#include <asm/kexec.h>
-
-#include <asm/irq.h>
-#include <asm/hw_irq.h>
-#include <asm/tlb.h>
-
-#include "mca_drv.h"
-#include "entry.h"
-#include "irq.h"
-
-#if defined(IA64_MCA_DEBUG_INFO)
-# define IA64_MCA_DEBUG(fmt...) printk(fmt)
-#else
-# define IA64_MCA_DEBUG(fmt...) do {} while (0)
-#endif
-
-#define NOTIFY_INIT(event, regs, arg, spin)				\
-do {									\
-	if ((notify_die((event), "INIT", (regs), (arg), 0, 0)		\
-			== NOTIFY_STOP) && ((spin) == 1))		\
-		ia64_mca_spin(__func__);				\
-} while (0)
-
-#define NOTIFY_MCA(event, regs, arg, spin)				\
-do {									\
-	if ((notify_die((event), "MCA", (regs), (arg), 0, 0)		\
-			== NOTIFY_STOP) && ((spin) == 1))		\
-		ia64_mca_spin(__func__);				\
-} while (0)
-
-/* Used by mca_asm.S */
-DEFINE_PER_CPU(u64, ia64_mca_data); /* == __per_cpu_mca[smp_processor_id()] */
-DEFINE_PER_CPU(u64, ia64_mca_per_cpu_pte); /* PTE to map per-CPU area */
-DEFINE_PER_CPU(u64, ia64_mca_pal_pte);	    /* PTE to map PAL code */
-DEFINE_PER_CPU(u64, ia64_mca_pal_base);    /* vaddr PAL code granule */
-DEFINE_PER_CPU(u64, ia64_mca_tr_reload);   /* Flag for TR reload */
-
-unsigned long __per_cpu_mca[NR_CPUS];
-
-/* In mca_asm.S */
-extern void			ia64_os_init_dispatch_monarch (void);
-extern void			ia64_os_init_dispatch_slave (void);
-
-static int monarch_cpu = -1;
-
-static ia64_mc_info_t		ia64_mc_info;
-
-#define MAX_CPE_POLL_INTERVAL (15*60*HZ) /* 15 minutes */
-#define MIN_CPE_POLL_INTERVAL (2*60*HZ)  /* 2 minutes */
-#define CMC_POLL_INTERVAL     (1*60*HZ)  /* 1 minute */
-#define CPE_HISTORY_LENGTH    5
-#define CMC_HISTORY_LENGTH    5
-
-static struct timer_list cpe_poll_timer;
-static struct timer_list cmc_poll_timer;
-/*
- * This variable tells whether we are currently in polling mode.
- * Start with this in the wrong state so we won't play w/ timers
- * before the system is ready.
- */
-static int cmc_polling_enabled = 1;
-
-/*
- * Clearing this variable prevents CPE polling from getting activated
- * in mca_late_init.  Use it if your system doesn't provide a CPEI,
- * but encounters problems retrieving CPE logs.  This should only be
- * necessary for debugging.
- */
-static int cpe_poll_enabled = 1;
-
-extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);
-
-static int mca_init __initdata;
-
-/*
- * limited & delayed printing support for MCA/INIT handler
- */
-
-#define mprintk(fmt...) ia64_mca_printk(fmt)
-
-#define MLOGBUF_SIZE (512+256*NR_CPUS)
-#define MLOGBUF_MSGMAX 256
-static char mlogbuf[MLOGBUF_SIZE];
-static DEFINE_SPINLOCK(mlogbuf_wlock);	/* mca context only */
-static DEFINE_SPINLOCK(mlogbuf_rlock);	/* normal context only */
-static unsigned long mlogbuf_start;
-static unsigned long mlogbuf_end;
-static unsigned int mlogbuf_finished = 0;
-static unsigned long mlogbuf_timestamp = 0;
-
-static int loglevel_save = -1;
-#define BREAK_LOGLEVEL(__console_loglevel)		\
-	oops_in_progress = 1;				\
-	if (loglevel_save < 0)				\
-		loglevel_save = __console_loglevel;	\
-	__console_loglevel = 15;
-
-#define RESTORE_LOGLEVEL(__console_loglevel)		\
-	if (loglevel_save >= 0) {			\
-		__console_loglevel = loglevel_save;	\
-		loglevel_save = -1;			\
-	}						\
-	mlogbuf_finished = 0;				\
-	oops_in_progress = 0;
-
-/*
- * Push messages into buffer, print them later if not urgent.
- */
-void ia64_mca_printk(const char *fmt, ...)
-{
-	va_list args;
-	int printed_len;
-	char temp_buf[MLOGBUF_MSGMAX];
-	char *p;
-
-	va_start(args, fmt);
-	printed_len = vscnprintf(temp_buf, sizeof(temp_buf), fmt, args);
-	va_end(args);
-
-	/* Copy the output into mlogbuf */
-	if (oops_in_progress) {
-		/* mlogbuf was abandoned, use printk directly instead. */
-		printk("%s", temp_buf);
-	} else {
-		spin_lock(&mlogbuf_wlock);
-		for (p = temp_buf; *p; p++) {
-			unsigned long next = (mlogbuf_end + 1) % MLOGBUF_SIZE;
-			if (next != mlogbuf_start) {
-				mlogbuf[mlogbuf_end] = *p;
-				mlogbuf_end = next;
-			} else {
-				/* buffer full */
-				break;
-			}
-		}
-		mlogbuf[mlogbuf_end] = '\0';
-		spin_unlock(&mlogbuf_wlock);
-	}
-}
-EXPORT_SYMBOL(ia64_mca_printk);
-
-/*
- * Print buffered messages.
- *  NOTE: call this after returning normal context. (ex. from salinfod)
- */
-void ia64_mlogbuf_dump(void)
-{
-	char temp_buf[MLOGBUF_MSGMAX];
-	char *p;
-	unsigned long index;
-	unsigned long flags;
-	unsigned int printed_len;
-
-	/* Get output from mlogbuf */
-	while (mlogbuf_start != mlogbuf_end) {
-		temp_buf[0] = '\0';
-		p = temp_buf;
-		printed_len = 0;
-
-		spin_lock_irqsave(&mlogbuf_rlock, flags);
-
-		index = mlogbuf_start;
-		while (index != mlogbuf_end) {
-			*p = mlogbuf[index];
-			index = (index + 1) % MLOGBUF_SIZE;
-			if (!*p)
-				break;
-			p++;
-			if (++printed_len >= MLOGBUF_MSGMAX - 1)
-				break;
-		}
-		*p = '\0';
-		if (temp_buf[0])
-			printk("%s", temp_buf);
-		mlogbuf_start = index;
-
-		mlogbuf_timestamp = 0;
-		spin_unlock_irqrestore(&mlogbuf_rlock, flags);
-	}
-}
-EXPORT_SYMBOL(ia64_mlogbuf_dump);
-
-/*
- * Call this if system is going to down or if immediate flushing messages to
- * console is required. (ex. recovery was failed, crash dump is going to be
- * invoked, long-wait rendezvous etc.)
- *  NOTE: this should be called from monarch.
- */
-static void ia64_mlogbuf_finish(int wait)
-{
-	BREAK_LOGLEVEL(console_loglevel);
-
-	ia64_mlogbuf_dump();
-	printk(KERN_EMERG "mlogbuf_finish: printing switched to urgent mode, "
-		"MCA/INIT might be dodgy or fail.\n");
-
-	if (!wait)
-		return;
-
-	/* wait for console */
-	printk("Delaying for 5 seconds...\n");
-	udelay(5*1000000);
-
-	mlogbuf_finished = 1;
-}
-
-/*
- * Print buffered messages from INIT context.
- */
-static void ia64_mlogbuf_dump_from_init(void)
-{
-	if (mlogbuf_finished)
-		return;
-
-	if (mlogbuf_timestamp &&
-			time_before(jiffies, mlogbuf_timestamp + 30 * HZ)) {
-		printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT "
-			" and the system seems to be messed up.\n");
-		ia64_mlogbuf_finish(0);
-		return;
-	}
-
-	if (!spin_trylock(&mlogbuf_rlock)) {
-		printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT. "
-			"Generated messages other than stack dump will be "
-			"buffered to mlogbuf and will be printed later.\n");
-		printk(KERN_ERR "INIT: If messages would not printed after "
-			"this INIT, wait 30sec and assert INIT again.\n");
-		if (!mlogbuf_timestamp)
-			mlogbuf_timestamp = jiffies;
-		return;
-	}
-	spin_unlock(&mlogbuf_rlock);
-	ia64_mlogbuf_dump();
-}
-
-static inline void
-ia64_mca_spin(const char *func)
-{
-	if (monarch_cpu == smp_processor_id())
-		ia64_mlogbuf_finish(0);
-	mprintk(KERN_EMERG "%s: spinning here, not returning to SAL\n", func);
-	while (1)
-		cpu_relax();
-}
-/*
- * IA64_MCA log support
- */
-#define IA64_MAX_LOGS		2	/* Double-buffering for nested MCAs */
-#define IA64_MAX_LOG_TYPES      4   /* MCA, INIT, CMC, CPE */
-
-typedef struct ia64_state_log_s
-{
-	spinlock_t	isl_lock;
-	int		isl_index;
-	unsigned long	isl_count;
-	ia64_err_rec_t  *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */
-} ia64_state_log_t;
-
-static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES];
-
-#define IA64_LOG_LOCK_INIT(it) spin_lock_init(&ia64_state_log[it].isl_lock)
-#define IA64_LOG_LOCK(it)      spin_lock_irqsave(&ia64_state_log[it].isl_lock, s)
-#define IA64_LOG_UNLOCK(it)    spin_unlock_irqrestore(&ia64_state_log[it].isl_lock,s)
-#define IA64_LOG_NEXT_INDEX(it)    ia64_state_log[it].isl_index
-#define IA64_LOG_CURR_INDEX(it)    1 - ia64_state_log[it].isl_index
-#define IA64_LOG_INDEX_INC(it) \
-    {ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index; \
-    ia64_state_log[it].isl_count++;}
-#define IA64_LOG_INDEX_DEC(it) \
-    ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index
-#define IA64_LOG_NEXT_BUFFER(it)   (void *)((ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)]))
-#define IA64_LOG_CURR_BUFFER(it)   (void *)((ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)]))
-#define IA64_LOG_COUNT(it)         ia64_state_log[it].isl_count
-
-static inline void ia64_log_allocate(int it, u64 size)
-{
-	ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)] =
-		(ia64_err_rec_t *)memblock_alloc(size, SMP_CACHE_BYTES);
-	if (!ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)])
-		panic("%s: Failed to allocate %llu bytes\n", __func__, size);
-
-	ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)] =
-		(ia64_err_rec_t *)memblock_alloc(size, SMP_CACHE_BYTES);
-	if (!ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)])
-		panic("%s: Failed to allocate %llu bytes\n", __func__, size);
-}
-
-/*
- * ia64_log_init
- *	Reset the OS ia64 log buffer
- * Inputs   :   info_type   (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE})
- * Outputs	:	None
- */
-static void __init
-ia64_log_init(int sal_info_type)
-{
-	u64	max_size = 0;
-
-	IA64_LOG_NEXT_INDEX(sal_info_type) = 0;
-	IA64_LOG_LOCK_INIT(sal_info_type);
-
-	// SAL will tell us the maximum size of any error record of this type
-	max_size = ia64_sal_get_state_info_size(sal_info_type);
-	if (!max_size)
-		/* alloc_bootmem() doesn't like zero-sized allocations! */
-		return;
-
-	// set up OS data structures to hold error info
-	ia64_log_allocate(sal_info_type, max_size);
-}
-
-/*
- * ia64_log_get
- *
- *	Get the current MCA log from SAL and copy it into the OS log buffer.
- *
- *  Inputs  :   info_type   (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE})
- *              irq_safe    whether you can use printk at this point
- *  Outputs :   size        (total record length)
- *              *buffer     (ptr to error record)
- *
- */
-static u64
-ia64_log_get(int sal_info_type, u8 **buffer, int irq_safe)
-{
-	sal_log_record_header_t     *log_buffer;
-	u64                         total_len = 0;
-	unsigned long               s;
-
-	IA64_LOG_LOCK(sal_info_type);
-
-	/* Get the process state information */
-	log_buffer = IA64_LOG_NEXT_BUFFER(sal_info_type);
-
-	total_len = ia64_sal_get_state_info(sal_info_type, (u64 *)log_buffer);
-
-	if (total_len) {
-		IA64_LOG_INDEX_INC(sal_info_type);
-		IA64_LOG_UNLOCK(sal_info_type);
-		if (irq_safe) {
-			IA64_MCA_DEBUG("%s: SAL error record type %d retrieved. Record length = %ld\n",
-				       __func__, sal_info_type, total_len);
-		}
-		*buffer = (u8 *) log_buffer;
-		return total_len;
-	} else {
-		IA64_LOG_UNLOCK(sal_info_type);
-		return 0;
-	}
-}
-
-/*
- *  ia64_mca_log_sal_error_record
- *
- *  This function retrieves a specified error record type from SAL
- *  and wakes up any processes waiting for error records.
- *
- *  Inputs  :   sal_info_type   (Type of error record MCA/CMC/CPE)
- *              FIXME: remove MCA and irq_safe.
- */
-static void
-ia64_mca_log_sal_error_record(int sal_info_type)
-{
-	u8 *buffer;
-	sal_log_record_header_t *rh;
-	u64 size;
-	int irq_safe = sal_info_type != SAL_INFO_TYPE_MCA;
-#ifdef IA64_MCA_DEBUG_INFO
-	static const char * const rec_name[] = { "MCA", "INIT", "CMC", "CPE" };
-#endif
-
-	size = ia64_log_get(sal_info_type, &buffer, irq_safe);
-	if (!size)
-		return;
-
-	salinfo_log_wakeup(sal_info_type, buffer, size, irq_safe);
-
-	if (irq_safe)
-		IA64_MCA_DEBUG("CPU %d: SAL log contains %s error record\n",
-			smp_processor_id(),
-			sal_info_type < ARRAY_SIZE(rec_name) ? rec_name[sal_info_type] : "UNKNOWN");
-
-	/* Clear logs from corrected errors in case there's no user-level logger */
-	rh = (sal_log_record_header_t *)buffer;
-	if (rh->severity == sal_log_severity_corrected)
-		ia64_sal_clear_state_info(sal_info_type);
-}
-
-/*
- * search_mca_table
- *  See if the MCA surfaced in an instruction range
- *  that has been tagged as recoverable.
- *
- *  Inputs
- *	first	First address range to check
- *	last	Last address range to check
- *	ip	Instruction pointer, address we are looking for
- *
- * Return value:
- *      1 on Success (in the table)/ 0 on Failure (not in the  table)
- */
-int
-search_mca_table (const struct mca_table_entry *first,
-                const struct mca_table_entry *last,
-                unsigned long ip)
-{
-        const struct mca_table_entry *curr;
-        u64 curr_start, curr_end;
-
-        curr = first;
-        while (curr <= last) {
-                curr_start = (u64) &curr->start_addr + curr->start_addr;
-                curr_end = (u64) &curr->end_addr + curr->end_addr;
-
-                if ((ip >= curr_start) && (ip <= curr_end)) {
-                        return 1;
-                }
-                curr++;
-        }
-        return 0;
-}
-
-/* Given an address, look for it in the mca tables. */
-int mca_recover_range(unsigned long addr)
-{
-	extern struct mca_table_entry __start___mca_table[];
-	extern struct mca_table_entry __stop___mca_table[];
-
-	return search_mca_table(__start___mca_table, __stop___mca_table-1, addr);
-}
-EXPORT_SYMBOL_GPL(mca_recover_range);
-
-int cpe_vector = -1;
-int ia64_cpe_irq = -1;
-
-static irqreturn_t
-ia64_mca_cpe_int_handler (int cpe_irq, void *arg)
-{
-	static unsigned long	cpe_history[CPE_HISTORY_LENGTH];
-	static int		index;
-	static DEFINE_SPINLOCK(cpe_history_lock);
-
-	IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
-		       __func__, cpe_irq, smp_processor_id());
-
-	/* SAL spec states this should run w/ interrupts enabled */
-	local_irq_enable();
-
-	spin_lock(&cpe_history_lock);
-	if (!cpe_poll_enabled && cpe_vector >= 0) {
-
-		int i, count = 1; /* we know 1 happened now */
-		unsigned long now = jiffies;
-
-		for (i = 0; i < CPE_HISTORY_LENGTH; i++) {
-			if (now - cpe_history[i] <= HZ)
-				count++;
-		}
-
-		IA64_MCA_DEBUG(KERN_INFO "CPE threshold %d/%d\n", count, CPE_HISTORY_LENGTH);
-		if (count >= CPE_HISTORY_LENGTH) {
-
-			cpe_poll_enabled = 1;
-			spin_unlock(&cpe_history_lock);
-			disable_irq_nosync(local_vector_to_irq(IA64_CPE_VECTOR));
-
-			/*
-			 * Corrected errors will still be corrected, but
-			 * make sure there's a log somewhere that indicates
-			 * something is generating more than we can handle.
-			 */
-			printk(KERN_WARNING "WARNING: Switching to polling CPE handler; error records may be lost\n");
-
-			mod_timer(&cpe_poll_timer, jiffies + MIN_CPE_POLL_INTERVAL);
-
-			/* lock already released, get out now */
-			goto out;
-		} else {
-			cpe_history[index++] = now;
-			if (index == CPE_HISTORY_LENGTH)
-				index = 0;
-		}
-	}
-	spin_unlock(&cpe_history_lock);
-out:
-	/* Get the CPE error record and log it */
-	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
-
-	local_irq_disable();
-
-	return IRQ_HANDLED;
-}
-
-/*
- * ia64_mca_register_cpev
- *
- *  Register the corrected platform error vector with SAL.
- *
- *  Inputs
- *      cpev        Corrected Platform Error Vector number
- *
- *  Outputs
- *      None
- */
-void
-ia64_mca_register_cpev (int cpev)
-{
-	/* Register the CPE interrupt vector with SAL */
-	struct ia64_sal_retval isrv;
-
-	isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_CPE_INT, SAL_MC_PARAM_MECHANISM_INT, cpev, 0, 0);
-	if (isrv.status) {
-		printk(KERN_ERR "Failed to register Corrected Platform "
-		       "Error interrupt vector with SAL (status %ld)\n", isrv.status);
-		return;
-	}
-
-	IA64_MCA_DEBUG("%s: corrected platform error "
-		       "vector %#x registered\n", __func__, cpev);
-}
-
-/*
- * ia64_mca_cmc_vector_setup
- *
- *  Setup the corrected machine check vector register in the processor.
- *  (The interrupt is masked on boot. ia64_mca_late_init unmask this.)
- *  This function is invoked on a per-processor basis.
- *
- * Inputs
- *      None
- *
- * Outputs
- *	None
- */
-void
-ia64_mca_cmc_vector_setup (void)
-{
-	cmcv_reg_t	cmcv;
-
-	cmcv.cmcv_regval	= 0;
-	cmcv.cmcv_mask		= 1;        /* Mask/disable interrupt at first */
-	cmcv.cmcv_vector	= IA64_CMC_VECTOR;
-	ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
-
-	IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x registered.\n",
-		       __func__, smp_processor_id(), IA64_CMC_VECTOR);
-
-	IA64_MCA_DEBUG("%s: CPU %d CMCV = %#016lx\n",
-		       __func__, smp_processor_id(), ia64_getreg(_IA64_REG_CR_CMCV));
-}
-
-/*
- * ia64_mca_cmc_vector_disable
- *
- *  Mask the corrected machine check vector register in the processor.
- *  This function is invoked on a per-processor basis.
- *
- * Inputs
- *      dummy(unused)
- *
- * Outputs
- *	None
- */
-static void
-ia64_mca_cmc_vector_disable (void *dummy)
-{
-	cmcv_reg_t	cmcv;
-
-	cmcv.cmcv_regval = ia64_getreg(_IA64_REG_CR_CMCV);
-
-	cmcv.cmcv_mask = 1; /* Mask/disable interrupt */
-	ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
-
-	IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x disabled.\n",
-		       __func__, smp_processor_id(), cmcv.cmcv_vector);
-}
-
-/*
- * ia64_mca_cmc_vector_enable
- *
- *  Unmask the corrected machine check vector register in the processor.
- *  This function is invoked on a per-processor basis.
- *
- * Inputs
- *      dummy(unused)
- *
- * Outputs
- *	None
- */
-static void
-ia64_mca_cmc_vector_enable (void *dummy)
-{
-	cmcv_reg_t	cmcv;
-
-	cmcv.cmcv_regval = ia64_getreg(_IA64_REG_CR_CMCV);
-
-	cmcv.cmcv_mask = 0; /* Unmask/enable interrupt */
-	ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval);
-
-	IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x enabled.\n",
-		       __func__, smp_processor_id(), cmcv.cmcv_vector);
-}
-
-/*
- * ia64_mca_cmc_vector_disable_keventd
- *
- * Called via keventd (smp_call_function() is not safe in interrupt context) to
- * disable the cmc interrupt vector.
- */
-static void
-ia64_mca_cmc_vector_disable_keventd(struct work_struct *unused)
-{
-	on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 0);
-}
-
-/*
- * ia64_mca_cmc_vector_enable_keventd
- *
- * Called via keventd (smp_call_function() is not safe in interrupt context) to
- * enable the cmc interrupt vector.
- */
-static void
-ia64_mca_cmc_vector_enable_keventd(struct work_struct *unused)
-{
-	on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 0);
-}
-
-/*
- * ia64_mca_wakeup
- *
- *	Send an inter-cpu interrupt to wake-up a particular cpu.
- *
- *  Inputs  :   cpuid
- *  Outputs :   None
- */
-static void
-ia64_mca_wakeup(int cpu)
-{
-	ia64_send_ipi(cpu, IA64_MCA_WAKEUP_VECTOR, IA64_IPI_DM_INT, 0);
-}
-
-/*
- * ia64_mca_wakeup_all
- *
- *	Wakeup all the slave cpus which have rendez'ed previously.
- *
- *  Inputs  :   None
- *  Outputs :   None
- */
-static void
-ia64_mca_wakeup_all(void)
-{
-	int cpu;
-
-	/* Clear the Rendez checkin flag for all cpus */
-	for_each_online_cpu(cpu) {
-		if (ia64_mc_info.imi_rendez_checkin[cpu] == IA64_MCA_RENDEZ_CHECKIN_DONE)
-			ia64_mca_wakeup(cpu);
-	}
-
-}
-
-/*
- * ia64_mca_rendez_interrupt_handler
- *
- *	This is handler used to put slave processors into spinloop
- *	while the monarch processor does the mca handling and later
- *	wake each slave up once the monarch is done.  The state
- *	IA64_MCA_RENDEZ_CHECKIN_DONE indicates the cpu is rendez'ed
- *	in SAL.  The state IA64_MCA_RENDEZ_CHECKIN_NOTDONE indicates
- *	the cpu has come out of OS rendezvous.
- *
- *  Inputs  :   None
- *  Outputs :   None
- */
-static irqreturn_t
-ia64_mca_rendez_int_handler(int rendez_irq, void *arg)
-{
-	unsigned long flags;
-	int cpu = smp_processor_id();
-	struct ia64_mca_notify_die nd =
-		{ .sos = NULL, .monarch_cpu = &monarch_cpu };
-
-	/* Mask all interrupts */
-	local_irq_save(flags);
-
-	NOTIFY_MCA(DIE_MCA_RENDZVOUS_ENTER, get_irq_regs(), (long)&nd, 1);
-
-	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_DONE;
-	/* Register with the SAL monarch that the slave has
-	 * reached SAL
-	 */
-	ia64_sal_mc_rendez();
-
-	NOTIFY_MCA(DIE_MCA_RENDZVOUS_PROCESS, get_irq_regs(), (long)&nd, 1);
-
-	/* Wait for the monarch cpu to exit. */
-	while (monarch_cpu != -1)
-	       cpu_relax();	/* spin until monarch leaves */
-
-	NOTIFY_MCA(DIE_MCA_RENDZVOUS_LEAVE, get_irq_regs(), (long)&nd, 1);
-
-	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
-	/* Enable all interrupts */
-	local_irq_restore(flags);
-	return IRQ_HANDLED;
-}
-
-/*
- * ia64_mca_wakeup_int_handler
- *
- *	The interrupt handler for processing the inter-cpu interrupt to the
- *	slave cpu which was spinning in the rendez loop.
- *	Since this spinning is done by turning off the interrupts and
- *	polling on the wakeup-interrupt bit in the IRR, there is
- *	nothing useful to be done in the handler.
- *
- *  Inputs  :   wakeup_irq  (Wakeup-interrupt bit)
- *	arg		(Interrupt handler specific argument)
- *  Outputs :   None
- *
- */
-static irqreturn_t
-ia64_mca_wakeup_int_handler(int wakeup_irq, void *arg)
-{
-	return IRQ_HANDLED;
-}
-
-/* Function pointer for extra MCA recovery */
-int (*ia64_mca_ucmc_extension)
-	(void*,struct ia64_sal_os_state*)
-	= NULL;
-
-int
-ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *))
-{
-	if (ia64_mca_ucmc_extension)
-		return 1;
-
-	ia64_mca_ucmc_extension = fn;
-	return 0;
-}
-
-void
-ia64_unreg_MCA_extension(void)
-{
-	if (ia64_mca_ucmc_extension)
-		ia64_mca_ucmc_extension = NULL;
-}
-
-EXPORT_SYMBOL(ia64_reg_MCA_extension);
-EXPORT_SYMBOL(ia64_unreg_MCA_extension);
-
-
-static inline void
-copy_reg(const u64 *fr, u64 fnat, unsigned long *tr, unsigned long *tnat)
-{
-	u64 fslot, tslot, nat;
-	*tr = *fr;
-	fslot = ((unsigned long)fr >> 3) & 63;
-	tslot = ((unsigned long)tr >> 3) & 63;
-	*tnat &= ~(1UL << tslot);
-	nat = (fnat >> fslot) & 1;
-	*tnat |= (nat << tslot);
-}
-
-/* Change the comm field on the MCA/INT task to include the pid that
- * was interrupted, it makes for easier debugging.  If that pid was 0
- * (swapper or nested MCA/INIT) then use the start of the previous comm
- * field suffixed with its cpu.
- */
-
-static void
-ia64_mca_modify_comm(const struct task_struct *previous_current)
-{
-	char *p, comm[sizeof(current->comm)];
-	if (previous_current->pid)
-		snprintf(comm, sizeof(comm), "%s %d",
-			current->comm, previous_current->pid);
-	else {
-		int l;
-		if ((p = strchr(previous_current->comm, ' ')))
-			l = p - previous_current->comm;
-		else
-			l = strlen(previous_current->comm);
-		snprintf(comm, sizeof(comm), "%s %*s %d",
-			current->comm, l, previous_current->comm,
-			task_thread_info(previous_current)->cpu);
-	}
-	memcpy(current->comm, comm, sizeof(current->comm));
-}
-
-static void
-finish_pt_regs(struct pt_regs *regs, struct ia64_sal_os_state *sos,
-		unsigned long *nat)
-{
-	const struct pal_min_state_area *ms = sos->pal_min_state;
-	const u64 *bank;
-
-	/* If ipsr.ic then use pmsa_{iip,ipsr,ifs}, else use
-	 * pmsa_{xip,xpsr,xfs}
-	 */
-	if (ia64_psr(regs)->ic) {
-		regs->cr_iip = ms->pmsa_iip;
-		regs->cr_ipsr = ms->pmsa_ipsr;
-		regs->cr_ifs = ms->pmsa_ifs;
-	} else {
-		regs->cr_iip = ms->pmsa_xip;
-		regs->cr_ipsr = ms->pmsa_xpsr;
-		regs->cr_ifs = ms->pmsa_xfs;
-
-		sos->iip = ms->pmsa_iip;
-		sos->ipsr = ms->pmsa_ipsr;
-		sos->ifs = ms->pmsa_ifs;
-	}
-	regs->pr = ms->pmsa_pr;
-	regs->b0 = ms->pmsa_br0;
-	regs->ar_rsc = ms->pmsa_rsc;
-	copy_reg(&ms->pmsa_gr[1-1], ms->pmsa_nat_bits, &regs->r1, nat);
-	copy_reg(&ms->pmsa_gr[2-1], ms->pmsa_nat_bits, &regs->r2, nat);
-	copy_reg(&ms->pmsa_gr[3-1], ms->pmsa_nat_bits, &regs->r3, nat);
-	copy_reg(&ms->pmsa_gr[8-1], ms->pmsa_nat_bits, &regs->r8, nat);
-	copy_reg(&ms->pmsa_gr[9-1], ms->pmsa_nat_bits, &regs->r9, nat);
-	copy_reg(&ms->pmsa_gr[10-1], ms->pmsa_nat_bits, &regs->r10, nat);
-	copy_reg(&ms->pmsa_gr[11-1], ms->pmsa_nat_bits, &regs->r11, nat);
-	copy_reg(&ms->pmsa_gr[12-1], ms->pmsa_nat_bits, &regs->r12, nat);
-	copy_reg(&ms->pmsa_gr[13-1], ms->pmsa_nat_bits, &regs->r13, nat);
-	copy_reg(&ms->pmsa_gr[14-1], ms->pmsa_nat_bits, &regs->r14, nat);
-	copy_reg(&ms->pmsa_gr[15-1], ms->pmsa_nat_bits, &regs->r15, nat);
-	if (ia64_psr(regs)->bn)
-		bank = ms->pmsa_bank1_gr;
-	else
-		bank = ms->pmsa_bank0_gr;
-	copy_reg(&bank[16-16], ms->pmsa_nat_bits, &regs->r16, nat);
-	copy_reg(&bank[17-16], ms->pmsa_nat_bits, &regs->r17, nat);
-	copy_reg(&bank[18-16], ms->pmsa_nat_bits, &regs->r18, nat);
-	copy_reg(&bank[19-16], ms->pmsa_nat_bits, &regs->r19, nat);
-	copy_reg(&bank[20-16], ms->pmsa_nat_bits, &regs->r20, nat);
-	copy_reg(&bank[21-16], ms->pmsa_nat_bits, &regs->r21, nat);
-	copy_reg(&bank[22-16], ms->pmsa_nat_bits, &regs->r22, nat);
-	copy_reg(&bank[23-16], ms->pmsa_nat_bits, &regs->r23, nat);
-	copy_reg(&bank[24-16], ms->pmsa_nat_bits, &regs->r24, nat);
-	copy_reg(&bank[25-16], ms->pmsa_nat_bits, &regs->r25, nat);
-	copy_reg(&bank[26-16], ms->pmsa_nat_bits, &regs->r26, nat);
-	copy_reg(&bank[27-16], ms->pmsa_nat_bits, &regs->r27, nat);
-	copy_reg(&bank[28-16], ms->pmsa_nat_bits, &regs->r28, nat);
-	copy_reg(&bank[29-16], ms->pmsa_nat_bits, &regs->r29, nat);
-	copy_reg(&bank[30-16], ms->pmsa_nat_bits, &regs->r30, nat);
-	copy_reg(&bank[31-16], ms->pmsa_nat_bits, &regs->r31, nat);
-}
-
-/* On entry to this routine, we are running on the per cpu stack, see
- * mca_asm.h.  The original stack has not been touched by this event.  Some of
- * the original stack's registers will be in the RBS on this stack.  This stack
- * also contains a partial pt_regs and switch_stack, the rest of the data is in
- * PAL minstate.
- *
- * The first thing to do is modify the original stack to look like a blocked
- * task so we can run backtrace on the original task.  Also mark the per cpu
- * stack as current to ensure that we use the correct task state, it also means
- * that we can do backtrace on the MCA/INIT handler code itself.
- */
-
-static struct task_struct *
-ia64_mca_modify_original_stack(struct pt_regs *regs,
-		const struct switch_stack *sw,
-		struct ia64_sal_os_state *sos,
-		const char *type)
-{
-	char *p;
-	ia64_va va;
-	extern char ia64_leave_kernel[];	/* Need asm address, not function descriptor */
-	const struct pal_min_state_area *ms = sos->pal_min_state;
-	struct task_struct *previous_current;
-	struct pt_regs *old_regs;
-	struct switch_stack *old_sw;
-	unsigned size = sizeof(struct pt_regs) +
-			sizeof(struct switch_stack) + 16;
-	unsigned long *old_bspstore, *old_bsp;
-	unsigned long *new_bspstore, *new_bsp;
-	unsigned long old_unat, old_rnat, new_rnat, nat;
-	u64 slots, loadrs = regs->loadrs;
-	u64 r12 = ms->pmsa_gr[12-1], r13 = ms->pmsa_gr[13-1];
-	u64 ar_bspstore = regs->ar_bspstore;
-	u64 ar_bsp = regs->ar_bspstore + (loadrs >> 16);
-	const char *msg;
-	int cpu = smp_processor_id();
-
-	previous_current = curr_task(cpu);
-	ia64_set_curr_task(cpu, current);
-	if ((p = strchr(current->comm, ' ')))
-		*p = '\0';
-
-	/* Best effort attempt to cope with MCA/INIT delivered while in
-	 * physical mode.
-	 */
-	regs->cr_ipsr = ms->pmsa_ipsr;
-	if (ia64_psr(regs)->dt == 0) {
-		va.l = r12;
-		if (va.f.reg == 0) {
-			va.f.reg = 7;
-			r12 = va.l;
-		}
-		va.l = r13;
-		if (va.f.reg == 0) {
-			va.f.reg = 7;
-			r13 = va.l;
-		}
-	}
-	if (ia64_psr(regs)->rt == 0) {
-		va.l = ar_bspstore;
-		if (va.f.reg == 0) {
-			va.f.reg = 7;
-			ar_bspstore = va.l;
-		}
-		va.l = ar_bsp;
-		if (va.f.reg == 0) {
-			va.f.reg = 7;
-			ar_bsp = va.l;
-		}
-	}
-
-	/* mca_asm.S ia64_old_stack() cannot assume that the dirty registers
-	 * have been copied to the old stack, the old stack may fail the
-	 * validation tests below.  So ia64_old_stack() must restore the dirty
-	 * registers from the new stack.  The old and new bspstore probably
-	 * have different alignments, so loadrs calculated on the old bsp
-	 * cannot be used to restore from the new bsp.  Calculate a suitable
-	 * loadrs for the new stack and save it in the new pt_regs, where
-	 * ia64_old_stack() can get it.
-	 */
-	old_bspstore = (unsigned long *)ar_bspstore;
-	old_bsp = (unsigned long *)ar_bsp;
-	slots = ia64_rse_num_regs(old_bspstore, old_bsp);
-	new_bspstore = (unsigned long *)((u64)current + IA64_RBS_OFFSET);
-	new_bsp = ia64_rse_skip_regs(new_bspstore, slots);
-	regs->loadrs = (new_bsp - new_bspstore) * 8 << 16;
-
-	/* Verify the previous stack state before we change it */
-	if (user_mode(regs)) {
-		msg = "occurred in user space";
-		/* previous_current is guaranteed to be valid when the task was
-		 * in user space, so ...
-		 */
-		ia64_mca_modify_comm(previous_current);
-		goto no_mod;
-	}
-
-	if (r13 != sos->prev_IA64_KR_CURRENT) {
-		msg = "inconsistent previous current and r13";
-		goto no_mod;
-	}
-
-	if (!mca_recover_range(ms->pmsa_iip)) {
-		if ((r12 - r13) >= KERNEL_STACK_SIZE) {
-			msg = "inconsistent r12 and r13";
-			goto no_mod;
-		}
-		if ((ar_bspstore - r13) >= KERNEL_STACK_SIZE) {
-			msg = "inconsistent ar.bspstore and r13";
-			goto no_mod;
-		}
-		va.p = old_bspstore;
-		if (va.f.reg < 5) {
-			msg = "old_bspstore is in the wrong region";
-			goto no_mod;
-		}
-		if ((ar_bsp - r13) >= KERNEL_STACK_SIZE) {
-			msg = "inconsistent ar.bsp and r13";
-			goto no_mod;
-		}
-		size += (ia64_rse_skip_regs(old_bspstore, slots) - old_bspstore) * 8;
-		if (ar_bspstore + size > r12) {
-			msg = "no room for blocked state";
-			goto no_mod;
-		}
-	}
-
-	ia64_mca_modify_comm(previous_current);
-
-	/* Make the original task look blocked.  First stack a struct pt_regs,
-	 * describing the state at the time of interrupt.  mca_asm.S built a
-	 * partial pt_regs, copy it and fill in the blanks using minstate.
-	 */
-	p = (char *)r12 - sizeof(*regs);
-	old_regs = (struct pt_regs *)p;
-	memcpy(old_regs, regs, sizeof(*regs));
-	old_regs->loadrs = loadrs;
-	old_unat = old_regs->ar_unat;
-	finish_pt_regs(old_regs, sos, &old_unat);
-
-	/* Next stack a struct switch_stack.  mca_asm.S built a partial
-	 * switch_stack, copy it and fill in the blanks using pt_regs and
-	 * minstate.
-	 *
-	 * In the synthesized switch_stack, b0 points to ia64_leave_kernel,
-	 * ar.pfs is set to 0.
-	 *
-	 * unwind.c::unw_unwind() does special processing for interrupt frames.
-	 * It checks if the PRED_NON_SYSCALL predicate is set, if the predicate
-	 * is clear then unw_unwind() does _not_ adjust bsp over pt_regs.  Not
-	 * that this is documented, of course.  Set PRED_NON_SYSCALL in the
-	 * switch_stack on the original stack so it will unwind correctly when
-	 * unwind.c reads pt_regs.
-	 *
-	 * thread.ksp is updated to point to the synthesized switch_stack.
-	 */
-	p -= sizeof(struct switch_stack);
-	old_sw = (struct switch_stack *)p;
-	memcpy(old_sw, sw, sizeof(*sw));
-	old_sw->caller_unat = old_unat;
-	old_sw->ar_fpsr = old_regs->ar_fpsr;
-	copy_reg(&ms->pmsa_gr[4-1], ms->pmsa_nat_bits, &old_sw->r4, &old_unat);
-	copy_reg(&ms->pmsa_gr[5-1], ms->pmsa_nat_bits, &old_sw->r5, &old_unat);
-	copy_reg(&ms->pmsa_gr[6-1], ms->pmsa_nat_bits, &old_sw->r6, &old_unat);
-	copy_reg(&ms->pmsa_gr[7-1], ms->pmsa_nat_bits, &old_sw->r7, &old_unat);
-	old_sw->b0 = (u64)ia64_leave_kernel;
-	old_sw->b1 = ms->pmsa_br1;
-	old_sw->ar_pfs = 0;
-	old_sw->ar_unat = old_unat;
-	old_sw->pr = old_regs->pr | (1UL << PRED_NON_SYSCALL);
-	previous_current->thread.ksp = (u64)p - 16;
-
-	/* Finally copy the original stack's registers back to its RBS.
-	 * Registers from ar.bspstore through ar.bsp at the time of the event
-	 * are in the current RBS, copy them back to the original stack.  The
-	 * copy must be done register by register because the original bspstore
-	 * and the current one have different alignments, so the saved RNAT
-	 * data occurs at different places.
-	 *
-	 * mca_asm does cover, so the old_bsp already includes all registers at
-	 * the time of MCA/INIT.  It also does flushrs, so all registers before
-	 * this function have been written to backing store on the MCA/INIT
-	 * stack.
-	 */
-	new_rnat = ia64_get_rnat(ia64_rse_rnat_addr(new_bspstore));
-	old_rnat = regs->ar_rnat;
-	while (slots--) {
-		if (ia64_rse_is_rnat_slot(new_bspstore)) {
-			new_rnat = ia64_get_rnat(new_bspstore++);
-		}
-		if (ia64_rse_is_rnat_slot(old_bspstore)) {
-			*old_bspstore++ = old_rnat;
-			old_rnat = 0;
-		}
-		nat = (new_rnat >> ia64_rse_slot_num(new_bspstore)) & 1UL;
-		old_rnat &= ~(1UL << ia64_rse_slot_num(old_bspstore));
-		old_rnat |= (nat << ia64_rse_slot_num(old_bspstore));
-		*old_bspstore++ = *new_bspstore++;
-	}
-	old_sw->ar_bspstore = (unsigned long)old_bspstore;
-	old_sw->ar_rnat = old_rnat;
-
-	sos->prev_task = previous_current;
-	return previous_current;
-
-no_mod:
-	mprintk(KERN_INFO "cpu %d, %s %s, original stack not modified\n",
-			smp_processor_id(), type, msg);
-	old_unat = regs->ar_unat;
-	finish_pt_regs(regs, sos, &old_unat);
-	return previous_current;
-}
-
-/* The monarch/slave interaction is based on monarch_cpu and requires that all
- * slaves have entered rendezvous before the monarch leaves.  If any cpu has
- * not entered rendezvous yet then wait a bit.  The assumption is that any
- * slave that has not rendezvoused after a reasonable time is never going to do
- * so.  In this context, slave includes cpus that respond to the MCA rendezvous
- * interrupt, as well as cpus that receive the INIT slave event.
- */
-
-static void
-ia64_wait_for_slaves(int monarch, const char *type)
-{
-	int c, i , wait;
-
-	/*
-	 * wait 5 seconds total for slaves (arbitrary)
-	 */
-	for (i = 0; i < 5000; i++) {
-		wait = 0;
-		for_each_online_cpu(c) {
-			if (c == monarch)
-				continue;
-			if (ia64_mc_info.imi_rendez_checkin[c]
-					== IA64_MCA_RENDEZ_CHECKIN_NOTDONE) {
-				udelay(1000);		/* short wait */
-				wait = 1;
-				break;
-			}
-		}
-		if (!wait)
-			goto all_in;
-	}
-
-	/*
-	 * Maybe slave(s) dead. Print buffered messages immediately.
-	 */
-	ia64_mlogbuf_finish(0);
-	mprintk(KERN_INFO "OS %s slave did not rendezvous on cpu", type);
-	for_each_online_cpu(c) {
-		if (c == monarch)
-			continue;
-		if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE)
-			mprintk(" %d", c);
-	}
-	mprintk("\n");
-	return;
-
-all_in:
-	mprintk(KERN_INFO "All OS %s slaves have reached rendezvous\n", type);
-	return;
-}
-
-/*  mca_insert_tr
- *
- *  Switch rid when TR reload and needed!
- *  iord: 1: itr, 2: itr;
- *
-*/
-static void mca_insert_tr(u64 iord)
-{
-
-	int i;
-	u64 old_rr;
-	struct ia64_tr_entry *p;
-	unsigned long psr;
-	int cpu = smp_processor_id();
-
-	if (!ia64_idtrs[cpu])
-		return;
-
-	psr = ia64_clear_ic();
-	for (i = IA64_TR_ALLOC_BASE; i < IA64_TR_ALLOC_MAX; i++) {
-		p = ia64_idtrs[cpu] + (iord - 1) * IA64_TR_ALLOC_MAX;
-		if (p->pte & 0x1) {
-			old_rr = ia64_get_rr(p->ifa);
-			if (old_rr != p->rr) {
-				ia64_set_rr(p->ifa, p->rr);
-				ia64_srlz_d();
-			}
-			ia64_ptr(iord, p->ifa, p->itir >> 2);
-			ia64_srlz_i();
-			if (iord & 0x1) {
-				ia64_itr(0x1, i, p->ifa, p->pte, p->itir >> 2);
-				ia64_srlz_i();
-			}
-			if (iord & 0x2) {
-				ia64_itr(0x2, i, p->ifa, p->pte, p->itir >> 2);
-				ia64_srlz_i();
-			}
-			if (old_rr != p->rr) {
-				ia64_set_rr(p->ifa, old_rr);
-				ia64_srlz_d();
-			}
-		}
-	}
-	ia64_set_psr(psr);
-}
-
-/*
- * ia64_mca_handler
- *
- *	This is uncorrectable machine check handler called from OS_MCA
- *	dispatch code which is in turn called from SAL_CHECK().
- *	This is the place where the core of OS MCA handling is done.
- *	Right now the logs are extracted and displayed in a well-defined
- *	format. This handler code is supposed to be run only on the
- *	monarch processor. Once the monarch is done with MCA handling
- *	further MCA logging is enabled by clearing logs.
- *	Monarch also has the duty of sending wakeup-IPIs to pull the
- *	slave processors out of rendezvous spinloop.
- *
- *	If multiple processors call into OS_MCA, the first will become
- *	the monarch.  Subsequent cpus will be recorded in the mca_cpu
- *	bitmask.  After the first monarch has processed its MCA, it
- *	will wake up the next cpu in the mca_cpu bitmask and then go
- *	into the rendezvous loop.  When all processors have serviced
- *	their MCA, the last monarch frees up the rest of the processors.
- */
-void
-ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
-		 struct ia64_sal_os_state *sos)
-{
-	int recover, cpu = smp_processor_id();
-	struct task_struct *previous_current;
-	struct ia64_mca_notify_die nd =
-		{ .sos = sos, .monarch_cpu = &monarch_cpu, .data = &recover };
-	static atomic_t mca_count;
-	static cpumask_t mca_cpu;
-
-	if (atomic_add_return(1, &mca_count) == 1) {
-		monarch_cpu = cpu;
-		sos->monarch = 1;
-	} else {
-		cpumask_set_cpu(cpu, &mca_cpu);
-		sos->monarch = 0;
-	}
-	mprintk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d "
-		"monarch=%ld\n", sos->proc_state_param, cpu, sos->monarch);
-
-	previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA");
-
-	NOTIFY_MCA(DIE_MCA_MONARCH_ENTER, regs, (long)&nd, 1);
-
-	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_CONCURRENT_MCA;
-	if (sos->monarch) {
-		ia64_wait_for_slaves(cpu, "MCA");
-
-		/* Wakeup all the processors which are spinning in the
-		 * rendezvous loop.  They will leave SAL, then spin in the OS
-		 * with interrupts disabled until this monarch cpu leaves the
-		 * MCA handler.  That gets control back to the OS so we can
-		 * backtrace the other cpus, backtrace when spinning in SAL
-		 * does not work.
-		 */
-		ia64_mca_wakeup_all();
-	} else {
-		while (cpumask_test_cpu(cpu, &mca_cpu))
-			cpu_relax();	/* spin until monarch wakes us */
-	}
-
-	NOTIFY_MCA(DIE_MCA_MONARCH_PROCESS, regs, (long)&nd, 1);
-
-	/* Get the MCA error record and log it */
-	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA);
-
-	/* MCA error recovery */
-	recover = (ia64_mca_ucmc_extension
-		&& ia64_mca_ucmc_extension(
-			IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA),
-			sos));
-
-	if (recover) {
-		sal_log_record_header_t *rh = IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA);
-		rh->severity = sal_log_severity_corrected;
-		ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
-		sos->os_status = IA64_MCA_CORRECTED;
-	} else {
-		/* Dump buffered message to console */
-		ia64_mlogbuf_finish(1);
-	}
-
-	if (__this_cpu_read(ia64_mca_tr_reload)) {
-		mca_insert_tr(0x1); /*Reload dynamic itrs*/
-		mca_insert_tr(0x2); /*Reload dynamic itrs*/
-	}
-
-	NOTIFY_MCA(DIE_MCA_MONARCH_LEAVE, regs, (long)&nd, 1);
-
-	if (atomic_dec_return(&mca_count) > 0) {
-		int i;
-
-		/* wake up the next monarch cpu,
-		 * and put this cpu in the rendez loop.
-		 */
-		for_each_online_cpu(i) {
-			if (cpumask_test_cpu(i, &mca_cpu)) {
-				monarch_cpu = i;
-				cpumask_clear_cpu(i, &mca_cpu);	/* wake next cpu */
-				while (monarch_cpu != -1)
-					cpu_relax();	/* spin until last cpu leaves */
-				ia64_set_curr_task(cpu, previous_current);
-				ia64_mc_info.imi_rendez_checkin[cpu]
-						= IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
-				return;
-			}
-		}
-	}
-	ia64_set_curr_task(cpu, previous_current);
-	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
-	monarch_cpu = -1;	/* This frees the slaves and previous monarchs */
-}
-
-static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd);
-static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd);
-
-/*
- * ia64_mca_cmc_int_handler
- *
- *  This is corrected machine check interrupt handler.
- *	Right now the logs are extracted and displayed in a well-defined
- *	format.
- *
- * Inputs
- *      interrupt number
- *      client data arg ptr
- *
- * Outputs
- *	None
- */
-static irqreturn_t
-ia64_mca_cmc_int_handler(int cmc_irq, void *arg)
-{
-	static unsigned long	cmc_history[CMC_HISTORY_LENGTH];
-	static int		index;
-	static DEFINE_SPINLOCK(cmc_history_lock);
-
-	IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
-		       __func__, cmc_irq, smp_processor_id());
-
-	/* SAL spec states this should run w/ interrupts enabled */
-	local_irq_enable();
-
-	spin_lock(&cmc_history_lock);
-	if (!cmc_polling_enabled) {
-		int i, count = 1; /* we know 1 happened now */
-		unsigned long now = jiffies;
-
-		for (i = 0; i < CMC_HISTORY_LENGTH; i++) {
-			if (now - cmc_history[i] <= HZ)
-				count++;
-		}
-
-		IA64_MCA_DEBUG(KERN_INFO "CMC threshold %d/%d\n", count, CMC_HISTORY_LENGTH);
-		if (count >= CMC_HISTORY_LENGTH) {
-
-			cmc_polling_enabled = 1;
-			spin_unlock(&cmc_history_lock);
-			/* If we're being hit with CMC interrupts, we won't
-			 * ever execute the schedule_work() below.  Need to
-			 * disable CMC interrupts on this processor now.
-			 */
-			ia64_mca_cmc_vector_disable(NULL);
-			schedule_work(&cmc_disable_work);
-
-			/*
-			 * Corrected errors will still be corrected, but
-			 * make sure there's a log somewhere that indicates
-			 * something is generating more than we can handle.
-			 */
-			printk(KERN_WARNING "WARNING: Switching to polling CMC handler; error records may be lost\n");
-
-			mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL);
-
-			/* lock already released, get out now */
-			goto out;
-		} else {
-			cmc_history[index++] = now;
-			if (index == CMC_HISTORY_LENGTH)
-				index = 0;
-		}
-	}
-	spin_unlock(&cmc_history_lock);
-out:
-	/* Get the CMC error record and log it */
-	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC);
-
-	local_irq_disable();
-
-	return IRQ_HANDLED;
-}
-
-/*
- *  ia64_mca_cmc_int_caller
- *
- * 	Triggered by sw interrupt from CMC polling routine.  Calls
- * 	real interrupt handler and either triggers a sw interrupt
- * 	on the next cpu or does cleanup at the end.
- *
- * Inputs
- *	interrupt number
- *	client data arg ptr
- * Outputs
- * 	handled
- */
-static irqreturn_t
-ia64_mca_cmc_int_caller(int cmc_irq, void *arg)
-{
-	static int start_count = -1;
-	unsigned int cpuid;
-
-	cpuid = smp_processor_id();
-
-	/* If first cpu, update count */
-	if (start_count == -1)
-		start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CMC);
-
-	ia64_mca_cmc_int_handler(cmc_irq, arg);
-
-	cpuid = cpumask_next(cpuid+1, cpu_online_mask);
-
-	if (cpuid < nr_cpu_ids) {
-		ia64_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0);
-	} else {
-		/* If no log record, switch out of polling mode */
-		if (start_count == IA64_LOG_COUNT(SAL_INFO_TYPE_CMC)) {
-
-			printk(KERN_WARNING "Returning to interrupt driven CMC handler\n");
-			schedule_work(&cmc_enable_work);
-			cmc_polling_enabled = 0;
-
-		} else {
-
-			mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL);
-		}
-
-		start_count = -1;
-	}
-
-	return IRQ_HANDLED;
-}
-
-/*
- *  ia64_mca_cmc_poll
- *
- *	Poll for Corrected Machine Checks (CMCs)
- *
- * Inputs   :   dummy(unused)
- * Outputs  :   None
- *
- */
-static void
-ia64_mca_cmc_poll (struct timer_list *unused)
-{
-	/* Trigger a CMC interrupt cascade  */
-	ia64_send_ipi(cpumask_first(cpu_online_mask), IA64_CMCP_VECTOR,
-							IA64_IPI_DM_INT, 0);
-}
-
-/*
- *  ia64_mca_cpe_int_caller
- *
- * 	Triggered by sw interrupt from CPE polling routine.  Calls
- * 	real interrupt handler and either triggers a sw interrupt
- * 	on the next cpu or does cleanup at the end.
- *
- * Inputs
- *	interrupt number
- *	client data arg ptr
- * Outputs
- * 	handled
- */
-static irqreturn_t
-ia64_mca_cpe_int_caller(int cpe_irq, void *arg)
-{
-	static int start_count = -1;
-	static int poll_time = MIN_CPE_POLL_INTERVAL;
-	unsigned int cpuid;
-
-	cpuid = smp_processor_id();
-
-	/* If first cpu, update count */
-	if (start_count == -1)
-		start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CPE);
-
-	ia64_mca_cpe_int_handler(cpe_irq, arg);
-
-	cpuid = cpumask_next(cpuid+1, cpu_online_mask);
-
-	if (cpuid < NR_CPUS) {
-		ia64_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0);
-	} else {
-		/*
-		 * If a log was recorded, increase our polling frequency,
-		 * otherwise, backoff or return to interrupt mode.
-		 */
-		if (start_count != IA64_LOG_COUNT(SAL_INFO_TYPE_CPE)) {
-			poll_time = max(MIN_CPE_POLL_INTERVAL, poll_time / 2);
-		} else if (cpe_vector < 0) {
-			poll_time = min(MAX_CPE_POLL_INTERVAL, poll_time * 2);
-		} else {
-			poll_time = MIN_CPE_POLL_INTERVAL;
-
-			printk(KERN_WARNING "Returning to interrupt driven CPE handler\n");
-			enable_irq(local_vector_to_irq(IA64_CPE_VECTOR));
-			cpe_poll_enabled = 0;
-		}
-
-		if (cpe_poll_enabled)
-			mod_timer(&cpe_poll_timer, jiffies + poll_time);
-		start_count = -1;
-	}
-
-	return IRQ_HANDLED;
-}
-
-/*
- *  ia64_mca_cpe_poll
- *
- *	Poll for Corrected Platform Errors (CPEs), trigger interrupt
- *	on first cpu, from there it will trickle through all the cpus.
- *
- * Inputs   :   dummy(unused)
- * Outputs  :   None
- *
- */
-static void
-ia64_mca_cpe_poll (struct timer_list *unused)
-{
-	/* Trigger a CPE interrupt cascade  */
-	ia64_send_ipi(cpumask_first(cpu_online_mask), IA64_CPEP_VECTOR,
-							IA64_IPI_DM_INT, 0);
-}
-
-static int
-default_monarch_init_process(struct notifier_block *self, unsigned long val, void *data)
-{
-	int c;
-	struct task_struct *g, *t;
-	if (val != DIE_INIT_MONARCH_PROCESS)
-		return NOTIFY_DONE;
-#ifdef CONFIG_KEXEC
-	if (atomic_read(&kdump_in_progress))
-		return NOTIFY_DONE;
-#endif
-
-	/*
-	 * FIXME: mlogbuf will brim over with INIT stack dumps.
-	 * To enable show_stack from INIT, we use oops_in_progress which should
-	 * be used in real oops. This would cause something wrong after INIT.
-	 */
-	BREAK_LOGLEVEL(console_loglevel);
-	ia64_mlogbuf_dump_from_init();
-
-	printk(KERN_ERR "Processes interrupted by INIT -");
-	for_each_online_cpu(c) {
-		struct ia64_sal_os_state *s;
-		t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET);
-		s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET);
-		g = s->prev_task;
-		if (g) {
-			if (g->pid)
-				printk(" %d", g->pid);
-			else
-				printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g);
-		}
-	}
-	printk("\n\n");
-	if (read_trylock(&tasklist_lock)) {
-		for_each_process_thread(g, t) {
-			printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm);
-			show_stack(t, NULL, KERN_DEFAULT);
-		}
-		read_unlock(&tasklist_lock);
-	}
-	/* FIXME: This will not restore zapped printk locks. */
-	RESTORE_LOGLEVEL(console_loglevel);
-	return NOTIFY_DONE;
-}
-
-/*
- * C portion of the OS INIT handler
- *
- * Called from ia64_os_init_dispatch
- *
- * Inputs: pointer to pt_regs where processor info was saved.  SAL/OS state for
- * this event.  This code is used for both monarch and slave INIT events, see
- * sos->monarch.
- *
- * All INIT events switch to the INIT stack and change the previous process to
- * blocked status.  If one of the INIT events is the monarch then we are
- * probably processing the nmi button/command.  Use the monarch cpu to dump all
- * the processes.  The slave INIT events all spin until the monarch cpu
- * returns.  We can also get INIT slave events for MCA, in which case the MCA
- * process is the monarch.
- */
-
-void
-ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
-		  struct ia64_sal_os_state *sos)
-{
-	static atomic_t slaves;
-	static atomic_t monarchs;
-	struct task_struct *previous_current;
-	int cpu = smp_processor_id();
-	struct ia64_mca_notify_die nd =
-		{ .sos = sos, .monarch_cpu = &monarch_cpu };
-
-	NOTIFY_INIT(DIE_INIT_ENTER, regs, (long)&nd, 0);
-
-	mprintk(KERN_INFO "Entered OS INIT handler. PSP=%lx cpu=%d monarch=%ld\n",
-		sos->proc_state_param, cpu, sos->monarch);
-	salinfo_log_wakeup(SAL_INFO_TYPE_INIT, NULL, 0, 0);
-
-	previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "INIT");
-	sos->os_status = IA64_INIT_RESUME;
-
-	/* FIXME: Workaround for broken proms that drive all INIT events as
-	 * slaves.  The last slave that enters is promoted to be a monarch.
-	 * Remove this code in September 2006, that gives platforms a year to
-	 * fix their proms and get their customers updated.
-	 */
-	if (!sos->monarch && atomic_add_return(1, &slaves) == num_online_cpus()) {
-		mprintk(KERN_WARNING "%s: Promoting cpu %d to monarch.\n",
-		        __func__, cpu);
-		atomic_dec(&slaves);
-		sos->monarch = 1;
-	}
-
-	/* FIXME: Workaround for broken proms that drive all INIT events as
-	 * monarchs.  Second and subsequent monarchs are demoted to slaves.
-	 * Remove this code in September 2006, that gives platforms a year to
-	 * fix their proms and get their customers updated.
-	 */
-	if (sos->monarch && atomic_add_return(1, &monarchs) > 1) {
-		mprintk(KERN_WARNING "%s: Demoting cpu %d to slave.\n",
-			       __func__, cpu);
-		atomic_dec(&monarchs);
-		sos->monarch = 0;
-	}
-
-	if (!sos->monarch) {
-		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_INIT;
-
-#ifdef CONFIG_KEXEC
-		while (monarch_cpu == -1 && !atomic_read(&kdump_in_progress))
-			udelay(1000);
-#else
-		while (monarch_cpu == -1)
-			cpu_relax();	/* spin until monarch enters */
-#endif
-
-		NOTIFY_INIT(DIE_INIT_SLAVE_ENTER, regs, (long)&nd, 1);
-		NOTIFY_INIT(DIE_INIT_SLAVE_PROCESS, regs, (long)&nd, 1);
-
-#ifdef CONFIG_KEXEC
-		while (monarch_cpu != -1 && !atomic_read(&kdump_in_progress))
-			udelay(1000);
-#else
-		while (monarch_cpu != -1)
-			cpu_relax();	/* spin until monarch leaves */
-#endif
-
-		NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1);
-
-		mprintk("Slave on cpu %d returning to normal service.\n", cpu);
-		ia64_set_curr_task(cpu, previous_current);
-		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
-		atomic_dec(&slaves);
-		return;
-	}
-
-	monarch_cpu = cpu;
-	NOTIFY_INIT(DIE_INIT_MONARCH_ENTER, regs, (long)&nd, 1);
-
-	/*
-	 * Wait for a bit.  On some machines (e.g., HP's zx2000 and zx6000, INIT can be
-	 * generated via the BMC's command-line interface, but since the console is on the
-	 * same serial line, the user will need some time to switch out of the BMC before
-	 * the dump begins.
-	 */
-	mprintk("Delaying for 5 seconds...\n");
-	udelay(5*1000000);
-	ia64_wait_for_slaves(cpu, "INIT");
-	/* If nobody intercepts DIE_INIT_MONARCH_PROCESS then we drop through
-	 * to default_monarch_init_process() above and just print all the
-	 * tasks.
-	 */
-	NOTIFY_INIT(DIE_INIT_MONARCH_PROCESS, regs, (long)&nd, 1);
-	NOTIFY_INIT(DIE_INIT_MONARCH_LEAVE, regs, (long)&nd, 1);
-
-	mprintk("\nINIT dump complete.  Monarch on cpu %d returning to normal service.\n", cpu);
-	atomic_dec(&monarchs);
-	ia64_set_curr_task(cpu, previous_current);
-	monarch_cpu = -1;
-	return;
-}
-
-static int __init
-ia64_mca_disable_cpe_polling(char *str)
-{
-	cpe_poll_enabled = 0;
-	return 1;
-}
-
-__setup("disable_cpe_poll", ia64_mca_disable_cpe_polling);
-
-/* Minimal format of the MCA/INIT stacks.  The pseudo processes that run on
- * these stacks can never sleep, they cannot return from the kernel to user
- * space, they do not appear in a normal ps listing.  So there is no need to
- * format most of the fields.
- */
-
-static void
-format_mca_init_stack(void *mca_data, unsigned long offset,
-		const char *type, int cpu)
-{
-	struct task_struct *p = (struct task_struct *)((char *)mca_data + offset);
-	struct thread_info *ti;
-	memset(p, 0, KERNEL_STACK_SIZE);
-	ti = task_thread_info(p);
-	ti->flags = _TIF_MCA_INIT;
-	ti->preempt_count = 1;
-	ti->task = p;
-	ti->cpu = cpu;
-	p->stack = ti;
-	p->__state = TASK_UNINTERRUPTIBLE;
-	cpumask_set_cpu(cpu, &p->cpus_mask);
-	INIT_LIST_HEAD(&p->tasks);
-	p->parent = p->real_parent = p->group_leader = p;
-	INIT_LIST_HEAD(&p->children);
-	INIT_LIST_HEAD(&p->sibling);
-	strscpy(p->comm, type, sizeof(p->comm)-1);
-}
-
-/* Caller prevents this from being called after init */
-static void * __ref mca_bootmem(void)
-{
-	return memblock_alloc(sizeof(struct ia64_mca_cpu), KERNEL_STACK_SIZE);
-}
-
-/* Do per-CPU MCA-related initialization.  */
-void
-ia64_mca_cpu_init(void *cpu_data)
-{
-	void *pal_vaddr;
-	void *data;
-	long sz = sizeof(struct ia64_mca_cpu);
-	int cpu = smp_processor_id();
-	static int first_time = 1;
-
-	/*
-	 * Structure will already be allocated if cpu has been online,
-	 * then offlined.
-	 */
-	if (__per_cpu_mca[cpu]) {
-		data = __va(__per_cpu_mca[cpu]);
-	} else {
-		if (first_time) {
-			data = mca_bootmem();
-			first_time = 0;
-		} else
-			data = (void *)__get_free_pages(GFP_ATOMIC,
-							get_order(sz));
-		if (!data)
-			panic("Could not allocate MCA memory for cpu %d\n",
-					cpu);
-	}
-	format_mca_init_stack(data, offsetof(struct ia64_mca_cpu, mca_stack),
-		"MCA", cpu);
-	format_mca_init_stack(data, offsetof(struct ia64_mca_cpu, init_stack),
-		"INIT", cpu);
-	__this_cpu_write(ia64_mca_data, (__per_cpu_mca[cpu] = __pa(data)));
-
-	/*
-	 * Stash away a copy of the PTE needed to map the per-CPU page.
-	 * We may need it during MCA recovery.
-	 */
-	__this_cpu_write(ia64_mca_per_cpu_pte,
-		pte_val(mk_pte_phys(__pa(cpu_data), PAGE_KERNEL)));
-
-	/*
-	 * Also, stash away a copy of the PAL address and the PTE
-	 * needed to map it.
-	 */
-	pal_vaddr = efi_get_pal_addr();
-	if (!pal_vaddr)
-		return;
-	__this_cpu_write(ia64_mca_pal_base,
-		GRANULEROUNDDOWN((unsigned long) pal_vaddr));
-	__this_cpu_write(ia64_mca_pal_pte, pte_val(mk_pte_phys(__pa(pal_vaddr),
-							      PAGE_KERNEL)));
-}
-
-static int ia64_mca_cpu_online(unsigned int cpu)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	if (!cmc_polling_enabled)
-		ia64_mca_cmc_vector_enable(NULL);
-	local_irq_restore(flags);
-	return 0;
-}
-
-/*
- * ia64_mca_init
- *
- *  Do all the system level mca specific initialization.
- *
- *	1. Register spinloop and wakeup request interrupt vectors
- *
- *	2. Register OS_MCA handler entry point
- *
- *	3. Register OS_INIT handler entry point
- *
- *  4. Initialize MCA/CMC/INIT related log buffers maintained by the OS.
- *
- *  Note that this initialization is done very early before some kernel
- *  services are available.
- *
- *  Inputs  :   None
- *
- *  Outputs :   None
- */
-void __init
-ia64_mca_init(void)
-{
-	ia64_fptr_t *init_hldlr_ptr_monarch = (ia64_fptr_t *)ia64_os_init_dispatch_monarch;
-	ia64_fptr_t *init_hldlr_ptr_slave = (ia64_fptr_t *)ia64_os_init_dispatch_slave;
-	ia64_fptr_t *mca_hldlr_ptr = (ia64_fptr_t *)ia64_os_mca_dispatch;
-	int i;
-	long rc;
-	struct ia64_sal_retval isrv;
-	unsigned long timeout = IA64_MCA_RENDEZ_TIMEOUT; /* platform specific */
-	static struct notifier_block default_init_monarch_nb = {
-		.notifier_call = default_monarch_init_process,
-		.priority = 0/* we need to notified last */
-	};
-
-	IA64_MCA_DEBUG("%s: begin\n", __func__);
-
-	/* Clear the Rendez checkin flag for all cpus */
-	for(i = 0 ; i < NR_CPUS; i++)
-		ia64_mc_info.imi_rendez_checkin[i] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
-
-	/*
-	 * Register the rendezvous spinloop and wakeup mechanism with SAL
-	 */
-
-	/* Register the rendezvous interrupt vector with SAL */
-	while (1) {
-		isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_INT,
-					      SAL_MC_PARAM_MECHANISM_INT,
-					      IA64_MCA_RENDEZ_VECTOR,
-					      timeout,
-					      SAL_MC_PARAM_RZ_ALWAYS);
-		rc = isrv.status;
-		if (rc == 0)
-			break;
-		if (rc == -2) {
-			printk(KERN_INFO "Increasing MCA rendezvous timeout from "
-				"%ld to %ld milliseconds\n", timeout, isrv.v0);
-			timeout = isrv.v0;
-			NOTIFY_MCA(DIE_MCA_NEW_TIMEOUT, NULL, timeout, 0);
-			continue;
-		}
-		printk(KERN_ERR "Failed to register rendezvous interrupt "
-		       "with SAL (status %ld)\n", rc);
-		return;
-	}
-
-	/* Register the wakeup interrupt vector with SAL */
-	isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_WAKEUP,
-				      SAL_MC_PARAM_MECHANISM_INT,
-				      IA64_MCA_WAKEUP_VECTOR,
-				      0, 0);
-	rc = isrv.status;
-	if (rc) {
-		printk(KERN_ERR "Failed to register wakeup interrupt with SAL "
-		       "(status %ld)\n", rc);
-		return;
-	}
-
-	IA64_MCA_DEBUG("%s: registered MCA rendezvous spinloop and wakeup mech.\n", __func__);
-
-	ia64_mc_info.imi_mca_handler        = ia64_tpa(mca_hldlr_ptr->fp);
-	/*
-	 * XXX - disable SAL checksum by setting size to 0; should be
-	 *	ia64_tpa(ia64_os_mca_dispatch_end) - ia64_tpa(ia64_os_mca_dispatch);
-	 */
-	ia64_mc_info.imi_mca_handler_size	= 0;
-
-	/* Register the os mca handler with SAL */
-	if ((rc = ia64_sal_set_vectors(SAL_VECTOR_OS_MCA,
-				       ia64_mc_info.imi_mca_handler,
-				       ia64_tpa(mca_hldlr_ptr->gp),
-				       ia64_mc_info.imi_mca_handler_size,
-				       0, 0, 0)))
-	{
-		printk(KERN_ERR "Failed to register OS MCA handler with SAL "
-		       "(status %ld)\n", rc);
-		return;
-	}
-
-	IA64_MCA_DEBUG("%s: registered OS MCA handler with SAL at 0x%lx, gp = 0x%lx\n", __func__,
-		       ia64_mc_info.imi_mca_handler, ia64_tpa(mca_hldlr_ptr->gp));
-
-	/*
-	 * XXX - disable SAL checksum by setting size to 0, should be
-	 * size of the actual init handler in mca_asm.S.
-	 */
-	ia64_mc_info.imi_monarch_init_handler		= ia64_tpa(init_hldlr_ptr_monarch->fp);
-	ia64_mc_info.imi_monarch_init_handler_size	= 0;
-	ia64_mc_info.imi_slave_init_handler		= ia64_tpa(init_hldlr_ptr_slave->fp);
-	ia64_mc_info.imi_slave_init_handler_size	= 0;
-
-	IA64_MCA_DEBUG("%s: OS INIT handler at %lx\n", __func__,
-		       ia64_mc_info.imi_monarch_init_handler);
-
-	/* Register the os init handler with SAL */
-	if ((rc = ia64_sal_set_vectors(SAL_VECTOR_OS_INIT,
-				       ia64_mc_info.imi_monarch_init_handler,
-				       ia64_tpa(ia64_getreg(_IA64_REG_GP)),
-				       ia64_mc_info.imi_monarch_init_handler_size,
-				       ia64_mc_info.imi_slave_init_handler,
-				       ia64_tpa(ia64_getreg(_IA64_REG_GP)),
-				       ia64_mc_info.imi_slave_init_handler_size)))
-	{
-		printk(KERN_ERR "Failed to register m/s INIT handlers with SAL "
-		       "(status %ld)\n", rc);
-		return;
-	}
-	if (register_die_notifier(&default_init_monarch_nb)) {
-		printk(KERN_ERR "Failed to register default monarch INIT process\n");
-		return;
-	}
-
-	IA64_MCA_DEBUG("%s: registered OS INIT handler with SAL\n", __func__);
-
-	/* Initialize the areas set aside by the OS to buffer the
-	 * platform/processor error states for MCA/INIT/CMC
-	 * handling.
-	 */
-	ia64_log_init(SAL_INFO_TYPE_MCA);
-	ia64_log_init(SAL_INFO_TYPE_INIT);
-	ia64_log_init(SAL_INFO_TYPE_CMC);
-	ia64_log_init(SAL_INFO_TYPE_CPE);
-
-	mca_init = 1;
-	printk(KERN_INFO "MCA related initialization done\n");
-}
-
-
-/*
- * These pieces cannot be done in ia64_mca_init() because it is called before
- * early_irq_init() which would wipe out our percpu irq registrations. But we
- * cannot leave them until ia64_mca_late_init() because by then all the other
- * processors have been brought online and have set their own CMC vectors to
- * point at a non-existant action. Called from arch_early_irq_init().
- */
-void __init ia64_mca_irq_init(void)
-{
-	/*
-	 *  Configure the CMCI/P vector and handler. Interrupts for CMC are
-	 *  per-processor, so AP CMC interrupts are setup in smp_callin() (smpboot.c).
-	 */
-	register_percpu_irq(IA64_CMC_VECTOR, ia64_mca_cmc_int_handler, 0,
-			    "cmc_hndlr");
-	register_percpu_irq(IA64_CMCP_VECTOR, ia64_mca_cmc_int_caller, 0,
-			    "cmc_poll");
-	ia64_mca_cmc_vector_setup();       /* Setup vector on BSP */
-
-	/* Setup the MCA rendezvous interrupt vector */
-	register_percpu_irq(IA64_MCA_RENDEZ_VECTOR, ia64_mca_rendez_int_handler,
-			    0, "mca_rdzv");
-
-	/* Setup the MCA wakeup interrupt vector */
-	register_percpu_irq(IA64_MCA_WAKEUP_VECTOR, ia64_mca_wakeup_int_handler,
-			    0, "mca_wkup");
-
-	/* Setup the CPEI/P handler */
-	register_percpu_irq(IA64_CPEP_VECTOR, ia64_mca_cpe_int_caller, 0,
-			    "cpe_poll");
-}
-
-/*
- * ia64_mca_late_init
- *
- *	Opportunity to setup things that require initialization later
- *	than ia64_mca_init.  Setup a timer to poll for CPEs if the
- *	platform doesn't support an interrupt driven mechanism.
- *
- *  Inputs  :   None
- *  Outputs :   Status
- */
-static int __init
-ia64_mca_late_init(void)
-{
-	if (!mca_init)
-		return 0;
-
-	/* Setup the CMCI/P vector and handler */
-	timer_setup(&cmc_poll_timer, ia64_mca_cmc_poll, 0);
-
-	/* Unmask/enable the vector */
-	cmc_polling_enabled = 0;
-	cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/mca:online",
-			  ia64_mca_cpu_online, NULL);
-	IA64_MCA_DEBUG("%s: CMCI/P setup and enabled.\n", __func__);
-
-	/* Setup the CPEI/P vector and handler */
-	cpe_vector = acpi_request_vector(ACPI_INTERRUPT_CPEI);
-	timer_setup(&cpe_poll_timer, ia64_mca_cpe_poll, 0);
-
-	{
-		unsigned int irq;
-
-		if (cpe_vector >= 0) {
-			/* If platform supports CPEI, enable the irq. */
-			irq = local_vector_to_irq(cpe_vector);
-			if (irq > 0) {
-				cpe_poll_enabled = 0;
-				irq_set_status_flags(irq, IRQ_PER_CPU);
-				if (request_irq(irq, ia64_mca_cpe_int_handler,
-						0, "cpe_hndlr", NULL))
-					pr_err("Failed to register cpe_hndlr interrupt\n");
-				ia64_cpe_irq = irq;
-				ia64_mca_register_cpev(cpe_vector);
-				IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n",
-					__func__);
-				return 0;
-			}
-			printk(KERN_ERR "%s: Failed to find irq for CPE "
-					"interrupt handler, vector %d\n",
-					__func__, cpe_vector);
-		}
-		/* If platform doesn't support CPEI, get the timer going. */
-		if (cpe_poll_enabled) {
-			ia64_mca_cpe_poll(0UL);
-			IA64_MCA_DEBUG("%s: CPEP setup and enabled.\n", __func__);
-		}
-	}
-
-	return 0;
-}
-
-device_initcall(ia64_mca_late_init);
diff --git a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S
deleted file mode 100644
index 0d6b8cf9d1d0..000000000000
--- a/arch/ia64/kernel/mca_asm.S
+++ /dev/null
@@ -1,1123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * File:	mca_asm.S
- * Purpose:	assembly portion of the IA64 MCA handling
- *
- * Mods by cfleck to integrate into kernel build
- *
- * 2000-03-15 David Mosberger-Tang <davidm@hpl.hp.com>
- *		Added various stop bits to get a clean compile
- *
- * 2000-03-29 Chuck Fleckenstein <cfleck@co.intel.com>
- *		Added code to save INIT handoff state in pt_regs format,
- *		switch to temp kstack, switch modes, jump to C INIT handler
- *
- * 2002-01-04 J.Hall <jenna.s.hall@intel.com>
- *		Before entering virtual mode code:
- *		 1. Check for TLB CPU error
- *		 2. Restore current thread pointer to kr6
- *		 3. Move stack ptr 16 bytes to conform to C calling convention
- *
- * 2004-11-12 Russ Anderson <rja@sgi.com>
- *		Added per cpu MCA/INIT stack save areas.
- *
- * 2005-12-08 Keith Owens <kaos@sgi.com>
- *		Use per cpu MCA/INIT stacks for all data.
- */
-#include <linux/threads.h>
-#include <linux/pgtable.h>
-
-#include <asm/asmmacro.h>
-#include <asm/processor.h>
-#include <asm/mca_asm.h>
-#include <asm/mca.h>
-
-#include "entry.h"
-
-#define GET_IA64_MCA_DATA(reg)						\
-	GET_THIS_PADDR(reg, ia64_mca_data)				\
-	;;								\
-	ld8 reg=[reg]
-
-	.global ia64_do_tlb_purge
-	.global ia64_os_mca_dispatch
-	.global ia64_os_init_on_kdump
-	.global ia64_os_init_dispatch_monarch
-	.global ia64_os_init_dispatch_slave
-
-	.text
-	.align 16
-
-//StartMain////////////////////////////////////////////////////////////////////
-
-/*
- * Just the TLB purge part is moved to a separate function
- * so we can re-use the code for cpu hotplug code as well
- * Caller should now setup b1, so we can branch once the
- * tlb flush is complete.
- */
-
-ia64_do_tlb_purge:
-#define O(member)	IA64_CPUINFO_##member##_OFFSET
-
-	GET_THIS_PADDR(r2, ia64_cpu_info) // load phys addr of cpu_info into r2
-	;;
-	addl r17=O(PTCE_STRIDE),r2
-	addl r2=O(PTCE_BASE),r2
-	;;
-	ld8 r18=[r2],(O(PTCE_COUNT)-O(PTCE_BASE));;	// r18=ptce_base
-	ld4 r19=[r2],4					// r19=ptce_count[0]
-	ld4 r21=[r17],4					// r21=ptce_stride[0]
-	;;
-	ld4 r20=[r2]					// r20=ptce_count[1]
-	ld4 r22=[r17]					// r22=ptce_stride[1]
-	mov r24=0
-	;;
-	adds r20=-1,r20
-	;;
-#undef O
-
-2:
-	cmp.ltu p6,p7=r24,r19
-(p7)	br.cond.dpnt.few 4f
-	mov ar.lc=r20
-3:
-	ptc.e r18
-	;;
-	add r18=r22,r18
-	br.cloop.sptk.few 3b
-	;;
-	add r18=r21,r18
-	add r24=1,r24
-	;;
-	br.sptk.few 2b
-4:
-	srlz.i 			// srlz.i implies srlz.d
-	;;
-
-        // Now purge addresses formerly mapped by TR registers
-	// 1. Purge ITR&DTR for kernel.
-	movl r16=KERNEL_START
-	mov r18=KERNEL_TR_PAGE_SHIFT<<2
-	;;
-	ptr.i r16, r18
-	ptr.d r16, r18
-	;;
-	srlz.i
-	;;
-	srlz.d
-	;;
-	// 3. Purge ITR for PAL code.
-	GET_THIS_PADDR(r2, ia64_mca_pal_base)
-	;;
-	ld8 r16=[r2]
-	mov r18=IA64_GRANULE_SHIFT<<2
-	;;
-	ptr.i r16,r18
-	;;
-	srlz.i
-	;;
-	// 4. Purge DTR for stack.
-	mov r16=IA64_KR(CURRENT_STACK)
-	;;
-	shl r16=r16,IA64_GRANULE_SHIFT
-	movl r19=PAGE_OFFSET
-	;;
-	add r16=r19,r16
-	mov r18=IA64_GRANULE_SHIFT<<2
-	;;
-	ptr.d r16,r18
-	;;
-	srlz.i
-	;;
-	// Now branch away to caller.
-	br.sptk.many b1
-	;;
-
-//EndMain//////////////////////////////////////////////////////////////////////
-
-//StartMain////////////////////////////////////////////////////////////////////
-
-ia64_os_mca_dispatch:
-	mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET	// use the MCA stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	mov r19=1				// All MCA events are treated as monarch (for now)
-	br.sptk ia64_state_save			// save the state that is not in minstate
-1:
-
-	GET_IA64_MCA_DATA(r2)
-	// Using MCA stack, struct ia64_sal_os_state, variable proc_state_param
-	;;
-	add r3=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SOS_OFFSET+SOS(PROC_STATE_PARAM), r2
-	;;
-	ld8 r18=[r3]				// Get processor state parameter on existing PALE_CHECK.
-	;;
-	tbit.nz p6,p7=r18,60
-(p7)	br.spnt done_tlb_purge_and_reload
-
-	// The following code purges TC and TR entries. Then reload all TC entries.
-	// Purge percpu data TC entries.
-begin_tlb_purge_and_reload:
-	movl r18=ia64_reload_tr;;
-	LOAD_PHYSICAL(p0,r18,ia64_reload_tr);;
-	mov b1=r18;;
-	br.sptk.many ia64_do_tlb_purge;;
-
-ia64_reload_tr:
-	// Finally reload the TR registers.
-	// 1. Reload DTR/ITR registers for kernel.
-	mov r18=KERNEL_TR_PAGE_SHIFT<<2
-	movl r17=KERNEL_START
-	;;
-	mov cr.itir=r18
-	mov cr.ifa=r17
-        mov r16=IA64_TR_KERNEL
-	mov r19=ip
-	movl r18=PAGE_KERNEL
-	;;
-        dep r17=0,r19,0, KERNEL_TR_PAGE_SHIFT
-	;;
-	or r18=r17,r18
-	;;
-        itr.i itr[r16]=r18
-	;;
-        itr.d dtr[r16]=r18
-        ;;
-	srlz.i
-	srlz.d
-	;;
-	// 3. Reload ITR for PAL code.
-	GET_THIS_PADDR(r2, ia64_mca_pal_pte)
-	;;
-	ld8 r18=[r2]			// load PAL PTE
-	;;
-	GET_THIS_PADDR(r2, ia64_mca_pal_base)
-	;;
-	ld8 r16=[r2]			// load PAL vaddr
-	mov r19=IA64_GRANULE_SHIFT<<2
-	;;
-	mov cr.itir=r19
-	mov cr.ifa=r16
-	mov r20=IA64_TR_PALCODE
-	;;
-	itr.i itr[r20]=r18
-	;;
-	srlz.i
-	;;
-	// 4. Reload DTR for stack.
-	mov r16=IA64_KR(CURRENT_STACK)
-	;;
-	shl r16=r16,IA64_GRANULE_SHIFT
-	movl r19=PAGE_OFFSET
-	;;
-	add r18=r19,r16
-	movl r20=PAGE_KERNEL
-	;;
-	add r16=r20,r16
-	mov r19=IA64_GRANULE_SHIFT<<2
-	;;
-	mov cr.itir=r19
-	mov cr.ifa=r18
-	mov r20=IA64_TR_CURRENT_STACK
-	;;
-	itr.d dtr[r20]=r16
-	GET_THIS_PADDR(r2, ia64_mca_tr_reload)
-	mov r18 = 1
-	;;
-	srlz.d
-	;;
-	st8 [r2] =r18
-	;;
-
-done_tlb_purge_and_reload:
-
-	// switch to per cpu MCA stack
-	mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET	// use the MCA stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_new_stack
-1:
-
-	// everything saved, now we can set the kernel registers
-	mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET	// use the MCA stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_set_kernel_registers
-1:
-
-	// This must be done in physical mode
-	GET_IA64_MCA_DATA(r2)
-	;;
-	mov r7=r2
-
-        // Enter virtual mode from physical mode
-	VIRTUAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_begin, r4)
-
-	// This code returns to SAL via SOS r2, in general SAL has no unwind
-	// data.  To get a clean termination when backtracing the C MCA/INIT
-	// handler, set a dummy return address of 0 in this routine.  That
-	// requires that ia64_os_mca_virtual_begin be a global function.
-ENTRY(ia64_os_mca_virtual_begin)
-	.prologue
-	.save rp,r0
-	.body
-
-	mov ar.rsc=3				// set eager mode for C handler
-	mov r2=r7				// see GET_IA64_MCA_DATA above
-	;;
-
-	// Call virtual mode handler
-	alloc r14=ar.pfs,0,0,3,0
-	;;
-	DATA_PA_TO_VA(r2,r7)
-	;;
-	add out0=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_PT_REGS_OFFSET, r2
-	add out1=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SWITCH_STACK_OFFSET, r2
-	add out2=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SOS_OFFSET, r2
-	br.call.sptk.many    b0=ia64_mca_handler
-
-	// Revert back to physical mode before going back to SAL
-	PHYSICAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_end, r4)
-ia64_os_mca_virtual_end:
-
-END(ia64_os_mca_virtual_begin)
-
-	// switch back to previous stack
-	alloc r14=ar.pfs,0,0,0,0		// remove the MCA handler frame
-	mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET	// use the MCA stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_old_stack
-1:
-
-	mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET	// use the MCA stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_state_restore		// restore the SAL state
-1:
-
-	mov		b0=r12			// SAL_CHECK return address
-
-	br		b0
-
-//EndMain//////////////////////////////////////////////////////////////////////
-
-//StartMain////////////////////////////////////////////////////////////////////
-
-//
-// NOP init handler for kdump.  In panic situation, we may receive INIT
-// while kernel transition.  Since we initialize registers on leave from
-// current kernel, no longer monarch/slave handlers of current kernel in
-// virtual mode are called safely.
-// We can unregister these init handlers from SAL, however then the INIT
-// will result in warmboot by SAL and we cannot retrieve the crashdump.
-// Therefore register this NOP function to SAL, to prevent entering virtual
-// mode and resulting warmboot by SAL.
-//
-ia64_os_init_on_kdump:
-	mov		r8=r0		// IA64_INIT_RESUME
-	mov             r9=r10		// SAL_GP
-	mov		r22=r17		// *minstate
-	;;
-	mov		r10=r0		// return to same context
-	mov		b0=r12		// SAL_CHECK return address
-	br		b0
-
-//
-// SAL to OS entry point for INIT on all processors.  This has been defined for
-// registration purposes with SAL as a part of ia64_mca_init.  Monarch and
-// slave INIT have identical processing, except for the value of the
-// sos->monarch flag in r19.
-//
-
-ia64_os_init_dispatch_monarch:
-	mov r19=1				// Bow, bow, ye lower middle classes!
-	br.sptk ia64_os_init_dispatch
-
-ia64_os_init_dispatch_slave:
-	mov r19=0				// <igor>yeth, mathter</igor>
-
-ia64_os_init_dispatch:
-
-	mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET	// use the INIT stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_state_save			// save the state that is not in minstate
-1:
-
-	// switch to per cpu INIT stack
-	mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET	// use the INIT stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_new_stack
-1:
-
-	// everything saved, now we can set the kernel registers
-	mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET	// use the INIT stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_set_kernel_registers
-1:
-
-	// This must be done in physical mode
-	GET_IA64_MCA_DATA(r2)
-	;;
-	mov r7=r2
-
-        // Enter virtual mode from physical mode
-	VIRTUAL_MODE_ENTER(r2, r3, ia64_os_init_virtual_begin, r4)
-
-	// This code returns to SAL via SOS r2, in general SAL has no unwind
-	// data.  To get a clean termination when backtracing the C MCA/INIT
-	// handler, set a dummy return address of 0 in this routine.  That
-	// requires that ia64_os_init_virtual_begin be a global function.
-ENTRY(ia64_os_init_virtual_begin)
-	.prologue
-	.save rp,r0
-	.body
-
-	mov ar.rsc=3				// set eager mode for C handler
-	mov r2=r7				// see GET_IA64_MCA_DATA above
-	;;
-
-	// Call virtual mode handler
-	alloc r14=ar.pfs,0,0,3,0
-	;;
-	DATA_PA_TO_VA(r2,r7)
-	;;
-	add out0=IA64_MCA_CPU_INIT_STACK_OFFSET+MCA_PT_REGS_OFFSET, r2
-	add out1=IA64_MCA_CPU_INIT_STACK_OFFSET+MCA_SWITCH_STACK_OFFSET, r2
-	add out2=IA64_MCA_CPU_INIT_STACK_OFFSET+MCA_SOS_OFFSET, r2
-	br.call.sptk.many    b0=ia64_init_handler
-
-	// Revert back to physical mode before going back to SAL
-	PHYSICAL_MODE_ENTER(r2, r3, ia64_os_init_virtual_end, r4)
-ia64_os_init_virtual_end:
-
-END(ia64_os_init_virtual_begin)
-
-	mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET	// use the INIT stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_state_restore		// restore the SAL state
-1:
-
-	// switch back to previous stack
-	alloc r14=ar.pfs,0,0,0,0		// remove the INIT handler frame
-	mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET	// use the INIT stack
-	LOAD_PHYSICAL(p0,r2,1f)			// return address
-	br.sptk ia64_old_stack
-1:
-
-	mov		b0=r12			// SAL_CHECK return address
-	br		b0
-
-//EndMain//////////////////////////////////////////////////////////////////////
-
-// common defines for the stubs
-#define	ms		r4
-#define	regs		r5
-#define	temp1		r2	/* careful, it overlaps with input registers */
-#define	temp2		r3	/* careful, it overlaps with input registers */
-#define	temp3		r7
-#define	temp4		r14
-
-
-//++
-// Name:
-//	ia64_state_save()
-//
-// Stub Description:
-//
-//	Save the state that is not in minstate.  This is sensitive to the layout of
-//	struct ia64_sal_os_state in mca.h.
-//
-//	r2 contains the return address, r3 contains either
-//	IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET.
-//
-//	The OS to SAL section of struct ia64_sal_os_state is set to a default
-//	value of cold boot (MCA) or warm boot (INIT) and return to the same
-//	context.  ia64_sal_os_state is also used to hold some registers that
-//	need to be saved and restored across the stack switches.
-//
-//	Most input registers to this stub come from PAL/SAL
-//	r1  os gp, physical
-//	r8  pal_proc entry point
-//	r9  sal_proc entry point
-//	r10 sal gp
-//	r11 MCA - rendevzous state, INIT - reason code
-//	r12 sal return address
-//	r17 pal min_state
-//	r18 processor state parameter
-//	r19 monarch flag, set by the caller of this routine
-//
-//	In addition to the SAL to OS state, this routine saves all the
-//	registers that appear in struct pt_regs and struct switch_stack,
-//	excluding those that are already in the PAL minstate area.  This
-//	results in a partial pt_regs and switch_stack, the C code copies the
-//	remaining registers from PAL minstate to pt_regs and switch_stack.  The
-//	resulting structures contain all the state of the original process when
-//	MCA/INIT occurred.
-//
-//--
-
-ia64_state_save:
-	add regs=MCA_SOS_OFFSET, r3
-	add ms=MCA_SOS_OFFSET+8, r3
-	mov b0=r2		// save return address
-	cmp.eq p1,p2=IA64_MCA_CPU_MCA_STACK_OFFSET, r3
-	;;
-	GET_IA64_MCA_DATA(temp2)
-	;;
-	add temp1=temp2, regs	// struct ia64_sal_os_state on MCA or INIT stack
-	add temp2=temp2, ms	// struct ia64_sal_os_state+8 on MCA or INIT stack
-	;;
-	mov regs=temp1		// save the start of sos
-	st8 [temp1]=r1,16	// os_gp
-	st8 [temp2]=r8,16	// pal_proc
-	;;
-	st8 [temp1]=r9,16	// sal_proc
-	st8 [temp2]=r11,16	// rv_rc
-	mov r11=cr.iipa
-	;;
-	st8 [temp1]=r18		// proc_state_param
-	st8 [temp2]=r19		// monarch
-	mov r6=IA64_KR(CURRENT)
-	add temp1=SOS(SAL_RA), regs
-	add temp2=SOS(SAL_GP), regs
-	;;
-	st8 [temp1]=r12,16	// sal_ra
-	st8 [temp2]=r10,16	// sal_gp
-	mov r12=cr.isr
-	;;
-	st8 [temp1]=r17,16	// pal_min_state
-	st8 [temp2]=r6,16	// prev_IA64_KR_CURRENT
-	mov r6=IA64_KR(CURRENT_STACK)
-	;;
-	st8 [temp1]=r6,16	// prev_IA64_KR_CURRENT_STACK
-	st8 [temp2]=r0,16	// prev_task, starts off as NULL
-	mov r6=cr.ifa
-	;;
-	st8 [temp1]=r12,16	// cr.isr
-	st8 [temp2]=r6,16	// cr.ifa
-	mov r12=cr.itir
-	;;
-	st8 [temp1]=r12,16	// cr.itir
-	st8 [temp2]=r11,16	// cr.iipa
-	mov r12=cr.iim
-	;;
-	st8 [temp1]=r12		// cr.iim
-(p1)	mov r12=IA64_MCA_COLD_BOOT
-(p2)	mov r12=IA64_INIT_WARM_BOOT
-	mov r6=cr.iha
-	add temp1=SOS(OS_STATUS), regs
-	;;
-	st8 [temp2]=r6		// cr.iha
-	add temp2=SOS(CONTEXT), regs
-	st8 [temp1]=r12		// os_status, default is cold boot
-	mov r6=IA64_MCA_SAME_CONTEXT
-	;;
-	st8 [temp2]=r6		// context, default is same context
-
-	// Save the pt_regs data that is not in minstate.  The previous code
-	// left regs at sos.
-	add regs=MCA_PT_REGS_OFFSET-MCA_SOS_OFFSET, regs
-	;;
-	add temp1=PT(B6), regs
-	mov temp3=b6
-	mov temp4=b7
-	add temp2=PT(B7), regs
-	;;
-	st8 [temp1]=temp3,PT(AR_CSD)-PT(B6)		// save b6
-	st8 [temp2]=temp4,PT(AR_SSD)-PT(B7)		// save b7
-	mov temp3=ar.csd
-	mov temp4=ar.ssd
-	cover						// must be last in group
-	;;
-	st8 [temp1]=temp3,PT(AR_UNAT)-PT(AR_CSD)	// save ar.csd
-	st8 [temp2]=temp4,PT(AR_PFS)-PT(AR_SSD)		// save ar.ssd
-	mov temp3=ar.unat
-	mov temp4=ar.pfs
-	;;
-	st8 [temp1]=temp3,PT(AR_RNAT)-PT(AR_UNAT)	// save ar.unat
-	st8 [temp2]=temp4,PT(AR_BSPSTORE)-PT(AR_PFS)	// save ar.pfs
-	mov temp3=ar.rnat
-	mov temp4=ar.bspstore
-	;;
-	st8 [temp1]=temp3,PT(LOADRS)-PT(AR_RNAT)	// save ar.rnat
-	st8 [temp2]=temp4,PT(AR_FPSR)-PT(AR_BSPSTORE)	// save ar.bspstore
-	mov temp3=ar.bsp
-	;;
-	sub temp3=temp3, temp4	// ar.bsp - ar.bspstore
-	mov temp4=ar.fpsr
-	;;
-	shl temp3=temp3,16	// compute ar.rsc to be used for "loadrs"
-	;;
-	st8 [temp1]=temp3,PT(AR_CCV)-PT(LOADRS)		// save loadrs
-	st8 [temp2]=temp4,PT(F6)-PT(AR_FPSR)		// save ar.fpsr
-	mov temp3=ar.ccv
-	;;
-	st8 [temp1]=temp3,PT(F7)-PT(AR_CCV)		// save ar.ccv
-	stf.spill [temp2]=f6,PT(F8)-PT(F6)
-	;;
-	stf.spill [temp1]=f7,PT(F9)-PT(F7)
-	stf.spill [temp2]=f8,PT(F10)-PT(F8)
-	;;
-	stf.spill [temp1]=f9,PT(F11)-PT(F9)
-	stf.spill [temp2]=f10
-	;;
-	stf.spill [temp1]=f11
-
-	// Save the switch_stack data that is not in minstate nor pt_regs.  The
-	// previous code left regs at pt_regs.
-	add regs=MCA_SWITCH_STACK_OFFSET-MCA_PT_REGS_OFFSET, regs
-	;;
-	add temp1=SW(F2), regs
-	add temp2=SW(F3), regs
-	;;
-	stf.spill [temp1]=f2,32
-	stf.spill [temp2]=f3,32
-	;;
-	stf.spill [temp1]=f4,32
-	stf.spill [temp2]=f5,32
-	;;
-	stf.spill [temp1]=f12,32
-	stf.spill [temp2]=f13,32
-	;;
-	stf.spill [temp1]=f14,32
-	stf.spill [temp2]=f15,32
-	;;
-	stf.spill [temp1]=f16,32
-	stf.spill [temp2]=f17,32
-	;;
-	stf.spill [temp1]=f18,32
-	stf.spill [temp2]=f19,32
-	;;
-	stf.spill [temp1]=f20,32
-	stf.spill [temp2]=f21,32
-	;;
-	stf.spill [temp1]=f22,32
-	stf.spill [temp2]=f23,32
-	;;
-	stf.spill [temp1]=f24,32
-	stf.spill [temp2]=f25,32
-	;;
-	stf.spill [temp1]=f26,32
-	stf.spill [temp2]=f27,32
-	;;
-	stf.spill [temp1]=f28,32
-	stf.spill [temp2]=f29,32
-	;;
-	stf.spill [temp1]=f30,SW(B2)-SW(F30)
-	stf.spill [temp2]=f31,SW(B3)-SW(F31)
-	mov temp3=b2
-	mov temp4=b3
-	;;
-	st8 [temp1]=temp3,16	// save b2
-	st8 [temp2]=temp4,16	// save b3
-	mov temp3=b4
-	mov temp4=b5
-	;;
-	st8 [temp1]=temp3,SW(AR_LC)-SW(B4)	// save b4
-	st8 [temp2]=temp4	// save b5
-	mov temp3=ar.lc
-	;;
-	st8 [temp1]=temp3	// save ar.lc
-
-	// FIXME: Some proms are incorrectly accessing the minstate area as
-	// cached data.  The C code uses region 6, uncached virtual.  Ensure
-	// that there is no cache data lying around for the first 1K of the
-	// minstate area.
-	// Remove this code in September 2006, that gives platforms a year to
-	// fix their proms and get their customers updated.
-
-	add r1=32*1,r17
-	add r2=32*2,r17
-	add r3=32*3,r17
-	add r4=32*4,r17
-	add r5=32*5,r17
-	add r6=32*6,r17
-	add r7=32*7,r17
-	;;
-	fc r17
-	fc r1
-	fc r2
-	fc r3
-	fc r4
-	fc r5
-	fc r6
-	fc r7
-	add r17=32*8,r17
-	add r1=32*8,r1
-	add r2=32*8,r2
-	add r3=32*8,r3
-	add r4=32*8,r4
-	add r5=32*8,r5
-	add r6=32*8,r6
-	add r7=32*8,r7
-	;;
-	fc r17
-	fc r1
-	fc r2
-	fc r3
-	fc r4
-	fc r5
-	fc r6
-	fc r7
-	add r17=32*8,r17
-	add r1=32*8,r1
-	add r2=32*8,r2
-	add r3=32*8,r3
-	add r4=32*8,r4
-	add r5=32*8,r5
-	add r6=32*8,r6
-	add r7=32*8,r7
-	;;
-	fc r17
-	fc r1
-	fc r2
-	fc r3
-	fc r4
-	fc r5
-	fc r6
-	fc r7
-	add r17=32*8,r17
-	add r1=32*8,r1
-	add r2=32*8,r2
-	add r3=32*8,r3
-	add r4=32*8,r4
-	add r5=32*8,r5
-	add r6=32*8,r6
-	add r7=32*8,r7
-	;;
-	fc r17
-	fc r1
-	fc r2
-	fc r3
-	fc r4
-	fc r5
-	fc r6
-	fc r7
-
-	br.sptk b0
-
-//EndStub//////////////////////////////////////////////////////////////////////
-
-
-//++
-// Name:
-//	ia64_state_restore()
-//
-// Stub Description:
-//
-//	Restore the SAL/OS state.  This is sensitive to the layout of struct
-//	ia64_sal_os_state in mca.h.
-//
-//	r2 contains the return address, r3 contains either
-//	IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET.
-//
-//	In addition to the SAL to OS state, this routine restores all the
-//	registers that appear in struct pt_regs and struct switch_stack,
-//	excluding those in the PAL minstate area.
-//
-//--
-
-ia64_state_restore:
-	// Restore the switch_stack data that is not in minstate nor pt_regs.
-	add regs=MCA_SWITCH_STACK_OFFSET, r3
-	mov b0=r2		// save return address
-	;;
-	GET_IA64_MCA_DATA(temp2)
-	;;
-	add regs=temp2, regs
-	;;
-	add temp1=SW(F2), regs
-	add temp2=SW(F3), regs
-	;;
-	ldf.fill f2=[temp1],32
-	ldf.fill f3=[temp2],32
-	;;
-	ldf.fill f4=[temp1],32
-	ldf.fill f5=[temp2],32
-	;;
-	ldf.fill f12=[temp1],32
-	ldf.fill f13=[temp2],32
-	;;
-	ldf.fill f14=[temp1],32
-	ldf.fill f15=[temp2],32
-	;;
-	ldf.fill f16=[temp1],32
-	ldf.fill f17=[temp2],32
-	;;
-	ldf.fill f18=[temp1],32
-	ldf.fill f19=[temp2],32
-	;;
-	ldf.fill f20=[temp1],32
-	ldf.fill f21=[temp2],32
-	;;
-	ldf.fill f22=[temp1],32
-	ldf.fill f23=[temp2],32
-	;;
-	ldf.fill f24=[temp1],32
-	ldf.fill f25=[temp2],32
-	;;
-	ldf.fill f26=[temp1],32
-	ldf.fill f27=[temp2],32
-	;;
-	ldf.fill f28=[temp1],32
-	ldf.fill f29=[temp2],32
-	;;
-	ldf.fill f30=[temp1],SW(B2)-SW(F30)
-	ldf.fill f31=[temp2],SW(B3)-SW(F31)
-	;;
-	ld8 temp3=[temp1],16	// restore b2
-	ld8 temp4=[temp2],16	// restore b3
-	;;
-	mov b2=temp3
-	mov b3=temp4
-	ld8 temp3=[temp1],SW(AR_LC)-SW(B4)	// restore b4
-	ld8 temp4=[temp2]	// restore b5
-	;;
-	mov b4=temp3
-	mov b5=temp4
-	ld8 temp3=[temp1]	// restore ar.lc
-	;;
-	mov ar.lc=temp3
-
-	// Restore the pt_regs data that is not in minstate.  The previous code
-	// left regs at switch_stack.
-	add regs=MCA_PT_REGS_OFFSET-MCA_SWITCH_STACK_OFFSET, regs
-	;;
-	add temp1=PT(B6), regs
-	add temp2=PT(B7), regs
-	;;
-	ld8 temp3=[temp1],PT(AR_CSD)-PT(B6)		// restore b6
-	ld8 temp4=[temp2],PT(AR_SSD)-PT(B7)		// restore b7
-	;;
-	mov b6=temp3
-	mov b7=temp4
-	ld8 temp3=[temp1],PT(AR_UNAT)-PT(AR_CSD)	// restore ar.csd
-	ld8 temp4=[temp2],PT(AR_PFS)-PT(AR_SSD)		// restore ar.ssd
-	;;
-	mov ar.csd=temp3
-	mov ar.ssd=temp4
-	ld8 temp3=[temp1]				// restore ar.unat
-	add temp1=PT(AR_CCV)-PT(AR_UNAT), temp1
-	ld8 temp4=[temp2],PT(AR_FPSR)-PT(AR_PFS)	// restore ar.pfs
-	;;
-	mov ar.unat=temp3
-	mov ar.pfs=temp4
-	// ar.rnat, ar.bspstore, loadrs are restore in ia64_old_stack.
-	ld8 temp3=[temp1],PT(F6)-PT(AR_CCV)		// restore ar.ccv
-	ld8 temp4=[temp2],PT(F7)-PT(AR_FPSR)		// restore ar.fpsr
-	;;
-	mov ar.ccv=temp3
-	mov ar.fpsr=temp4
-	ldf.fill f6=[temp1],PT(F8)-PT(F6)
-	ldf.fill f7=[temp2],PT(F9)-PT(F7)
-	;;
-	ldf.fill f8=[temp1],PT(F10)-PT(F8)
-	ldf.fill f9=[temp2],PT(F11)-PT(F9)
-	;;
-	ldf.fill f10=[temp1]
-	ldf.fill f11=[temp2]
-
-	// Restore the SAL to OS state. The previous code left regs at pt_regs.
-	add regs=MCA_SOS_OFFSET-MCA_PT_REGS_OFFSET, regs
-	;;
-	add temp1=SOS(SAL_RA), regs
-	add temp2=SOS(SAL_GP), regs
-	;;
-	ld8 r12=[temp1],16	// sal_ra
-	ld8 r9=[temp2],16	// sal_gp
-	;;
-	ld8 r22=[temp1],16	// pal_min_state, virtual
-	ld8 r13=[temp2],16	// prev_IA64_KR_CURRENT
-	;;
-	ld8 r16=[temp1],16	// prev_IA64_KR_CURRENT_STACK
-	ld8 r20=[temp2],16	// prev_task
-	;;
-	ld8 temp3=[temp1],16	// cr.isr
-	ld8 temp4=[temp2],16	// cr.ifa
-	;;
-	mov cr.isr=temp3
-	mov cr.ifa=temp4
-	ld8 temp3=[temp1],16	// cr.itir
-	ld8 temp4=[temp2],16	// cr.iipa
-	;;
-	mov cr.itir=temp3
-	mov cr.iipa=temp4
-	ld8 temp3=[temp1]	// cr.iim
-	ld8 temp4=[temp2]		// cr.iha
-	add temp1=SOS(OS_STATUS), regs
-	add temp2=SOS(CONTEXT), regs
-	;;
-	mov cr.iim=temp3
-	mov cr.iha=temp4
-	dep r22=0,r22,62,1	// pal_min_state, physical, uncached
-	mov IA64_KR(CURRENT)=r13
-	ld8 r8=[temp1]		// os_status
-	ld8 r10=[temp2]		// context
-
-	/* Wire IA64_TR_CURRENT_STACK to the stack that we are resuming to.  To
-	 * avoid any dependencies on the algorithm in ia64_switch_to(), just
-	 * purge any existing CURRENT_STACK mapping and insert the new one.
-	 *
-	 * r16 contains prev_IA64_KR_CURRENT_STACK, r13 contains
-	 * prev_IA64_KR_CURRENT, these values may have been changed by the C
-	 * code.  Do not use r8, r9, r10, r22, they contain values ready for
-	 * the return to SAL.
-	 */
-
-	mov r15=IA64_KR(CURRENT_STACK)		// physical granule mapped by IA64_TR_CURRENT_STACK
-	;;
-	shl r15=r15,IA64_GRANULE_SHIFT
-	;;
-	dep r15=-1,r15,61,3			// virtual granule
-	mov r18=IA64_GRANULE_SHIFT<<2		// for cr.itir.ps
-	;;
-	ptr.d r15,r18
-	;;
-	srlz.d
-
-	extr.u r19=r13,61,3			// r13 = prev_IA64_KR_CURRENT
-	shl r20=r16,IA64_GRANULE_SHIFT		// r16 = prev_IA64_KR_CURRENT_STACK
-	movl r21=PAGE_KERNEL			// page properties
-	;;
-	mov IA64_KR(CURRENT_STACK)=r16
-	cmp.ne p6,p0=RGN_KERNEL,r19		// new stack is in the kernel region?
-	or r21=r20,r21				// construct PA | page properties
-(p6)	br.spnt 1f				// the dreaded cpu 0 idle task in region 5:(
-	;;
-	mov cr.itir=r18
-	mov cr.ifa=r13
-	mov r20=IA64_TR_CURRENT_STACK
-	;;
-	itr.d dtr[r20]=r21
-	;;
-	srlz.d
-1:
-
-	br.sptk b0
-
-//EndStub//////////////////////////////////////////////////////////////////////
-
-
-//++
-// Name:
-//	ia64_new_stack()
-//
-// Stub Description:
-//
-//	Switch to the MCA/INIT stack.
-//
-//	r2 contains the return address, r3 contains either
-//	IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET.
-//
-//	On entry RBS is still on the original stack, this routine switches RBS
-//	to use the MCA/INIT stack.
-//
-//	On entry, sos->pal_min_state is physical, on exit it is virtual.
-//
-//--
-
-ia64_new_stack:
-	add regs=MCA_PT_REGS_OFFSET, r3
-	add temp2=MCA_SOS_OFFSET+SOS(PAL_MIN_STATE), r3
-	mov b0=r2			// save return address
-	GET_IA64_MCA_DATA(temp1)
-	invala
-	;;
-	add temp2=temp2, temp1		// struct ia64_sal_os_state.pal_min_state on MCA or INIT stack
-	add regs=regs, temp1		// struct pt_regs on MCA or INIT stack
-	;;
-	// Address of minstate area provided by PAL is physical, uncacheable.
-	// Convert to Linux virtual address in region 6 for C code.
-	ld8 ms=[temp2]			// pal_min_state, physical
-	;;
-	dep temp1=-1,ms,62,2		// set region 6
-	mov temp3=IA64_RBS_OFFSET-MCA_PT_REGS_OFFSET
-	;;
-	st8 [temp2]=temp1		// pal_min_state, virtual
-
-	add temp4=temp3, regs		// start of bspstore on new stack
-	;;
-	mov ar.bspstore=temp4		// switch RBS to MCA/INIT stack
-	;;
-	flushrs				// must be first in group
-	br.sptk b0
-
-//EndStub//////////////////////////////////////////////////////////////////////
-
-
-//++
-// Name:
-//	ia64_old_stack()
-//
-// Stub Description:
-//
-//	Switch to the old stack.
-//
-//	r2 contains the return address, r3 contains either
-//	IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET.
-//
-//	On entry, pal_min_state is virtual, on exit it is physical.
-//
-//	On entry RBS is on the MCA/INIT stack, this routine switches RBS
-//	back to the previous stack.
-//
-//	The psr is set to all zeroes.  SAL return requires either all zeroes or
-//	just psr.mc set.  Leaving psr.mc off allows INIT to be issued if this
-//	code does not perform correctly.
-//
-//	The dirty registers at the time of the event were flushed to the
-//	MCA/INIT stack in ia64_pt_regs_save().  Restore the dirty registers
-//	before reverting to the previous bspstore.
-//--
-
-ia64_old_stack:
-	add regs=MCA_PT_REGS_OFFSET, r3
-	mov b0=r2			// save return address
-	GET_IA64_MCA_DATA(temp2)
-	LOAD_PHYSICAL(p0,temp1,1f)
-	;;
-	mov cr.ipsr=r0
-	mov cr.ifs=r0
-	mov cr.iip=temp1
-	;;
-	invala
-	rfi
-1:
-
-	add regs=regs, temp2		// struct pt_regs on MCA or INIT stack
-	;;
-	add temp1=PT(LOADRS), regs
-	;;
-	ld8 temp2=[temp1],PT(AR_BSPSTORE)-PT(LOADRS)	// restore loadrs
-	;;
-	ld8 temp3=[temp1],PT(AR_RNAT)-PT(AR_BSPSTORE)	// restore ar.bspstore
-	mov ar.rsc=temp2
-	;;
-	loadrs
-	ld8 temp4=[temp1]		// restore ar.rnat
-	;;
-	mov ar.bspstore=temp3		// back to old stack
-	;;
-	mov ar.rnat=temp4
-	;;
-
-	br.sptk b0
-
-//EndStub//////////////////////////////////////////////////////////////////////
-
-
-//++
-// Name:
-//	ia64_set_kernel_registers()
-//
-// Stub Description:
-//
-//	Set the registers that are required by the C code in order to run on an
-//	MCA/INIT stack.
-//
-//	r2 contains the return address, r3 contains either
-//	IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET.
-//
-//--
-
-ia64_set_kernel_registers:
-	add temp3=MCA_SP_OFFSET, r3
-	mov b0=r2		// save return address
-	GET_IA64_MCA_DATA(temp1)
-	;;
-	add r12=temp1, temp3	// kernel stack pointer on MCA/INIT stack
-	add r13=temp1, r3	// set current to start of MCA/INIT stack
-	add r20=temp1, r3	// physical start of MCA/INIT stack
-	;;
-	DATA_PA_TO_VA(r12,temp2)
-	DATA_PA_TO_VA(r13,temp3)
-	;;
-	mov IA64_KR(CURRENT)=r13
-
-	/* Wire IA64_TR_CURRENT_STACK to the MCA/INIT handler stack.  To avoid
-	 * any dependencies on the algorithm in ia64_switch_to(), just purge
-	 * any existing CURRENT_STACK mapping and insert the new one.
-	 */
-
-	mov r16=IA64_KR(CURRENT_STACK)		// physical granule mapped by IA64_TR_CURRENT_STACK
-	;;
-	shl r16=r16,IA64_GRANULE_SHIFT
-	;;
-	dep r16=-1,r16,61,3			// virtual granule
-	mov r18=IA64_GRANULE_SHIFT<<2		// for cr.itir.ps
-	;;
-	ptr.d r16,r18
-	;;
-	srlz.d
-
-	shr.u r16=r20,IA64_GRANULE_SHIFT	// r20 = physical start of MCA/INIT stack
-	movl r21=PAGE_KERNEL			// page properties
-	;;
-	mov IA64_KR(CURRENT_STACK)=r16
-	or r21=r20,r21				// construct PA | page properties
-	;;
-	mov cr.itir=r18
-	mov cr.ifa=r13
-	mov r20=IA64_TR_CURRENT_STACK
-
-	movl r17=FPSR_DEFAULT
-	;;
-	mov.m ar.fpsr=r17			// set ar.fpsr to kernel default value
-	;;
-	itr.d dtr[r20]=r21
-	;;
-	srlz.d
-
-	br.sptk b0
-
-//EndStub//////////////////////////////////////////////////////////////////////
-
-#undef	ms
-#undef	regs
-#undef	temp1
-#undef	temp2
-#undef	temp3
-#undef	temp4
-
-
-// Support function for mca.c, it is here to avoid using inline asm.  Given the
-// address of an rnat slot, if that address is below the current ar.bspstore
-// then return the contents of that slot, otherwise return the contents of
-// ar.rnat.
-GLOBAL_ENTRY(ia64_get_rnat)
-	alloc r14=ar.pfs,1,0,0,0
-	mov ar.rsc=0
-	;;
-	mov r14=ar.bspstore
-	;;
-	cmp.lt p6,p7=in0,r14
-	;;
-(p6)	ld8 r8=[in0]
-(p7)	mov r8=ar.rnat
-	mov ar.rsc=3
-	br.ret.sptk.many rp
-END(ia64_get_rnat)
-
-
-// void ia64_set_psr_mc(void)
-//
-// Set psr.mc bit to mask MCA/INIT.
-GLOBAL_ENTRY(ia64_set_psr_mc)
-	rsm psr.i | psr.ic		// disable interrupts
-	;;
-	srlz.d
-	;;
-	mov r14 = psr			// get psr{36:35,31:0}
-	movl r15 = 1f
-	;;
-	dep r14 = -1, r14, PSR_MC, 1	// set psr.mc
-	;;
-	dep r14 = -1, r14, PSR_IC, 1	// set psr.ic
-	;;
-	dep r14 = -1, r14, PSR_BN, 1	// keep bank1 in use
-	;;
-	mov cr.ipsr = r14
-	mov cr.ifs = r0
-	mov cr.iip = r15
-	;;
-	rfi
-1:
-	br.ret.sptk.many rp
-END(ia64_set_psr_mc)
diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c
deleted file mode 100644
index 23c203639a96..000000000000
--- a/arch/ia64/kernel/mca_drv.c
+++ /dev/null
@@ -1,796 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * File:	mca_drv.c
- * Purpose:	Generic MCA handling layer
- *
- * Copyright (C) 2004 FUJITSU LIMITED
- * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
- * Copyright (C) 2005 Silicon Graphics, Inc
- * Copyright (C) 2005 Keith Owens <kaos@sgi.com>
- * Copyright (C) 2006 Russ Anderson <rja@sgi.com>
- */
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/kallsyms.h>
-#include <linux/memblock.h>
-#include <linux/acpi.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/workqueue.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-
-#include <asm/delay.h>
-#include <asm/page.h>
-#include <asm/ptrace.h>
-#include <asm/sal.h>
-#include <asm/mca.h>
-
-#include <asm/irq.h>
-#include <asm/hw_irq.h>
-
-#include "mca_drv.h"
-
-/* max size of SAL error record (default) */
-static int sal_rec_max = 10000;
-
-/* from mca_drv_asm.S */
-extern void *mca_handler_bhhook(void);
-
-static DEFINE_SPINLOCK(mca_bh_lock);
-
-typedef enum {
-	MCA_IS_LOCAL  = 0,
-	MCA_IS_GLOBAL = 1
-} mca_type_t;
-
-#define MAX_PAGE_ISOLATE 1024
-
-static struct page *page_isolate[MAX_PAGE_ISOLATE];
-static int num_page_isolate = 0;
-
-typedef enum {
-	ISOLATE_NG,
-	ISOLATE_OK,
-	ISOLATE_NONE
-} isolate_status_t;
-
-typedef enum {
-	MCA_NOT_RECOVERED = 0,
-	MCA_RECOVERED	  = 1
-} recovery_status_t;
-
-/*
- *  This pool keeps pointers to the section part of SAL error record
- */
-static struct {
-	slidx_list_t *buffer; /* section pointer list pool */
-	int	     cur_idx; /* Current index of section pointer list pool */
-	int	     max_idx; /* Maximum index of section pointer list pool */
-} slidx_pool;
-
-static int
-fatal_mca(const char *fmt, ...)
-{
-	va_list args;
-	char buf[256];
-
-	va_start(args, fmt);
-	vsnprintf(buf, sizeof(buf), fmt, args);
-	va_end(args);
-	ia64_mca_printk(KERN_ALERT "MCA: %s\n", buf);
-
-	return MCA_NOT_RECOVERED;
-}
-
-static int
-mca_recovered(const char *fmt, ...)
-{
-	va_list args;
-	char buf[256];
-
-	va_start(args, fmt);
-	vsnprintf(buf, sizeof(buf), fmt, args);
-	va_end(args);
-	ia64_mca_printk(KERN_INFO "MCA: %s\n", buf);
-
-	return MCA_RECOVERED;
-}
-
-/**
- * mca_page_isolate - isolate a poisoned page in order not to use it later
- * @paddr:	poisoned memory location
- *
- * Return value:
- *	one of isolate_status_t, ISOLATE_OK/NG/NONE.
- */
-
-static isolate_status_t
-mca_page_isolate(unsigned long paddr)
-{
-	int i;
-	struct page *p;
-
-	/* whether physical address is valid or not */
-	if (!ia64_phys_addr_valid(paddr))
-		return ISOLATE_NONE;
-
-	if (!pfn_valid(paddr >> PAGE_SHIFT))
-		return ISOLATE_NONE;
-
-	/* convert physical address to physical page number */
-	p = pfn_to_page(paddr>>PAGE_SHIFT);
-
-	/* check whether a page number have been already registered or not */
-	for (i = 0; i < num_page_isolate; i++)
-		if (page_isolate[i] == p)
-			return ISOLATE_OK; /* already listed */
-
-	/* limitation check */
-	if (num_page_isolate == MAX_PAGE_ISOLATE)
-		return ISOLATE_NG;
-
-	/* kick pages having attribute 'SLAB' or 'Reserved' */
-	if (PageSlab(p) || PageReserved(p))
-		return ISOLATE_NG;
-
-	/* add attribute 'Reserved' and register the page */
-	get_page(p);
-	SetPageReserved(p);
-	page_isolate[num_page_isolate++] = p;
-
-	return ISOLATE_OK;
-}
-
-/**
- * mca_hanlder_bh - Kill the process which occurred memory read error
- * @paddr:	poisoned address received from MCA Handler
- */
-
-void
-mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr)
-{
-	ia64_mlogbuf_dump();
-	printk(KERN_ERR "OS_MCA: process [cpu %d, pid: %d, uid: %d, "
-		"iip: %p, psr: 0x%lx,paddr: 0x%lx](%s) encounters MCA.\n",
-	       raw_smp_processor_id(), current->pid,
-		from_kuid(&init_user_ns, current_uid()),
-		iip, ipsr, paddr, current->comm);
-
-	spin_lock(&mca_bh_lock);
-	switch (mca_page_isolate(paddr)) {
-	case ISOLATE_OK:
-		printk(KERN_DEBUG "Page isolation: ( %lx ) success.\n", paddr);
-		break;
-	case ISOLATE_NG:
-		printk(KERN_CRIT "Page isolation: ( %lx ) failure.\n", paddr);
-		break;
-	default:
-		break;
-	}
-	spin_unlock(&mca_bh_lock);
-
-	/* This process is about to be killed itself */
-	make_task_dead(SIGKILL);
-}
-
-/**
- * mca_make_peidx - Make index of processor error section
- * @slpi:	pointer to record of processor error section
- * @peidx:	pointer to index of processor error section
- */
-
-static void
-mca_make_peidx(sal_log_processor_info_t *slpi, peidx_table_t *peidx)
-{
-	/*
-	 * calculate the start address of
-	 *   "struct cpuid_info" and "sal_processor_static_info_t".
-	 */
-	u64 total_check_num = slpi->valid.num_cache_check
-				+ slpi->valid.num_tlb_check
-				+ slpi->valid.num_bus_check
-				+ slpi->valid.num_reg_file_check
-				+ slpi->valid.num_ms_check;
-	u64 head_size =	sizeof(sal_log_mod_error_info_t) * total_check_num
-			+ sizeof(sal_log_processor_info_t);
-	u64 mid_size  = slpi->valid.cpuid_info * sizeof(struct sal_cpuid_info);
-
-	peidx_head(peidx)   = slpi;
-	peidx_mid(peidx)    = (struct sal_cpuid_info *)
-		(slpi->valid.cpuid_info ? ((char*)slpi + head_size) : NULL);
-	peidx_bottom(peidx) = (sal_processor_static_info_t *)
-		(slpi->valid.psi_static_struct ?
-			((char*)slpi + head_size + mid_size) : NULL);
-}
-
-/**
- * mca_make_slidx -  Make index of SAL error record
- * @buffer:	pointer to SAL error record
- * @slidx:	pointer to index of SAL error record
- *
- * Return value:
- *	1 if record has platform error / 0 if not
- */
-#define LOG_INDEX_ADD_SECT_PTR(sect, ptr) \
-	{slidx_list_t *hl = &slidx_pool.buffer[slidx_pool.cur_idx]; \
-	hl->hdr = ptr; \
-	list_add(&hl->list, &(sect)); \
-	slidx_pool.cur_idx = (slidx_pool.cur_idx + 1)%slidx_pool.max_idx; }
-
-static int
-mca_make_slidx(void *buffer, slidx_table_t *slidx)
-{
-	int platform_err = 0;
-	int record_len = ((sal_log_record_header_t*)buffer)->len;
-	u32 ercd_pos;
-	int sects;
-	sal_log_section_hdr_t *sp;
-
-	/*
-	 * Initialize index referring current record
-	 */
-	INIT_LIST_HEAD(&(slidx->proc_err));
-	INIT_LIST_HEAD(&(slidx->mem_dev_err));
-	INIT_LIST_HEAD(&(slidx->sel_dev_err));
-	INIT_LIST_HEAD(&(slidx->pci_bus_err));
-	INIT_LIST_HEAD(&(slidx->smbios_dev_err));
-	INIT_LIST_HEAD(&(slidx->pci_comp_err));
-	INIT_LIST_HEAD(&(slidx->plat_specific_err));
-	INIT_LIST_HEAD(&(slidx->host_ctlr_err));
-	INIT_LIST_HEAD(&(slidx->plat_bus_err));
-	INIT_LIST_HEAD(&(slidx->unsupported));
-
-	/*
-	 * Extract a Record Header
-	 */
-	slidx->header = buffer;
-
-	/*
-	 * Extract each section records
-	 * (arranged from "int ia64_log_platform_info_print()")
-	 */
-	for (ercd_pos = sizeof(sal_log_record_header_t), sects = 0;
-		ercd_pos < record_len; ercd_pos += sp->len, sects++) {
-		sp = (sal_log_section_hdr_t *)((char*)buffer + ercd_pos);
-		if (!efi_guidcmp(sp->guid, SAL_PROC_DEV_ERR_SECT_GUID)) {
-			LOG_INDEX_ADD_SECT_PTR(slidx->proc_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_MEM_DEV_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->mem_dev_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_SEL_DEV_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->sel_dev_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_PCI_BUS_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->pci_bus_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_SMBIOS_DEV_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->smbios_dev_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_PCI_COMP_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->pci_comp_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_SPECIFIC_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->plat_specific_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_HOST_CTLR_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->host_ctlr_err, sp);
-		} else if (!efi_guidcmp(sp->guid,
-				SAL_PLAT_BUS_ERR_SECT_GUID)) {
-			platform_err = 1;
-			LOG_INDEX_ADD_SECT_PTR(slidx->plat_bus_err, sp);
-		} else {
-			LOG_INDEX_ADD_SECT_PTR(slidx->unsupported, sp);
-		}
-	}
-	slidx->n_sections = sects;
-
-	return platform_err;
-}
-
-/**
- * init_record_index_pools - Initialize pool of lists for SAL record index
- *
- * Return value:
- *	0 on Success / -ENOMEM on Failure
- */
-static int
-init_record_index_pools(void)
-{
-	int i;
-	int rec_max_size;  /* Maximum size of SAL error records */
-	int sect_min_size; /* Minimum size of SAL error sections */
-	/* minimum size table of each section */
-	static int sal_log_sect_min_sizes[] = {
-		sizeof(sal_log_processor_info_t)
-		+ sizeof(sal_processor_static_info_t),
-		sizeof(sal_log_mem_dev_err_info_t),
-		sizeof(sal_log_sel_dev_err_info_t),
-		sizeof(sal_log_pci_bus_err_info_t),
-		sizeof(sal_log_smbios_dev_err_info_t),
-		sizeof(sal_log_pci_comp_err_info_t),
-		sizeof(sal_log_plat_specific_err_info_t),
-		sizeof(sal_log_host_ctlr_err_info_t),
-		sizeof(sal_log_plat_bus_err_info_t),
-	};
-
-	/*
-	 * MCA handler cannot allocate new memory on flight,
-	 * so we preallocate enough memory to handle a SAL record.
-	 *
-	 * Initialize a handling set of slidx_pool:
-	 *   1. Pick up the max size of SAL error records
-	 *   2. Pick up the min size of SAL error sections
-	 *   3. Allocate the pool as enough to 2 SAL records
-	 *     (now we can estimate the maxinum of section in a record.)
-	 */
-
-	/* - 1 - */
-	rec_max_size = sal_rec_max;
-
-	/* - 2 - */
-	sect_min_size = sal_log_sect_min_sizes[0];
-	for (i = 1; i < ARRAY_SIZE(sal_log_sect_min_sizes); i++)
-		if (sect_min_size > sal_log_sect_min_sizes[i])
-			sect_min_size = sal_log_sect_min_sizes[i];
-
-	/* - 3 - */
-	slidx_pool.max_idx = (rec_max_size/sect_min_size) * 2 + 1;
-	slidx_pool.buffer =
-		kmalloc_array(slidx_pool.max_idx, sizeof(slidx_list_t),
-			      GFP_KERNEL);
-
-	return slidx_pool.buffer ? 0 : -ENOMEM;
-}
-
-
-/*****************************************************************************
- * Recovery functions                                                        *
- *****************************************************************************/
-
-/**
- * is_mca_global - Check whether this MCA is global or not
- * @peidx:	pointer of index of processor error section
- * @pbci:	pointer to pal_bus_check_info_t
- * @sos:	pointer to hand off struct between SAL and OS
- *
- * Return value:
- *	MCA_IS_LOCAL / MCA_IS_GLOBAL
- */
-
-static mca_type_t
-is_mca_global(peidx_table_t *peidx, pal_bus_check_info_t *pbci,
-	      struct ia64_sal_os_state *sos)
-{
-	pal_processor_state_info_t *psp =
-		(pal_processor_state_info_t*)peidx_psp(peidx);
-
-	/*
-	 * PAL can request a rendezvous, if the MCA has a global scope.
-	 * If "rz_always" flag is set, SAL requests MCA rendezvous
-	 * in spite of global MCA.
-	 * Therefore it is local MCA when rendezvous has not been requested.
-	 * Failed to rendezvous, the system must be down.
-	 */
-	switch (sos->rv_rc) {
-		case -1: /* SAL rendezvous unsuccessful */
-			return MCA_IS_GLOBAL;
-		case  0: /* SAL rendezvous not required */
-			return MCA_IS_LOCAL;
-		case  1: /* SAL rendezvous successful int */
-		case  2: /* SAL rendezvous successful int with init */
-		default:
-			break;
-	}
-
-	/*
-	 * If One or more Cache/TLB/Reg_File/Uarch_Check is here,
-	 * it would be a local MCA. (i.e. processor internal error)
-	 */
-	if (psp->tc || psp->cc || psp->rc || psp->uc)
-		return MCA_IS_LOCAL;
-	
-	/*
-	 * Bus_Check structure with Bus_Check.ib (internal bus error) flag set
-	 * would be a global MCA. (e.g. a system bus address parity error)
-	 */
-	if (!pbci || pbci->ib)
-		return MCA_IS_GLOBAL;
-
-	/*
-	 * Bus_Check structure with Bus_Check.eb (external bus error) flag set
-	 * could be either a local MCA or a global MCA.
-	 *
-	 * Referring Bus_Check.bsi:
-	 *   0: Unknown/unclassified
-	 *   1: BERR#
-	 *   2: BINIT#
-	 *   3: Hard Fail
-	 * (FIXME: Are these SGI specific or generic bsi values?)
-	 */
-	if (pbci->eb)
-		switch (pbci->bsi) {
-			case 0:
-				/* e.g. a load from poisoned memory */
-				return MCA_IS_LOCAL;
-			case 1:
-			case 2:
-			case 3:
-				return MCA_IS_GLOBAL;
-		}
-
-	return MCA_IS_GLOBAL;
-}
-
-/**
- * get_target_identifier - Get the valid Cache or Bus check target identifier.
- * @peidx:	pointer of index of processor error section
- *
- * Return value:
- *	target address on Success / 0 on Failure
- */
-static u64
-get_target_identifier(peidx_table_t *peidx)
-{
-	u64 target_address = 0;
-	sal_log_mod_error_info_t *smei;
-	pal_cache_check_info_t *pcci;
-	int i, level = 9;
-
-	/*
-	 * Look through the cache checks for a valid target identifier
-	 * If more than one valid target identifier, return the one
-	 * with the lowest cache level.
-	 */
-	for (i = 0; i < peidx_cache_check_num(peidx); i++) {
-		smei = (sal_log_mod_error_info_t *)peidx_cache_check(peidx, i);
-		if (smei->valid.target_identifier && smei->target_identifier) {
-			pcci = (pal_cache_check_info_t *)&(smei->check_info);
-			if (!target_address || (pcci->level < level)) {
-				target_address = smei->target_identifier;
-				level = pcci->level;
-				continue;
-			}
-		}
-	}
-	if (target_address)
-		return target_address;
-
-	/*
-	 * Look at the bus check for a valid target identifier
-	 */
-	smei = peidx_bus_check(peidx, 0);
-	if (smei && smei->valid.target_identifier)
-		return smei->target_identifier;
-
-	return 0;
-}
-
-/**
- * recover_from_read_error - Try to recover the errors which type are "read"s.
- * @slidx:	pointer of index of SAL error record
- * @peidx:	pointer of index of processor error section
- * @pbci:	pointer of pal_bus_check_info
- * @sos:	pointer to hand off struct between SAL and OS
- *
- * Return value:
- *	1 on Success / 0 on Failure
- */
-
-static int
-recover_from_read_error(slidx_table_t *slidx,
-			peidx_table_t *peidx, pal_bus_check_info_t *pbci,
-			struct ia64_sal_os_state *sos)
-{
-	u64 target_identifier;
-	struct pal_min_state_area *pmsa;
-	struct ia64_psr *psr1, *psr2;
-	ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook;
-
-	/* Is target address valid? */
-	target_identifier = get_target_identifier(peidx);
-	if (!target_identifier)
-		return fatal_mca("target address not valid");
-
-	/*
-	 * cpu read or memory-mapped io read
-	 *
-	 *    offending process  affected process  OS MCA do
-	 *     kernel mode        kernel mode       down system
-	 *     kernel mode        user   mode       kill the process
-	 *     user   mode        kernel mode       down system (*)
-	 *     user   mode        user   mode       kill the process
-	 *
-	 * (*) You could terminate offending user-mode process
-	 *    if (pbci->pv && pbci->pl != 0) *and* if you sure
-	 *    the process not have any locks of kernel.
-	 */
-
-	/* Is minstate valid? */
-	if (!peidx_bottom(peidx) || !(peidx_bottom(peidx)->valid.minstate))
-		return fatal_mca("minstate not valid");
-	psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr);
-	psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr);
-
-	/*
-	 *  Check the privilege level of interrupted context.
-	 *   If it is user-mode, then terminate affected process.
-	 */
-
-	pmsa = sos->pal_min_state;
-	if (psr1->cpl != 0 ||
-	   ((psr2->cpl != 0) && mca_recover_range(pmsa->pmsa_iip))) {
-		/*
-		 *  setup for resume to bottom half of MCA,
-		 * "mca_handler_bhhook"
-		 */
-		/* pass to bhhook as argument (gr8, ...) */
-		pmsa->pmsa_gr[8-1] = target_identifier;
-		pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
-		pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
-		/* set interrupted return address (but no use) */
-		pmsa->pmsa_br0 = pmsa->pmsa_iip;
-		/* change resume address to bottom half */
-		pmsa->pmsa_iip = mca_hdlr_bh->fp;
-		pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp;
-		/* set cpl with kernel mode */
-		psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
-		psr2->cpl = 0;
-		psr2->ri  = 0;
-		psr2->bn  = 1;
-		psr2->i  = 0;
-
-		return mca_recovered("user memory corruption. "
-				"kill affected process - recovered.");
-	}
-
-	return fatal_mca("kernel context not recovered, iip 0x%lx\n",
-			 pmsa->pmsa_iip);
-}
-
-/**
- * recover_from_platform_error - Recover from platform error.
- * @slidx:	pointer of index of SAL error record
- * @peidx:	pointer of index of processor error section
- * @pbci:	pointer of pal_bus_check_info
- * @sos:	pointer to hand off struct between SAL and OS
- *
- * Return value:
- *	1 on Success / 0 on Failure
- */
-
-static int
-recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx,
-			    pal_bus_check_info_t *pbci,
-			    struct ia64_sal_os_state *sos)
-{
-	int status = 0;
-	pal_processor_state_info_t *psp =
-		(pal_processor_state_info_t*)peidx_psp(peidx);
-
-	if (psp->bc && pbci->eb && pbci->bsi == 0) {
-		switch(pbci->type) {
-		case 1: /* partial read */
-		case 3: /* full line(cpu) read */
-		case 9: /* I/O space read */
-			status = recover_from_read_error(slidx, peidx, pbci,
-							 sos);
-			break;
-		case 0: /* unknown */
-		case 2: /* partial write */
-		case 4: /* full line write */
-		case 5: /* implicit or explicit write-back operation */
-		case 6: /* snoop probe */
-		case 7: /* incoming or outgoing ptc.g */
-		case 8: /* write coalescing transactions */
-		case 10: /* I/O space write */
-		case 11: /* inter-processor interrupt message(IPI) */
-		case 12: /* interrupt acknowledge or
-				external task priority cycle */
-		default:
-			break;
-		}
-	} else if (psp->cc && !psp->bc) {	/* Cache error */
-		status = recover_from_read_error(slidx, peidx, pbci, sos);
-	}
-
-	return status;
-}
-
-/*
- * recover_from_tlb_check
- * @peidx:	pointer of index of processor error section
- *
- * Return value:
- *	1 on Success / 0 on Failure
- */
-static int
-recover_from_tlb_check(peidx_table_t *peidx)
-{
-	sal_log_mod_error_info_t *smei;
-	pal_tlb_check_info_t *ptci;
-
-	smei = (sal_log_mod_error_info_t *)peidx_tlb_check(peidx, 0);
-	ptci = (pal_tlb_check_info_t *)&(smei->check_info);
-
-	/*
-	 * Look for signature of a duplicate TLB DTC entry, which is
-	 * a SW bug and always fatal.
-	 */
-	if (ptci->op == PAL_TLB_CHECK_OP_PURGE
-	    && !(ptci->itr || ptci->dtc || ptci->itc))
-		return fatal_mca("Duplicate TLB entry");
-
-	return mca_recovered("TLB check recovered");
-}
-
-/**
- * recover_from_processor_error
- * @platform:	whether there are some platform error section or not
- * @slidx:	pointer of index of SAL error record
- * @peidx:	pointer of index of processor error section
- * @pbci:	pointer of pal_bus_check_info
- * @sos:	pointer to hand off struct between SAL and OS
- *
- * Return value:
- *	1 on Success / 0 on Failure
- */
-
-static int
-recover_from_processor_error(int platform, slidx_table_t *slidx,
-			     peidx_table_t *peidx, pal_bus_check_info_t *pbci,
-			     struct ia64_sal_os_state *sos)
-{
-	pal_processor_state_info_t *psp =
-		(pal_processor_state_info_t*)peidx_psp(peidx);
-
-	/*
-	 * Processor recovery status must key off of the PAL recovery
-	 * status in the Processor State Parameter.
-	 */
-
-	/*
-	 * The machine check is corrected.
-	 */
-	if (psp->cm == 1)
-		return mca_recovered("machine check is already corrected.");
-
-	/*
-	 * The error was not contained.  Software must be reset.
-	 */
-	if (psp->us || psp->ci == 0)
-		return fatal_mca("error not contained");
-
-	/*
-	 * Look for recoverable TLB check
-	 */
-	if (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc))
-		return recover_from_tlb_check(peidx);
-
-	/*
-	 * The cache check and bus check bits have four possible states
-	 *   cc bc
-	 *    1  1	Memory error, attempt recovery
-	 *    1  0	Cache error, attempt recovery
-	 *    0  1	I/O error, attempt recovery
-	 *    0  0	Other error type, not recovered
-	 */
-	if (psp->cc == 0 && (psp->bc == 0 || pbci == NULL))
-		return fatal_mca("No cache or bus check");
-
-	/*
-	 * Cannot handle more than one bus check.
-	 */
-	if (peidx_bus_check_num(peidx) > 1)
-		return fatal_mca("Too many bus checks");
-
-	if (pbci->ib)
-		return fatal_mca("Internal Bus error");
-	if (pbci->eb && pbci->bsi > 0)
-		return fatal_mca("External bus check fatal status");
-
-	/*
-	 * This is a local MCA and estimated as a recoverable error.
-	 */
-	if (platform)
-		return recover_from_platform_error(slidx, peidx, pbci, sos);
-
-	/*
-	 * On account of strange SAL error record, we cannot recover.
-	 */
-	return fatal_mca("Strange SAL record");
-}
-
-/**
- * mca_try_to_recover - Try to recover from MCA
- * @rec:	pointer to a SAL error record
- * @sos:	pointer to hand off struct between SAL and OS
- *
- * Return value:
- *	1 on Success / 0 on Failure
- */
-
-static int
-mca_try_to_recover(void *rec, struct ia64_sal_os_state *sos)
-{
-	int platform_err;
-	int n_proc_err;
-	slidx_table_t slidx;
-	peidx_table_t peidx;
-	pal_bus_check_info_t pbci;
-
-	/* Make index of SAL error record */
-	platform_err = mca_make_slidx(rec, &slidx);
-
-	/* Count processor error sections */
-	n_proc_err = slidx_count(&slidx, proc_err);
-
-	 /* Now, OS can recover when there is one processor error section */
-	if (n_proc_err > 1)
-		return fatal_mca("Too Many Errors");
-	else if (n_proc_err == 0)
-		/* Weird SAL record ... We can't do anything */
-		return fatal_mca("Weird SAL record");
-
-	/* Make index of processor error section */
-	mca_make_peidx((sal_log_processor_info_t*)
-		slidx_first_entry(&slidx.proc_err)->hdr, &peidx);
-
-	/* Extract Processor BUS_CHECK[0] */
-	*((u64*)&pbci) = peidx_check_info(&peidx, bus_check, 0);
-
-	/* Check whether MCA is global or not */
-	if (is_mca_global(&peidx, &pbci, sos))
-		return fatal_mca("global MCA");
-	
-	/* Try to recover a processor error */
-	return recover_from_processor_error(platform_err, &slidx, &peidx,
-					    &pbci, sos);
-}
-
-/*
- * =============================================================================
- */
-
-int __init mca_external_handler_init(void)
-{
-	if (init_record_index_pools())
-		return -ENOMEM;
-
-	/* register external mca handlers */
-	if (ia64_reg_MCA_extension(mca_try_to_recover)) {	
-		printk(KERN_ERR "ia64_reg_MCA_extension failed.\n");
-		kfree(slidx_pool.buffer);
-		return -EFAULT;
-	}
-	return 0;
-}
-
-void __exit mca_external_handler_exit(void)
-{
-	/* unregister external mca handlers */
-	ia64_unreg_MCA_extension();
-	kfree(slidx_pool.buffer);
-}
-
-module_init(mca_external_handler_init);
-module_exit(mca_external_handler_exit);
-
-module_param(sal_rec_max, int, 0644);
-MODULE_PARM_DESC(sal_rec_max, "Max size of SAL error record");
-
-MODULE_DESCRIPTION("ia64 platform dependent mca handler driver");
-MODULE_LICENSE("GPL");
diff --git a/arch/ia64/kernel/mca_drv.h b/arch/ia64/kernel/mca_drv.h
deleted file mode 100644
index 45bc4e3ae14f..000000000000
--- a/arch/ia64/kernel/mca_drv.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * File:	mca_drv.h
- * Purpose:	Define helpers for Generic MCA handling
- *
- * Copyright (C) 2004 FUJITSU LIMITED
- * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
- */
-/*
- * Processor error section:
- *
- *  +-sal_log_processor_info_t *info-------------+
- *  | sal_log_section_hdr_t header;              |
- *  | ...                                        |
- *  | sal_log_mod_error_info_t info[0];          |
- *  +-+----------------+-------------------------+
- *    | CACHE_CHECK    |  ^ num_cache_check v
- *    +----------------+
- *    | TLB_CHECK      |  ^ num_tlb_check v
- *    +----------------+
- *    | BUS_CHECK      |  ^ num_bus_check v
- *    +----------------+
- *    | REG_FILE_CHECK |  ^ num_reg_file_check v
- *    +----------------+
- *    | MS_CHECK       |  ^ num_ms_check v
- *  +-struct cpuid_info *id----------------------+
- *  | regs[5];                                   |
- *  | reserved;                                  |
- *  +-sal_processor_static_info_t *regs----------+
- *  | valid;                                     |
- *  | ...                                        |
- *  | fr[128];                                   |
- *  +--------------------------------------------+
- */
-
-/* peidx: index of processor error section */
-typedef struct peidx_table {
-	sal_log_processor_info_t        *info;
-	struct sal_cpuid_info           *id;
-	sal_processor_static_info_t     *regs;
-} peidx_table_t;
-
-#define peidx_head(p)   (((p)->info))
-#define peidx_mid(p)    (((p)->id))
-#define peidx_bottom(p) (((p)->regs))
-
-#define peidx_psp(p)           (&(peidx_head(p)->proc_state_parameter))
-#define peidx_field_valid(p)   (&(peidx_head(p)->valid))
-#define peidx_minstate_area(p) (&(peidx_bottom(p)->min_state_area))
-
-#define peidx_cache_check_num(p)    (peidx_head(p)->valid.num_cache_check)
-#define peidx_tlb_check_num(p)      (peidx_head(p)->valid.num_tlb_check)
-#define peidx_bus_check_num(p)      (peidx_head(p)->valid.num_bus_check)
-#define peidx_reg_file_check_num(p) (peidx_head(p)->valid.num_reg_file_check)
-#define peidx_ms_check_num(p)       (peidx_head(p)->valid.num_ms_check)
-
-#define peidx_cache_check_idx(p, n)    (n)
-#define peidx_tlb_check_idx(p, n)      (peidx_cache_check_idx(p, peidx_cache_check_num(p)) + n)
-#define peidx_bus_check_idx(p, n)      (peidx_tlb_check_idx(p, peidx_tlb_check_num(p)) + n)
-#define peidx_reg_file_check_idx(p, n) (peidx_bus_check_idx(p, peidx_bus_check_num(p)) + n)
-#define peidx_ms_check_idx(p, n)       (peidx_reg_file_check_idx(p, peidx_reg_file_check_num(p)) + n)
-
-#define peidx_mod_error_info(p, name, n) \
-({	int __idx = peidx_##name##_idx(p, n); \
-	sal_log_mod_error_info_t *__ret = NULL; \
-	if (peidx_##name##_num(p) > n) /*BUG*/ \
-		__ret = &(peidx_head(p)->info[__idx]); \
-	__ret; })
-
-#define peidx_cache_check(p, n)    peidx_mod_error_info(p, cache_check, n)
-#define peidx_tlb_check(p, n)      peidx_mod_error_info(p, tlb_check, n)
-#define peidx_bus_check(p, n)      peidx_mod_error_info(p, bus_check, n)
-#define peidx_reg_file_check(p, n) peidx_mod_error_info(p, reg_file_check, n)
-#define peidx_ms_check(p, n)       peidx_mod_error_info(p, ms_check, n)
-
-#define peidx_check_info(proc, name, n) \
-({ \
-	sal_log_mod_error_info_t *__info = peidx_mod_error_info(proc, name, n);\
-	u64 __temp = __info && __info->valid.check_info \
-		? __info->check_info : 0; \
-	__temp; })
-
-/* slidx: index of SAL log error record */
-
-typedef struct slidx_list {
-	struct list_head list;
-	sal_log_section_hdr_t *hdr;
-} slidx_list_t;
-
-typedef struct slidx_table {
-	sal_log_record_header_t *header;
-	int n_sections;			/* # of section headers */
-	struct list_head proc_err;
-	struct list_head mem_dev_err;
-	struct list_head sel_dev_err;
-	struct list_head pci_bus_err;
-	struct list_head smbios_dev_err;
-	struct list_head pci_comp_err;
-	struct list_head plat_specific_err;
-	struct list_head host_ctlr_err;
-	struct list_head plat_bus_err;
-	struct list_head unsupported;	/* list of unsupported sections */
-} slidx_table_t;
-
-#define slidx_foreach_entry(pos, head) \
-	list_for_each_entry(pos, head, list)
-#define slidx_first_entry(head) \
-	(((head)->next != (head)) ? list_entry((head)->next, typeof(slidx_list_t), list) : NULL)
-#define slidx_count(slidx, sec) \
-({	int __count = 0; \
-	slidx_list_t *__pos; \
-	slidx_foreach_entry(__pos, &((slidx)->sec)) { __count++; }\
-	__count; })
-
-struct mca_table_entry {
-	int start_addr;	/* location-relative starting address of MCA recoverable range */
-	int end_addr;	/* location-relative ending address of MCA recoverable range */
-};
-
-extern const struct mca_table_entry *search_mca_tables (unsigned long addr);
-extern int mca_recover_range(unsigned long);
-extern void ia64_mlogbuf_dump(void);
-
diff --git a/arch/ia64/kernel/mca_drv_asm.S b/arch/ia64/kernel/mca_drv_asm.S
deleted file mode 100644
index 4428f57bee73..000000000000
--- a/arch/ia64/kernel/mca_drv_asm.S
+++ /dev/null
@@ -1,56 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * File:        mca_drv_asm.S
- * Purpose:     Assembly portion of Generic MCA handling
- *
- * Copyright (C) 2004 FUJITSU LIMITED
- * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
- */
-#include <linux/threads.h>
-
-#include <asm/asmmacro.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-
-GLOBAL_ENTRY(mca_handler_bhhook)
-	invala				// clear RSE ?
-	cover
-	;;
-	clrrrb
-	;;						
-	alloc	r16=ar.pfs,0,2,3,0	// make a new frame
-	mov	ar.rsc=0
-	mov	r13=IA64_KR(CURRENT)	// current task pointer
-	;;
-	mov	r2=r13
-	;;
-	addl	r22=IA64_RBS_OFFSET,r2
-	;;
-	mov	ar.bspstore=r22
-	addl	sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2
-	;;
-	adds	r2=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
-	;;
-	st1	[r2]=r0		// clear current->thread.on_ustack flag
-	mov	loc0=r16
-	movl	loc1=mca_handler_bh	// recovery C function
-	;;
-	mov	out0=r8			// poisoned address
-	mov	out1=r9			// iip
-	mov	out2=r10		// psr
-	mov	b6=loc1
-	;;
-	mov	loc1=rp
-	ssm	psr.ic
-	;;
-	srlz.i
-	;;
-	ssm	psr.i
-	br.call.sptk.many rp=b6		// does not return ...
-	;;
-	mov	ar.pfs=loc0
-	mov 	rp=loc1
-	;;
-	mov	r8=r0
-	br.ret.sptk.many rp
-END(mca_handler_bhhook)
diff --git a/arch/ia64/kernel/minstate.h b/arch/ia64/kernel/minstate.h
deleted file mode 100644
index d6eab2a1084d..000000000000
--- a/arch/ia64/kernel/minstate.h
+++ /dev/null
@@ -1,251 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#include <asm/cache.h>
-
-#include "entry.h"
-#include <asm/native/inst.h>
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-/* read ar.itc in advance, and use it before leaving bank 0 */
-#define ACCOUNT_GET_STAMP				\
-(pUStk) mov.m r20=ar.itc;
-#define ACCOUNT_SYS_ENTER				\
-(pUStk) br.call.spnt rp=account_sys_enter		\
-	;;
-#else
-#define ACCOUNT_GET_STAMP
-#define ACCOUNT_SYS_ENTER
-#endif
-
-.section ".data..patch.rse", "a"
-.previous
-
-/*
- * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
- * the minimum state necessary that allows us to turn psr.ic back
- * on.
- *
- * Assumed state upon entry:
- *	psr.ic: off
- *	r31:	contains saved predicates (pr)
- *
- * Upon exit, the state is as follows:
- *	psr.ic: off
- *	 r2 = points to &pt_regs.r16
- *	 r8 = contents of ar.ccv
- *	 r9 = contents of ar.csd
- *	r10 = contents of ar.ssd
- *	r11 = FPSR_DEFAULT
- *	r12 = kernel sp (kernel virtual address)
- *	r13 = points to current task_struct (kernel virtual address)
- *	p15 = TRUE if psr.i is set in cr.ipsr
- *	predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
- *		preserved
- *
- * Note that psr.ic is NOT turned on by this macro.  This is so that
- * we can pass interruption state as arguments to a handler.
- */
-#define IA64_NATIVE_DO_SAVE_MIN(__COVER,SAVE_IFS,EXTRA,WORKAROUND)				\
-	mov r16=IA64_KR(CURRENT);	/* M */							\
-	mov r27=ar.rsc;			/* M */							\
-	mov r20=r1;			/* A */							\
-	mov r25=ar.unat;		/* M */							\
-	MOV_FROM_IPSR(p0,r29);		/* M */							\
-	mov r26=ar.pfs;			/* I */							\
-	MOV_FROM_IIP(r28);			/* M */						\
-	mov r21=ar.fpsr;		/* M */							\
-	__COVER;				/* B;; (or nothing) */				\
-	;;											\
-	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;						\
-	;;											\
-	ld1 r17=[r16];				/* load current->thread.on_ustack flag */	\
-	st1 [r16]=r0;				/* clear current->thread.on_ustack flag */	\
-	adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16						\
-	/* switch from user to kernel RBS: */							\
-	;;											\
-	invala;				/* M */							\
-	SAVE_IFS;										\
-	cmp.eq pKStk,pUStk=r0,r17;		/* are we in kernel mode already? */		\
-	;;											\
-(pUStk)	mov ar.rsc=0;		/* set enforced lazy mode, pl 0, little-endian, loadrs=0 */	\
-	;;											\
-(pUStk)	mov.m r24=ar.rnat;									\
-(pUStk)	addl r22=IA64_RBS_OFFSET,r1;			/* compute base of RBS */		\
-(pKStk) mov r1=sp;					/* get sp  */				\
-	;;											\
-(pUStk) lfetch.fault.excl.nt1 [r22];								\
-(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;	/* compute base of memory stack */	\
-(pUStk)	mov r23=ar.bspstore;				/* save ar.bspstore */			\
-	;;											\
-(pUStk)	mov ar.bspstore=r22;				/* switch to kernel RBS */		\
-(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;			/* if in kernel mode, use sp (r12) */	\
-	;;											\
-(pUStk)	mov r18=ar.bsp;										\
-(pUStk)	mov ar.rsc=0x3;		/* set eager mode, pl 0, little-endian, loadrs=0 */		\
-	adds r17=2*L1_CACHE_BYTES,r1;		/* really: biggest cache-line size */		\
-	adds r16=PT(CR_IPSR),r1;								\
-	;;											\
-	lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;						\
-	st8 [r16]=r29;		/* save cr.ipsr */						\
-	;;											\
-	lfetch.fault.excl.nt1 [r17];								\
-	tbit.nz p15,p0=r29,IA64_PSR_I_BIT;							\
-	mov r29=b0										\
-	;;											\
-	WORKAROUND;										\
-	adds r16=PT(R8),r1;	/* initialize first base pointer */				\
-	adds r17=PT(R9),r1;	/* initialize second base pointer */				\
-(pKStk)	mov r18=r0;		/* make sure r18 isn't NaT */					\
-	;;											\
-.mem.offset 0,0; st8.spill [r16]=r8,16;								\
-.mem.offset 8,0; st8.spill [r17]=r9,16;								\
-        ;;											\
-.mem.offset 0,0; st8.spill [r16]=r10,24;							\
-.mem.offset 8,0; st8.spill [r17]=r11,24;							\
-        ;;											\
-	st8 [r16]=r28,16;	/* save cr.iip */						\
-	st8 [r17]=r30,16;	/* save cr.ifs */						\
-(pUStk)	sub r18=r18,r22;	/* r18=RSE.ndirty*8 */						\
-	mov r8=ar.ccv;										\
-	mov r9=ar.csd;										\
-	mov r10=ar.ssd;										\
-	movl r11=FPSR_DEFAULT;   /* L-unit */							\
-	;;											\
-	st8 [r16]=r25,16;	/* save ar.unat */						\
-	st8 [r17]=r26,16;	/* save ar.pfs */						\
-	shl r18=r18,16;		/* compute ar.rsc to be used for "loadrs" */			\
-	;;											\
-	st8 [r16]=r27,16;	/* save ar.rsc */						\
-(pUStk)	st8 [r17]=r24,16;	/* save ar.rnat */						\
-(pKStk)	adds r17=16,r17;	/* skip over ar_rnat field */					\
-	;;			/* avoid RAW on r16 & r17 */					\
-(pUStk)	st8 [r16]=r23,16;	/* save ar.bspstore */						\
-	st8 [r17]=r31,16;	/* save predicates */						\
-(pKStk)	adds r16=16,r16;	/* skip over ar_bspstore field */				\
-	;;											\
-	st8 [r16]=r29,16;	/* save b0 */							\
-	st8 [r17]=r18,16;	/* save ar.rsc value for "loadrs" */				\
-	cmp.eq pNonSys,pSys=r0,r0	/* initialize pSys=0, pNonSys=1 */			\
-	;;											\
-.mem.offset 0,0; st8.spill [r16]=r20,16;	/* save original r1 */				\
-.mem.offset 8,0; st8.spill [r17]=r12,16;							\
-	adds r12=-16,r1;	/* switch to kernel memory stack (with 16 bytes of scratch) */	\
-	;;											\
-.mem.offset 0,0; st8.spill [r16]=r13,16;							\
-.mem.offset 8,0; st8.spill [r17]=r21,16;	/* save ar.fpsr */				\
-	mov r13=IA64_KR(CURRENT);	/* establish `current' */				\
-	;;											\
-.mem.offset 0,0; st8.spill [r16]=r15,16;							\
-.mem.offset 8,0; st8.spill [r17]=r14,16;							\
-	;;											\
-.mem.offset 0,0; st8.spill [r16]=r2,16;								\
-.mem.offset 8,0; st8.spill [r17]=r3,16;								\
-	ACCOUNT_GET_STAMP									\
-	adds r2=IA64_PT_REGS_R16_OFFSET,r1;							\
-	;;											\
-	EXTRA;											\
-	movl r1=__gp;		/* establish kernel global pointer */				\
-	;;											\
-	ACCOUNT_SYS_ENTER									\
-	bsw.1;			/* switch back to bank 1 (must be last in insn group) */	\
-	;;
-
-/*
- * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
- *
- * Assumed state upon entry:
- *	psr.ic: on
- *	r2:	points to &pt_regs.r16
- *	r3:	points to &pt_regs.r17
- *	r8:	contents of ar.ccv
- *	r9:	contents of ar.csd
- *	r10:	contents of ar.ssd
- *	r11:	FPSR_DEFAULT
- *
- * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
- */
-#define SAVE_REST				\
-.mem.offset 0,0; st8.spill [r2]=r16,16;		\
-.mem.offset 8,0; st8.spill [r3]=r17,16;		\
-	;;					\
-.mem.offset 0,0; st8.spill [r2]=r18,16;		\
-.mem.offset 8,0; st8.spill [r3]=r19,16;		\
-	;;					\
-.mem.offset 0,0; st8.spill [r2]=r20,16;		\
-.mem.offset 8,0; st8.spill [r3]=r21,16;		\
-	mov r18=b6;				\
-	;;					\
-.mem.offset 0,0; st8.spill [r2]=r22,16;		\
-.mem.offset 8,0; st8.spill [r3]=r23,16;		\
-	mov r19=b7;				\
-	;;					\
-.mem.offset 0,0; st8.spill [r2]=r24,16;		\
-.mem.offset 8,0; st8.spill [r3]=r25,16;		\
-	;;					\
-.mem.offset 0,0; st8.spill [r2]=r26,16;		\
-.mem.offset 8,0; st8.spill [r3]=r27,16;		\
-	;;					\
-.mem.offset 0,0; st8.spill [r2]=r28,16;		\
-.mem.offset 8,0; st8.spill [r3]=r29,16;		\
-	;;					\
-.mem.offset 0,0; st8.spill [r2]=r30,16;		\
-.mem.offset 8,0; st8.spill [r3]=r31,32;		\
-	;;					\
-	mov ar.fpsr=r11;	/* M-unit */	\
-	st8 [r2]=r8,8;		/* ar.ccv */	\
-	adds r24=PT(B6)-PT(F7),r3;		\
-	;;					\
-	stf.spill [r2]=f6,32;			\
-	stf.spill [r3]=f7,32;			\
-	;;					\
-	stf.spill [r2]=f8,32;			\
-	stf.spill [r3]=f9,32;			\
-	;;					\
-	stf.spill [r2]=f10;			\
-	stf.spill [r3]=f11;			\
-	adds r25=PT(B7)-PT(F11),r3;		\
-	;;					\
-	st8 [r24]=r18,16;       /* b6 */	\
-	st8 [r25]=r19,16;       /* b7 */	\
-	;;					\
-	st8 [r24]=r9;        	/* ar.csd */	\
-	st8 [r25]=r10;      	/* ar.ssd */	\
-	;;
-
-#define RSE_WORKAROUND				\
-(pUStk) extr.u r17=r18,3,6;			\
-(pUStk)	sub r16=r18,r22;			\
-[1:](pKStk)	br.cond.sptk.many 1f;		\
-	.xdata4 ".data..patch.rse",1b-.		\
-	;;					\
-	cmp.ge p6,p7 = 33,r17;			\
-	;;					\
-(p6)	mov r17=0x310;				\
-(p7)	mov r17=0x308;				\
-	;;					\
-	cmp.leu p1,p0=r16,r17;			\
-(p1)	br.cond.sptk.many 1f;			\
-	dep.z r17=r26,0,62;			\
-	movl r16=2f;				\
-	;;					\
-	mov ar.pfs=r17;				\
-	dep r27=r0,r27,16,14;			\
-	mov b0=r16;				\
-	;;					\
-	br.ret.sptk b0;				\
-	;;					\
-2:						\
-	mov ar.rsc=r0				\
-	;;					\
-	flushrs;				\
-	;;					\
-	mov ar.bspstore=r22			\
-	;;					\
-	mov r18=ar.bsp;				\
-	;;					\
-1:						\
-	.pred.rel "mutex", pKStk, pUStk
-
-#define SAVE_MIN_WITH_COVER	DO_SAVE_MIN(COVER, mov r30=cr.ifs, , RSE_WORKAROUND)
-#define SAVE_MIN_WITH_COVER_R19	DO_SAVE_MIN(COVER, mov r30=cr.ifs, mov r15=r19, RSE_WORKAROUND)
-#define SAVE_MIN			DO_SAVE_MIN(     , mov r30=r0, , )
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
deleted file mode 100644
index 3661135da9d9..000000000000
--- a/arch/ia64/kernel/module.c
+++ /dev/null
@@ -1,959 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * IA-64-specific support for kernel module loader.
- *
- * Copyright (C) 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * Loosely based on patch by Rusty Russell.
- */
-
-/* relocs tested so far:
-
-   DIR64LSB
-   FPTR64LSB
-   GPREL22
-   LDXMOV
-   LDXMOV
-   LTOFF22
-   LTOFF22X
-   LTOFF22X
-   LTOFF_FPTR22
-   PCREL21B	(for br.call only; br.cond is not supported out of modules!)
-   PCREL60B	(for brl.cond only; brl.call is not supported for modules!)
-   PCREL64LSB
-   SECREL32LSB
-   SEGREL64LSB
- */
-
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/elf.h>
-#include <linux/moduleloader.h>
-#include <linux/string.h>
-#include <linux/vmalloc.h>
-
-#include <asm/patch.h>
-#include <asm/unaligned.h>
-#include <asm/sections.h>
-
-#define ARCH_MODULE_DEBUG 0
-
-#if ARCH_MODULE_DEBUG
-# define DEBUGP printk
-# define inline
-#else
-# define DEBUGP(fmt , a...)
-#endif
-
-#ifdef CONFIG_ITANIUM
-# define USE_BRL	0
-#else
-# define USE_BRL	1
-#endif
-
-#define MAX_LTOFF	((uint64_t) (1 << 22))	/* max. allowable linkage-table offset */
-
-/* Define some relocation helper macros/types: */
-
-#define FORMAT_SHIFT	0
-#define FORMAT_BITS	3
-#define FORMAT_MASK	((1 << FORMAT_BITS) - 1)
-#define VALUE_SHIFT	3
-#define VALUE_BITS	5
-#define VALUE_MASK	((1 << VALUE_BITS) - 1)
-
-enum reloc_target_format {
-	/* direct encoded formats: */
-	RF_NONE = 0,
-	RF_INSN14 = 1,
-	RF_INSN22 = 2,
-	RF_INSN64 = 3,
-	RF_32MSB = 4,
-	RF_32LSB = 5,
-	RF_64MSB = 6,
-	RF_64LSB = 7,
-
-	/* formats that cannot be directly decoded: */
-	RF_INSN60,
-	RF_INSN21B,	/* imm21 form 1 */
-	RF_INSN21M,	/* imm21 form 2 */
-	RF_INSN21F	/* imm21 form 3 */
-};
-
-enum reloc_value_formula {
-	RV_DIRECT = 4,		/* S + A */
-	RV_GPREL = 5,		/* @gprel(S + A) */
-	RV_LTREL = 6,		/* @ltoff(S + A) */
-	RV_PLTREL = 7,		/* @pltoff(S + A) */
-	RV_FPTR = 8,		/* @fptr(S + A) */
-	RV_PCREL = 9,		/* S + A - P */
-	RV_LTREL_FPTR = 10,	/* @ltoff(@fptr(S + A)) */
-	RV_SEGREL = 11,		/* @segrel(S + A) */
-	RV_SECREL = 12,		/* @secrel(S + A) */
-	RV_BDREL = 13,		/* BD + A */
-	RV_LTV = 14,		/* S + A (like RV_DIRECT, except frozen at static link-time) */
-	RV_PCREL2 = 15,		/* S + A - P */
-	RV_SPECIAL = 16,	/* various (see below) */
-	RV_RSVD17 = 17,
-	RV_TPREL = 18,		/* @tprel(S + A) */
-	RV_LTREL_TPREL = 19,	/* @ltoff(@tprel(S + A)) */
-	RV_DTPMOD = 20,		/* @dtpmod(S + A) */
-	RV_LTREL_DTPMOD = 21,	/* @ltoff(@dtpmod(S + A)) */
-	RV_DTPREL = 22,		/* @dtprel(S + A) */
-	RV_LTREL_DTPREL = 23,	/* @ltoff(@dtprel(S + A)) */
-	RV_RSVD24 = 24,
-	RV_RSVD25 = 25,
-	RV_RSVD26 = 26,
-	RV_RSVD27 = 27
-	/* 28-31 reserved for implementation-specific purposes.  */
-};
-
-#define N(reloc)	[R_IA64_##reloc] = #reloc
-
-static const char *reloc_name[256] = {
-	N(NONE),		N(IMM14),		N(IMM22),		N(IMM64),
-	N(DIR32MSB),		N(DIR32LSB),		N(DIR64MSB),		N(DIR64LSB),
-	N(GPREL22),		N(GPREL64I),		N(GPREL32MSB),		N(GPREL32LSB),
-	N(GPREL64MSB),		N(GPREL64LSB),		N(LTOFF22),		N(LTOFF64I),
-	N(PLTOFF22),		N(PLTOFF64I),		N(PLTOFF64MSB),		N(PLTOFF64LSB),
-	N(FPTR64I),		N(FPTR32MSB),		N(FPTR32LSB),		N(FPTR64MSB),
-	N(FPTR64LSB),		N(PCREL60B),		N(PCREL21B),		N(PCREL21M),
-	N(PCREL21F),		N(PCREL32MSB),		N(PCREL32LSB),		N(PCREL64MSB),
-	N(PCREL64LSB),		N(LTOFF_FPTR22),	N(LTOFF_FPTR64I),	N(LTOFF_FPTR32MSB),
-	N(LTOFF_FPTR32LSB),	N(LTOFF_FPTR64MSB),	N(LTOFF_FPTR64LSB),	N(SEGREL32MSB),
-	N(SEGREL32LSB),		N(SEGREL64MSB),		N(SEGREL64LSB),		N(SECREL32MSB),
-	N(SECREL32LSB),		N(SECREL64MSB),		N(SECREL64LSB),		N(REL32MSB),
-	N(REL32LSB),		N(REL64MSB),		N(REL64LSB),		N(LTV32MSB),
-	N(LTV32LSB),		N(LTV64MSB),		N(LTV64LSB),		N(PCREL21BI),
-	N(PCREL22),		N(PCREL64I),		N(IPLTMSB),		N(IPLTLSB),
-	N(COPY),		N(LTOFF22X),		N(LDXMOV),		N(TPREL14),
-	N(TPREL22),		N(TPREL64I),		N(TPREL64MSB),		N(TPREL64LSB),
-	N(LTOFF_TPREL22),	N(DTPMOD64MSB),		N(DTPMOD64LSB),		N(LTOFF_DTPMOD22),
-	N(DTPREL14),		N(DTPREL22),		N(DTPREL64I),		N(DTPREL32MSB),
-	N(DTPREL32LSB),		N(DTPREL64MSB),		N(DTPREL64LSB),		N(LTOFF_DTPREL22)
-};
-
-#undef N
-
-/* Opaque struct for insns, to protect against derefs. */
-struct insn;
-
-static inline uint64_t
-bundle (const struct insn *insn)
-{
-	return (uint64_t) insn & ~0xfUL;
-}
-
-static inline int
-slot (const struct insn *insn)
-{
-	return (uint64_t) insn & 0x3;
-}
-
-static int
-apply_imm64 (struct module *mod, struct insn *insn, uint64_t val)
-{
-	if (slot(insn) != 1 && slot(insn) != 2) {
-		printk(KERN_ERR "%s: invalid slot number %d for IMM64\n",
-		       mod->name, slot(insn));
-		return 0;
-	}
-	ia64_patch_imm64((u64) insn, val);
-	return 1;
-}
-
-static int
-apply_imm60 (struct module *mod, struct insn *insn, uint64_t val)
-{
-	if (slot(insn) != 1 && slot(insn) != 2) {
-		printk(KERN_ERR "%s: invalid slot number %d for IMM60\n",
-		       mod->name, slot(insn));
-		return 0;
-	}
-	if (val + ((uint64_t) 1 << 59) >= (1UL << 60)) {
-		printk(KERN_ERR "%s: value %ld out of IMM60 range\n",
-			mod->name, (long) val);
-		return 0;
-	}
-	ia64_patch_imm60((u64) insn, val);
-	return 1;
-}
-
-static int
-apply_imm22 (struct module *mod, struct insn *insn, uint64_t val)
-{
-	if (val + (1 << 21) >= (1 << 22)) {
-		printk(KERN_ERR "%s: value %li out of IMM22 range\n",
-			mod->name, (long)val);
-		return 0;
-	}
-	ia64_patch((u64) insn, 0x01fffcfe000UL, (  ((val & 0x200000UL) << 15) /* bit 21 -> 36 */
-					         | ((val & 0x1f0000UL) <<  6) /* bit 16 -> 22 */
-					         | ((val & 0x00ff80UL) << 20) /* bit  7 -> 27 */
-					         | ((val & 0x00007fUL) << 13) /* bit  0 -> 13 */));
-	return 1;
-}
-
-static int
-apply_imm21b (struct module *mod, struct insn *insn, uint64_t val)
-{
-	if (val + (1 << 20) >= (1 << 21)) {
-		printk(KERN_ERR "%s: value %li out of IMM21b range\n",
-			mod->name, (long)val);
-		return 0;
-	}
-	ia64_patch((u64) insn, 0x11ffffe000UL, (  ((val & 0x100000UL) << 16) /* bit 20 -> 36 */
-					        | ((val & 0x0fffffUL) << 13) /* bit  0 -> 13 */));
-	return 1;
-}
-
-#if USE_BRL
-
-struct plt_entry {
-	/* Three instruction bundles in PLT. */
- 	unsigned char bundle[2][16];
-};
-
-static const struct plt_entry ia64_plt_template = {
-	{
-		{
-			0x04, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
-			0x00, 0x00, 0x00, 0x00, 0x00, 0x20, /*	     movl gp=TARGET_GP */
-			0x00, 0x00, 0x00, 0x60
-		},
-		{
-			0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
-			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /*	     brl.many gp=TARGET_GP */
-			0x08, 0x00, 0x00, 0xc0
-		}
-	}
-};
-
-static int
-patch_plt (struct module *mod, struct plt_entry *plt, long target_ip, unsigned long target_gp)
-{
-	if (apply_imm64(mod, (struct insn *) (plt->bundle[0] + 2), target_gp)
-	    && apply_imm60(mod, (struct insn *) (plt->bundle[1] + 2),
-			   (target_ip - (int64_t) plt->bundle[1]) / 16))
-		return 1;
-	return 0;
-}
-
-unsigned long
-plt_target (struct plt_entry *plt)
-{
-	uint64_t b0, b1, *b = (uint64_t *) plt->bundle[1];
-	long off;
-
-	b0 = b[0]; b1 = b[1];
-	off = (  ((b1 & 0x00fffff000000000UL) >> 36)		/* imm20b -> bit 0 */
-	       | ((b0 >> 48) << 20) | ((b1 & 0x7fffffUL) << 36)	/* imm39 -> bit 20 */
-	       | ((b1 & 0x0800000000000000UL) << 0));		/* i -> bit 59 */
-	return (long) plt->bundle[1] + 16*off;
-}
-
-#else /* !USE_BRL */
-
-struct plt_entry {
-	/* Three instruction bundles in PLT. */
- 	unsigned char bundle[3][16];
-};
-
-static const struct plt_entry ia64_plt_template = {
-	{
-		{
-			0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
-			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /*	     movl r16=TARGET_IP */
-			0x02, 0x00, 0x00, 0x60
-		},
-		{
-			0x04, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */
-			0x00, 0x00, 0x00, 0x00, 0x00, 0x20, /*	     movl gp=TARGET_GP */
-			0x00, 0x00, 0x00, 0x60
-		},
-		{
-			0x11, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MIB] nop.m 0 */
-			0x60, 0x80, 0x04, 0x80, 0x03, 0x00, /*	     mov b6=r16 */
-			0x60, 0x00, 0x80, 0x00		    /*	     br.few b6 */
-		}
-	}
-};
-
-static int
-patch_plt (struct module *mod, struct plt_entry *plt, long target_ip, unsigned long target_gp)
-{
-	if (apply_imm64(mod, (struct insn *) (plt->bundle[0] + 2), target_ip)
-	    && apply_imm64(mod, (struct insn *) (plt->bundle[1] + 2), target_gp))
-		return 1;
-	return 0;
-}
-
-unsigned long
-plt_target (struct plt_entry *plt)
-{
-	uint64_t b0, b1, *b = (uint64_t *) plt->bundle[0];
-
-	b0 = b[0]; b1 = b[1];
-	return (  ((b1 & 0x000007f000000000) >> 36)		/* imm7b -> bit 0 */
-		| ((b1 & 0x07fc000000000000) >> 43)		/* imm9d -> bit 7 */
-		| ((b1 & 0x0003e00000000000) >> 29)		/* imm5c -> bit 16 */
-		| ((b1 & 0x0000100000000000) >> 23)		/* ic -> bit 21 */
-		| ((b0 >> 46) << 22) | ((b1 & 0x7fffff) << 40)	/* imm41 -> bit 22 */
-		| ((b1 & 0x0800000000000000) <<  4));		/* i -> bit 63 */
-}
-
-#endif /* !USE_BRL */
-
-void
-module_arch_freeing_init (struct module *mod)
-{
-	if (mod->arch.init_unw_table) {
-		unw_remove_unwind_table(mod->arch.init_unw_table);
-		mod->arch.init_unw_table = NULL;
-	}
-}
-
-/* Have we already seen one of these relocations? */
-/* FIXME: we could look in other sections, too --RR */
-static int
-duplicate_reloc (const Elf64_Rela *rela, unsigned int num)
-{
-	unsigned int i;
-
-	for (i = 0; i < num; i++) {
-		if (rela[i].r_info == rela[num].r_info && rela[i].r_addend == rela[num].r_addend)
-			return 1;
-	}
-	return 0;
-}
-
-/* Count how many GOT entries we may need */
-static unsigned int
-count_gots (const Elf64_Rela *rela, unsigned int num)
-{
-	unsigned int i, ret = 0;
-
-	/* Sure, this is order(n^2), but it's usually short, and not
-           time critical */
-	for (i = 0; i < num; i++) {
-		switch (ELF64_R_TYPE(rela[i].r_info)) {
-		      case R_IA64_LTOFF22:
-		      case R_IA64_LTOFF22X:
-		      case R_IA64_LTOFF64I:
-		      case R_IA64_LTOFF_FPTR22:
-		      case R_IA64_LTOFF_FPTR64I:
-		      case R_IA64_LTOFF_FPTR32MSB:
-		      case R_IA64_LTOFF_FPTR32LSB:
-		      case R_IA64_LTOFF_FPTR64MSB:
-		      case R_IA64_LTOFF_FPTR64LSB:
-			if (!duplicate_reloc(rela, i))
-				ret++;
-			break;
-		}
-	}
-	return ret;
-}
-
-/* Count how many PLT entries we may need */
-static unsigned int
-count_plts (const Elf64_Rela *rela, unsigned int num)
-{
-	unsigned int i, ret = 0;
-
-	/* Sure, this is order(n^2), but it's usually short, and not
-           time critical */
-	for (i = 0; i < num; i++) {
-		switch (ELF64_R_TYPE(rela[i].r_info)) {
-		      case R_IA64_PCREL21B:
-		      case R_IA64_PLTOFF22:
-		      case R_IA64_PLTOFF64I:
-		      case R_IA64_PLTOFF64MSB:
-		      case R_IA64_PLTOFF64LSB:
-		      case R_IA64_IPLTMSB:
-		      case R_IA64_IPLTLSB:
-			if (!duplicate_reloc(rela, i))
-				ret++;
-			break;
-		}
-	}
-	return ret;
-}
-
-/* We need to create an function-descriptors for any internal function
-   which is referenced. */
-static unsigned int
-count_fdescs (const Elf64_Rela *rela, unsigned int num)
-{
-	unsigned int i, ret = 0;
-
-	/* Sure, this is order(n^2), but it's usually short, and not time critical.  */
-	for (i = 0; i < num; i++) {
-		switch (ELF64_R_TYPE(rela[i].r_info)) {
-		      case R_IA64_FPTR64I:
-		      case R_IA64_FPTR32LSB:
-		      case R_IA64_FPTR32MSB:
-		      case R_IA64_FPTR64LSB:
-		      case R_IA64_FPTR64MSB:
-		      case R_IA64_LTOFF_FPTR22:
-		      case R_IA64_LTOFF_FPTR32LSB:
-		      case R_IA64_LTOFF_FPTR32MSB:
-		      case R_IA64_LTOFF_FPTR64I:
-		      case R_IA64_LTOFF_FPTR64LSB:
-		      case R_IA64_LTOFF_FPTR64MSB:
-		      case R_IA64_IPLTMSB:
-		      case R_IA64_IPLTLSB:
-			/*
-			 * Jumps to static functions sometimes go straight to their
-			 * offset.  Of course, that may not be possible if the jump is
-			 * from init -> core or vice. versa, so we need to generate an
-			 * FDESC (and PLT etc) for that.
-			 */
-		      case R_IA64_PCREL21B:
-			if (!duplicate_reloc(rela, i))
-				ret++;
-			break;
-		}
-	}
-	return ret;
-}
-
-int
-module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings,
-			   struct module *mod)
-{
-	unsigned long core_plts = 0, init_plts = 0, gots = 0, fdescs = 0;
-	Elf64_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum;
-
-	/*
-	 * To store the PLTs and function-descriptors, we expand the .text section for
-	 * core module-code and the .init.text section for initialization code.
-	 */
-	for (s = sechdrs; s < sechdrs_end; ++s)
-		if (strcmp(".core.plt", secstrings + s->sh_name) == 0)
-			mod->arch.core_plt = s;
-		else if (strcmp(".init.plt", secstrings + s->sh_name) == 0)
-			mod->arch.init_plt = s;
-		else if (strcmp(".got", secstrings + s->sh_name) == 0)
-			mod->arch.got = s;
-		else if (strcmp(".opd", secstrings + s->sh_name) == 0)
-			mod->arch.opd = s;
-		else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0)
-			mod->arch.unwind = s;
-
-	if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) {
-		printk(KERN_ERR "%s: sections missing\n", mod->name);
-		return -ENOEXEC;
-	}
-
-	/* GOT and PLTs can occur in any relocated section... */
-	for (s = sechdrs + 1; s < sechdrs_end; ++s) {
-		const Elf64_Rela *rels = (void *)ehdr + s->sh_offset;
-		unsigned long numrels = s->sh_size/sizeof(Elf64_Rela);
-
-		if (s->sh_type != SHT_RELA)
-			continue;
-
-		gots += count_gots(rels, numrels);
-		fdescs += count_fdescs(rels, numrels);
-		if (strstr(secstrings + s->sh_name, ".init"))
-			init_plts += count_plts(rels, numrels);
-		else
-			core_plts += count_plts(rels, numrels);
-	}
-
-	mod->arch.core_plt->sh_type = SHT_NOBITS;
-	mod->arch.core_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
-	mod->arch.core_plt->sh_addralign = 16;
-	mod->arch.core_plt->sh_size = core_plts * sizeof(struct plt_entry);
-	mod->arch.init_plt->sh_type = SHT_NOBITS;
-	mod->arch.init_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
-	mod->arch.init_plt->sh_addralign = 16;
-	mod->arch.init_plt->sh_size = init_plts * sizeof(struct plt_entry);
-	mod->arch.got->sh_type = SHT_NOBITS;
-	mod->arch.got->sh_flags = ARCH_SHF_SMALL | SHF_ALLOC;
-	mod->arch.got->sh_addralign = 8;
-	mod->arch.got->sh_size = gots * sizeof(struct got_entry);
-	mod->arch.opd->sh_type = SHT_NOBITS;
-	mod->arch.opd->sh_flags = SHF_ALLOC;
-	mod->arch.opd->sh_addralign = 8;
-	mod->arch.opd->sh_size = fdescs * sizeof(struct fdesc);
-	DEBUGP("%s: core.plt=%lx, init.plt=%lx, got=%lx, fdesc=%lx\n",
-	       __func__, mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size,
-	       mod->arch.got->sh_size, mod->arch.opd->sh_size);
-	return 0;
-}
-
-static inline bool
-in_init (const struct module *mod, uint64_t addr)
-{
-	return within_module_init(addr, mod);
-}
-
-static inline bool
-in_core (const struct module *mod, uint64_t addr)
-{
-	return within_module_core(addr, mod);
-}
-
-static inline bool
-is_internal (const struct module *mod, uint64_t value)
-{
-	return in_init(mod, value) || in_core(mod, value);
-}
-
-/*
- * Get gp-relative offset for the linkage-table entry of VALUE.
- */
-static uint64_t
-get_ltoff (struct module *mod, uint64_t value, int *okp)
-{
-	struct got_entry *got, *e;
-
-	if (!*okp)
-		return 0;
-
-	got = (void *) mod->arch.got->sh_addr;
-	for (e = got; e < got + mod->arch.next_got_entry; ++e)
-		if (e->val == value)
-			goto found;
-
-	/* Not enough GOT entries? */
-	BUG_ON(e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size));
-
-	e->val = value;
-	++mod->arch.next_got_entry;
-  found:
-	return (uint64_t) e - mod->arch.gp;
-}
-
-static inline int
-gp_addressable (struct module *mod, uint64_t value)
-{
-	return value - mod->arch.gp + MAX_LTOFF/2 < MAX_LTOFF;
-}
-
-/* Get PC-relative PLT entry for this value.  Returns 0 on failure. */
-static uint64_t
-get_plt (struct module *mod, const struct insn *insn, uint64_t value, int *okp)
-{
-	struct plt_entry *plt, *plt_end;
-	uint64_t target_ip, target_gp;
-
-	if (!*okp)
-		return 0;
-
-	if (in_init(mod, (uint64_t) insn)) {
-		plt = (void *) mod->arch.init_plt->sh_addr;
-		plt_end = (void *) plt + mod->arch.init_plt->sh_size;
-	} else {
-		plt = (void *) mod->arch.core_plt->sh_addr;
-		plt_end = (void *) plt + mod->arch.core_plt->sh_size;
-	}
-
-	/* "value" is a pointer to a function-descriptor; fetch the target ip/gp from it: */
-	target_ip = ((uint64_t *) value)[0];
-	target_gp = ((uint64_t *) value)[1];
-
-	/* Look for existing PLT entry. */
-	while (plt->bundle[0][0]) {
-		if (plt_target(plt) == target_ip)
-			goto found;
-		if (++plt >= plt_end)
-			BUG();
-	}
-	*plt = ia64_plt_template;
-	if (!patch_plt(mod, plt, target_ip, target_gp)) {
-		*okp = 0;
-		return 0;
-	}
-#if ARCH_MODULE_DEBUG
-	if (plt_target(plt) != target_ip) {
-		printk("%s: mistargeted PLT: wanted %lx, got %lx\n",
-		       __func__, target_ip, plt_target(plt));
-		*okp = 0;
-		return 0;
-	}
-#endif
-  found:
-	return (uint64_t) plt;
-}
-
-/* Get function descriptor for VALUE. */
-static uint64_t
-get_fdesc (struct module *mod, uint64_t value, int *okp)
-{
-	struct fdesc *fdesc = (void *) mod->arch.opd->sh_addr;
-
-	if (!*okp)
-		return 0;
-
-	if (!value) {
-		printk(KERN_ERR "%s: fdesc for zero requested!\n", mod->name);
-		return 0;
-	}
-
-	if (!is_internal(mod, value))
-		/*
-		 * If it's not a module-local entry-point, "value" already points to a
-		 * function-descriptor.
-		 */
-		return value;
-
-	/* Look for existing function descriptor. */
-	while (fdesc->addr) {
-		if (fdesc->addr == value)
-			return (uint64_t)fdesc;
-		if ((uint64_t) ++fdesc >= mod->arch.opd->sh_addr + mod->arch.opd->sh_size)
-			BUG();
-	}
-
-	/* Create new one */
-	fdesc->addr = value;
-	fdesc->gp = mod->arch.gp;
-	return (uint64_t) fdesc;
-}
-
-static inline int
-do_reloc (struct module *mod, uint8_t r_type, Elf64_Sym *sym, uint64_t addend,
-	  Elf64_Shdr *sec, void *location)
-{
-	enum reloc_target_format format = (r_type >> FORMAT_SHIFT) & FORMAT_MASK;
-	enum reloc_value_formula formula = (r_type >> VALUE_SHIFT) & VALUE_MASK;
-	uint64_t val;
-	int ok = 1;
-
-	val = sym->st_value + addend;
-
-	switch (formula) {
-	      case RV_SEGREL:	/* segment base is arbitrarily chosen to be 0 for kernel modules */
-	      case RV_DIRECT:
-		break;
-
-	      case RV_GPREL:	  val -= mod->arch.gp; break;
-	      case RV_LTREL:	  val = get_ltoff(mod, val, &ok); break;
-	      case RV_PLTREL:	  val = get_plt(mod, location, val, &ok); break;
-	      case RV_FPTR:	  val = get_fdesc(mod, val, &ok); break;
-	      case RV_SECREL:	  val -= sec->sh_addr; break;
-	      case RV_LTREL_FPTR: val = get_ltoff(mod, get_fdesc(mod, val, &ok), &ok); break;
-
-	      case RV_PCREL:
-		switch (r_type) {
-		      case R_IA64_PCREL21B:
-			if ((in_init(mod, val) && in_core(mod, (uint64_t)location)) ||
-			    (in_core(mod, val) && in_init(mod, (uint64_t)location))) {
-				/*
-				 * Init section may have been allocated far away from core,
-				 * if the branch won't reach, then allocate a plt for it.
-				 */
-				uint64_t delta = ((int64_t)val - (int64_t)location) / 16;
-				if (delta + (1 << 20) >= (1 << 21)) {
-					val = get_fdesc(mod, val, &ok);
-					val = get_plt(mod, location, val, &ok);
-				}
-			} else if (!is_internal(mod, val))
-				val = get_plt(mod, location, val, &ok);
-			fallthrough;
-		      default:
-			val -= bundle(location);
-			break;
-
-		      case R_IA64_PCREL32MSB:
-		      case R_IA64_PCREL32LSB:
-		      case R_IA64_PCREL64MSB:
-		      case R_IA64_PCREL64LSB:
-			val -= (uint64_t) location;
-			break;
-
-		}
-		switch (r_type) {
-		      case R_IA64_PCREL60B: format = RF_INSN60; break;
-		      case R_IA64_PCREL21B: format = RF_INSN21B; break;
-		      case R_IA64_PCREL21M: format = RF_INSN21M; break;
-		      case R_IA64_PCREL21F: format = RF_INSN21F; break;
-		      default: break;
-		}
-		break;
-
-	      case RV_BDREL:
-		val -= (uint64_t) (in_init(mod, val) ? mod->mem[MOD_INIT_TEXT].base :
-				   mod->mem[MOD_TEXT].base);
-		break;
-
-	      case RV_LTV:
-		/* can link-time value relocs happen here?  */
-		BUG();
-		break;
-
-	      case RV_PCREL2:
-		if (r_type == R_IA64_PCREL21BI) {
-			if (!is_internal(mod, val)) {
-				printk(KERN_ERR "%s: %s reloc against "
-					"non-local symbol (%lx)\n", __func__,
-					reloc_name[r_type], (unsigned long)val);
-				return -ENOEXEC;
-			}
-			format = RF_INSN21B;
-		}
-		val -= bundle(location);
-		break;
-
-	      case RV_SPECIAL:
-		switch (r_type) {
-		      case R_IA64_IPLTMSB:
-		      case R_IA64_IPLTLSB:
-			val = get_fdesc(mod, get_plt(mod, location, val, &ok), &ok);
-			format = RF_64LSB;
-			if (r_type == R_IA64_IPLTMSB)
-				format = RF_64MSB;
-			break;
-
-		      case R_IA64_SUB:
-			val = addend - sym->st_value;
-			format = RF_INSN64;
-			break;
-
-		      case R_IA64_LTOFF22X:
-			if (gp_addressable(mod, val))
-				val -= mod->arch.gp;
-			else
-				val = get_ltoff(mod, val, &ok);
-			format = RF_INSN22;
-			break;
-
-		      case R_IA64_LDXMOV:
-			if (gp_addressable(mod, val)) {
-				/* turn "ld8" into "mov": */
-				DEBUGP("%s: patching ld8 at %p to mov\n", __func__, location);
-				ia64_patch((u64) location, 0x1fff80fe000UL, 0x10000000000UL);
-			}
-			return 0;
-
-		      default:
-			if (reloc_name[r_type])
-				printk(KERN_ERR "%s: special reloc %s not supported",
-				       mod->name, reloc_name[r_type]);
-			else
-				printk(KERN_ERR "%s: unknown special reloc %x\n",
-				       mod->name, r_type);
-			return -ENOEXEC;
-		}
-		break;
-
-	      case RV_TPREL:
-	      case RV_LTREL_TPREL:
-	      case RV_DTPMOD:
-	      case RV_LTREL_DTPMOD:
-	      case RV_DTPREL:
-	      case RV_LTREL_DTPREL:
-		printk(KERN_ERR "%s: %s reloc not supported\n",
-		       mod->name, reloc_name[r_type] ? reloc_name[r_type] : "?");
-		return -ENOEXEC;
-
-	      default:
-		printk(KERN_ERR "%s: unknown reloc %x\n", mod->name, r_type);
-		return -ENOEXEC;
-	}
-
-	if (!ok)
-		return -ENOEXEC;
-
-	DEBUGP("%s: [%p]<-%016lx = %s(%lx)\n", __func__, location, val,
-	       reloc_name[r_type] ? reloc_name[r_type] : "?", sym->st_value + addend);
-
-	switch (format) {
-	      case RF_INSN21B:	ok = apply_imm21b(mod, location, (int64_t) val / 16); break;
-	      case RF_INSN22:	ok = apply_imm22(mod, location, val); break;
-	      case RF_INSN64:	ok = apply_imm64(mod, location, val); break;
-	      case RF_INSN60:	ok = apply_imm60(mod, location, (int64_t) val / 16); break;
-	      case RF_32LSB:	put_unaligned(val, (uint32_t *) location); break;
-	      case RF_64LSB:	put_unaligned(val, (uint64_t *) location); break;
-	      case RF_32MSB:	/* ia64 Linux is little-endian... */
-	      case RF_64MSB:	/* ia64 Linux is little-endian... */
-	      case RF_INSN14:	/* must be within-module, i.e., resolved by "ld -r" */
-	      case RF_INSN21M:	/* must be within-module, i.e., resolved by "ld -r" */
-	      case RF_INSN21F:	/* must be within-module, i.e., resolved by "ld -r" */
-		printk(KERN_ERR "%s: format %u needed by %s reloc is not supported\n",
-		       mod->name, format, reloc_name[r_type] ? reloc_name[r_type] : "?");
-		return -ENOEXEC;
-
-	      default:
-		printk(KERN_ERR "%s: relocation %s resulted in unknown format %u\n",
-		       mod->name, reloc_name[r_type] ? reloc_name[r_type] : "?", format);
-		return -ENOEXEC;
-	}
-	return ok ? 0 : -ENOEXEC;
-}
-
-int
-apply_relocate_add (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex,
-		    unsigned int relsec, struct module *mod)
-{
-	unsigned int i, n = sechdrs[relsec].sh_size / sizeof(Elf64_Rela);
-	Elf64_Rela *rela = (void *) sechdrs[relsec].sh_addr;
-	Elf64_Shdr *target_sec;
-	int ret;
-
-	DEBUGP("%s: applying section %u (%u relocs) to %u\n", __func__,
-	       relsec, n, sechdrs[relsec].sh_info);
-
-	target_sec = sechdrs + sechdrs[relsec].sh_info;
-
-	if (target_sec->sh_entsize == ~0UL)
-		/*
-		 * If target section wasn't allocated, we don't need to relocate it.
-		 * Happens, e.g., for debug sections.
-		 */
-		return 0;
-
-	if (!mod->arch.gp) {
-		/*
-		 * XXX Should have an arch-hook for running this after final section
-		 *     addresses have been selected...
-		 */
-		uint64_t gp;
-		struct module_memory *mod_mem;
-
-		mod_mem = &mod->mem[MOD_DATA];
-		if (mod_mem->size > MAX_LTOFF)
-			/*
-			 * This takes advantage of fact that SHF_ARCH_SMALL gets allocated
-			 * at the end of the module.
-			 */
-			gp = mod_mem->size - MAX_LTOFF / 2;
-		else
-			gp = mod_mem->size / 2;
-		gp = (uint64_t) mod_mem->base + ((gp + 7) & -8);
-		mod->arch.gp = gp;
-		DEBUGP("%s: placing gp at 0x%lx\n", __func__, gp);
-	}
-
-	for (i = 0; i < n; i++) {
-		ret = do_reloc(mod, ELF64_R_TYPE(rela[i].r_info),
-			       ((Elf64_Sym *) sechdrs[symindex].sh_addr
-				+ ELF64_R_SYM(rela[i].r_info)),
-			       rela[i].r_addend, target_sec,
-			       (void *) target_sec->sh_addr + rela[i].r_offset);
-		if (ret < 0)
-			return ret;
-	}
-	return 0;
-}
-
-/*
- * Modules contain a single unwind table which covers both the core and the init text
- * sections but since the two are not contiguous, we need to split this table up such that
- * we can register (and unregister) each "segment" separately.  Fortunately, this sounds
- * more complicated than it really is.
- */
-static void
-register_unwind_table (struct module *mod)
-{
-	struct unw_table_entry *start = (void *) mod->arch.unwind->sh_addr;
-	struct unw_table_entry *end = start + mod->arch.unwind->sh_size / sizeof (*start);
-	struct unw_table_entry *e1, *e2, *core, *init;
-	unsigned long num_init = 0, num_core = 0;
-
-	/* First, count how many init and core unwind-table entries there are.  */
-	for (e1 = start; e1 < end; ++e1)
-		if (in_init(mod, e1->start_offset))
-			++num_init;
-		else
-			++num_core;
-	/*
-	 * Second, sort the table such that all unwind-table entries for the init and core
-	 * text sections are nicely separated.  We do this with a stupid bubble sort
-	 * (unwind tables don't get ridiculously huge).
-	 */
-	for (e1 = start; e1 < end; ++e1) {
-		for (e2 = e1 + 1; e2 < end; ++e2) {
-			if (e2->start_offset < e1->start_offset) {
-				swap(*e1, *e2);
-			}
-		}
-	}
-	/*
-	 * Third, locate the init and core segments in the unwind table:
-	 */
-	if (in_init(mod, start->start_offset)) {
-		init = start;
-		core = start + num_init;
-	} else {
-		core = start;
-		init = start + num_core;
-	}
-
-	DEBUGP("%s: name=%s, gp=%lx, num_init=%lu, num_core=%lu\n", __func__,
-	       mod->name, mod->arch.gp, num_init, num_core);
-
-	/*
-	 * Fourth, register both tables (if not empty).
-	 */
-	if (num_core > 0) {
-		mod->arch.core_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp,
-								core, core + num_core);
-		DEBUGP("%s:  core: handle=%p [%p-%p)\n", __func__,
-		       mod->arch.core_unw_table, core, core + num_core);
-	}
-	if (num_init > 0) {
-		mod->arch.init_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp,
-								init, init + num_init);
-		DEBUGP("%s:  init: handle=%p [%p-%p)\n", __func__,
-		       mod->arch.init_unw_table, init, init + num_init);
-	}
-}
-
-int
-module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod)
-{
-	struct mod_arch_specific *mas = &mod->arch;
-
-	DEBUGP("%s: init: entry=%p\n", __func__, mod->init);
-	if (mas->unwind)
-		register_unwind_table(mod);
-
-	/*
-	 * ".opd" was already relocated to the final destination. Store
-	 * it's address for use in symbolizer.
-	 */
-	mas->opd_addr = (void *)mas->opd->sh_addr;
-	mas->opd_size = mas->opd->sh_size;
-
-	/*
-	 * Module relocation was already done at this point. Section
-	 * headers are about to be deleted. Wipe out load-time context.
-	 */
-	mas->core_plt = NULL;
-	mas->init_plt = NULL;
-	mas->got = NULL;
-	mas->opd = NULL;
-	mas->unwind = NULL;
-	mas->gp = 0;
-	mas->next_got_entry = 0;
-
-	return 0;
-}
-
-void
-module_arch_cleanup (struct module *mod)
-{
-	if (mod->arch.init_unw_table) {
-		unw_remove_unwind_table(mod->arch.init_unw_table);
-		mod->arch.init_unw_table = NULL;
-	}
-	if (mod->arch.core_unw_table) {
-		unw_remove_unwind_table(mod->arch.core_unw_table);
-		mod->arch.core_unw_table = NULL;
-	}
-}
-
-void *dereference_module_function_descriptor(struct module *mod, void *ptr)
-{
-	struct mod_arch_specific *mas = &mod->arch;
-
-	if (ptr < mas->opd_addr || ptr >= mas->opd_addr + mas->opd_size)
-		return ptr;
-
-	return dereference_function_descriptor(ptr);
-}
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
deleted file mode 100644
index 025e5133c860..000000000000
--- a/arch/ia64/kernel/msi_ia64.c
+++ /dev/null
@@ -1,198 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * MSI hooks for standard x86 apic
- */
-
-#include <linux/pci.h>
-#include <linux/irq.h>
-#include <linux/msi.h>
-#include <linux/dmar.h>
-#include <asm/smp.h>
-#include <asm/msidef.h>
-
-static struct irq_chip	ia64_msi_chip;
-
-#ifdef CONFIG_SMP
-static int ia64_set_msi_irq_affinity(struct irq_data *idata,
-				     const cpumask_t *cpu_mask, bool force)
-{
-	struct msi_msg msg;
-	u32 addr, data;
-	int cpu = cpumask_first_and(cpu_mask, cpu_online_mask);
-	unsigned int irq = idata->irq;
-
-	if (irq_prepare_move(irq, cpu))
-		return -1;
-
-	__get_cached_msi_msg(irq_data_get_msi_desc(idata), &msg);
-
-	addr = msg.address_lo;
-	addr &= MSI_ADDR_DEST_ID_MASK;
-	addr |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu));
-	msg.address_lo = addr;
-
-	data = msg.data;
-	data &= MSI_DATA_VECTOR_MASK;
-	data |= MSI_DATA_VECTOR(irq_to_vector(irq));
-	msg.data = data;
-
-	pci_write_msi_msg(irq, &msg);
-	irq_data_update_affinity(idata, cpumask_of(cpu));
-
-	return 0;
-}
-#endif /* CONFIG_SMP */
-
-int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
-{
-	struct msi_msg	msg;
-	unsigned long	dest_phys_id;
-	int	irq, vector;
-
-	irq = create_irq();
-	if (irq < 0)
-		return irq;
-
-	irq_set_msi_desc(irq, desc);
-	dest_phys_id = cpu_physical_id(cpumask_any_and(&(irq_to_domain(irq)),
-						       cpu_online_mask));
-	vector = irq_to_vector(irq);
-
-	msg.address_hi = 0;
-	msg.address_lo =
-		MSI_ADDR_HEADER |
-		MSI_ADDR_DEST_MODE_PHYS |
-		MSI_ADDR_REDIRECTION_CPU |
-		MSI_ADDR_DEST_ID_CPU(dest_phys_id);
-
-	msg.data =
-		MSI_DATA_TRIGGER_EDGE |
-		MSI_DATA_LEVEL_ASSERT |
-		MSI_DATA_DELIVERY_FIXED |
-		MSI_DATA_VECTOR(vector);
-
-	pci_write_msi_msg(irq, &msg);
-	irq_set_chip_and_handler(irq, &ia64_msi_chip, handle_edge_irq);
-
-	return 0;
-}
-
-void arch_teardown_msi_irq(unsigned int irq)
-{
-	destroy_irq(irq);
-}
-
-static void ia64_ack_msi_irq(struct irq_data *data)
-{
-	irq_complete_move(data->irq);
-	irq_move_irq(data);
-	ia64_eoi();
-}
-
-static int ia64_msi_retrigger_irq(struct irq_data *data)
-{
-	unsigned int vector = irq_to_vector(data->irq);
-	ia64_resend_irq(vector);
-
-	return 1;
-}
-
-/*
- * Generic ops used on most IA64 platforms.
- */
-static struct irq_chip ia64_msi_chip = {
-	.name			= "PCI-MSI",
-	.irq_mask		= pci_msi_mask_irq,
-	.irq_unmask		= pci_msi_unmask_irq,
-	.irq_ack		= ia64_ack_msi_irq,
-#ifdef CONFIG_SMP
-	.irq_set_affinity	= ia64_set_msi_irq_affinity,
-#endif
-	.irq_retrigger		= ia64_msi_retrigger_irq,
-};
-
-#ifdef CONFIG_INTEL_IOMMU
-#ifdef CONFIG_SMP
-static int dmar_msi_set_affinity(struct irq_data *data,
-				 const struct cpumask *mask, bool force)
-{
-	unsigned int irq = data->irq;
-	struct irq_cfg *cfg = irq_cfg + irq;
-	struct msi_msg msg;
-	int cpu = cpumask_first_and(mask, cpu_online_mask);
-
-	if (irq_prepare_move(irq, cpu))
-		return -1;
-
-	dmar_msi_read(irq, &msg);
-
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu));
-
-	dmar_msi_write(irq, &msg);
-	irq_data_update_affinity(data, mask);
-
-	return 0;
-}
-#endif /* CONFIG_SMP */
-
-static struct irq_chip dmar_msi_type = {
-	.name = "DMAR_MSI",
-	.irq_unmask = dmar_msi_unmask,
-	.irq_mask = dmar_msi_mask,
-	.irq_ack = ia64_ack_msi_irq,
-#ifdef CONFIG_SMP
-	.irq_set_affinity = dmar_msi_set_affinity,
-#endif
-	.irq_retrigger = ia64_msi_retrigger_irq,
-};
-
-static void
-msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
-{
-	struct irq_cfg *cfg = irq_cfg + irq;
-	unsigned dest;
-
-	dest = cpu_physical_id(cpumask_first_and(&(irq_to_domain(irq)),
-						 cpu_online_mask));
-
-	msg->address_hi = 0;
-	msg->address_lo =
-		MSI_ADDR_HEADER |
-		MSI_ADDR_DEST_MODE_PHYS |
-		MSI_ADDR_REDIRECTION_CPU |
-		MSI_ADDR_DEST_ID_CPU(dest);
-
-	msg->data =
-		MSI_DATA_TRIGGER_EDGE |
-		MSI_DATA_LEVEL_ASSERT |
-		MSI_DATA_DELIVERY_FIXED |
-		MSI_DATA_VECTOR(cfg->vector);
-}
-
-int dmar_alloc_hwirq(int id, int node, void *arg)
-{
-	int irq;
-	struct msi_msg msg;
-
-	irq = create_irq();
-	if (irq > 0) {
-		irq_set_handler_data(irq, arg);
-		irq_set_chip_and_handler_name(irq, &dmar_msi_type,
-					      handle_edge_irq, "edge");
-		msi_compose_msg(NULL, irq, &msg);
-		dmar_msi_write(irq, &msg);
-	}
-
-	return irq;
-}
-
-void dmar_free_hwirq(int irq)
-{
-	irq_set_handler_data(irq, NULL);
-	destroy_irq(irq);
-}
-#endif /* CONFIG_INTEL_IOMMU */
-
diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c
deleted file mode 100644
index 8a959f20662d..000000000000
--- a/arch/ia64/kernel/numa.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- *
- * ia64 kernel NUMA specific stuff
- *
- * Copyright (C) 2002 Erich Focht <efocht@ess.nec.de>
- * Copyright (C) 2004 Silicon Graphics, Inc.
- *   Jesse Barnes <jbarnes@sgi.com>
- */
-#include <linux/topology.h>
-#include <linux/module.h>
-#include <asm/processor.h>
-#include <asm/smp.h>
-
-u16 cpu_to_node_map[NR_CPUS] __cacheline_aligned;
-EXPORT_SYMBOL(cpu_to_node_map);
-
-cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
-EXPORT_SYMBOL(node_to_cpu_mask);
-
-void map_cpu_to_node(int cpu, int nid)
-{
-	int oldnid;
-	if (nid < 0) { /* just initialize by zero */
-		cpu_to_node_map[cpu] = 0;
-		return;
-	}
-	/* sanity check first */
-	oldnid = cpu_to_node_map[cpu];
-	if (cpumask_test_cpu(cpu, &node_to_cpu_mask[oldnid])) {
-		return; /* nothing to do */
-	}
-	/* we don't have cpu-driven node hot add yet...
-	   In usual case, node is created from SRAT at boot time. */
-	if (!node_online(nid))
-		nid = first_online_node;
-	cpu_to_node_map[cpu] = nid;
-	cpumask_set_cpu(cpu, &node_to_cpu_mask[nid]);
-	return;
-}
-
-void unmap_cpu_from_node(int cpu, int nid)
-{
-	WARN_ON(!cpumask_test_cpu(cpu, &node_to_cpu_mask[nid]));
-	WARN_ON(cpu_to_node_map[cpu] != nid);
-	cpu_to_node_map[cpu] = 0;
-	cpumask_clear_cpu(cpu, &node_to_cpu_mask[nid]);
-}
-
-
-/**
- * build_cpu_to_node_map - setup cpu to node and node to cpumask arrays
- *
- * Build cpu to node mapping and initialize the per node cpu masks using
- * info from the node_cpuid array handed to us by ACPI.
- */
-void __init build_cpu_to_node_map(void)
-{
-	int cpu, i, node;
-
-	for(node=0; node < MAX_NUMNODES; node++)
-		cpumask_clear(&node_to_cpu_mask[node]);
-
-	for_each_possible_early_cpu(cpu) {
-		node = NUMA_NO_NODE;
-		for (i = 0; i < NR_CPUS; ++i)
-			if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) {
-				node = node_cpuid[i].nid;
-				break;
-			}
-		map_cpu_to_node(cpu, node);
-	}
-}
diff --git a/arch/ia64/kernel/pal.S b/arch/ia64/kernel/pal.S
deleted file mode 100644
index fb6db6966f70..000000000000
--- a/arch/ia64/kernel/pal.S
+++ /dev/null
@@ -1,306 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PAL Firmware support
- * IA-64 Processor Programmers Reference Vol 2
- *
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999-2001, 2003 Hewlett-Packard Co
- *	David Mosberger <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * 05/22/2000 eranian Added support for stacked register calls
- * 05/24/2000 eranian Added support for physical mode static calls
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/processor.h>
-
-	.data
-pal_entry_point:
-	data8 ia64_pal_default_handler
-	.text
-
-/*
- * Set the PAL entry point address.  This could be written in C code, but we
- * do it here to keep it all in one module (besides, it's so trivial that it's
- * not a big deal).
- *
- * in0		Address of the PAL entry point (text address, NOT a function
- *		descriptor).
- */
-GLOBAL_ENTRY(ia64_pal_handler_init)
-	alloc r3=ar.pfs,1,0,0,0
-	movl r2=pal_entry_point
-	;;
-	st8 [r2]=in0
-	br.ret.sptk.many rp
-END(ia64_pal_handler_init)
-
-/*
- * Default PAL call handler.  This needs to be coded in assembly because it
- * uses the static calling convention, i.e., the RSE may not be used and
- * calls are done via "br.cond" (not "br.call").
- */
-GLOBAL_ENTRY(ia64_pal_default_handler)
-	mov r8=-1
-	br.cond.sptk.many rp
-END(ia64_pal_default_handler)
-
-/*
- * Make a PAL call using the static calling convention.
- *
- * in0         Index of PAL service
- * in1 - in3   Remaining PAL arguments
- */
-GLOBAL_ENTRY(ia64_pal_call_static)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
-	alloc loc1 = ar.pfs,4,5,0,0
-	movl loc2 = pal_entry_point
-1:	{
-	  mov r28 = in0
-	  mov r29 = in1
-	  mov r8 = ip
-	}
-	;;
-	ld8 loc2 = [loc2]		// loc2 <- entry point
-	adds r8 = 1f-1b,r8
-	mov loc4=ar.rsc			// save RSE configuration
-	;;
-	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	mov loc3 = psr
-	mov loc0 = rp
-	.body
-	mov r30 = in2
-
-	mov r31 = in3
-	mov b7 = loc2
-
-	rsm psr.i
-	;;
-	mov rp = r8
-	br.cond.sptk.many b7
-1:	mov psr.l = loc3
-	mov ar.rsc = loc4		// restore RSE configuration
-	mov ar.pfs = loc1
-	mov rp = loc0
-	;;
-	srlz.d				// serialize restoration of psr.l
-	br.ret.sptk.many b0
-END(ia64_pal_call_static)
-EXPORT_SYMBOL(ia64_pal_call_static)
-
-/*
- * Make a PAL call using the stacked registers calling convention.
- *
- * Inputs:
- *	in0         Index of PAL service
- *	in2 - in3   Remaining PAL arguments
- */
-GLOBAL_ENTRY(ia64_pal_call_stacked)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
-	alloc loc1 = ar.pfs,4,4,4,0
-	movl loc2 = pal_entry_point
-
-	mov r28  = in0			// Index MUST be copied to r28
-	mov out0 = in0			// AND in0 of PAL function
-	mov loc0 = rp
-	.body
-	;;
-	ld8 loc2 = [loc2]		// loc2 <- entry point
-	mov out1 = in1
-	mov out2 = in2
-	mov out3 = in3
-	mov loc3 = psr
-	;;
-	rsm psr.i
-	mov b7 = loc2
-	;;
-	br.call.sptk.many rp=b7		// now make the call
-.ret0:	mov psr.l  = loc3
-	mov ar.pfs = loc1
-	mov rp = loc0
-	;;
-	srlz.d				// serialize restoration of psr.l
-	br.ret.sptk.many b0
-END(ia64_pal_call_stacked)
-EXPORT_SYMBOL(ia64_pal_call_stacked)
-
-/*
- * Make a physical mode PAL call using the static registers calling convention.
- *
- * Inputs:
- *	in0         Index of PAL service
- *	in2 - in3   Remaining PAL arguments
- *
- * PSR_LP, PSR_TB, PSR_ID, PSR_DA are never set by the kernel.
- * So we don't need to clear them.
- */
-#define PAL_PSR_BITS_TO_CLEAR						      \
-	(IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT  | IA64_PSR_DB | IA64_PSR_RT |\
-	 IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED |	      \
-	 IA64_PSR_DFL | IA64_PSR_DFH)
-
-#define PAL_PSR_BITS_TO_SET						      \
-	(IA64_PSR_BN)
-
-
-GLOBAL_ENTRY(ia64_pal_call_phys_static)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4)
-	alloc loc1 = ar.pfs,4,7,0,0
-	movl loc2 = pal_entry_point
-1:	{
-	  mov r28  = in0		// copy procedure index
-	  mov r8   = ip			// save ip to compute branch
-	  mov loc0 = rp			// save rp
-	}
-	.body
-	;;
-	ld8 loc2 = [loc2]		// loc2 <- entry point
-	mov r29  = in1			// first argument
-	mov r30  = in2			// copy arg2
-	mov r31  = in3			// copy arg3
-	;;
-	mov loc3 = psr			// save psr
-	adds r8  = 1f-1b,r8		// calculate return address for call
-	;;
-	mov loc4=ar.rsc			// save RSE configuration
-	dep.z loc2=loc2,0,61		// convert pal entry point to physical
-	tpa r8=r8			// convert rp to physical
-	;;
-	mov b7 = loc2			// install target to branch reg
-	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	movl r16=PAL_PSR_BITS_TO_CLEAR
-	movl r17=PAL_PSR_BITS_TO_SET
-	;;
-	or loc3=loc3,r17		// add in psr the bits to set
-	;;
-	andcm r16=loc3,r16		// removes bits to clear from psr
-	br.call.sptk.many rp=ia64_switch_mode_phys
-	mov rp = r8			// install return address (physical)
-	mov loc5 = r19
-	mov loc6 = r20
-	br.cond.sptk.many b7
-1:
-	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	mov r16=loc3			// r16= original psr
-	mov r19=loc5
-	mov r20=loc6
-	br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
-	mov psr.l = loc3		// restore init PSR
-
-	mov ar.pfs = loc1
-	mov rp = loc0
-	;;
-	mov ar.rsc=loc4			// restore RSE configuration
-	srlz.d				// serialize restoration of psr.l
-	br.ret.sptk.many b0
-END(ia64_pal_call_phys_static)
-EXPORT_SYMBOL(ia64_pal_call_phys_static)
-
-/*
- * Make a PAL call using the stacked registers in physical mode.
- *
- * Inputs:
- *	in0         Index of PAL service
- *	in2 - in3   Remaining PAL arguments
- */
-GLOBAL_ENTRY(ia64_pal_call_phys_stacked)
-	.prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5)
-	alloc	loc1 = ar.pfs,5,7,4,0
-	movl	loc2 = pal_entry_point
-1:	{
-	  mov r28  = in0		// copy procedure index
-	  mov loc0 = rp			// save rp
-	}
-	.body
-	;;
-	ld8 loc2 = [loc2]		// loc2 <- entry point
-	mov loc3 = psr			// save psr
-	;;
-	mov loc4=ar.rsc			// save RSE configuration
-	dep.z loc2=loc2,0,61		// convert pal entry point to physical
-	;;
-	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	movl r16=PAL_PSR_BITS_TO_CLEAR
-	movl r17=PAL_PSR_BITS_TO_SET
-	;;
-	or loc3=loc3,r17		// add in psr the bits to set
-	mov b7 = loc2			// install target to branch reg
-	;;
-	andcm r16=loc3,r16		// removes bits to clear from psr
-	br.call.sptk.many rp=ia64_switch_mode_phys
-
-	mov out0 = in0			// first argument
-	mov out1 = in1			// copy arg2
-	mov out2 = in2			// copy arg3
-	mov out3 = in3			// copy arg3
-	mov loc5 = r19
-	mov loc6 = r20
-
-	br.call.sptk.many rp=b7		// now make the call
-
-	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
-	mov r16=loc3			// r16= original psr
-	mov r19=loc5
-	mov r20=loc6
-	br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode
-
-	mov psr.l  = loc3		// restore init PSR
-	mov ar.pfs = loc1
-	mov rp = loc0
-	;;
-	mov ar.rsc=loc4			// restore RSE configuration
-	srlz.d				// serialize restoration of psr.l
-	br.ret.sptk.many b0
-END(ia64_pal_call_phys_stacked)
-EXPORT_SYMBOL(ia64_pal_call_phys_stacked)
-
-/*
- * Save scratch fp scratch regs which aren't saved in pt_regs already
- * (fp10-fp15).
- *
- * NOTE: We need to do this since firmware (SAL and PAL) may use any of the
- * scratch regs fp-low partition.
- *
- * Inputs:
- *      in0	Address of stack storage for fp regs
- */
-GLOBAL_ENTRY(ia64_save_scratch_fpregs)
-	alloc r3=ar.pfs,1,0,0,0
-	add r2=16,in0
-	;;
-	stf.spill [in0] = f10,32
-	stf.spill [r2]  = f11,32
-	;;
-	stf.spill [in0] = f12,32
-	stf.spill [r2]  = f13,32
-	;;
-	stf.spill [in0] = f14,32
-	stf.spill [r2]  = f15,32
-	br.ret.sptk.many rp
-END(ia64_save_scratch_fpregs)
-EXPORT_SYMBOL(ia64_save_scratch_fpregs)
-
-/*
- * Load scratch fp scratch regs (fp10-fp15)
- *
- * Inputs:
- *      in0	Address of stack storage for fp regs
- */
-GLOBAL_ENTRY(ia64_load_scratch_fpregs)
-	alloc r3=ar.pfs,1,0,0,0
-	add r2=16,in0
-	;;
-	ldf.fill  f10 = [in0],32
-	ldf.fill  f11 = [r2],32
-	;;
-	ldf.fill  f12 = [in0],32
-	ldf.fill  f13 = [r2],32
-	;;
-	ldf.fill  f14 = [in0],32
-	ldf.fill  f15 = [r2],32
-	br.ret.sptk.many rp
-END(ia64_load_scratch_fpregs)
-EXPORT_SYMBOL(ia64_load_scratch_fpregs)
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
deleted file mode 100644
index b9ae093bfe37..000000000000
--- a/arch/ia64/kernel/palinfo.c
+++ /dev/null
@@ -1,942 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * palinfo.c
- *
- * Prints processor specific information reported by PAL.
- * This code is based on specification of PAL as of the
- * Intel IA-64 Architecture Software Developer's Manual v1.0.
- *
- *
- * Copyright (C) 2000-2001, 2003 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2004 Intel Corporation
- *  Ashok Raj <ashok.raj@intel.com>
- *
- * 05/26/2000	S.Eranian	initial release
- * 08/21/2000	S.Eranian	updated to July 2000 PAL specs
- * 02/05/2001   S.Eranian	fixed module support
- * 10/23/2001	S.Eranian	updated pal_perf_mon_info bug fixes
- * 03/24/2004	Ashok Raj	updated to work with CPU Hotplug
- * 10/26/2006   Russ Anderson	updated processor features to rev 2.2 spec
- */
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-
-#include <asm/pal.h>
-#include <asm/sal.h>
-#include <asm/page.h>
-#include <asm/processor.h>
-#include <linux/smp.h>
-
-MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
-MODULE_DESCRIPTION("/proc interface to IA-64 PAL");
-MODULE_LICENSE("GPL");
-
-#define PALINFO_VERSION "0.5"
-
-typedef int (*palinfo_func_t)(struct seq_file *);
-
-typedef struct {
-	const char		*name;		/* name of the proc entry */
-	palinfo_func_t		proc_read;	/* function to call for reading */
-	struct proc_dir_entry	*entry;		/* registered entry (removal) */
-} palinfo_entry_t;
-
-
-/*
- *  A bunch of string array to get pretty printing
- */
-
-static const char *cache_types[] = {
-	"",			/* not used */
-	"Instruction",
-	"Data",
-	"Data/Instruction"	/* unified */
-};
-
-static const char *cache_mattrib[]={
-	"WriteThrough",
-	"WriteBack",
-	"",		/* reserved */
-	""		/* reserved */
-};
-
-static const char *cache_st_hints[]={
-	"Temporal, level 1",
-	"Reserved",
-	"Reserved",
-	"Non-temporal, all levels",
-	"Reserved",
-	"Reserved",
-	"Reserved",
-	"Reserved"
-};
-
-static const char *cache_ld_hints[]={
-	"Temporal, level 1",
-	"Non-temporal, level 1",
-	"Reserved",
-	"Non-temporal, all levels",
-	"Reserved",
-	"Reserved",
-	"Reserved",
-	"Reserved"
-};
-
-static const char *rse_hints[]={
-	"enforced lazy",
-	"eager stores",
-	"eager loads",
-	"eager loads and stores"
-};
-
-#define RSE_HINTS_COUNT ARRAY_SIZE(rse_hints)
-
-static const char *mem_attrib[]={
-	"WB",		/* 000 */
-	"SW",		/* 001 */
-	"010",		/* 010 */
-	"011",		/* 011 */
-	"UC",		/* 100 */
-	"UCE",		/* 101 */
-	"WC",		/* 110 */
-	"NaTPage"	/* 111 */
-};
-
-/*
- * Take a 64bit vector and produces a string such that
- * if bit n is set then 2^n in clear text is generated. The adjustment
- * to the right unit is also done.
- *
- * Input:
- *	- a pointer to a buffer to hold the string
- *	- a 64-bit vector
- * Output:
- *	- a pointer to the end of the buffer
- *
- */
-static void bitvector_process(struct seq_file *m, u64 vector)
-{
-	int i,j;
-	static const char *units[]={ "", "K", "M", "G", "T" };
-
-	for (i=0, j=0; i < 64; i++ , j=i/10) {
-		if (vector & 0x1)
-			seq_printf(m, "%d%s ", 1 << (i-j*10), units[j]);
-		vector >>= 1;
-	}
-}
-
-/*
- * Take a 64bit vector and produces a string such that
- * if bit n is set then register n is present. The function
- * takes into account consecutive registers and prints out ranges.
- *
- * Input:
- *	- a pointer to a buffer to hold the string
- *	- a 64-bit vector
- * Ouput:
- *	- a pointer to the end of the buffer
- *
- */
-static void bitregister_process(struct seq_file *m, u64 *reg_info, int max)
-{
-	int i, begin, skip = 0;
-	u64 value = reg_info[0];
-
-	value >>= i = begin = ffs(value) - 1;
-
-	for(; i < max; i++ ) {
-
-		if (i != 0 && (i%64) == 0) value = *++reg_info;
-
-		if ((value & 0x1) == 0 && skip == 0) {
-			if (begin  <= i - 2)
-				seq_printf(m, "%d-%d ", begin, i-1);
-			else
-				seq_printf(m, "%d ", i-1);
-			skip  = 1;
-			begin = -1;
-		} else if ((value & 0x1) && skip == 1) {
-			skip = 0;
-			begin = i;
-		}
-		value >>=1;
-	}
-	if (begin > -1) {
-		if (begin < 127)
-			seq_printf(m, "%d-127", begin);
-		else
-			seq_puts(m, "127");
-	}
-}
-
-static int power_info(struct seq_file *m)
-{
-	s64 status;
-	u64 halt_info_buffer[8];
-	pal_power_mgmt_info_u_t *halt_info =(pal_power_mgmt_info_u_t *)halt_info_buffer;
-	int i;
-
-	status = ia64_pal_halt_info(halt_info);
-	if (status != 0) return 0;
-
-	for (i=0; i < 8 ; i++ ) {
-		if (halt_info[i].pal_power_mgmt_info_s.im == 1) {
-			seq_printf(m,
-				   "Power level %d:\n"
-				   "\tentry_latency       : %d cycles\n"
-				   "\texit_latency        : %d cycles\n"
-				   "\tpower consumption   : %d mW\n"
-				   "\tCache+TLB coherency : %s\n", i,
-				   halt_info[i].pal_power_mgmt_info_s.entry_latency,
-				   halt_info[i].pal_power_mgmt_info_s.exit_latency,
-				   halt_info[i].pal_power_mgmt_info_s.power_consumption,
-				   halt_info[i].pal_power_mgmt_info_s.co ? "Yes" : "No");
-		} else {
-			seq_printf(m,"Power level %d: not implemented\n", i);
-		}
-	}
-	return 0;
-}
-
-static int cache_info(struct seq_file *m)
-{
-	unsigned long i, levels, unique_caches;
-	pal_cache_config_info_t cci;
-	int j, k;
-	long status;
-
-	if ((status = ia64_pal_cache_summary(&levels, &unique_caches)) != 0) {
-		printk(KERN_ERR "ia64_pal_cache_summary=%ld\n", status);
-		return 0;
-	}
-
-	seq_printf(m, "Cache levels  : %ld\nUnique caches : %ld\n\n",
-		   levels, unique_caches);
-
-	for (i=0; i < levels; i++) {
-		for (j=2; j >0 ; j--) {
-			/* even without unification some level may not be present */
-			if ((status=ia64_pal_cache_config_info(i,j, &cci)) != 0)
-				continue;
-
-			seq_printf(m,
-				   "%s Cache level %lu:\n"
-				   "\tSize           : %u bytes\n"
-				   "\tAttributes     : ",
-				   cache_types[j+cci.pcci_unified], i+1,
-				   cci.pcci_cache_size);
-
-			if (cci.pcci_unified)
-				seq_puts(m, "Unified ");
-
-			seq_printf(m, "%s\n", cache_mattrib[cci.pcci_cache_attr]);
-
-			seq_printf(m,
-				   "\tAssociativity  : %d\n"
-				   "\tLine size      : %d bytes\n"
-				   "\tStride         : %d bytes\n",
-				   cci.pcci_assoc,
-				   1<<cci.pcci_line_size,
-				   1<<cci.pcci_stride);
-			if (j == 1)
-				seq_puts(m, "\tStore latency  : N/A\n");
-			else
-				seq_printf(m, "\tStore latency  : %d cycle(s)\n",
-					   cci.pcci_st_latency);
-
-			seq_printf(m,
-				   "\tLoad latency   : %d cycle(s)\n"
-				   "\tStore hints    : ", cci.pcci_ld_latency);
-
-			for(k=0; k < 8; k++ ) {
-				if ( cci.pcci_st_hints & 0x1)
-					seq_printf(m, "[%s]", cache_st_hints[k]);
-				cci.pcci_st_hints >>=1;
-			}
-			seq_puts(m, "\n\tLoad hints     : ");
-
-			for(k=0; k < 8; k++ ) {
-				if (cci.pcci_ld_hints & 0x1)
-					seq_printf(m, "[%s]", cache_ld_hints[k]);
-				cci.pcci_ld_hints >>=1;
-			}
-			seq_printf(m,
-				   "\n\tAlias boundary : %d byte(s)\n"
-				   "\tTag LSB        : %d\n"
-				   "\tTag MSB        : %d\n",
-				   1<<cci.pcci_alias_boundary, cci.pcci_tag_lsb,
-				   cci.pcci_tag_msb);
-
-			/* when unified, data(j=2) is enough */
-			if (cci.pcci_unified)
-				break;
-		}
-	}
-	return 0;
-}
-
-
-static int vm_info(struct seq_file *m)
-{
-	u64 tr_pages =0, vw_pages=0, tc_pages;
-	u64 attrib;
-	pal_vm_info_1_u_t vm_info_1;
-	pal_vm_info_2_u_t vm_info_2;
-	pal_tc_info_u_t	tc_info;
-	ia64_ptce_info_t ptce;
-	const char *sep;
-	int i, j;
-	long status;
-
-	if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) {
-		printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status);
-	} else {
-
-		seq_printf(m,
-		     "Physical Address Space         : %d bits\n"
-		     "Virtual Address Space          : %d bits\n"
-		     "Protection Key Registers(PKR)  : %d\n"
-		     "Implemented bits in PKR.key    : %d\n"
-		     "Hash Tag ID                    : 0x%x\n"
-		     "Size of RR.rid                 : %d\n"
-		     "Max Purges                     : ",
-		     vm_info_1.pal_vm_info_1_s.phys_add_size,
-		     vm_info_2.pal_vm_info_2_s.impl_va_msb+1,
-		     vm_info_1.pal_vm_info_1_s.max_pkr+1,
-		     vm_info_1.pal_vm_info_1_s.key_size,
-		     vm_info_1.pal_vm_info_1_s.hash_tag_id,
-		     vm_info_2.pal_vm_info_2_s.rid_size);
-		if (vm_info_2.pal_vm_info_2_s.max_purges == PAL_MAX_PURGES)
-			seq_puts(m, "unlimited\n");
-		else
-			seq_printf(m, "%d\n",
-		     		vm_info_2.pal_vm_info_2_s.max_purges ?
-				vm_info_2.pal_vm_info_2_s.max_purges : 1);
-	}
-
-	if (ia64_pal_mem_attrib(&attrib) == 0) {
-		seq_puts(m, "Supported memory attributes    : ");
-		sep = "";
-		for (i = 0; i < 8; i++) {
-			if (attrib & (1 << i)) {
-				seq_printf(m, "%s%s", sep, mem_attrib[i]);
-				sep = ", ";
-			}
-		}
-		seq_putc(m, '\n');
-	}
-
-	if ((status = ia64_pal_vm_page_size(&tr_pages, &vw_pages)) !=0) {
-		printk(KERN_ERR "ia64_pal_vm_page_size=%ld\n", status);
-	} else {
-
-		seq_printf(m,
-			   "\nTLB walker                     : %simplemented\n"
-			   "Number of DTR                  : %d\n"
-			   "Number of ITR                  : %d\n"
-			   "TLB insertable page sizes      : ",
-			   vm_info_1.pal_vm_info_1_s.vw ? "" : "not ",
-			   vm_info_1.pal_vm_info_1_s.max_dtr_entry+1,
-			   vm_info_1.pal_vm_info_1_s.max_itr_entry+1);
-
-		bitvector_process(m, tr_pages);
-
-		seq_puts(m, "\nTLB purgeable page sizes       : ");
-
-		bitvector_process(m, vw_pages);
-	}
-
-	if ((status = ia64_get_ptce(&ptce)) != 0) {
-		printk(KERN_ERR "ia64_get_ptce=%ld\n", status);
-	} else {
-		seq_printf(m,
-		     "\nPurge base address             : 0x%016lx\n"
-		     "Purge outer loop count         : %d\n"
-		     "Purge inner loop count         : %d\n"
-		     "Purge outer loop stride        : %d\n"
-		     "Purge inner loop stride        : %d\n",
-		     ptce.base, ptce.count[0], ptce.count[1],
-		     ptce.stride[0], ptce.stride[1]);
-
-		seq_printf(m,
-		     "TC Levels                      : %d\n"
-		     "Unique TC(s)                   : %d\n",
-		     vm_info_1.pal_vm_info_1_s.num_tc_levels,
-		     vm_info_1.pal_vm_info_1_s.max_unique_tcs);
-
-		for(i=0; i < vm_info_1.pal_vm_info_1_s.num_tc_levels; i++) {
-			for (j=2; j>0 ; j--) {
-				tc_pages = 0; /* just in case */
-
-				/* even without unification, some levels may not be present */
-				if ((status=ia64_pal_vm_info(i,j, &tc_info, &tc_pages)) != 0)
-					continue;
-
-				seq_printf(m,
-				     "\n%s Translation Cache Level %d:\n"
-				     "\tHash sets           : %d\n"
-				     "\tAssociativity       : %d\n"
-				     "\tNumber of entries   : %d\n"
-				     "\tFlags               : ",
-				     cache_types[j+tc_info.tc_unified], i+1,
-				     tc_info.tc_num_sets,
-				     tc_info.tc_associativity,
-				     tc_info.tc_num_entries);
-
-				if (tc_info.tc_pf)
-					seq_puts(m, "PreferredPageSizeOptimized ");
-				if (tc_info.tc_unified)
-					seq_puts(m, "Unified ");
-				if (tc_info.tc_reduce_tr)
-					seq_puts(m, "TCReduction");
-
-				seq_puts(m, "\n\tSupported page sizes: ");
-
-				bitvector_process(m, tc_pages);
-
-				/* when unified date (j=2) is enough */
-				if (tc_info.tc_unified)
-					break;
-			}
-		}
-	}
-
-	seq_putc(m, '\n');
-	return 0;
-}
-
-
-static int register_info(struct seq_file *m)
-{
-	u64 reg_info[2];
-	u64 info;
-	unsigned long phys_stacked;
-	pal_hints_u_t hints;
-	unsigned long iregs, dregs;
-	static const char * const info_type[] = {
-		"Implemented AR(s)",
-		"AR(s) with read side-effects",
-		"Implemented CR(s)",
-		"CR(s) with read side-effects",
-	};
-
-	for(info=0; info < 4; info++) {
-		if (ia64_pal_register_info(info, &reg_info[0], &reg_info[1]) != 0)
-			return 0;
-		seq_printf(m, "%-32s : ", info_type[info]);
-		bitregister_process(m, reg_info, 128);
-		seq_putc(m, '\n');
-	}
-
-	if (ia64_pal_rse_info(&phys_stacked, &hints) == 0)
-		seq_printf(m,
-			   "RSE stacked physical registers   : %ld\n"
-			   "RSE load/store hints             : %ld (%s)\n",
-			   phys_stacked, hints.ph_data,
-			   hints.ph_data < RSE_HINTS_COUNT ? rse_hints[hints.ph_data]: "(??)");
-
-	if (ia64_pal_debug_info(&iregs, &dregs))
-		return 0;
-
-	seq_printf(m,
-		   "Instruction debug register pairs : %ld\n"
-		   "Data debug register pairs        : %ld\n", iregs, dregs);
-
-	return 0;
-}
-
-static const char *const proc_features_0[]={		/* Feature set 0 */
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL,
-	"Unimplemented instruction address fault",
-	"INIT, PMI, and LINT pins",
-	"Simple unimplemented instr addresses",
-	"Variable P-state performance",
-	"Virtual machine features implemented",
-	"XIP,XPSR,XFS implemented",
-	"XR1-XR3 implemented",
-	"Disable dynamic predicate prediction",
-	"Disable processor physical number",
-	"Disable dynamic data cache prefetch",
-	"Disable dynamic inst cache prefetch",
-	"Disable dynamic branch prediction",
-	NULL, NULL, NULL, NULL,
-	"Disable P-states",
-	"Enable MCA on Data Poisoning",
-	"Enable vmsw instruction",
-	"Enable extern environmental notification",
-	"Disable BINIT on processor time-out",
-	"Disable dynamic power management (DPM)",
-	"Disable coherency",
-	"Disable cache",
-	"Enable CMCI promotion",
-	"Enable MCA to BINIT promotion",
-	"Enable MCA promotion",
-	"Enable BERR promotion"
-};
-
-static const char *const proc_features_16[]={		/* Feature set 16 */
-	"Disable ETM",
-	"Enable ETM",
-	"Enable MCA on half-way timer",
-	"Enable snoop WC",
-	NULL,
-	"Enable Fast Deferral",
-	"Disable MCA on memory aliasing",
-	"Enable RSB",
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	"DP system processor",
-	"Low Voltage",
-	"HT supported",
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL
-};
-
-static const char *const *const proc_features[]={
-	proc_features_0,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL,
-	proc_features_16,
-	NULL, NULL, NULL, NULL,
-};
-
-static void feature_set_info(struct seq_file *m, u64 avail, u64 status, u64 control,
-			     unsigned long set)
-{
-	const char *const *vf, *const *v;
-	int i;
-
-	vf = v = proc_features[set];
-	for(i=0; i < 64; i++, avail >>=1, status >>=1, control >>=1) {
-
-		if (!(control))		/* No remaining bits set */
-			break;
-		if (!(avail & 0x1))	/* Print only bits that are available */
-			continue;
-		if (vf)
-			v = vf + i;
-		if ( v && *v ) {
-			seq_printf(m, "%-40s : %s %s\n", *v,
-				avail & 0x1 ? (status & 0x1 ?
-					      "On " : "Off"): "",
-				avail & 0x1 ? (control & 0x1 ?
-						"Ctrl" : "NoCtrl"): "");
-		} else {
-			seq_printf(m, "Feature set %2ld bit %2d\t\t\t"
-					" : %s %s\n",
-				set, i,
-				avail & 0x1 ? (status & 0x1 ?
-						"On " : "Off"): "",
-				avail & 0x1 ? (control & 0x1 ?
-						"Ctrl" : "NoCtrl"): "");
-		}
-	}
-}
-
-static int processor_info(struct seq_file *m)
-{
-	u64 avail=1, status=1, control=1, feature_set=0;
-	s64 ret;
-
-	do {
-		ret = ia64_pal_proc_get_features(&avail, &status, &control,
-						feature_set);
-		if (ret < 0)
-			return 0;
-
-		if (ret == 1) {
-			feature_set++;
-			continue;
-		}
-
-		feature_set_info(m, avail, status, control, feature_set);
-		feature_set++;
-	} while(1);
-
-	return 0;
-}
-
-static const char *const bus_features[]={
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL,
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-	NULL,NULL,
-	"Request  Bus Parking",
-	"Bus Lock Mask",
-	"Enable Half Transfer",
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL,
-	"Enable Cache Line Repl. Shared",
-	"Enable Cache Line Repl. Exclusive",
-	"Disable Transaction Queuing",
-	"Disable Response Error Checking",
-	"Disable Bus Error Checking",
-	"Disable Bus Requester Internal Error Signalling",
-	"Disable Bus Requester Error Signalling",
-	"Disable Bus Initialization Event Checking",
-	"Disable Bus Initialization Event Signalling",
-	"Disable Bus Address Error Checking",
-	"Disable Bus Address Error Signalling",
-	"Disable Bus Data Error Checking"
-};
-
-
-static int bus_info(struct seq_file *m)
-{
-	const char *const *v = bus_features;
-	pal_bus_features_u_t av, st, ct;
-	u64 avail, status, control;
-	int i;
-	s64 ret;
-
-	if ((ret=ia64_pal_bus_get_features(&av, &st, &ct)) != 0)
-		return 0;
-
-	avail   = av.pal_bus_features_val;
-	status  = st.pal_bus_features_val;
-	control = ct.pal_bus_features_val;
-
-	for(i=0; i < 64; i++, v++, avail >>=1, status >>=1, control >>=1) {
-		if ( ! *v )
-			continue;
-		seq_printf(m, "%-48s : %s%s %s\n", *v,
-			   avail & 0x1 ? "" : "NotImpl",
-			   avail & 0x1 ? (status  & 0x1 ? "On" : "Off"): "",
-			   avail & 0x1 ? (control & 0x1 ? "Ctrl" : "NoCtrl"): "");
-	}
-	return 0;
-}
-
-static int version_info(struct seq_file *m)
-{
-	pal_version_u_t min_ver, cur_ver;
-
-	if (ia64_pal_version(&min_ver, &cur_ver) != 0)
-		return 0;
-
-	seq_printf(m,
-		   "PAL_vendor : 0x%02x (min=0x%02x)\n"
-		   "PAL_A      : %02x.%02x (min=%02x.%02x)\n"
-		   "PAL_B      : %02x.%02x (min=%02x.%02x)\n",
-		   cur_ver.pal_version_s.pv_pal_vendor,
-		   min_ver.pal_version_s.pv_pal_vendor,
-		   cur_ver.pal_version_s.pv_pal_a_model,
-		   cur_ver.pal_version_s.pv_pal_a_rev,
-		   min_ver.pal_version_s.pv_pal_a_model,
-		   min_ver.pal_version_s.pv_pal_a_rev,
-		   cur_ver.pal_version_s.pv_pal_b_model,
-		   cur_ver.pal_version_s.pv_pal_b_rev,
-		   min_ver.pal_version_s.pv_pal_b_model,
-		   min_ver.pal_version_s.pv_pal_b_rev);
-	return 0;
-}
-
-static int frequency_info(struct seq_file *m)
-{
-	struct pal_freq_ratio proc, itc, bus;
-	unsigned long base;
-
-	if (ia64_pal_freq_base(&base) == -1)
-		seq_puts(m, "Output clock            : not implemented\n");
-	else
-		seq_printf(m, "Output clock            : %ld ticks/s\n", base);
-
-	if (ia64_pal_freq_ratios(&proc, &bus, &itc) != 0) return 0;
-
-	seq_printf(m,
-		     "Processor/Clock ratio   : %d/%d\n"
-		     "Bus/Clock ratio         : %d/%d\n"
-		     "ITC/Clock ratio         : %d/%d\n",
-		     proc.num, proc.den, bus.num, bus.den, itc.num, itc.den);
-	return 0;
-}
-
-static int tr_info(struct seq_file *m)
-{
-	long status;
-	pal_tr_valid_u_t tr_valid;
-	u64 tr_buffer[4];
-	pal_vm_info_1_u_t vm_info_1;
-	pal_vm_info_2_u_t vm_info_2;
-	unsigned long i, j;
-	unsigned long max[3], pgm;
-	struct ifa_reg {
-		unsigned long valid:1;
-		unsigned long ig:11;
-		unsigned long vpn:52;
-	} *ifa_reg;
-	struct itir_reg {
-		unsigned long rv1:2;
-		unsigned long ps:6;
-		unsigned long key:24;
-		unsigned long rv2:32;
-	} *itir_reg;
-	struct gr_reg {
-		unsigned long p:1;
-		unsigned long rv1:1;
-		unsigned long ma:3;
-		unsigned long a:1;
-		unsigned long d:1;
-		unsigned long pl:2;
-		unsigned long ar:3;
-		unsigned long ppn:38;
-		unsigned long rv2:2;
-		unsigned long ed:1;
-		unsigned long ig:11;
-	} *gr_reg;
-	struct rid_reg {
-		unsigned long ig1:1;
-		unsigned long rv1:1;
-		unsigned long ig2:6;
-		unsigned long rid:24;
-		unsigned long rv2:32;
-	} *rid_reg;
-
-	if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) {
-		printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status);
-		return 0;
-	}
-	max[0] = vm_info_1.pal_vm_info_1_s.max_itr_entry+1;
-	max[1] = vm_info_1.pal_vm_info_1_s.max_dtr_entry+1;
-
-	for (i=0; i < 2; i++ ) {
-		for (j=0; j < max[i]; j++) {
-
-		status = ia64_pal_tr_read(j, i, tr_buffer, &tr_valid);
-		if (status != 0) {
-			printk(KERN_ERR "palinfo: pal call failed on tr[%lu:%lu]=%ld\n",
-			       i, j, status);
-			continue;
-		}
-
-		ifa_reg  = (struct ifa_reg *)&tr_buffer[2];
-
-		if (ifa_reg->valid == 0)
-			continue;
-
-		gr_reg   = (struct gr_reg *)tr_buffer;
-		itir_reg = (struct itir_reg *)&tr_buffer[1];
-		rid_reg  = (struct rid_reg *)&tr_buffer[3];
-
-		pgm	 = -1 << (itir_reg->ps - 12);
-		seq_printf(m,
-			   "%cTR%lu: av=%d pv=%d dv=%d mv=%d\n"
-			   "\tppn  : 0x%lx\n"
-			   "\tvpn  : 0x%lx\n"
-			   "\tps   : ",
-			   "ID"[i], j,
-			   tr_valid.pal_tr_valid_s.access_rights_valid,
-			   tr_valid.pal_tr_valid_s.priv_level_valid,
-			   tr_valid.pal_tr_valid_s.dirty_bit_valid,
-			   tr_valid.pal_tr_valid_s.mem_attr_valid,
-			   (gr_reg->ppn & pgm)<< 12, (ifa_reg->vpn & pgm)<< 12);
-
-		bitvector_process(m, 1<< itir_reg->ps);
-
-		seq_printf(m,
-			   "\n\tpl   : %d\n"
-			   "\tar   : %d\n"
-			   "\trid  : %x\n"
-			   "\tp    : %d\n"
-			   "\tma   : %d\n"
-			   "\td    : %d\n",
-			   gr_reg->pl, gr_reg->ar, rid_reg->rid, gr_reg->p, gr_reg->ma,
-			   gr_reg->d);
-		}
-	}
-	return 0;
-}
-
-
-
-/*
- * List {name,function} pairs for every entry in /proc/palinfo/cpu*
- */
-static const palinfo_entry_t palinfo_entries[]={
-	{ "version_info",	version_info, },
-	{ "vm_info",		vm_info, },
-	{ "cache_info",		cache_info, },
-	{ "power_info",		power_info, },
-	{ "register_info",	register_info, },
-	{ "processor_info",	processor_info, },
-	{ "frequency_info",	frequency_info, },
-	{ "bus_info",		bus_info },
-	{ "tr_info",		tr_info, }
-};
-
-#define NR_PALINFO_ENTRIES	(int) ARRAY_SIZE(palinfo_entries)
-
-static struct proc_dir_entry *palinfo_dir;
-
-/*
- * This data structure is used to pass which cpu,function is being requested
- * It must fit in a 64bit quantity to be passed to the proc callback routine
- *
- * In SMP mode, when we get a request for another CPU, we must call that
- * other CPU using IPI and wait for the result before returning.
- */
-typedef union {
-	u64 value;
-	struct {
-		unsigned	req_cpu: 32;	/* for which CPU this info is */
-		unsigned	func_id: 32;	/* which function is requested */
-	} pal_func_cpu;
-} pal_func_cpu_u_t;
-
-#define req_cpu	pal_func_cpu.req_cpu
-#define func_id pal_func_cpu.func_id
-
-#ifdef CONFIG_SMP
-
-/*
- * used to hold information about final function to call
- */
-typedef struct {
-	palinfo_func_t	func;	/* pointer to function to call */
-	struct seq_file *m;	/* buffer to store results */
-	int		ret;	/* return value from call */
-} palinfo_smp_data_t;
-
-
-/*
- * this function does the actual final call and he called
- * from the smp code, i.e., this is the palinfo callback routine
- */
-static void
-palinfo_smp_call(void *info)
-{
-	palinfo_smp_data_t *data = (palinfo_smp_data_t *)info;
-	data->ret = (*data->func)(data->m);
-}
-
-/*
- * function called to trigger the IPI, we need to access a remote CPU
- * Return:
- *	0 : error or nothing to output
- *	otherwise how many bytes in the "page" buffer were written
- */
-static
-int palinfo_handle_smp(struct seq_file *m, pal_func_cpu_u_t *f)
-{
-	palinfo_smp_data_t ptr;
-	int ret;
-
-	ptr.func = palinfo_entries[f->func_id].proc_read;
-	ptr.m = m;
-	ptr.ret  = 0; /* just in case */
-
-
-	/* will send IPI to other CPU and wait for completion of remote call */
-	if ((ret=smp_call_function_single(f->req_cpu, palinfo_smp_call, &ptr, 1))) {
-		printk(KERN_ERR "palinfo: remote CPU call from %d to %d on function %d: "
-		       "error %d\n", smp_processor_id(), f->req_cpu, f->func_id, ret);
-		return 0;
-	}
-	return ptr.ret;
-}
-#else /* ! CONFIG_SMP */
-static
-int palinfo_handle_smp(struct seq_file *m, pal_func_cpu_u_t *f)
-{
-	printk(KERN_ERR "palinfo: should not be called with non SMP kernel\n");
-	return 0;
-}
-#endif /* CONFIG_SMP */
-
-/*
- * Entry point routine: all calls go through this function
- */
-static int proc_palinfo_show(struct seq_file *m, void *v)
-{
-	pal_func_cpu_u_t *f = (pal_func_cpu_u_t *)&m->private;
-
-	/*
-	 * in SMP mode, we may need to call another CPU to get correct
-	 * information. PAL, by definition, is processor specific
-	 */
-	if (f->req_cpu == get_cpu())
-		(*palinfo_entries[f->func_id].proc_read)(m);
-	else
-		palinfo_handle_smp(m, f);
-
-	put_cpu();
-	return 0;
-}
-
-static int palinfo_add_proc(unsigned int cpu)
-{
-	pal_func_cpu_u_t f;
-	struct proc_dir_entry *cpu_dir;
-	int j;
-	char cpustr[3+4+1];	/* cpu numbers are up to 4095 on itanic */
-	sprintf(cpustr, "cpu%d", cpu);
-
-	cpu_dir = proc_mkdir(cpustr, palinfo_dir);
-	if (!cpu_dir)
-		return -EINVAL;
-
-	f.req_cpu = cpu;
-
-	for (j=0; j < NR_PALINFO_ENTRIES; j++) {
-		f.func_id = j;
-		proc_create_single_data(palinfo_entries[j].name, 0, cpu_dir,
-				proc_palinfo_show, (void *)f.value);
-	}
-	return 0;
-}
-
-static int palinfo_del_proc(unsigned int hcpu)
-{
-	char cpustr[3+4+1];	/* cpu numbers are up to 4095 on itanic */
-
-	sprintf(cpustr, "cpu%d", hcpu);
-	remove_proc_subtree(cpustr, palinfo_dir);
-	return 0;
-}
-
-static enum cpuhp_state hp_online;
-
-static int __init palinfo_init(void)
-{
-	int i = 0;
-
-	printk(KERN_INFO "PAL Information Facility v%s\n", PALINFO_VERSION);
-	palinfo_dir = proc_mkdir("pal", NULL);
-	if (!palinfo_dir)
-		return -ENOMEM;
-
-	i = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/palinfo:online",
-			      palinfo_add_proc, palinfo_del_proc);
-	if (i < 0) {
-		remove_proc_subtree("pal", NULL);
-		return i;
-	}
-	hp_online = i;
-	return 0;
-}
-
-static void __exit palinfo_exit(void)
-{
-	cpuhp_remove_state(hp_online);
-	remove_proc_subtree("pal", NULL);
-}
-
-module_init(palinfo_init);
-module_exit(palinfo_exit);
diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c
deleted file mode 100644
index 7f21a8c57ed7..000000000000
--- a/arch/ia64/kernel/patch.c
+++ /dev/null
@@ -1,237 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Instruction-patching support.
- *
- * Copyright (C) 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/init.h>
-#include <linux/string.h>
-
-#include <asm/patch.h>
-#include <asm/processor.h>
-#include <asm/sections.h>
-#include <asm/unistd.h>
-
-/*
- * This was adapted from code written by Tony Luck:
- *
- * The 64-bit value in a "movl reg=value" is scattered between the two words of the bundle
- * like this:
- *
- * 6  6         5         4         3         2         1
- * 3210987654321098765432109876543210987654321098765432109876543210
- * ABBBBBBBBBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCCCDEEEEEFFFFFFFFFGGGGGGG
- *
- * CCCCCCCCCCCCCCCCCCxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
- * xxxxAFFFFFFFFFEEEEEDxGGGGGGGxxxxxxxxxxxxxBBBBBBBBBBBBBBBBBBBBBBB
- */
-static u64
-get_imm64 (u64 insn_addr)
-{
-	u64 *p = (u64 *) (insn_addr & -16);	/* mask out slot number */
-
-	return ( (p[1] & 0x0800000000000000UL) << 4)  | /*A*/
-		((p[1] & 0x00000000007fffffUL) << 40) | /*B*/
-		((p[0] & 0xffffc00000000000UL) >> 24) | /*C*/
-		((p[1] & 0x0000100000000000UL) >> 23) | /*D*/
-		((p[1] & 0x0003e00000000000UL) >> 29) | /*E*/
-		((p[1] & 0x07fc000000000000UL) >> 43) | /*F*/
-		((p[1] & 0x000007f000000000UL) >> 36);  /*G*/
-}
-
-/* Patch instruction with "val" where "mask" has 1 bits. */
-void
-ia64_patch (u64 insn_addr, u64 mask, u64 val)
-{
-	u64 m0, m1, v0, v1, b0, b1, *b = (u64 *) (insn_addr & -16);
-#	define insn_mask ((1UL << 41) - 1)
-	unsigned long shift;
-
-	b0 = b[0]; b1 = b[1];
-	shift = 5 + 41 * (insn_addr % 16); /* 5 bits of template, then 3 x 41-bit instructions */
-	if (shift >= 64) {
-		m1 = mask << (shift - 64);
-		v1 = val << (shift - 64);
-	} else {
-		m0 = mask << shift; m1 = mask >> (64 - shift);
-		v0 = val  << shift; v1 = val >> (64 - shift);
-		b[0] = (b0 & ~m0) | (v0 & m0);
-	}
-	b[1] = (b1 & ~m1) | (v1 & m1);
-}
-
-void
-ia64_patch_imm64 (u64 insn_addr, u64 val)
-{
-	/* The assembler may generate offset pointing to either slot 1
-	   or slot 2 for a long (2-slot) instruction, occupying slots 1
-	   and 2.  */
-  	insn_addr &= -16UL;
-	ia64_patch(insn_addr + 2,
-		   0x01fffefe000UL, (  ((val & 0x8000000000000000UL) >> 27) /* bit 63 -> 36 */
-				     | ((val & 0x0000000000200000UL) <<  0) /* bit 21 -> 21 */
-				     | ((val & 0x00000000001f0000UL) <<  6) /* bit 16 -> 22 */
-				     | ((val & 0x000000000000ff80UL) << 20) /* bit  7 -> 27 */
-				     | ((val & 0x000000000000007fUL) << 13) /* bit  0 -> 13 */));
-	ia64_patch(insn_addr + 1, 0x1ffffffffffUL, val >> 22);
-}
-
-void
-ia64_patch_imm60 (u64 insn_addr, u64 val)
-{
-	/* The assembler may generate offset pointing to either slot 1
-	   or slot 2 for a long (2-slot) instruction, occupying slots 1
-	   and 2.  */
-  	insn_addr &= -16UL;
-	ia64_patch(insn_addr + 2,
-		   0x011ffffe000UL, (  ((val & 0x0800000000000000UL) >> 23) /* bit 59 -> 36 */
-				     | ((val & 0x00000000000fffffUL) << 13) /* bit  0 -> 13 */));
-	ia64_patch(insn_addr + 1, 0x1fffffffffcUL, val >> 18);
-}
-
-/*
- * We need sometimes to load the physical address of a kernel
- * object.  Often we can convert the virtual address to physical
- * at execution time, but sometimes (either for performance reasons
- * or during error recovery) we cannot to this.  Patch the marked
- * bundles to load the physical address.
- */
-void __init
-ia64_patch_vtop (unsigned long start, unsigned long end)
-{
-	s32 *offp = (s32 *) start;
-	u64 ip;
-
-	while (offp < (s32 *) end) {
-		ip = (u64) offp + *offp;
-
-		/* replace virtual address with corresponding physical address: */
-		ia64_patch_imm64(ip, ia64_tpa(get_imm64(ip)));
-		ia64_fc((void *) ip);
-		++offp;
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-
-/*
- * Disable the RSE workaround by turning the conditional branch
- * that we tagged in each place the workaround was used into an
- * unconditional branch.
- */
-void __init
-ia64_patch_rse (unsigned long start, unsigned long end)
-{
-	s32 *offp = (s32 *) start;
-	u64 ip, *b;
-
-	while (offp < (s32 *) end) {
-		ip = (u64) offp + *offp;
-
-		b = (u64 *)(ip & -16);
-		b[1] &= ~0xf800000L;
-		ia64_fc((void *) ip);
-		++offp;
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-
-void __init
-ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
-{
-	static int first_time = 1;
-	int need_workaround;
-	s32 *offp = (s32 *) start;
-	u64 *wp;
-
-	need_workaround = (local_cpu_data->family == 0x1f && local_cpu_data->model == 0);
-
-	if (first_time) {
-		first_time = 0;
-		if (need_workaround)
-			printk(KERN_INFO "Leaving McKinley Errata 9 workaround enabled\n");
-	}
-	if (need_workaround)
-		return;
-
-	while (offp < (s32 *) end) {
-		wp = (u64 *) ia64_imva((char *) offp + *offp);
-		wp[0] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */
-		wp[1] = 0x0084006880000200UL;
-		wp[2] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */
-		wp[3] = 0x0004000000000200UL;
-		ia64_fc(wp); ia64_fc(wp + 2);
-		++offp;
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-
-static void __init
-patch_fsyscall_table (unsigned long start, unsigned long end)
-{
-	extern unsigned long fsyscall_table[NR_syscalls];
-	s32 *offp = (s32 *) start;
-	u64 ip;
-
-	while (offp < (s32 *) end) {
-		ip = (u64) ia64_imva((char *) offp + *offp);
-		ia64_patch_imm64(ip, (u64) fsyscall_table);
-		ia64_fc((void *) ip);
-		++offp;
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-
-static void __init
-patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
-{
-	extern char fsys_bubble_down[];
-	s32 *offp = (s32 *) start;
-	u64 ip;
-
-	while (offp < (s32 *) end) {
-		ip = (u64) offp + *offp;
-		ia64_patch_imm60((u64) ia64_imva((void *) ip),
-				 (u64) (fsys_bubble_down - (ip & -16)) / 16);
-		ia64_fc((void *) ip);
-		++offp;
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-
-void __init
-ia64_patch_gate (void)
-{
-#	define START(name)	((unsigned long) __start_gate_##name##_patchlist)
-#	define END(name)	((unsigned long)__end_gate_##name##_patchlist)
-
-	patch_fsyscall_table(START(fsyscall), END(fsyscall));
-	patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down));
-	ia64_patch_vtop(START(vtop), END(vtop));
-	ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9));
-}
-
-void ia64_patch_phys_stack_reg(unsigned long val)
-{
-	s32 * offp = (s32 *) __start___phys_stack_reg_patchlist;
-	s32 * end = (s32 *) __end___phys_stack_reg_patchlist;
-	u64 ip, mask, imm;
-
-	/* see instruction format A4: adds r1 = imm13, r3 */
-	mask = (0x3fUL << 27) | (0x7f << 13);
-	imm = (((val >> 7) & 0x3f) << 27) | (val & 0x7f) << 13;
-
-	while (offp < end) {
-		ip = (u64) offp + *offp;
-		ia64_patch(ip, mask, imm);
-		ia64_fc((void *)ip);
-		++offp;
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c
deleted file mode 100644
index c90221733c6b..000000000000
--- a/arch/ia64/kernel/pci-dma.c
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Dynamic DMA mapping support.
- */
-
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/module.h>
-#include <linux/dmar.h>
-#include <asm/iommu.h>
-#include <linux/dma-mapping.h>
-#include <linux/kernel.h>
-#include <asm/page.h>
-
-int no_iommu __read_mostly;
-#ifdef CONFIG_IOMMU_DEBUG
-int force_iommu __read_mostly = 1;
-#else
-int force_iommu __read_mostly;
-#endif
-
-static int __init pci_iommu_init(void)
-{
-	if (iommu_detected)
-		intel_iommu_init();
-
-	return 0;
-}
-
-/* Must execute after PCI subsystem */
-fs_initcall(pci_iommu_init);
diff --git a/arch/ia64/kernel/perfmon_itanium.h b/arch/ia64/kernel/perfmon_itanium.h
deleted file mode 100644
index dbd04028aafa..000000000000
--- a/arch/ia64/kernel/perfmon_itanium.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This file contains the Itanium PMU register description tables
- * and pmc checker.
- *
- * Copyright (C) 2002-2003  Hewlett Packard Co
- *               Stephane Eranian <eranian@hpl.hp.com>
- */
-static int pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
-
-static pfm_reg_desc_t pfm_ita_pmc_desc[PMU_MAX_PMCS]={
-/* pmc0  */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc1  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc2  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc3  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc4  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc5  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc6  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc7  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc8  */ { PFM_REG_CONFIG  , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc9  */ { PFM_REG_CONFIG  , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc10 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0000000010000000UL, -1UL, NULL, pfm_ita_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc13 */ { PFM_REG_CONFIG  , 0, 0x0003ffff00000001UL, -1UL, NULL, pfm_ita_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-	    { PFM_REG_END     , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
-};
-
-static pfm_reg_desc_t pfm_ita_pmd_desc[PMU_MAX_PMDS]={
-/* pmd0  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
-/* pmd1  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
-/* pmd2  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
-/* pmd3  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
-/* pmd4  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}},
-/* pmd5  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}},
-/* pmd6  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}},
-/* pmd7  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}},
-/* pmd8  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd9  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd10 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd11 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd12 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd13 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd14 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd15 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd16 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd17 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
-	    { PFM_REG_END     , 0, 0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
-};
-
-static int
-pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs)
-{
-	int ret;
-	int is_loaded;
-
-	/* sanitfy check */
-	if (ctx == NULL) return -EINVAL;
-
-	is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED;
-
-	/*
-	 * we must clear the (instruction) debug registers if pmc13.ta bit is cleared
-	 * before they are written (fl_using_dbreg==0) to avoid picking up stale information.
-	 */
-	if (cnum == 13 && is_loaded && ((*val & 0x1) == 0UL) && ctx->ctx_fl_using_dbreg == 0) {
-
-		DPRINT(("pmc[%d]=0x%lx has active pmc13.ta cleared, clearing ibr\n", cnum, *val));
-
-		/* don't mix debug with perfmon */
-		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
-
-		/*
-		 * a count of 0 will mark the debug registers as in use and also
-		 * ensure that they are properly cleared.
-		 */
-		ret = pfm_write_ibr_dbr(1, ctx, NULL, 0, regs);
-		if (ret) return ret;
-	}
-
-	/*
-	 * we must clear the (data) debug registers if pmc11.pt bit is cleared
-	 * before they are written (fl_using_dbreg==0) to avoid picking up stale information.
-	 */
-	if (cnum == 11 && is_loaded && ((*val >> 28)& 0x1) == 0 && ctx->ctx_fl_using_dbreg == 0) {
-
-		DPRINT(("pmc[%d]=0x%lx has active pmc11.pt cleared, clearing dbr\n", cnum, *val));
-
-		/* don't mix debug with perfmon */
-		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
-
-		/*
-		 * a count of 0 will mark the debug registers as in use and also
-		 * ensure that they are properly cleared.
-		 */
-		ret = pfm_write_ibr_dbr(0, ctx, NULL, 0, regs);
-		if (ret) return ret;
-	}
-	return 0;
-}
-
-/*
- * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
- */
-static pmu_config_t pmu_conf_ita={
-	.pmu_name      = "Itanium",
-	.pmu_family    = 0x7,
-	.ovfl_val      = (1UL << 32) - 1,
-	.pmd_desc      = pfm_ita_pmd_desc,
-	.pmc_desc      = pfm_ita_pmc_desc,
-	.num_ibrs      = 8,
-	.num_dbrs      = 8,
-	.use_rr_dbregs = 1, /* debug register are use for range retrictions */
-};
-
-
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
deleted file mode 100644
index 9a5cd9fad3a9..000000000000
--- a/arch/ia64/kernel/process.c
+++ /dev/null
@@ -1,611 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Architecture-specific setup.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * 04/11/17 Ashok Raj	<ashok.raj@intel.com> Added CPU Hotplug Support
- *
- * 2005-10-07 Keith Owens <kaos@sgi.com>
- *	      Add notify_die() hooks.
- */
-#include <linux/cpu.h>
-#include <linux/pm.h>
-#include <linux/elf.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/notifier.h>
-#include <linux/personality.h>
-#include <linux/reboot.h>
-#include <linux/sched.h>
-#include <linux/sched/debug.h>
-#include <linux/sched/hotplug.h>
-#include <linux/sched/task.h>
-#include <linux/sched/task_stack.h>
-#include <linux/stddef.h>
-#include <linux/thread_info.h>
-#include <linux/unistd.h>
-#include <linux/efi.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/kdebug.h>
-#include <linux/utsname.h>
-#include <linux/resume_user_mode.h>
-#include <linux/rcupdate.h>
-
-#include <asm/cpu.h>
-#include <asm/delay.h>
-#include <asm/elf.h>
-#include <asm/irq.h>
-#include <asm/kexec.h>
-#include <asm/processor.h>
-#include <asm/sal.h>
-#include <asm/switch_to.h>
-#include <asm/tlbflush.h>
-#include <linux/uaccess.h>
-#include <asm/unwind.h>
-#include <asm/user.h>
-#include <asm/xtp.h>
-
-#include "entry.h"
-
-#include "sigframe.h"
-
-void (*ia64_mark_idle)(int);
-
-unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
-EXPORT_SYMBOL(boot_option_idle_override);
-void (*pm_power_off) (void);
-EXPORT_SYMBOL(pm_power_off);
-
-static void
-ia64_do_show_stack (struct unw_frame_info *info, void *arg)
-{
-	unsigned long ip, sp, bsp;
-	const char *loglvl = arg;
-
-	printk("%s\nCall Trace:\n", loglvl);
-	do {
-		unw_get_ip(info, &ip);
-		if (ip == 0)
-			break;
-
-		unw_get_sp(info, &sp);
-		unw_get_bsp(info, &bsp);
-		printk("%s [<%016lx>] %pS\n"
-			 "                                sp=%016lx bsp=%016lx\n",
-			 loglvl, ip, (void *)ip, sp, bsp);
-	} while (unw_unwind(info) >= 0);
-}
-
-void
-show_stack (struct task_struct *task, unsigned long *sp, const char *loglvl)
-{
-	if (!task)
-		unw_init_running(ia64_do_show_stack, (void *)loglvl);
-	else {
-		struct unw_frame_info info;
-
-		unw_init_from_blocked_task(&info, task);
-		ia64_do_show_stack(&info, (void *)loglvl);
-	}
-}
-
-void
-show_regs (struct pt_regs *regs)
-{
-	unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri;
-
-	print_modules();
-	printk("\n");
-	show_regs_print_info(KERN_DEFAULT);
-	printk("psr : %016lx ifs : %016lx ip  : [<%016lx>]    %s (%s)\n",
-	       regs->cr_ipsr, regs->cr_ifs, ip, print_tainted(),
-	       init_utsname()->release);
-	printk("ip is at %pS\n", (void *)ip);
-	printk("unat: %016lx pfs : %016lx rsc : %016lx\n",
-	       regs->ar_unat, regs->ar_pfs, regs->ar_rsc);
-	printk("rnat: %016lx bsps: %016lx pr  : %016lx\n",
-	       regs->ar_rnat, regs->ar_bspstore, regs->pr);
-	printk("ldrs: %016lx ccv : %016lx fpsr: %016lx\n",
-	       regs->loadrs, regs->ar_ccv, regs->ar_fpsr);
-	printk("csd : %016lx ssd : %016lx\n", regs->ar_csd, regs->ar_ssd);
-	printk("b0  : %016lx b6  : %016lx b7  : %016lx\n", regs->b0, regs->b6, regs->b7);
-	printk("f6  : %05lx%016lx f7  : %05lx%016lx\n",
-	       regs->f6.u.bits[1], regs->f6.u.bits[0],
-	       regs->f7.u.bits[1], regs->f7.u.bits[0]);
-	printk("f8  : %05lx%016lx f9  : %05lx%016lx\n",
-	       regs->f8.u.bits[1], regs->f8.u.bits[0],
-	       regs->f9.u.bits[1], regs->f9.u.bits[0]);
-	printk("f10 : %05lx%016lx f11 : %05lx%016lx\n",
-	       regs->f10.u.bits[1], regs->f10.u.bits[0],
-	       regs->f11.u.bits[1], regs->f11.u.bits[0]);
-
-	printk("r1  : %016lx r2  : %016lx r3  : %016lx\n", regs->r1, regs->r2, regs->r3);
-	printk("r8  : %016lx r9  : %016lx r10 : %016lx\n", regs->r8, regs->r9, regs->r10);
-	printk("r11 : %016lx r12 : %016lx r13 : %016lx\n", regs->r11, regs->r12, regs->r13);
-	printk("r14 : %016lx r15 : %016lx r16 : %016lx\n", regs->r14, regs->r15, regs->r16);
-	printk("r17 : %016lx r18 : %016lx r19 : %016lx\n", regs->r17, regs->r18, regs->r19);
-	printk("r20 : %016lx r21 : %016lx r22 : %016lx\n", regs->r20, regs->r21, regs->r22);
-	printk("r23 : %016lx r24 : %016lx r25 : %016lx\n", regs->r23, regs->r24, regs->r25);
-	printk("r26 : %016lx r27 : %016lx r28 : %016lx\n", regs->r26, regs->r27, regs->r28);
-	printk("r29 : %016lx r30 : %016lx r31 : %016lx\n", regs->r29, regs->r30, regs->r31);
-
-	if (user_mode(regs)) {
-		/* print the stacked registers */
-		unsigned long val, *bsp, ndirty;
-		int i, sof, is_nat = 0;
-
-		sof = regs->cr_ifs & 0x7f;	/* size of frame */
-		ndirty = (regs->loadrs >> 19);
-		bsp = ia64_rse_skip_regs((unsigned long *) regs->ar_bspstore, ndirty);
-		for (i = 0; i < sof; ++i) {
-			get_user(val, (unsigned long __user *) ia64_rse_skip_regs(bsp, i));
-			printk("r%-3u:%c%016lx%s", 32 + i, is_nat ? '*' : ' ', val,
-			       ((i == sof - 1) || (i % 3) == 2) ? "\n" : " ");
-		}
-	} else
-		show_stack(NULL, NULL, KERN_DEFAULT);
-}
-
-/* local support for deprecated console_print */
-void
-console_print(const char *s)
-{
-	printk(KERN_EMERG "%s", s);
-}
-
-void
-do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall)
-{
-	if (fsys_mode(current, &scr->pt)) {
-		/*
-		 * defer signal-handling etc. until we return to
-		 * privilege-level 0.
-		 */
-		if (!ia64_psr(&scr->pt)->lp)
-			ia64_psr(&scr->pt)->lp = 1;
-		return;
-	}
-
-	/* deal with pending signal delivery */
-	if (test_thread_flag(TIF_SIGPENDING) ||
-	    test_thread_flag(TIF_NOTIFY_SIGNAL)) {
-		local_irq_enable();	/* force interrupt enable */
-		ia64_do_signal(scr, in_syscall);
-	}
-
-	if (test_thread_flag(TIF_NOTIFY_RESUME)) {
-		local_irq_enable();	/* force interrupt enable */
-		resume_user_mode_work(&scr->pt);
-	}
-
-	/* copy user rbs to kernel rbs */
-	if (unlikely(test_thread_flag(TIF_RESTORE_RSE))) {
-		local_irq_enable();	/* force interrupt enable */
-		ia64_sync_krbs();
-	}
-
-	local_irq_disable();	/* force interrupt disable */
-}
-
-static int __init nohalt_setup(char * str)
-{
-	cpu_idle_poll_ctrl(true);
-	return 1;
-}
-__setup("nohalt", nohalt_setup);
-
-#ifdef CONFIG_HOTPLUG_CPU
-/* We don't actually take CPU down, just spin without interrupts. */
-static inline void __noreturn play_dead(void)
-{
-	unsigned int this_cpu = smp_processor_id();
-
-	/* Ack it */
-	__this_cpu_write(cpu_state, CPU_DEAD);
-
-	max_xtp();
-	local_irq_disable();
-	idle_task_exit();
-	ia64_jump_to_sal(&sal_boot_rendez_state[this_cpu]);
-	/*
-	 * The above is a point of no-return, the processor is
-	 * expected to be in SAL loop now.
-	 */
-	BUG();
-}
-#else
-static inline void __noreturn play_dead(void)
-{
-	BUG();
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-void __noreturn arch_cpu_idle_dead(void)
-{
-	play_dead();
-}
-
-void arch_cpu_idle(void)
-{
-	void (*mark_idle)(int) = ia64_mark_idle;
-
-#ifdef CONFIG_SMP
-	min_xtp();
-#endif
-	rmb();
-	if (mark_idle)
-		(*mark_idle)(1);
-
-	raw_safe_halt();
-	raw_local_irq_disable();
-
-	if (mark_idle)
-		(*mark_idle)(0);
-#ifdef CONFIG_SMP
-	normal_xtp();
-#endif
-}
-
-void
-ia64_save_extra (struct task_struct *task)
-{
-	if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
-		ia64_save_debug_regs(&task->thread.dbr[0]);
-}
-
-void
-ia64_load_extra (struct task_struct *task)
-{
-	if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
-		ia64_load_debug_regs(&task->thread.dbr[0]);
-}
-
-/*
- * Copy the state of an ia-64 thread.
- *
- * We get here through the following  call chain:
- *
- *	from user-level:	from kernel:
- *
- *	<clone syscall>	        <some kernel call frames>
- *	sys_clone		   :
- *	kernel_clone		kernel_clone
- *	copy_thread		copy_thread
- *
- * This means that the stack layout is as follows:
- *
- *	+---------------------+ (highest addr)
- *	|   struct pt_regs    |
- *	+---------------------+
- *	| struct switch_stack |
- *	+---------------------+
- *	|                     |
- *	|    memory stack     |
- *	|                     | <-- sp (lowest addr)
- *	+---------------------+
- *
- * Observe that we copy the unat values that are in pt_regs and switch_stack.  Spilling an
- * integer to address X causes bit N in ar.unat to be set to the NaT bit of the register,
- * with N=(X & 0x1ff)/8.  Thus, copying the unat value preserves the NaT bits ONLY if the
- * pt_regs structure in the parent is congruent to that of the child, modulo 512.  Since
- * the stack is page aligned and the page size is at least 4KB, this is always the case,
- * so there is nothing to worry about.
- */
-int
-copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
-{
-	unsigned long clone_flags = args->flags;
-	unsigned long user_stack_base = args->stack;
-	unsigned long user_stack_size = args->stack_size;
-	unsigned long tls = args->tls;
-	extern char ia64_ret_from_clone;
-	struct switch_stack *child_stack, *stack;
-	unsigned long rbs, child_rbs, rbs_size;
-	struct pt_regs *child_ptregs;
-	struct pt_regs *regs = current_pt_regs();
-	int retval = 0;
-
-	child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1;
-	child_stack = (struct switch_stack *) child_ptregs - 1;
-
-	rbs = (unsigned long) current + IA64_RBS_OFFSET;
-	child_rbs = (unsigned long) p + IA64_RBS_OFFSET;
-
-	/* copy parts of thread_struct: */
-	p->thread.ksp = (unsigned long) child_stack - 16;
-
-	/*
-	 * NOTE: The calling convention considers all floating point
-	 * registers in the high partition (fph) to be scratch.  Since
-	 * the only way to get to this point is through a system call,
-	 * we know that the values in fph are all dead.  Hence, there
-	 * is no need to inherit the fph state from the parent to the
-	 * child and all we have to do is to make sure that
-	 * IA64_THREAD_FPH_VALID is cleared in the child.
-	 *
-	 * XXX We could push this optimization a bit further by
-	 * clearing IA64_THREAD_FPH_VALID on ANY system call.
-	 * However, it's not clear this is worth doing.  Also, it
-	 * would be a slight deviation from the normal Linux system
-	 * call behavior where scratch registers are preserved across
-	 * system calls (unless used by the system call itself).
-	 */
-#	define THREAD_FLAGS_TO_CLEAR	(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \
-					 | IA64_THREAD_PM_VALID)
-#	define THREAD_FLAGS_TO_SET	0
-	p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR)
-			   | THREAD_FLAGS_TO_SET);
-
-	ia64_drop_fpu(p);	/* don't pick up stale state from a CPU's fph */
-
-	if (unlikely(args->fn)) {
-		if (unlikely(args->idle)) {
-			/* fork_idle() called us */
-			return 0;
-		}
-		memset(child_stack, 0, sizeof(*child_ptregs) + sizeof(*child_stack));
-		child_stack->r4 = (unsigned long) args->fn;
-		child_stack->r5 = (unsigned long) args->fn_arg;
-		/*
-		 * Preserve PSR bits, except for bits 32-34 and 37-45,
-		 * which we can't read.
-		 */
-		child_ptregs->cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
-		/* mark as valid, empty frame */
-		child_ptregs->cr_ifs = 1UL << 63;
-		child_stack->ar_fpsr = child_ptregs->ar_fpsr
-			= ia64_getreg(_IA64_REG_AR_FPSR);
-		child_stack->pr = (1 << PRED_KERNEL_STACK);
-		child_stack->ar_bspstore = child_rbs;
-		child_stack->b0 = (unsigned long) &ia64_ret_from_clone;
-
-		/* stop some PSR bits from being inherited.
-		 * the psr.up/psr.pp bits must be cleared on fork but inherited on execve()
-		 * therefore we must specify them explicitly here and not include them in
-		 * IA64_PSR_BITS_TO_CLEAR.
-		 */
-		child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
-				 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
-
-		return 0;
-	}
-	stack = ((struct switch_stack *) regs) - 1;
-	/* copy parent's switch_stack & pt_regs to child: */
-	memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack));
-
-	/* copy the parent's register backing store to the child: */
-	rbs_size = stack->ar_bspstore - rbs;
-	memcpy((void *) child_rbs, (void *) rbs, rbs_size);
-	if (clone_flags & CLONE_SETTLS)
-		child_ptregs->r13 = tls;
-	if (user_stack_base) {
-		child_ptregs->r12 = user_stack_base + user_stack_size - 16;
-		child_ptregs->ar_bspstore = user_stack_base;
-		child_ptregs->ar_rnat = 0;
-		child_ptregs->loadrs = 0;
-	}
-	child_stack->ar_bspstore = child_rbs + rbs_size;
-	child_stack->b0 = (unsigned long) &ia64_ret_from_clone;
-
-	/* stop some PSR bits from being inherited.
-	 * the psr.up/psr.pp bits must be cleared on fork but inherited on execve()
-	 * therefore we must specify them explicitly here and not include them in
-	 * IA64_PSR_BITS_TO_CLEAR.
-	 */
-	child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
-				 & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
-	return retval;
-}
-
-asmlinkage long ia64_clone(unsigned long clone_flags, unsigned long stack_start,
-			   unsigned long stack_size, unsigned long parent_tidptr,
-			   unsigned long child_tidptr, unsigned long tls)
-{
-	struct kernel_clone_args args = {
-		.flags		= (lower_32_bits(clone_flags) & ~CSIGNAL),
-		.pidfd		= (int __user *)parent_tidptr,
-		.child_tid	= (int __user *)child_tidptr,
-		.parent_tid	= (int __user *)parent_tidptr,
-		.exit_signal	= (lower_32_bits(clone_flags) & CSIGNAL),
-		.stack		= stack_start,
-		.stack_size	= stack_size,
-		.tls		= tls,
-	};
-
-	return kernel_clone(&args);
-}
-
-static void
-do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg)
-{
-	unsigned long mask, sp, nat_bits = 0, ar_rnat, urbs_end, cfm;
-	unsigned long ip;
-	elf_greg_t *dst = arg;
-	struct pt_regs *pt;
-	char nat;
-	int i;
-
-	memset(dst, 0, sizeof(elf_gregset_t));	/* don't leak any kernel bits to user-level */
-
-	if (unw_unwind_to_user(info) < 0)
-		return;
-
-	unw_get_sp(info, &sp);
-	pt = (struct pt_regs *) (sp + 16);
-
-	urbs_end = ia64_get_user_rbs_end(task, pt, &cfm);
-
-	if (ia64_sync_user_rbs(task, info->sw, pt->ar_bspstore, urbs_end) < 0)
-		return;
-
-	ia64_peek(task, info->sw, urbs_end, (long) ia64_rse_rnat_addr((long *) urbs_end),
-		  &ar_rnat);
-
-	/*
-	 * coredump format:
-	 *	r0-r31
-	 *	NaT bits (for r0-r31; bit N == 1 iff rN is a NaT)
-	 *	predicate registers (p0-p63)
-	 *	b0-b7
-	 *	ip cfm user-mask
-	 *	ar.rsc ar.bsp ar.bspstore ar.rnat
-	 *	ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec
-	 */
-
-	/* r0 is zero */
-	for (i = 1, mask = (1UL << i); i < 32; ++i) {
-		unw_get_gr(info, i, &dst[i], &nat);
-		if (nat)
-			nat_bits |= mask;
-		mask <<= 1;
-	}
-	dst[32] = nat_bits;
-	unw_get_pr(info, &dst[33]);
-
-	for (i = 0; i < 8; ++i)
-		unw_get_br(info, i, &dst[34 + i]);
-
-	unw_get_rp(info, &ip);
-	dst[42] = ip + ia64_psr(pt)->ri;
-	dst[43] = cfm;
-	dst[44] = pt->cr_ipsr & IA64_PSR_UM;
-
-	unw_get_ar(info, UNW_AR_RSC, &dst[45]);
-	/*
-	 * For bsp and bspstore, unw_get_ar() would return the kernel
-	 * addresses, but we need the user-level addresses instead:
-	 */
-	dst[46] = urbs_end;	/* note: by convention PT_AR_BSP points to the end of the urbs! */
-	dst[47] = pt->ar_bspstore;
-	dst[48] = ar_rnat;
-	unw_get_ar(info, UNW_AR_CCV, &dst[49]);
-	unw_get_ar(info, UNW_AR_UNAT, &dst[50]);
-	unw_get_ar(info, UNW_AR_FPSR, &dst[51]);
-	dst[52] = pt->ar_pfs;	/* UNW_AR_PFS is == to pt->cr_ifs for interrupt frames */
-	unw_get_ar(info, UNW_AR_LC, &dst[53]);
-	unw_get_ar(info, UNW_AR_EC, &dst[54]);
-	unw_get_ar(info, UNW_AR_CSD, &dst[55]);
-	unw_get_ar(info, UNW_AR_SSD, &dst[56]);
-}
-
-static void
-do_copy_regs (struct unw_frame_info *info, void *arg)
-{
-	do_copy_task_regs(current, info, arg);
-}
-
-void
-ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst)
-{
-	unw_init_running(do_copy_regs, dst);
-}
-
-/*
- * Flush thread state.  This is called when a thread does an execve().
- */
-void
-flush_thread (void)
-{
-	/* drop floating-point and debug-register state if it exists: */
-	current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID);
-	ia64_drop_fpu(current);
-}
-
-/*
- * Clean up state associated with a thread.  This is called when
- * the thread calls exit().
- */
-void
-exit_thread (struct task_struct *tsk)
-{
-
-	ia64_drop_fpu(tsk);
-}
-
-unsigned long
-__get_wchan (struct task_struct *p)
-{
-	struct unw_frame_info info;
-	unsigned long ip;
-	int count = 0;
-
-	/*
-	 * Note: p may not be a blocked task (it could be current or
-	 * another process running on some other CPU.  Rather than
-	 * trying to determine if p is really blocked, we just assume
-	 * it's blocked and rely on the unwind routines to fail
-	 * gracefully if the process wasn't really blocked after all.
-	 * --davidm 99/12/15
-	 */
-	unw_init_from_blocked_task(&info, p);
-	do {
-		if (task_is_running(p))
-			return 0;
-		if (unw_unwind(&info) < 0)
-			return 0;
-		unw_get_ip(&info, &ip);
-		if (!in_sched_functions(ip))
-			return ip;
-	} while (count++ < 16);
-	return 0;
-}
-
-void
-cpu_halt (void)
-{
-	pal_power_mgmt_info_u_t power_info[8];
-	unsigned long min_power;
-	int i, min_power_state;
-
-	if (ia64_pal_halt_info(power_info) != 0)
-		return;
-
-	min_power_state = 0;
-	min_power = power_info[0].pal_power_mgmt_info_s.power_consumption;
-	for (i = 1; i < 8; ++i)
-		if (power_info[i].pal_power_mgmt_info_s.im
-		    && power_info[i].pal_power_mgmt_info_s.power_consumption < min_power) {
-			min_power = power_info[i].pal_power_mgmt_info_s.power_consumption;
-			min_power_state = i;
-		}
-
-	while (1)
-		ia64_pal_halt(min_power_state);
-}
-
-void machine_shutdown(void)
-{
-	smp_shutdown_nonboot_cpus(reboot_cpu);
-
-#ifdef CONFIG_KEXEC
-	kexec_disable_iosapic();
-#endif
-}
-
-void
-machine_restart (char *restart_cmd)
-{
-	(void) notify_die(DIE_MACHINE_RESTART, restart_cmd, NULL, 0, 0, 0);
-	efi_reboot(REBOOT_WARM, NULL);
-}
-
-void
-machine_halt (void)
-{
-	(void) notify_die(DIE_MACHINE_HALT, "", NULL, 0, 0, 0);
-	cpu_halt();
-}
-
-void
-machine_power_off (void)
-{
-	do_kernel_power_off();
-	machine_halt();
-}
-
-EXPORT_SYMBOL(ia64_delay_loop);
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
deleted file mode 100644
index 4c41912c550f..000000000000
--- a/arch/ia64/kernel/ptrace.c
+++ /dev/null
@@ -1,2012 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Kernel support for the ptrace() and syscall tracing interfaces.
- *
- * Copyright (C) 1999-2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2006 Intel Co
- *  2006-08-12	- IA64 Native Utrace implementation support added by
- *	Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
- *
- * Derived from the x86 and Alpha versions.
- */
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/sched/task.h>
-#include <linux/sched/task_stack.h>
-#include <linux/mm.h>
-#include <linux/errno.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/security.h>
-#include <linux/audit.h>
-#include <linux/signal.h>
-#include <linux/regset.h>
-#include <linux/elf.h>
-#include <linux/resume_user_mode.h>
-
-#include <asm/processor.h>
-#include <asm/ptrace_offsets.h>
-#include <asm/rse.h>
-#include <linux/uaccess.h>
-#include <asm/unwind.h>
-
-#include "entry.h"
-
-/*
- * Bits in the PSR that we allow ptrace() to change:
- *	be, up, ac, mfl, mfh (the user mask; five bits total)
- *	db (debug breakpoint fault; one bit)
- *	id (instruction debug fault disable; one bit)
- *	dd (data debug fault disable; one bit)
- *	ri (restart instruction; two bits)
- *	is (instruction set; one bit)
- */
-#define IPSR_MASK (IA64_PSR_UM | IA64_PSR_DB | IA64_PSR_IS	\
-		   | IA64_PSR_ID | IA64_PSR_DD | IA64_PSR_RI)
-
-#define MASK(nbits)	((1UL << (nbits)) - 1)	/* mask with NBITS bits set */
-#define PFM_MASK	MASK(38)
-
-#define PTRACE_DEBUG	0
-
-#if PTRACE_DEBUG
-# define dprintk(format...)	printk(format)
-# define inline
-#else
-# define dprintk(format...)
-#endif
-
-/* Return TRUE if PT was created due to kernel-entry via a system-call.  */
-
-static inline int
-in_syscall (struct pt_regs *pt)
-{
-	return (long) pt->cr_ifs >= 0;
-}
-
-/*
- * Collect the NaT bits for r1-r31 from scratch_unat and return a NaT
- * bitset where bit i is set iff the NaT bit of register i is set.
- */
-unsigned long
-ia64_get_scratch_nat_bits (struct pt_regs *pt, unsigned long scratch_unat)
-{
-#	define GET_BITS(first, last, unat)				\
-	({								\
-		unsigned long bit = ia64_unat_pos(&pt->r##first);	\
-		unsigned long nbits = (last - first + 1);		\
-		unsigned long mask = MASK(nbits) << first;		\
-		unsigned long dist;					\
-		if (bit < first)					\
-			dist = 64 + bit - first;			\
-		else							\
-			dist = bit - first;				\
-		ia64_rotr(unat, dist) & mask;				\
-	})
-	unsigned long val;
-
-	/*
-	 * Registers that are stored consecutively in struct pt_regs
-	 * can be handled in parallel.  If the register order in
-	 * struct_pt_regs changes, this code MUST be updated.
-	 */
-	val  = GET_BITS( 1,  1, scratch_unat);
-	val |= GET_BITS( 2,  3, scratch_unat);
-	val |= GET_BITS(12, 13, scratch_unat);
-	val |= GET_BITS(14, 14, scratch_unat);
-	val |= GET_BITS(15, 15, scratch_unat);
-	val |= GET_BITS( 8, 11, scratch_unat);
-	val |= GET_BITS(16, 31, scratch_unat);
-	return val;
-
-#	undef GET_BITS
-}
-
-/*
- * Set the NaT bits for the scratch registers according to NAT and
- * return the resulting unat (assuming the scratch registers are
- * stored in PT).
- */
-unsigned long
-ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat)
-{
-#	define PUT_BITS(first, last, nat)				\
-	({								\
-		unsigned long bit = ia64_unat_pos(&pt->r##first);	\
-		unsigned long nbits = (last - first + 1);		\
-		unsigned long mask = MASK(nbits) << first;		\
-		long dist;						\
-		if (bit < first)					\
-			dist = 64 + bit - first;			\
-		else							\
-			dist = bit - first;				\
-		ia64_rotl(nat & mask, dist);				\
-	})
-	unsigned long scratch_unat;
-
-	/*
-	 * Registers that are stored consecutively in struct pt_regs
-	 * can be handled in parallel.  If the register order in
-	 * struct_pt_regs changes, this code MUST be updated.
-	 */
-	scratch_unat  = PUT_BITS( 1,  1, nat);
-	scratch_unat |= PUT_BITS( 2,  3, nat);
-	scratch_unat |= PUT_BITS(12, 13, nat);
-	scratch_unat |= PUT_BITS(14, 14, nat);
-	scratch_unat |= PUT_BITS(15, 15, nat);
-	scratch_unat |= PUT_BITS( 8, 11, nat);
-	scratch_unat |= PUT_BITS(16, 31, nat);
-
-	return scratch_unat;
-
-#	undef PUT_BITS
-}
-
-#define IA64_MLX_TEMPLATE	0x2
-#define IA64_MOVL_OPCODE	6
-
-void
-ia64_increment_ip (struct pt_regs *regs)
-{
-	unsigned long w0, ri = ia64_psr(regs)->ri + 1;
-
-	if (ri > 2) {
-		ri = 0;
-		regs->cr_iip += 16;
-	} else if (ri == 2) {
-		get_user(w0, (char __user *) regs->cr_iip + 0);
-		if (((w0 >> 1) & 0xf) == IA64_MLX_TEMPLATE) {
-			/*
-			 * rfi'ing to slot 2 of an MLX bundle causes
-			 * an illegal operation fault.  We don't want
-			 * that to happen...
-			 */
-			ri = 0;
-			regs->cr_iip += 16;
-		}
-	}
-	ia64_psr(regs)->ri = ri;
-}
-
-void
-ia64_decrement_ip (struct pt_regs *regs)
-{
-	unsigned long w0, ri = ia64_psr(regs)->ri - 1;
-
-	if (ia64_psr(regs)->ri == 0) {
-		regs->cr_iip -= 16;
-		ri = 2;
-		get_user(w0, (char __user *) regs->cr_iip + 0);
-		if (((w0 >> 1) & 0xf) == IA64_MLX_TEMPLATE) {
-			/*
-			 * rfi'ing to slot 2 of an MLX bundle causes
-			 * an illegal operation fault.  We don't want
-			 * that to happen...
-			 */
-			ri = 1;
-		}
-	}
-	ia64_psr(regs)->ri = ri;
-}
-
-/*
- * This routine is used to read an rnat bits that are stored on the
- * kernel backing store.  Since, in general, the alignment of the user
- * and kernel are different, this is not completely trivial.  In
- * essence, we need to construct the user RNAT based on up to two
- * kernel RNAT values and/or the RNAT value saved in the child's
- * pt_regs.
- *
- * user rbs
- *
- * +--------+ <-- lowest address
- * | slot62 |
- * +--------+
- * |  rnat  | 0x....1f8
- * +--------+
- * | slot00 | \
- * +--------+ |
- * | slot01 | > child_regs->ar_rnat
- * +--------+ |
- * | slot02 | /				kernel rbs
- * +--------+				+--------+
- *	    <- child_regs->ar_bspstore	| slot61 | <-- krbs
- * +- - - - +				+--------+
- *					| slot62 |
- * +- - - - +				+--------+
- *					|  rnat	 |
- * +- - - - +				+--------+
- *   vrnat				| slot00 |
- * +- - - - +				+--------+
- *					=	 =
- *					+--------+
- *					| slot00 | \
- *					+--------+ |
- *					| slot01 | > child_stack->ar_rnat
- *					+--------+ |
- *					| slot02 | /
- *					+--------+
- *						  <--- child_stack->ar_bspstore
- *
- * The way to think of this code is as follows: bit 0 in the user rnat
- * corresponds to some bit N (0 <= N <= 62) in one of the kernel rnat
- * value.  The kernel rnat value holding this bit is stored in
- * variable rnat0.  rnat1 is loaded with the kernel rnat value that
- * form the upper bits of the user rnat value.
- *
- * Boundary cases:
- *
- * o when reading the rnat "below" the first rnat slot on the kernel
- *   backing store, rnat0/rnat1 are set to 0 and the low order bits are
- *   merged in from pt->ar_rnat.
- *
- * o when reading the rnat "above" the last rnat slot on the kernel
- *   backing store, rnat0/rnat1 gets its value from sw->ar_rnat.
- */
-static unsigned long
-get_rnat (struct task_struct *task, struct switch_stack *sw,
-	  unsigned long *krbs, unsigned long *urnat_addr,
-	  unsigned long *urbs_end)
-{
-	unsigned long rnat0 = 0, rnat1 = 0, urnat = 0, *slot0_kaddr;
-	unsigned long umask = 0, mask, m;
-	unsigned long *kbsp, *ubspstore, *rnat0_kaddr, *rnat1_kaddr, shift;
-	long num_regs, nbits;
-	struct pt_regs *pt;
-
-	pt = task_pt_regs(task);
-	kbsp = (unsigned long *) sw->ar_bspstore;
-	ubspstore = (unsigned long *) pt->ar_bspstore;
-
-	if (urbs_end < urnat_addr)
-		nbits = ia64_rse_num_regs(urnat_addr - 63, urbs_end);
-	else
-		nbits = 63;
-	mask = MASK(nbits);
-	/*
-	 * First, figure out which bit number slot 0 in user-land maps
-	 * to in the kernel rnat.  Do this by figuring out how many
-	 * register slots we're beyond the user's backingstore and
-	 * then computing the equivalent address in kernel space.
-	 */
-	num_regs = ia64_rse_num_regs(ubspstore, urnat_addr + 1);
-	slot0_kaddr = ia64_rse_skip_regs(krbs, num_regs);
-	shift = ia64_rse_slot_num(slot0_kaddr);
-	rnat1_kaddr = ia64_rse_rnat_addr(slot0_kaddr);
-	rnat0_kaddr = rnat1_kaddr - 64;
-
-	if (ubspstore + 63 > urnat_addr) {
-		/* some bits need to be merged in from pt->ar_rnat */
-		umask = MASK(ia64_rse_slot_num(ubspstore)) & mask;
-		urnat = (pt->ar_rnat & umask);
-		mask &= ~umask;
-		if (!mask)
-			return urnat;
-	}
-
-	m = mask << shift;
-	if (rnat0_kaddr >= kbsp)
-		rnat0 = sw->ar_rnat;
-	else if (rnat0_kaddr > krbs)
-		rnat0 = *rnat0_kaddr;
-	urnat |= (rnat0 & m) >> shift;
-
-	m = mask >> (63 - shift);
-	if (rnat1_kaddr >= kbsp)
-		rnat1 = sw->ar_rnat;
-	else if (rnat1_kaddr > krbs)
-		rnat1 = *rnat1_kaddr;
-	urnat |= (rnat1 & m) << (63 - shift);
-	return urnat;
-}
-
-/*
- * The reverse of get_rnat.
- */
-static void
-put_rnat (struct task_struct *task, struct switch_stack *sw,
-	  unsigned long *krbs, unsigned long *urnat_addr, unsigned long urnat,
-	  unsigned long *urbs_end)
-{
-	unsigned long rnat0 = 0, rnat1 = 0, *slot0_kaddr, umask = 0, mask, m;
-	unsigned long *kbsp, *ubspstore, *rnat0_kaddr, *rnat1_kaddr, shift;
-	long num_regs, nbits;
-	struct pt_regs *pt;
-	unsigned long cfm, *urbs_kargs;
-
-	pt = task_pt_regs(task);
-	kbsp = (unsigned long *) sw->ar_bspstore;
-	ubspstore = (unsigned long *) pt->ar_bspstore;
-
-	urbs_kargs = urbs_end;
-	if (in_syscall(pt)) {
-		/*
-		 * If entered via syscall, don't allow user to set rnat bits
-		 * for syscall args.
-		 */
-		cfm = pt->cr_ifs;
-		urbs_kargs = ia64_rse_skip_regs(urbs_end, -(cfm & 0x7f));
-	}
-
-	if (urbs_kargs >= urnat_addr)
-		nbits = 63;
-	else {
-		if ((urnat_addr - 63) >= urbs_kargs)
-			return;
-		nbits = ia64_rse_num_regs(urnat_addr - 63, urbs_kargs);
-	}
-	mask = MASK(nbits);
-
-	/*
-	 * First, figure out which bit number slot 0 in user-land maps
-	 * to in the kernel rnat.  Do this by figuring out how many
-	 * register slots we're beyond the user's backingstore and
-	 * then computing the equivalent address in kernel space.
-	 */
-	num_regs = ia64_rse_num_regs(ubspstore, urnat_addr + 1);
-	slot0_kaddr = ia64_rse_skip_regs(krbs, num_regs);
-	shift = ia64_rse_slot_num(slot0_kaddr);
-	rnat1_kaddr = ia64_rse_rnat_addr(slot0_kaddr);
-	rnat0_kaddr = rnat1_kaddr - 64;
-
-	if (ubspstore + 63 > urnat_addr) {
-		/* some bits need to be place in pt->ar_rnat: */
-		umask = MASK(ia64_rse_slot_num(ubspstore)) & mask;
-		pt->ar_rnat = (pt->ar_rnat & ~umask) | (urnat & umask);
-		mask &= ~umask;
-		if (!mask)
-			return;
-	}
-	/*
-	 * Note: Section 11.1 of the EAS guarantees that bit 63 of an
-	 * rnat slot is ignored. so we don't have to clear it here.
-	 */
-	rnat0 = (urnat << shift);
-	m = mask << shift;
-	if (rnat0_kaddr >= kbsp)
-		sw->ar_rnat = (sw->ar_rnat & ~m) | (rnat0 & m);
-	else if (rnat0_kaddr > krbs)
-		*rnat0_kaddr = ((*rnat0_kaddr & ~m) | (rnat0 & m));
-
-	rnat1 = (urnat >> (63 - shift));
-	m = mask >> (63 - shift);
-	if (rnat1_kaddr >= kbsp)
-		sw->ar_rnat = (sw->ar_rnat & ~m) | (rnat1 & m);
-	else if (rnat1_kaddr > krbs)
-		*rnat1_kaddr = ((*rnat1_kaddr & ~m) | (rnat1 & m));
-}
-
-static inline int
-on_kernel_rbs (unsigned long addr, unsigned long bspstore,
-	       unsigned long urbs_end)
-{
-	unsigned long *rnat_addr = ia64_rse_rnat_addr((unsigned long *)
-						      urbs_end);
-	return (addr >= bspstore && addr <= (unsigned long) rnat_addr);
-}
-
-/*
- * Read a word from the user-level backing store of task CHILD.  ADDR
- * is the user-level address to read the word from, VAL a pointer to
- * the return value, and USER_BSP gives the end of the user-level
- * backing store (i.e., it's the address that would be in ar.bsp after
- * the user executed a "cover" instruction).
- *
- * This routine takes care of accessing the kernel register backing
- * store for those registers that got spilled there.  It also takes
- * care of calculating the appropriate RNaT collection words.
- */
-long
-ia64_peek (struct task_struct *child, struct switch_stack *child_stack,
-	   unsigned long user_rbs_end, unsigned long addr, long *val)
-{
-	unsigned long *bspstore, *krbs, regnum, *laddr, *urbs_end, *rnat_addr;
-	struct pt_regs *child_regs;
-	size_t copied;
-	long ret;
-
-	urbs_end = (long *) user_rbs_end;
-	laddr = (unsigned long *) addr;
-	child_regs = task_pt_regs(child);
-	bspstore = (unsigned long *) child_regs->ar_bspstore;
-	krbs = (unsigned long *) child + IA64_RBS_OFFSET/8;
-	if (on_kernel_rbs(addr, (unsigned long) bspstore,
-			  (unsigned long) urbs_end))
-	{
-		/*
-		 * Attempt to read the RBS in an area that's actually
-		 * on the kernel RBS => read the corresponding bits in
-		 * the kernel RBS.
-		 */
-		rnat_addr = ia64_rse_rnat_addr(laddr);
-		ret = get_rnat(child, child_stack, krbs, rnat_addr, urbs_end);
-
-		if (laddr == rnat_addr) {
-			/* return NaT collection word itself */
-			*val = ret;
-			return 0;
-		}
-
-		if (((1UL << ia64_rse_slot_num(laddr)) & ret) != 0) {
-			/*
-			 * It is implementation dependent whether the
-			 * data portion of a NaT value gets saved on a
-			 * st8.spill or RSE spill (e.g., see EAS 2.6,
-			 * 4.4.4.6 Register Spill and Fill).  To get
-			 * consistent behavior across all possible
-			 * IA-64 implementations, we return zero in
-			 * this case.
-			 */
-			*val = 0;
-			return 0;
-		}
-
-		if (laddr < urbs_end) {
-			/*
-			 * The desired word is on the kernel RBS and
-			 * is not a NaT.
-			 */
-			regnum = ia64_rse_num_regs(bspstore, laddr);
-			*val = *ia64_rse_skip_regs(krbs, regnum);
-			return 0;
-		}
-	}
-	copied = access_process_vm(child, addr, &ret, sizeof(ret), FOLL_FORCE);
-	if (copied != sizeof(ret))
-		return -EIO;
-	*val = ret;
-	return 0;
-}
-
-long
-ia64_poke (struct task_struct *child, struct switch_stack *child_stack,
-	   unsigned long user_rbs_end, unsigned long addr, long val)
-{
-	unsigned long *bspstore, *krbs, regnum, *laddr;
-	unsigned long *urbs_end = (long *) user_rbs_end;
-	struct pt_regs *child_regs;
-
-	laddr = (unsigned long *) addr;
-	child_regs = task_pt_regs(child);
-	bspstore = (unsigned long *) child_regs->ar_bspstore;
-	krbs = (unsigned long *) child + IA64_RBS_OFFSET/8;
-	if (on_kernel_rbs(addr, (unsigned long) bspstore,
-			  (unsigned long) urbs_end))
-	{
-		/*
-		 * Attempt to write the RBS in an area that's actually
-		 * on the kernel RBS => write the corresponding bits
-		 * in the kernel RBS.
-		 */
-		if (ia64_rse_is_rnat_slot(laddr))
-			put_rnat(child, child_stack, krbs, laddr, val,
-				 urbs_end);
-		else {
-			if (laddr < urbs_end) {
-				regnum = ia64_rse_num_regs(bspstore, laddr);
-				*ia64_rse_skip_regs(krbs, regnum) = val;
-			}
-		}
-	} else if (access_process_vm(child, addr, &val, sizeof(val),
-				FOLL_FORCE | FOLL_WRITE)
-		   != sizeof(val))
-		return -EIO;
-	return 0;
-}
-
-/*
- * Calculate the address of the end of the user-level register backing
- * store.  This is the address that would have been stored in ar.bsp
- * if the user had executed a "cover" instruction right before
- * entering the kernel.  If CFMP is not NULL, it is used to return the
- * "current frame mask" that was active at the time the kernel was
- * entered.
- */
-unsigned long
-ia64_get_user_rbs_end (struct task_struct *child, struct pt_regs *pt,
-		       unsigned long *cfmp)
-{
-	unsigned long *krbs, *bspstore, cfm = pt->cr_ifs;
-	long ndirty;
-
-	krbs = (unsigned long *) child + IA64_RBS_OFFSET/8;
-	bspstore = (unsigned long *) pt->ar_bspstore;
-	ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19));
-
-	if (in_syscall(pt))
-		ndirty += (cfm & 0x7f);
-	else
-		cfm &= ~(1UL << 63);	/* clear valid bit */
-
-	if (cfmp)
-		*cfmp = cfm;
-	return (unsigned long) ia64_rse_skip_regs(bspstore, ndirty);
-}
-
-/*
- * Synchronize (i.e, write) the RSE backing store living in kernel
- * space to the VM of the CHILD task.  SW and PT are the pointers to
- * the switch_stack and pt_regs structures, respectively.
- * USER_RBS_END is the user-level address at which the backing store
- * ends.
- */
-long
-ia64_sync_user_rbs (struct task_struct *child, struct switch_stack *sw,
-		    unsigned long user_rbs_start, unsigned long user_rbs_end)
-{
-	unsigned long addr, val;
-	long ret;
-
-	/* now copy word for word from kernel rbs to user rbs: */
-	for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) {
-		ret = ia64_peek(child, sw, user_rbs_end, addr, &val);
-		if (ret < 0)
-			return ret;
-		if (access_process_vm(child, addr, &val, sizeof(val),
-				FOLL_FORCE | FOLL_WRITE)
-		    != sizeof(val))
-			return -EIO;
-	}
-	return 0;
-}
-
-static long
-ia64_sync_kernel_rbs (struct task_struct *child, struct switch_stack *sw,
-		unsigned long user_rbs_start, unsigned long user_rbs_end)
-{
-	unsigned long addr, val;
-	long ret;
-
-	/* now copy word for word from user rbs to kernel rbs: */
-	for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) {
-		if (access_process_vm(child, addr, &val, sizeof(val),
-				FOLL_FORCE)
-				!= sizeof(val))
-			return -EIO;
-
-		ret = ia64_poke(child, sw, user_rbs_end, addr, val);
-		if (ret < 0)
-			return ret;
-	}
-	return 0;
-}
-
-typedef long (*syncfunc_t)(struct task_struct *, struct switch_stack *,
-			    unsigned long, unsigned long);
-
-static void do_sync_rbs(struct unw_frame_info *info, void *arg)
-{
-	struct pt_regs *pt;
-	unsigned long urbs_end;
-	syncfunc_t fn = arg;
-
-	if (unw_unwind_to_user(info) < 0)
-		return;
-	pt = task_pt_regs(info->task);
-	urbs_end = ia64_get_user_rbs_end(info->task, pt, NULL);
-
-	fn(info->task, info->sw, pt->ar_bspstore, urbs_end);
-}
-
-/*
- * when a thread is stopped (ptraced), debugger might change thread's user
- * stack (change memory directly), and we must avoid the RSE stored in kernel
- * to override user stack (user space's RSE is newer than kernel's in the
- * case). To workaround the issue, we copy kernel RSE to user RSE before the
- * task is stopped, so user RSE has updated data.  we then copy user RSE to
- * kernel after the task is resummed from traced stop and kernel will use the
- * newer RSE to return to user. TIF_RESTORE_RSE is the flag to indicate we need
- * synchronize user RSE to kernel.
- */
-void ia64_ptrace_stop(void)
-{
-	if (test_and_set_tsk_thread_flag(current, TIF_RESTORE_RSE))
-		return;
-	set_notify_resume(current);
-	unw_init_running(do_sync_rbs, ia64_sync_user_rbs);
-}
-
-/*
- * This is called to read back the register backing store.
- */
-void ia64_sync_krbs(void)
-{
-	clear_tsk_thread_flag(current, TIF_RESTORE_RSE);
-
-	unw_init_running(do_sync_rbs, ia64_sync_kernel_rbs);
-}
-
-/*
- * Write f32-f127 back to task->thread.fph if it has been modified.
- */
-inline void
-ia64_flush_fph (struct task_struct *task)
-{
-	struct ia64_psr *psr = ia64_psr(task_pt_regs(task));
-
-	/*
-	 * Prevent migrating this task while
-	 * we're fiddling with the FPU state
-	 */
-	preempt_disable();
-	if (ia64_is_local_fpu_owner(task) && psr->mfh) {
-		psr->mfh = 0;
-		task->thread.flags |= IA64_THREAD_FPH_VALID;
-		ia64_save_fpu(&task->thread.fph[0]);
-	}
-	preempt_enable();
-}
-
-/*
- * Sync the fph state of the task so that it can be manipulated
- * through thread.fph.  If necessary, f32-f127 are written back to
- * thread.fph or, if the fph state hasn't been used before, thread.fph
- * is cleared to zeroes.  Also, access to f32-f127 is disabled to
- * ensure that the task picks up the state from thread.fph when it
- * executes again.
- */
-void
-ia64_sync_fph (struct task_struct *task)
-{
-	struct ia64_psr *psr = ia64_psr(task_pt_regs(task));
-
-	ia64_flush_fph(task);
-	if (!(task->thread.flags & IA64_THREAD_FPH_VALID)) {
-		task->thread.flags |= IA64_THREAD_FPH_VALID;
-		memset(&task->thread.fph, 0, sizeof(task->thread.fph));
-	}
-	ia64_drop_fpu(task);
-	psr->dfh = 1;
-}
-
-/*
- * Change the machine-state of CHILD such that it will return via the normal
- * kernel exit-path, rather than the syscall-exit path.
- */
-static void
-convert_to_non_syscall (struct task_struct *child, struct pt_regs  *pt,
-			unsigned long cfm)
-{
-	struct unw_frame_info info, prev_info;
-	unsigned long ip, sp, pr;
-
-	unw_init_from_blocked_task(&info, child);
-	while (1) {
-		prev_info = info;
-		if (unw_unwind(&info) < 0)
-			return;
-
-		unw_get_sp(&info, &sp);
-		if ((long)((unsigned long)child + IA64_STK_OFFSET - sp)
-		    < IA64_PT_REGS_SIZE) {
-			dprintk("ptrace.%s: ran off the top of the kernel "
-				"stack\n", __func__);
-			return;
-		}
-		if (unw_get_pr (&prev_info, &pr) < 0) {
-			unw_get_rp(&prev_info, &ip);
-			dprintk("ptrace.%s: failed to read "
-				"predicate register (ip=0x%lx)\n",
-				__func__, ip);
-			return;
-		}
-		if (unw_is_intr_frame(&info)
-		    && (pr & (1UL << PRED_USER_STACK)))
-			break;
-	}
-
-	/*
-	 * Note: at the time of this call, the target task is blocked
-	 * in notify_resume_user() and by clearling PRED_LEAVE_SYSCALL
-	 * (aka, "pLvSys") we redirect execution from
-	 * .work_pending_syscall_end to .work_processed_kernel.
-	 */
-	unw_get_pr(&prev_info, &pr);
-	pr &= ~((1UL << PRED_SYSCALL) | (1UL << PRED_LEAVE_SYSCALL));
-	pr |=  (1UL << PRED_NON_SYSCALL);
-	unw_set_pr(&prev_info, pr);
-
-	pt->cr_ifs = (1UL << 63) | cfm;
-	/*
-	 * Clear the memory that is NOT written on syscall-entry to
-	 * ensure we do not leak kernel-state to user when execution
-	 * resumes.
-	 */
-	pt->r2 = 0;
-	pt->r3 = 0;
-	pt->r14 = 0;
-	memset(&pt->r16, 0, 16*8);	/* clear r16-r31 */
-	memset(&pt->f6, 0, 6*16);	/* clear f6-f11 */
-	pt->b7 = 0;
-	pt->ar_ccv = 0;
-	pt->ar_csd = 0;
-	pt->ar_ssd = 0;
-}
-
-static int
-access_nat_bits (struct task_struct *child, struct pt_regs *pt,
-		 struct unw_frame_info *info,
-		 unsigned long *data, int write_access)
-{
-	unsigned long regnum, nat_bits, scratch_unat, dummy = 0;
-	char nat = 0;
-
-	if (write_access) {
-		nat_bits = *data;
-		scratch_unat = ia64_put_scratch_nat_bits(pt, nat_bits);
-		if (unw_set_ar(info, UNW_AR_UNAT, scratch_unat) < 0) {
-			dprintk("ptrace: failed to set ar.unat\n");
-			return -1;
-		}
-		for (regnum = 4; regnum <= 7; ++regnum) {
-			unw_get_gr(info, regnum, &dummy, &nat);
-			unw_set_gr(info, regnum, dummy,
-				   (nat_bits >> regnum) & 1);
-		}
-	} else {
-		if (unw_get_ar(info, UNW_AR_UNAT, &scratch_unat) < 0) {
-			dprintk("ptrace: failed to read ar.unat\n");
-			return -1;
-		}
-		nat_bits = ia64_get_scratch_nat_bits(pt, scratch_unat);
-		for (regnum = 4; regnum <= 7; ++regnum) {
-			unw_get_gr(info, regnum, &dummy, &nat);
-			nat_bits |= (nat != 0) << regnum;
-		}
-		*data = nat_bits;
-	}
-	return 0;
-}
-
-static int
-access_elf_reg(struct task_struct *target, struct unw_frame_info *info,
-		unsigned long addr, unsigned long *data, int write_access);
-
-static long
-ptrace_getregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
-{
-	unsigned long psr, ec, lc, rnat, bsp, cfm, nat_bits, val;
-	struct unw_frame_info info;
-	struct ia64_fpreg fpval;
-	struct switch_stack *sw;
-	struct pt_regs *pt;
-	long ret, retval = 0;
-	char nat = 0;
-	int i;
-
-	if (!access_ok(ppr, sizeof(struct pt_all_user_regs)))
-		return -EIO;
-
-	pt = task_pt_regs(child);
-	sw = (struct switch_stack *) (child->thread.ksp + 16);
-	unw_init_from_blocked_task(&info, child);
-	if (unw_unwind_to_user(&info) < 0) {
-		return -EIO;
-	}
-
-	if (((unsigned long) ppr & 0x7) != 0) {
-		dprintk("ptrace:unaligned register address %p\n", ppr);
-		return -EIO;
-	}
-
-	if (access_elf_reg(child, &info, ELF_CR_IPSR_OFFSET, &psr, 0) < 0 ||
-	    access_elf_reg(child, &info, ELF_AR_EC_OFFSET, &ec, 0) < 0 ||
-	    access_elf_reg(child, &info, ELF_AR_LC_OFFSET, &lc, 0) < 0 ||
-	    access_elf_reg(child, &info, ELF_AR_RNAT_OFFSET, &rnat, 0) < 0 ||
-	    access_elf_reg(child, &info, ELF_AR_BSP_OFFSET, &bsp, 0) < 0 ||
-	    access_elf_reg(child, &info, ELF_CFM_OFFSET, &cfm, 0) < 0 ||
-	    access_elf_reg(child, &info, ELF_NAT_OFFSET, &nat_bits, 0) < 0)
-		return -EIO;
-
-	/* control regs */
-
-	retval |= __put_user(pt->cr_iip, &ppr->cr_iip);
-	retval |= __put_user(psr, &ppr->cr_ipsr);
-
-	/* app regs */
-
-	retval |= __put_user(pt->ar_pfs, &ppr->ar[PT_AUR_PFS]);
-	retval |= __put_user(pt->ar_rsc, &ppr->ar[PT_AUR_RSC]);
-	retval |= __put_user(pt->ar_bspstore, &ppr->ar[PT_AUR_BSPSTORE]);
-	retval |= __put_user(pt->ar_unat, &ppr->ar[PT_AUR_UNAT]);
-	retval |= __put_user(pt->ar_ccv, &ppr->ar[PT_AUR_CCV]);
-	retval |= __put_user(pt->ar_fpsr, &ppr->ar[PT_AUR_FPSR]);
-
-	retval |= __put_user(ec, &ppr->ar[PT_AUR_EC]);
-	retval |= __put_user(lc, &ppr->ar[PT_AUR_LC]);
-	retval |= __put_user(rnat, &ppr->ar[PT_AUR_RNAT]);
-	retval |= __put_user(bsp, &ppr->ar[PT_AUR_BSP]);
-	retval |= __put_user(cfm, &ppr->cfm);
-
-	/* gr1-gr3 */
-
-	retval |= __copy_to_user(&ppr->gr[1], &pt->r1, sizeof(long));
-	retval |= __copy_to_user(&ppr->gr[2], &pt->r2, sizeof(long) *2);
-
-	/* gr4-gr7 */
-
-	for (i = 4; i < 8; i++) {
-		if (unw_access_gr(&info, i, &val, &nat, 0) < 0)
-			return -EIO;
-		retval |= __put_user(val, &ppr->gr[i]);
-	}
-
-	/* gr8-gr11 */
-
-	retval |= __copy_to_user(&ppr->gr[8], &pt->r8, sizeof(long) * 4);
-
-	/* gr12-gr15 */
-
-	retval |= __copy_to_user(&ppr->gr[12], &pt->r12, sizeof(long) * 2);
-	retval |= __copy_to_user(&ppr->gr[14], &pt->r14, sizeof(long));
-	retval |= __copy_to_user(&ppr->gr[15], &pt->r15, sizeof(long));
-
-	/* gr16-gr31 */
-
-	retval |= __copy_to_user(&ppr->gr[16], &pt->r16, sizeof(long) * 16);
-
-	/* b0 */
-
-	retval |= __put_user(pt->b0, &ppr->br[0]);
-
-	/* b1-b5 */
-
-	for (i = 1; i < 6; i++) {
-		if (unw_access_br(&info, i, &val, 0) < 0)
-			return -EIO;
-		__put_user(val, &ppr->br[i]);
-	}
-
-	/* b6-b7 */
-
-	retval |= __put_user(pt->b6, &ppr->br[6]);
-	retval |= __put_user(pt->b7, &ppr->br[7]);
-
-	/* fr2-fr5 */
-
-	for (i = 2; i < 6; i++) {
-		if (unw_get_fr(&info, i, &fpval) < 0)
-			return -EIO;
-		retval |= __copy_to_user(&ppr->fr[i], &fpval, sizeof (fpval));
-	}
-
-	/* fr6-fr11 */
-
-	retval |= __copy_to_user(&ppr->fr[6], &pt->f6,
-				 sizeof(struct ia64_fpreg) * 6);
-
-	/* fp scratch regs(12-15) */
-
-	retval |= __copy_to_user(&ppr->fr[12], &sw->f12,
-				 sizeof(struct ia64_fpreg) * 4);
-
-	/* fr16-fr31 */
-
-	for (i = 16; i < 32; i++) {
-		if (unw_get_fr(&info, i, &fpval) < 0)
-			return -EIO;
-		retval |= __copy_to_user(&ppr->fr[i], &fpval, sizeof (fpval));
-	}
-
-	/* fph */
-
-	ia64_flush_fph(child);
-	retval |= __copy_to_user(&ppr->fr[32], &child->thread.fph,
-				 sizeof(ppr->fr[32]) * 96);
-
-	/*  preds */
-
-	retval |= __put_user(pt->pr, &ppr->pr);
-
-	/* nat bits */
-
-	retval |= __put_user(nat_bits, &ppr->nat);
-
-	ret = retval ? -EIO : 0;
-	return ret;
-}
-
-static long
-ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
-{
-	unsigned long psr, rsc, ec, lc, rnat, bsp, cfm, nat_bits, val = 0;
-	struct unw_frame_info info;
-	struct switch_stack *sw;
-	struct ia64_fpreg fpval;
-	struct pt_regs *pt;
-	long retval = 0;
-	int i;
-
-	memset(&fpval, 0, sizeof(fpval));
-
-	if (!access_ok(ppr, sizeof(struct pt_all_user_regs)))
-		return -EIO;
-
-	pt = task_pt_regs(child);
-	sw = (struct switch_stack *) (child->thread.ksp + 16);
-	unw_init_from_blocked_task(&info, child);
-	if (unw_unwind_to_user(&info) < 0) {
-		return -EIO;
-	}
-
-	if (((unsigned long) ppr & 0x7) != 0) {
-		dprintk("ptrace:unaligned register address %p\n", ppr);
-		return -EIO;
-	}
-
-	/* control regs */
-
-	retval |= __get_user(pt->cr_iip, &ppr->cr_iip);
-	retval |= __get_user(psr, &ppr->cr_ipsr);
-
-	/* app regs */
-
-	retval |= __get_user(pt->ar_pfs, &ppr->ar[PT_AUR_PFS]);
-	retval |= __get_user(rsc, &ppr->ar[PT_AUR_RSC]);
-	retval |= __get_user(pt->ar_bspstore, &ppr->ar[PT_AUR_BSPSTORE]);
-	retval |= __get_user(pt->ar_unat, &ppr->ar[PT_AUR_UNAT]);
-	retval |= __get_user(pt->ar_ccv, &ppr->ar[PT_AUR_CCV]);
-	retval |= __get_user(pt->ar_fpsr, &ppr->ar[PT_AUR_FPSR]);
-
-	retval |= __get_user(ec, &ppr->ar[PT_AUR_EC]);
-	retval |= __get_user(lc, &ppr->ar[PT_AUR_LC]);
-	retval |= __get_user(rnat, &ppr->ar[PT_AUR_RNAT]);
-	retval |= __get_user(bsp, &ppr->ar[PT_AUR_BSP]);
-	retval |= __get_user(cfm, &ppr->cfm);
-
-	/* gr1-gr3 */
-
-	retval |= __copy_from_user(&pt->r1, &ppr->gr[1], sizeof(long));
-	retval |= __copy_from_user(&pt->r2, &ppr->gr[2], sizeof(long) * 2);
-
-	/* gr4-gr7 */
-
-	for (i = 4; i < 8; i++) {
-		retval |= __get_user(val, &ppr->gr[i]);
-		/* NaT bit will be set via PT_NAT_BITS: */
-		if (unw_set_gr(&info, i, val, 0) < 0)
-			return -EIO;
-	}
-
-	/* gr8-gr11 */
-
-	retval |= __copy_from_user(&pt->r8, &ppr->gr[8], sizeof(long) * 4);
-
-	/* gr12-gr15 */
-
-	retval |= __copy_from_user(&pt->r12, &ppr->gr[12], sizeof(long) * 2);
-	retval |= __copy_from_user(&pt->r14, &ppr->gr[14], sizeof(long));
-	retval |= __copy_from_user(&pt->r15, &ppr->gr[15], sizeof(long));
-
-	/* gr16-gr31 */
-
-	retval |= __copy_from_user(&pt->r16, &ppr->gr[16], sizeof(long) * 16);
-
-	/* b0 */
-
-	retval |= __get_user(pt->b0, &ppr->br[0]);
-
-	/* b1-b5 */
-
-	for (i = 1; i < 6; i++) {
-		retval |= __get_user(val, &ppr->br[i]);
-		unw_set_br(&info, i, val);
-	}
-
-	/* b6-b7 */
-
-	retval |= __get_user(pt->b6, &ppr->br[6]);
-	retval |= __get_user(pt->b7, &ppr->br[7]);
-
-	/* fr2-fr5 */
-
-	for (i = 2; i < 6; i++) {
-		retval |= __copy_from_user(&fpval, &ppr->fr[i], sizeof(fpval));
-		if (unw_set_fr(&info, i, fpval) < 0)
-			return -EIO;
-	}
-
-	/* fr6-fr11 */
-
-	retval |= __copy_from_user(&pt->f6, &ppr->fr[6],
-				   sizeof(ppr->fr[6]) * 6);
-
-	/* fp scratch regs(12-15) */
-
-	retval |= __copy_from_user(&sw->f12, &ppr->fr[12],
-				   sizeof(ppr->fr[12]) * 4);
-
-	/* fr16-fr31 */
-
-	for (i = 16; i < 32; i++) {
-		retval |= __copy_from_user(&fpval, &ppr->fr[i],
-					   sizeof(fpval));
-		if (unw_set_fr(&info, i, fpval) < 0)
-			return -EIO;
-	}
-
-	/* fph */
-
-	ia64_sync_fph(child);
-	retval |= __copy_from_user(&child->thread.fph, &ppr->fr[32],
-				   sizeof(ppr->fr[32]) * 96);
-
-	/* preds */
-
-	retval |= __get_user(pt->pr, &ppr->pr);
-
-	/* nat bits */
-
-	retval |= __get_user(nat_bits, &ppr->nat);
-
-	retval |= access_elf_reg(child, &info, ELF_CR_IPSR_OFFSET, &psr, 1);
-	retval |= access_elf_reg(child, &info, ELF_AR_RSC_OFFSET, &rsc, 1);
-	retval |= access_elf_reg(child, &info, ELF_AR_EC_OFFSET, &ec, 1);
-	retval |= access_elf_reg(child, &info, ELF_AR_LC_OFFSET, &lc, 1);
-	retval |= access_elf_reg(child, &info, ELF_AR_RNAT_OFFSET, &rnat, 1);
-	retval |= access_elf_reg(child, &info, ELF_AR_BSP_OFFSET, &bsp, 1);
-	retval |= access_elf_reg(child, &info, ELF_CFM_OFFSET, &cfm, 1);
-	retval |= access_elf_reg(child, &info, ELF_NAT_OFFSET, &nat_bits, 1);
-
-	return retval ? -EIO : 0;
-}
-
-void
-user_enable_single_step (struct task_struct *child)
-{
-	struct ia64_psr *child_psr = ia64_psr(task_pt_regs(child));
-
-	set_tsk_thread_flag(child, TIF_SINGLESTEP);
-	child_psr->ss = 1;
-}
-
-void
-user_enable_block_step (struct task_struct *child)
-{
-	struct ia64_psr *child_psr = ia64_psr(task_pt_regs(child));
-
-	set_tsk_thread_flag(child, TIF_SINGLESTEP);
-	child_psr->tb = 1;
-}
-
-void
-user_disable_single_step (struct task_struct *child)
-{
-	struct ia64_psr *child_psr = ia64_psr(task_pt_regs(child));
-
-	/* make sure the single step/taken-branch trap bits are not set: */
-	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-	child_psr->ss = 0;
-	child_psr->tb = 0;
-}
-
-/*
- * Called by kernel/ptrace.c when detaching..
- *
- * Make sure the single step bit is not set.
- */
-void
-ptrace_disable (struct task_struct *child)
-{
-	user_disable_single_step(child);
-}
-
-static int
-access_uarea (struct task_struct *child, unsigned long addr,
-	      unsigned long *data, int write_access);
-
-long
-arch_ptrace (struct task_struct *child, long request,
-	     unsigned long addr, unsigned long data)
-{
-	switch (request) {
-	case PTRACE_PEEKTEXT:
-	case PTRACE_PEEKDATA:
-		/* read word at location addr */
-		if (ptrace_access_vm(child, addr, &data, sizeof(data),
-				FOLL_FORCE)
-		    != sizeof(data))
-			return -EIO;
-		/* ensure return value is not mistaken for error code */
-		force_successful_syscall_return();
-		return data;
-
-	/* PTRACE_POKETEXT and PTRACE_POKEDATA is handled
-	 * by the generic ptrace_request().
-	 */
-
-	case PTRACE_PEEKUSR:
-		/* read the word at addr in the USER area */
-		if (access_uarea(child, addr, &data, 0) < 0)
-			return -EIO;
-		/* ensure return value is not mistaken for error code */
-		force_successful_syscall_return();
-		return data;
-
-	case PTRACE_POKEUSR:
-		/* write the word at addr in the USER area */
-		if (access_uarea(child, addr, &data, 1) < 0)
-			return -EIO;
-		return 0;
-
-	case PTRACE_OLD_GETSIGINFO:
-		/* for backwards-compatibility */
-		return ptrace_request(child, PTRACE_GETSIGINFO, addr, data);
-
-	case PTRACE_OLD_SETSIGINFO:
-		/* for backwards-compatibility */
-		return ptrace_request(child, PTRACE_SETSIGINFO, addr, data);
-
-	case PTRACE_GETREGS:
-		return ptrace_getregs(child,
-				      (struct pt_all_user_regs __user *) data);
-
-	case PTRACE_SETREGS:
-		return ptrace_setregs(child,
-				      (struct pt_all_user_regs __user *) data);
-
-	default:
-		return ptrace_request(child, request, addr, data);
-	}
-}
-
-
-/* "asmlinkage" so the input arguments are preserved... */
-
-asmlinkage long
-syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
-		     long arg4, long arg5, long arg6, long arg7,
-		     struct pt_regs regs)
-{
-	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		if (ptrace_report_syscall_entry(&regs))
-			return -ENOSYS;
-
-	/* copy user rbs to kernel rbs */
-	if (test_thread_flag(TIF_RESTORE_RSE))
-		ia64_sync_krbs();
-
-
-	audit_syscall_entry(regs.r15, arg0, arg1, arg2, arg3);
-
-	return 0;
-}
-
-/* "asmlinkage" so the input arguments are preserved... */
-
-asmlinkage void
-syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
-		     long arg4, long arg5, long arg6, long arg7,
-		     struct pt_regs regs)
-{
-	int step;
-
-	audit_syscall_exit(&regs);
-
-	step = test_thread_flag(TIF_SINGLESTEP);
-	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		ptrace_report_syscall_exit(&regs, step);
-
-	/* copy user rbs to kernel rbs */
-	if (test_thread_flag(TIF_RESTORE_RSE))
-		ia64_sync_krbs();
-}
-
-/* Utrace implementation starts here */
-struct regset_get {
-	void *kbuf;
-	void __user *ubuf;
-};
-
-struct regset_set {
-	const void *kbuf;
-	const void __user *ubuf;
-};
-
-struct regset_getset {
-	struct task_struct *target;
-	const struct user_regset *regset;
-	union {
-		struct regset_get get;
-		struct regset_set set;
-	} u;
-	unsigned int pos;
-	unsigned int count;
-	int ret;
-};
-
-static const ptrdiff_t pt_offsets[32] =
-{
-#define R(n) offsetof(struct pt_regs, r##n)
-	[0] = -1, R(1), R(2), R(3),
-	[4] = -1, [5] = -1, [6] = -1, [7] = -1,
-	R(8), R(9), R(10), R(11), R(12), R(13), R(14), R(15),
-	R(16), R(17), R(18), R(19), R(20), R(21), R(22), R(23),
-	R(24), R(25), R(26), R(27), R(28), R(29), R(30), R(31),
-#undef R
-};
-
-static int
-access_elf_gpreg(struct task_struct *target, struct unw_frame_info *info,
-		unsigned long addr, unsigned long *data, int write_access)
-{
-	struct pt_regs *pt = task_pt_regs(target);
-	unsigned reg = addr / sizeof(unsigned long);
-	ptrdiff_t d = pt_offsets[reg];
-
-	if (d >= 0) {
-		unsigned long *ptr = (void *)pt + d;
-		if (write_access)
-			*ptr = *data;
-		else
-			*data = *ptr;
-		return 0;
-	} else {
-		char nat = 0;
-		if (write_access) {
-			/* read NaT bit first: */
-			unsigned long dummy;
-			int ret = unw_get_gr(info, reg, &dummy, &nat);
-			if (ret < 0)
-				return ret;
-		}
-		return unw_access_gr(info, reg, data, &nat, write_access);
-	}
-}
-
-static int
-access_elf_breg(struct task_struct *target, struct unw_frame_info *info,
-		unsigned long addr, unsigned long *data, int write_access)
-{
-	struct pt_regs *pt;
-	unsigned long *ptr = NULL;
-
-	pt = task_pt_regs(target);
-	switch (addr) {
-	case ELF_BR_OFFSET(0):
-		ptr = &pt->b0;
-		break;
-	case ELF_BR_OFFSET(1) ... ELF_BR_OFFSET(5):
-		return unw_access_br(info, (addr - ELF_BR_OFFSET(0))/8,
-				     data, write_access);
-	case ELF_BR_OFFSET(6):
-		ptr = &pt->b6;
-		break;
-	case ELF_BR_OFFSET(7):
-		ptr = &pt->b7;
-	}
-	if (write_access)
-		*ptr = *data;
-	else
-		*data = *ptr;
-	return 0;
-}
-
-static int
-access_elf_areg(struct task_struct *target, struct unw_frame_info *info,
-		unsigned long addr, unsigned long *data, int write_access)
-{
-	struct pt_regs *pt;
-	unsigned long cfm, urbs_end;
-	unsigned long *ptr = NULL;
-
-	pt = task_pt_regs(target);
-	if (addr >= ELF_AR_RSC_OFFSET && addr <= ELF_AR_SSD_OFFSET) {
-		switch (addr) {
-		case ELF_AR_RSC_OFFSET:
-			/* force PL3 */
-			if (write_access)
-				pt->ar_rsc = *data | (3 << 2);
-			else
-				*data = pt->ar_rsc;
-			return 0;
-		case ELF_AR_BSP_OFFSET:
-			/*
-			 * By convention, we use PT_AR_BSP to refer to
-			 * the end of the user-level backing store.
-			 * Use ia64_rse_skip_regs(PT_AR_BSP, -CFM.sof)
-			 * to get the real value of ar.bsp at the time
-			 * the kernel was entered.
-			 *
-			 * Furthermore, when changing the contents of
-			 * PT_AR_BSP (or PT_CFM) while the task is
-			 * blocked in a system call, convert the state
-			 * so that the non-system-call exit
-			 * path is used.  This ensures that the proper
-			 * state will be picked up when resuming
-			 * execution.  However, it *also* means that
-			 * once we write PT_AR_BSP/PT_CFM, it won't be
-			 * possible to modify the syscall arguments of
-			 * the pending system call any longer.  This
-			 * shouldn't be an issue because modifying
-			 * PT_AR_BSP/PT_CFM generally implies that
-			 * we're either abandoning the pending system
-			 * call or that we defer it's re-execution
-			 * (e.g., due to GDB doing an inferior
-			 * function call).
-			 */
-			urbs_end = ia64_get_user_rbs_end(target, pt, &cfm);
-			if (write_access) {
-				if (*data != urbs_end) {
-					if (in_syscall(pt))
-						convert_to_non_syscall(target,
-								       pt,
-								       cfm);
-					/*
-					 * Simulate user-level write
-					 * of ar.bsp:
-					 */
-					pt->loadrs = 0;
-					pt->ar_bspstore = *data;
-				}
-			} else
-				*data = urbs_end;
-			return 0;
-		case ELF_AR_BSPSTORE_OFFSET:
-			ptr = &pt->ar_bspstore;
-			break;
-		case ELF_AR_RNAT_OFFSET:
-			ptr = &pt->ar_rnat;
-			break;
-		case ELF_AR_CCV_OFFSET:
-			ptr = &pt->ar_ccv;
-			break;
-		case ELF_AR_UNAT_OFFSET:
-			ptr = &pt->ar_unat;
-			break;
-		case ELF_AR_FPSR_OFFSET:
-			ptr = &pt->ar_fpsr;
-			break;
-		case ELF_AR_PFS_OFFSET:
-			ptr = &pt->ar_pfs;
-			break;
-		case ELF_AR_LC_OFFSET:
-			return unw_access_ar(info, UNW_AR_LC, data,
-					     write_access);
-		case ELF_AR_EC_OFFSET:
-			return unw_access_ar(info, UNW_AR_EC, data,
-					     write_access);
-		case ELF_AR_CSD_OFFSET:
-			ptr = &pt->ar_csd;
-			break;
-		case ELF_AR_SSD_OFFSET:
-			ptr = &pt->ar_ssd;
-		}
-	} else if (addr >= ELF_CR_IIP_OFFSET && addr <= ELF_CR_IPSR_OFFSET) {
-		switch (addr) {
-		case ELF_CR_IIP_OFFSET:
-			ptr = &pt->cr_iip;
-			break;
-		case ELF_CFM_OFFSET:
-			urbs_end = ia64_get_user_rbs_end(target, pt, &cfm);
-			if (write_access) {
-				if (((cfm ^ *data) & PFM_MASK) != 0) {
-					if (in_syscall(pt))
-						convert_to_non_syscall(target,
-								       pt,
-								       cfm);
-					pt->cr_ifs = ((pt->cr_ifs & ~PFM_MASK)
-						      | (*data & PFM_MASK));
-				}
-			} else
-				*data = cfm;
-			return 0;
-		case ELF_CR_IPSR_OFFSET:
-			if (write_access) {
-				unsigned long tmp = *data;
-				/* psr.ri==3 is a reserved value: SDM 2:25 */
-				if ((tmp & IA64_PSR_RI) == IA64_PSR_RI)
-					tmp &= ~IA64_PSR_RI;
-				pt->cr_ipsr = ((tmp & IPSR_MASK)
-					       | (pt->cr_ipsr & ~IPSR_MASK));
-			} else
-				*data = (pt->cr_ipsr & IPSR_MASK);
-			return 0;
-		}
-	} else if (addr == ELF_NAT_OFFSET)
-		return access_nat_bits(target, pt, info,
-				       data, write_access);
-	else if (addr == ELF_PR_OFFSET)
-		ptr = &pt->pr;
-	else
-		return -1;
-
-	if (write_access)
-		*ptr = *data;
-	else
-		*data = *ptr;
-
-	return 0;
-}
-
-static int
-access_elf_reg(struct task_struct *target, struct unw_frame_info *info,
-		unsigned long addr, unsigned long *data, int write_access)
-{
-	if (addr >= ELF_GR_OFFSET(1) && addr <= ELF_GR_OFFSET(31))
-		return access_elf_gpreg(target, info, addr, data, write_access);
-	else if (addr >= ELF_BR_OFFSET(0) && addr <= ELF_BR_OFFSET(7))
-		return access_elf_breg(target, info, addr, data, write_access);
-	else
-		return access_elf_areg(target, info, addr, data, write_access);
-}
-
-struct regset_membuf {
-	struct membuf to;
-	int ret;
-};
-
-static void do_gpregs_get(struct unw_frame_info *info, void *arg)
-{
-	struct regset_membuf *dst = arg;
-	struct membuf to = dst->to;
-	unsigned int n;
-	elf_greg_t reg;
-
-	if (unw_unwind_to_user(info) < 0)
-		return;
-
-	/*
-	 * coredump format:
-	 *      r0-r31
-	 *      NaT bits (for r0-r31; bit N == 1 iff rN is a NaT)
-	 *      predicate registers (p0-p63)
-	 *      b0-b7
-	 *      ip cfm user-mask
-	 *      ar.rsc ar.bsp ar.bspstore ar.rnat
-	 *      ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec
-	 */
-
-
-	/* Skip r0 */
-	membuf_zero(&to, 8);
-	for (n = 8; to.left && n < ELF_AR_END_OFFSET; n += 8) {
-		if (access_elf_reg(info->task, info, n, &reg, 0) < 0) {
-			dst->ret = -EIO;
-			return;
-		}
-		membuf_store(&to, reg);
-	}
-}
-
-static void do_gpregs_set(struct unw_frame_info *info, void *arg)
-{
-	struct regset_getset *dst = arg;
-
-	if (unw_unwind_to_user(info) < 0)
-		return;
-
-	if (!dst->count)
-		return;
-	/* Skip r0 */
-	if (dst->pos < ELF_GR_OFFSET(1)) {
-		user_regset_copyin_ignore(&dst->pos, &dst->count,
-					  &dst->u.set.kbuf, &dst->u.set.ubuf,
-					  0, ELF_GR_OFFSET(1));
-		dst->ret = 0;
-	}
-
-	while (dst->count && dst->pos < ELF_AR_END_OFFSET) {
-		unsigned int n, from, to;
-		elf_greg_t tmp[16];
-
-		from = dst->pos;
-		to = from + sizeof(tmp);
-		if (to > ELF_AR_END_OFFSET)
-			to = ELF_AR_END_OFFSET;
-		/* get up to 16 values */
-		dst->ret = user_regset_copyin(&dst->pos, &dst->count,
-				&dst->u.set.kbuf, &dst->u.set.ubuf, tmp,
-				from, to);
-		if (dst->ret)
-			return;
-		/* now copy them into registers */
-		for (n = 0; from < dst->pos; from += sizeof(elf_greg_t), n++)
-			if (access_elf_reg(dst->target, info, from,
-						&tmp[n], 1) < 0) {
-				dst->ret = -EIO;
-				return;
-			}
-	}
-}
-
-#define ELF_FP_OFFSET(i)	(i * sizeof(elf_fpreg_t))
-
-static void do_fpregs_get(struct unw_frame_info *info, void *arg)
-{
-	struct task_struct *task = info->task;
-	struct regset_membuf *dst = arg;
-	struct membuf to = dst->to;
-	elf_fpreg_t reg;
-	unsigned int n;
-
-	if (unw_unwind_to_user(info) < 0)
-		return;
-
-	/* Skip pos 0 and 1 */
-	membuf_zero(&to, 2 * sizeof(elf_fpreg_t));
-
-	/* fr2-fr31 */
-	for (n = 2; to.left && n < 32; n++) {
-		if (unw_get_fr(info, n, &reg)) {
-			dst->ret = -EIO;
-			return;
-		}
-		membuf_write(&to, &reg, sizeof(reg));
-	}
-
-	/* fph */
-	if (!to.left)
-		return;
-
-	ia64_flush_fph(task);
-	if (task->thread.flags & IA64_THREAD_FPH_VALID)
-		membuf_write(&to, &task->thread.fph, 96 * sizeof(reg));
-	else
-		membuf_zero(&to, 96 * sizeof(reg));
-}
-
-static void do_fpregs_set(struct unw_frame_info *info, void *arg)
-{
-	struct regset_getset *dst = arg;
-	elf_fpreg_t fpreg, tmp[30];
-	int index, start, end;
-
-	if (unw_unwind_to_user(info) < 0)
-		return;
-
-	/* Skip pos 0 and 1 */
-	if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(2)) {
-		user_regset_copyin_ignore(&dst->pos, &dst->count,
-					  &dst->u.set.kbuf, &dst->u.set.ubuf,
-					  0, ELF_FP_OFFSET(2));
-		dst->ret = 0;
-		if (dst->count == 0)
-			return;
-	}
-
-	/* fr2-fr31 */
-	if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(32)) {
-		start = dst->pos;
-		end = min(((unsigned int)ELF_FP_OFFSET(32)),
-			 dst->pos + dst->count);
-		dst->ret = user_regset_copyin(&dst->pos, &dst->count,
-				&dst->u.set.kbuf, &dst->u.set.ubuf, tmp,
-				ELF_FP_OFFSET(2), ELF_FP_OFFSET(32));
-		if (dst->ret)
-			return;
-
-		if (start & 0xF) { /* only write high part */
-			if (unw_get_fr(info, start / sizeof(elf_fpreg_t),
-					 &fpreg)) {
-				dst->ret = -EIO;
-				return;
-			}
-			tmp[start / sizeof(elf_fpreg_t) - 2].u.bits[0]
-				= fpreg.u.bits[0];
-			start &= ~0xFUL;
-		}
-		if (end & 0xF) { /* only write low part */
-			if (unw_get_fr(info, end / sizeof(elf_fpreg_t),
-					&fpreg)) {
-				dst->ret = -EIO;
-				return;
-			}
-			tmp[end / sizeof(elf_fpreg_t) - 2].u.bits[1]
-				= fpreg.u.bits[1];
-			end = (end + 0xF) & ~0xFUL;
-		}
-
-		for ( ;	start < end ; start += sizeof(elf_fpreg_t)) {
-			index = start / sizeof(elf_fpreg_t);
-			if (unw_set_fr(info, index, tmp[index - 2])) {
-				dst->ret = -EIO;
-				return;
-			}
-		}
-		if (dst->ret || dst->count == 0)
-			return;
-	}
-
-	/* fph */
-	if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(128)) {
-		ia64_sync_fph(dst->target);
-		dst->ret = user_regset_copyin(&dst->pos, &dst->count,
-						&dst->u.set.kbuf,
-						&dst->u.set.ubuf,
-						&dst->target->thread.fph,
-						ELF_FP_OFFSET(32), -1);
-	}
-}
-
-static void
-unwind_and_call(void (*call)(struct unw_frame_info *, void *),
-	       struct task_struct *target, void *data)
-{
-	if (target == current)
-		unw_init_running(call, data);
-	else {
-		struct unw_frame_info info;
-		memset(&info, 0, sizeof(info));
-		unw_init_from_blocked_task(&info, target);
-		(*call)(&info, data);
-	}
-}
-
-static int
-do_regset_call(void (*call)(struct unw_frame_info *, void *),
-	       struct task_struct *target,
-	       const struct user_regset *regset,
-	       unsigned int pos, unsigned int count,
-	       const void *kbuf, const void __user *ubuf)
-{
-	struct regset_getset info = { .target = target, .regset = regset,
-				 .pos = pos, .count = count,
-				 .u.set = { .kbuf = kbuf, .ubuf = ubuf },
-				 .ret = 0 };
-	unwind_and_call(call, target, &info);
-	return info.ret;
-}
-
-static int
-gpregs_get(struct task_struct *target,
-	   const struct user_regset *regset,
-	   struct membuf to)
-{
-	struct regset_membuf info = {.to = to};
-	unwind_and_call(do_gpregs_get, target, &info);
-	return info.ret;
-}
-
-static int gpregs_set(struct task_struct *target,
-		const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		const void *kbuf, const void __user *ubuf)
-{
-	return do_regset_call(do_gpregs_set, target, regset, pos, count,
-		kbuf, ubuf);
-}
-
-static void do_gpregs_writeback(struct unw_frame_info *info, void *arg)
-{
-	do_sync_rbs(info, ia64_sync_user_rbs);
-}
-
-/*
- * This is called to write back the register backing store.
- * ptrace does this before it stops, so that a tracer reading the user
- * memory after the thread stops will get the current register data.
- */
-static int
-gpregs_writeback(struct task_struct *target,
-		 const struct user_regset *regset,
-		 int now)
-{
-	if (test_and_set_tsk_thread_flag(target, TIF_RESTORE_RSE))
-		return 0;
-	set_notify_resume(target);
-	return do_regset_call(do_gpregs_writeback, target, regset, 0, 0,
-		NULL, NULL);
-}
-
-static int
-fpregs_active(struct task_struct *target, const struct user_regset *regset)
-{
-	return (target->thread.flags & IA64_THREAD_FPH_VALID) ? 128 : 32;
-}
-
-static int fpregs_get(struct task_struct *target,
-		const struct user_regset *regset,
-		struct membuf to)
-{
-	struct regset_membuf info = {.to = to};
-	unwind_and_call(do_fpregs_get, target, &info);
-	return info.ret;
-}
-
-static int fpregs_set(struct task_struct *target,
-		const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		const void *kbuf, const void __user *ubuf)
-{
-	return do_regset_call(do_fpregs_set, target, regset, pos, count,
-		kbuf, ubuf);
-}
-
-static int
-access_uarea(struct task_struct *child, unsigned long addr,
-	      unsigned long *data, int write_access)
-{
-	unsigned int pos = -1; /* an invalid value */
-	unsigned long *ptr, regnum;
-
-	if ((addr & 0x7) != 0) {
-		dprintk("ptrace: unaligned register address 0x%lx\n", addr);
-		return -1;
-	}
-	if ((addr >= PT_NAT_BITS + 8 && addr < PT_F2) ||
-		(addr >= PT_R7 + 8 && addr < PT_B1) ||
-		(addr >= PT_AR_LC + 8 && addr < PT_CR_IPSR) ||
-		(addr >= PT_AR_SSD + 8 && addr < PT_DBR)) {
-		dprintk("ptrace: rejecting access to register "
-					"address 0x%lx\n", addr);
-		return -1;
-	}
-
-	switch (addr) {
-	case PT_F32 ... (PT_F127 + 15):
-		pos = addr - PT_F32 + ELF_FP_OFFSET(32);
-		break;
-	case PT_F2 ... (PT_F5 + 15):
-		pos = addr - PT_F2 + ELF_FP_OFFSET(2);
-		break;
-	case PT_F10 ... (PT_F31 + 15):
-		pos = addr - PT_F10 + ELF_FP_OFFSET(10);
-		break;
-	case PT_F6 ... (PT_F9 + 15):
-		pos = addr - PT_F6 + ELF_FP_OFFSET(6);
-		break;
-	}
-
-	if (pos != -1) {
-		unsigned reg = pos / sizeof(elf_fpreg_t);
-		int which_half = (pos / sizeof(unsigned long)) & 1;
-
-		if (reg < 32) { /* fr2-fr31 */
-			struct unw_frame_info info;
-			elf_fpreg_t fpreg;
-
-			memset(&info, 0, sizeof(info));
-			unw_init_from_blocked_task(&info, child);
-			if (unw_unwind_to_user(&info) < 0)
-				return 0;
-
-			if (unw_get_fr(&info, reg, &fpreg))
-				return -1;
-			if (write_access) {
-				fpreg.u.bits[which_half] = *data;
-				if (unw_set_fr(&info, reg, fpreg))
-					return -1;
-			} else {
-				*data = fpreg.u.bits[which_half];
-			}
-		} else { /* fph */
-			elf_fpreg_t *p = &child->thread.fph[reg - 32];
-			unsigned long *bits = &p->u.bits[which_half];
-
-			ia64_sync_fph(child);
-			if (write_access)
-				*bits = *data;
-			else if (child->thread.flags & IA64_THREAD_FPH_VALID)
-				*data = *bits;
-			else
-				*data = 0;
-		}
-		return 0;
-	}
-
-	switch (addr) {
-	case PT_NAT_BITS:
-		pos = ELF_NAT_OFFSET;
-		break;
-	case PT_R4 ... PT_R7:
-		pos = addr - PT_R4 + ELF_GR_OFFSET(4);
-		break;
-	case PT_B1 ... PT_B5:
-		pos = addr - PT_B1 + ELF_BR_OFFSET(1);
-		break;
-	case PT_AR_EC:
-		pos = ELF_AR_EC_OFFSET;
-		break;
-	case PT_AR_LC:
-		pos = ELF_AR_LC_OFFSET;
-		break;
-	case PT_CR_IPSR:
-		pos = ELF_CR_IPSR_OFFSET;
-		break;
-	case PT_CR_IIP:
-		pos = ELF_CR_IIP_OFFSET;
-		break;
-	case PT_CFM:
-		pos = ELF_CFM_OFFSET;
-		break;
-	case PT_AR_UNAT:
-		pos = ELF_AR_UNAT_OFFSET;
-		break;
-	case PT_AR_PFS:
-		pos = ELF_AR_PFS_OFFSET;
-		break;
-	case PT_AR_RSC:
-		pos = ELF_AR_RSC_OFFSET;
-		break;
-	case PT_AR_RNAT:
-		pos = ELF_AR_RNAT_OFFSET;
-		break;
-	case PT_AR_BSPSTORE:
-		pos = ELF_AR_BSPSTORE_OFFSET;
-		break;
-	case PT_PR:
-		pos = ELF_PR_OFFSET;
-		break;
-	case PT_B6:
-		pos = ELF_BR_OFFSET(6);
-		break;
-	case PT_AR_BSP:
-		pos = ELF_AR_BSP_OFFSET;
-		break;
-	case PT_R1 ... PT_R3:
-		pos = addr - PT_R1 + ELF_GR_OFFSET(1);
-		break;
-	case PT_R12 ... PT_R15:
-		pos = addr - PT_R12 + ELF_GR_OFFSET(12);
-		break;
-	case PT_R8 ... PT_R11:
-		pos = addr - PT_R8 + ELF_GR_OFFSET(8);
-		break;
-	case PT_R16 ... PT_R31:
-		pos = addr - PT_R16 + ELF_GR_OFFSET(16);
-		break;
-	case PT_AR_CCV:
-		pos = ELF_AR_CCV_OFFSET;
-		break;
-	case PT_AR_FPSR:
-		pos = ELF_AR_FPSR_OFFSET;
-		break;
-	case PT_B0:
-		pos = ELF_BR_OFFSET(0);
-		break;
-	case PT_B7:
-		pos = ELF_BR_OFFSET(7);
-		break;
-	case PT_AR_CSD:
-		pos = ELF_AR_CSD_OFFSET;
-		break;
-	case PT_AR_SSD:
-		pos = ELF_AR_SSD_OFFSET;
-		break;
-	}
-
-	if (pos != -1) {
-		struct unw_frame_info info;
-
-		memset(&info, 0, sizeof(info));
-		unw_init_from_blocked_task(&info, child);
-		if (unw_unwind_to_user(&info) < 0)
-			return 0;
-
-		return access_elf_reg(child, &info, pos, data, write_access);
-	}
-
-	/* access debug registers */
-	if (addr >= PT_IBR) {
-		regnum = (addr - PT_IBR) >> 3;
-		ptr = &child->thread.ibr[0];
-	} else {
-		regnum = (addr - PT_DBR) >> 3;
-		ptr = &child->thread.dbr[0];
-	}
-
-	if (regnum >= 8) {
-		dprintk("ptrace: rejecting access to register "
-				"address 0x%lx\n", addr);
-		return -1;
-	}
-
-	if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) {
-		child->thread.flags |= IA64_THREAD_DBG_VALID;
-		memset(child->thread.dbr, 0,
-				sizeof(child->thread.dbr));
-		memset(child->thread.ibr, 0,
-				sizeof(child->thread.ibr));
-	}
-
-	ptr += regnum;
-
-	if ((regnum & 1) && write_access) {
-		/* don't let the user set kernel-level breakpoints: */
-		*ptr = *data & ~(7UL << 56);
-		return 0;
-	}
-	if (write_access)
-		*ptr = *data;
-	else
-		*data = *ptr;
-	return 0;
-}
-
-static const struct user_regset native_regsets[] = {
-	{
-		.core_note_type = NT_PRSTATUS,
-		.n = ELF_NGREG,
-		.size = sizeof(elf_greg_t), .align = sizeof(elf_greg_t),
-		.regset_get = gpregs_get, .set = gpregs_set,
-		.writeback = gpregs_writeback
-	},
-	{
-		.core_note_type = NT_PRFPREG,
-		.n = ELF_NFPREG,
-		.size = sizeof(elf_fpreg_t), .align = sizeof(elf_fpreg_t),
-		.regset_get = fpregs_get, .set = fpregs_set, .active = fpregs_active
-	},
-};
-
-static const struct user_regset_view user_ia64_view = {
-	.name = "ia64",
-	.e_machine = EM_IA_64,
-	.regsets = native_regsets, .n = ARRAY_SIZE(native_regsets)
-};
-
-const struct user_regset_view *task_user_regset_view(struct task_struct *tsk)
-{
-	return &user_ia64_view;
-}
-
-struct syscall_get_args {
-	unsigned int i;
-	unsigned int n;
-	unsigned long *args;
-	struct pt_regs *regs;
-};
-
-static void syscall_get_args_cb(struct unw_frame_info *info, void *data)
-{
-	struct syscall_get_args *args = data;
-	struct pt_regs *pt = args->regs;
-	unsigned long *krbs, cfm, ndirty, nlocals, nouts;
-	int i, count;
-
-	if (unw_unwind_to_user(info) < 0)
-		return;
-
-	/*
-	 * We get here via a few paths:
-	 * - break instruction: cfm is shared with caller.
-	 *   syscall args are in out= regs, locals are non-empty.
-	 * - epsinstruction: cfm is set by br.call
-	 *   locals don't exist.
-	 *
-	 * For both cases arguments are reachable in cfm.sof - cfm.sol.
-	 * CFM: [ ... | sor: 17..14 | sol : 13..7 | sof : 6..0 ]
-	 */
-	cfm = pt->cr_ifs;
-	nlocals = (cfm >> 7) & 0x7f; /* aka sol */
-	nouts = (cfm & 0x7f) - nlocals; /* aka sof - sol */
-	krbs = (unsigned long *)info->task + IA64_RBS_OFFSET/8;
-	ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19));
-
-	count = 0;
-	if (in_syscall(pt))
-		count = min_t(int, args->n, nouts);
-
-	/* Iterate over outs. */
-	for (i = 0; i < count; i++) {
-		int j = ndirty + nlocals + i + args->i;
-		args->args[i] = *ia64_rse_skip_regs(krbs, j);
-	}
-
-	while (i < args->n) {
-		args->args[i] = 0;
-		i++;
-	}
-}
-
-void syscall_get_arguments(struct task_struct *task,
-	struct pt_regs *regs, unsigned long *args)
-{
-	struct syscall_get_args data = {
-		.i = 0,
-		.n = 6,
-		.args = args,
-		.regs = regs,
-	};
-
-	if (task == current)
-		unw_init_running(syscall_get_args_cb, &data);
-	else {
-		struct unw_frame_info ufi;
-		memset(&ufi, 0, sizeof(ufi));
-		unw_init_from_blocked_task(&ufi, task);
-		syscall_get_args_cb(&ufi, &data);
-	}
-}
diff --git a/arch/ia64/kernel/relocate_kernel.S b/arch/ia64/kernel/relocate_kernel.S
deleted file mode 100644
index 527a7b896a6e..000000000000
--- a/arch/ia64/kernel/relocate_kernel.S
+++ /dev/null
@@ -1,321 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * arch/ia64/kernel/relocate_kernel.S
- *
- * Relocate kexec'able kernel and start it
- *
- * Copyright (C) 2005 Hewlett-Packard Development Company, L.P.
- * Copyright (C) 2005 Khalid Aziz  <khalid.aziz@hp.com>
- * Copyright (C) 2005 Intel Corp,  Zou Nan hai <nanhai.zou@intel.com>
- */
-#include <linux/pgtable.h>
-#include <asm/asmmacro.h>
-#include <asm/kregs.h>
-#include <asm/page.h>
-#include <asm/mca_asm.h>
-
-       /* Must be relocatable PIC code callable as a C function
-        */
-GLOBAL_ENTRY(relocate_new_kernel)
-	.prologue
-	alloc r31=ar.pfs,4,0,0,0
-        .body
-.reloc_entry:
-{
-	rsm psr.i| psr.ic
-	mov r2=ip
-}
-	;;
-{
-        flushrs                         // must be first insn in group
-        srlz.i
-}
-	;;
-	dep r2=0,r2,61,3		//to physical address
-	;;
-	//first switch to physical mode
-	add r3=1f-.reloc_entry, r2
-	movl r16 = IA64_PSR_AC|IA64_PSR_BN|IA64_PSR_IC
-	mov ar.rsc=0	          	// put RSE in enforced lazy mode
-	;;
-	add sp=(memory_stack_end - 16 - .reloc_entry),r2
-	add r8=(register_stack - .reloc_entry),r2
-	;;
-	mov r18=ar.rnat
-	mov ar.bspstore=r8
-	;;
-        mov cr.ipsr=r16
-        mov cr.iip=r3
-        mov cr.ifs=r0
-	srlz.i
-	;;
-	mov ar.rnat=r18
-	rfi				// note: this unmask MCA/INIT (psr.mc)
-	;;
-1:
-	//physical mode code begin
-	mov b6=in1
-	dep r28=0,in2,61,3	//to physical address
-
-	// purge all TC entries
-#define O(member)       IA64_CPUINFO_##member##_OFFSET
-        GET_THIS_PADDR(r2, ia64_cpu_info) // load phys addr of cpu_info into r2
-        ;;
-        addl r17=O(PTCE_STRIDE),r2
-        addl r2=O(PTCE_BASE),r2
-        ;;
-        ld8 r18=[r2],(O(PTCE_COUNT)-O(PTCE_BASE));;    	// r18=ptce_base
-        ld4 r19=[r2],4                                  // r19=ptce_count[0]
-        ld4 r21=[r17],4                                 // r21=ptce_stride[0]
-        ;;
-        ld4 r20=[r2]                                    // r20=ptce_count[1]
-        ld4 r22=[r17]                                   // r22=ptce_stride[1]
-        mov r24=r0
-        ;;
-        adds r20=-1,r20
-        ;;
-#undef O
-2:
-        cmp.ltu p6,p7=r24,r19
-(p7)    br.cond.dpnt.few 4f
-        mov ar.lc=r20
-3:
-        ptc.e r18
-        ;;
-        add r18=r22,r18
-        br.cloop.sptk.few 3b
-        ;;
-        add r18=r21,r18
-        add r24=1,r24
-        ;;
-        br.sptk.few 2b
-4:
-        srlz.i
-        ;;
-	// purge TR entry for kernel text and data
-        movl r16=KERNEL_START
-        mov r18=KERNEL_TR_PAGE_SHIFT<<2
-        ;;
-        ptr.i r16, r18
-        ptr.d r16, r18
-        ;;
-        srlz.i
-        ;;
-
-        // purge TR entry for pal code
-        mov r16=in3
-        mov r18=IA64_GRANULE_SHIFT<<2
-        ;;
-        ptr.i r16,r18
-        ;;
-        srlz.i
-	;;
-
-        // purge TR entry for stack
-        mov r16=IA64_KR(CURRENT_STACK)
-        ;;
-        shl r16=r16,IA64_GRANULE_SHIFT
-        movl r19=PAGE_OFFSET
-        ;;
-        add r16=r19,r16
-        mov r18=IA64_GRANULE_SHIFT<<2
-        ;;
-        ptr.d r16,r18
-        ;;
-        srlz.i
-	;;
-
-	//copy segments
-	movl r16=PAGE_MASK
-        mov  r30=in0                    // in0 is page_list
-        br.sptk.few .dest_page
-	;;
-.loop:
-	ld8  r30=[in0], 8;;
-.dest_page:
-	tbit.z p0, p6=r30, 0;;    	// 0x1 dest page
-(p6)	and r17=r30, r16
-(p6)	br.cond.sptk.few .loop;;
-
-	tbit.z p0, p6=r30, 1;;		// 0x2 indirect page
-(p6)	and in0=r30, r16
-(p6)	br.cond.sptk.few .loop;;
-
-	tbit.z p0, p6=r30, 2;;		// 0x4 end flag
-(p6)	br.cond.sptk.few .end_loop;;
-
-	tbit.z p6, p0=r30, 3;;		// 0x8 source page
-(p6)	br.cond.sptk.few .loop
-
-	and r18=r30, r16
-
-	// simple copy page, may optimize later
-	movl r14=PAGE_SIZE/8 - 1;;
-	mov ar.lc=r14;;
-1:
-	ld8 r14=[r18], 8;;
-	st8 [r17]=r14;;
-	fc.i r17
-	add r17=8, r17
-	br.ctop.sptk.few 1b
-	br.sptk.few .loop
-	;;
-
-.end_loop:
-	sync.i			// for fc.i
-	;;
-	srlz.i
-	;;
-	srlz.d
-	;;
-	br.call.sptk.many b0=b6;;
-
-.align  32
-memory_stack:
-	.fill           8192, 1, 0
-memory_stack_end:
-register_stack:
-	.fill           8192, 1, 0
-register_stack_end:
-relocate_new_kernel_end:
-END(relocate_new_kernel)
-
-.global relocate_new_kernel_size
-relocate_new_kernel_size:
-	data8	relocate_new_kernel_end - relocate_new_kernel
-
-GLOBAL_ENTRY(ia64_dump_cpu_regs)
-        .prologue
-        alloc loc0=ar.pfs,1,2,0,0
-        .body
-        mov     ar.rsc=0                // put RSE in enforced lazy mode
-        add     loc1=4*8, in0           // save r4 and r5 first
-        ;;
-{
-        flushrs                         // flush dirty regs to backing store
-        srlz.i
-}
-        st8 [loc1]=r4, 8
-        ;;
-        st8 [loc1]=r5, 8
-        ;;
-        add loc1=32*8, in0
-        mov r4=ar.rnat
-        ;;
-        st8 [in0]=r0, 8			// r0
-        st8 [loc1]=r4, 8		// rnat
-        mov r5=pr
-        ;;
-        st8 [in0]=r1, 8			// r1
-        st8 [loc1]=r5, 8		// pr
-        mov r4=b0
-        ;;
-        st8 [in0]=r2, 8			// r2
-        st8 [loc1]=r4, 8		// b0
-        mov r5=b1;
-        ;;
-        st8 [in0]=r3, 24		// r3
-        st8 [loc1]=r5, 8		// b1
-        mov r4=b2
-        ;;
-        st8 [in0]=r6, 8			// r6
-        st8 [loc1]=r4, 8		// b2
-	mov r5=b3
-        ;;
-        st8 [in0]=r7, 8			// r7
-        st8 [loc1]=r5, 8		// b3
-        mov r4=b4
-        ;;
-        st8 [in0]=r8, 8			// r8
-        st8 [loc1]=r4, 8		// b4
-        mov r5=b5
-        ;;
-        st8 [in0]=r9, 8			// r9
-        st8 [loc1]=r5, 8		// b5
-        mov r4=b6
-        ;;
-        st8 [in0]=r10, 8		// r10
-        st8 [loc1]=r5, 8		// b6
-        mov r5=b7
-        ;;
-        st8 [in0]=r11, 8		// r11
-        st8 [loc1]=r5, 8		// b7
-        mov r4=b0
-        ;;
-        st8 [in0]=r12, 8		// r12
-        st8 [loc1]=r4, 8		// ip
-        mov r5=loc0
-	;;
-        st8 [in0]=r13, 8		// r13
-        extr.u r5=r5, 0, 38		// ar.pfs.pfm
-	mov r4=r0			// user mask
-        ;;
-        st8 [in0]=r14, 8		// r14
-        st8 [loc1]=r5, 8		// cfm
-        ;;
-        st8 [in0]=r15, 8		// r15
-        st8 [loc1]=r4, 8        	// user mask
-	mov r5=ar.rsc
-        ;;
-        st8 [in0]=r16, 8		// r16
-        st8 [loc1]=r5, 8        	// ar.rsc
-        mov r4=ar.bsp
-        ;;
-        st8 [in0]=r17, 8		// r17
-        st8 [loc1]=r4, 8        	// ar.bsp
-        mov r5=ar.bspstore
-        ;;
-        st8 [in0]=r18, 8		// r18
-        st8 [loc1]=r5, 8        	// ar.bspstore
-        mov r4=ar.rnat
-        ;;
-        st8 [in0]=r19, 8		// r19
-        st8 [loc1]=r4, 8        	// ar.rnat
-        mov r5=ar.ccv
-        ;;
-        st8 [in0]=r20, 8		// r20
-	st8 [loc1]=r5, 8        	// ar.ccv
-        mov r4=ar.unat
-        ;;
-        st8 [in0]=r21, 8		// r21
-        st8 [loc1]=r4, 8        	// ar.unat
-        mov r5 = ar.fpsr
-        ;;
-        st8 [in0]=r22, 8		// r22
-        st8 [loc1]=r5, 8        	// ar.fpsr
-        mov r4 = ar.unat
-        ;;
-        st8 [in0]=r23, 8		// r23
-        st8 [loc1]=r4, 8        	// unat
-        mov r5 = ar.fpsr
-        ;;
-        st8 [in0]=r24, 8		// r24
-        st8 [loc1]=r5, 8        	// fpsr
-        mov r4 = ar.pfs
-        ;;
-        st8 [in0]=r25, 8		// r25
-        st8 [loc1]=r4, 8        	// ar.pfs
-        mov r5 = ar.lc
-        ;;
-        st8 [in0]=r26, 8		// r26
-        st8 [loc1]=r5, 8        	// ar.lc
-        mov r4 = ar.ec
-        ;;
-        st8 [in0]=r27, 8		// r27
-        st8 [loc1]=r4, 8        	// ar.ec
-        mov r5 = ar.csd
-        ;;
-        st8 [in0]=r28, 8		// r28
-        st8 [loc1]=r5, 8        	// ar.csd
-        mov r4 = ar.ssd
-        ;;
-        st8 [in0]=r29, 8		// r29
-        st8 [loc1]=r4, 8        	// ar.ssd
-        ;;
-        st8 [in0]=r30, 8		// r30
-        ;;
-	st8 [in0]=r31, 8		// r31
-        mov ar.pfs=loc0
-        ;;
-        br.ret.sptk.many rp
-END(ia64_dump_cpu_regs)
diff --git a/arch/ia64/kernel/sal.c b/arch/ia64/kernel/sal.c
deleted file mode 100644
index e4f0705c0282..000000000000
--- a/arch/ia64/kernel/sal.c
+++ /dev/null
@@ -1,400 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * System Abstraction Layer (SAL) interface routines.
- *
- * Copyright (C) 1998, 1999, 2001, 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/string.h>
-
-#include <asm/delay.h>
-#include <asm/page.h>
-#include <asm/sal.h>
-#include <asm/pal.h>
-#include <asm/xtp.h>
-
- __cacheline_aligned DEFINE_SPINLOCK(sal_lock);
-unsigned long sal_platform_features;
-
-unsigned short sal_revision;
-unsigned short sal_version;
-
-#define SAL_MAJOR(x) ((x) >> 8)
-#define SAL_MINOR(x) ((x) & 0xff)
-
-static struct {
-	void *addr;	/* function entry point */
-	void *gpval;	/* gp value to use */
-} pdesc;
-
-static long
-default_handler (void)
-{
-	return -1;
-}
-
-ia64_sal_handler ia64_sal = (ia64_sal_handler) default_handler;
-ia64_sal_desc_ptc_t *ia64_ptc_domain_info;
-
-const char *
-ia64_sal_strerror (long status)
-{
-	const char *str;
-	switch (status) {
-	      case 0: str = "Call completed without error"; break;
-	      case 1: str = "Effect a warm boot of the system to complete "
-			      "the update"; break;
-	      case -1: str = "Not implemented"; break;
-	      case -2: str = "Invalid argument"; break;
-	      case -3: str = "Call completed with error"; break;
-	      case -4: str = "Virtual address not registered"; break;
-	      case -5: str = "No information available"; break;
-	      case -6: str = "Insufficient space to add the entry"; break;
-	      case -7: str = "Invalid entry_addr value"; break;
-	      case -8: str = "Invalid interrupt vector"; break;
-	      case -9: str = "Requested memory not available"; break;
-	      case -10: str = "Unable to write to the NVM device"; break;
-	      case -11: str = "Invalid partition type specified"; break;
-	      case -12: str = "Invalid NVM_Object id specified"; break;
-	      case -13: str = "NVM_Object already has the maximum number "
-				"of partitions"; break;
-	      case -14: str = "Insufficient space in partition for the "
-				"requested write sub-function"; break;
-	      case -15: str = "Insufficient data buffer space for the "
-				"requested read record sub-function"; break;
-	      case -16: str = "Scratch buffer required for the write/delete "
-				"sub-function"; break;
-	      case -17: str = "Insufficient space in the NVM_Object for the "
-				"requested create sub-function"; break;
-	      case -18: str = "Invalid value specified in the partition_rec "
-				"argument"; break;
-	      case -19: str = "Record oriented I/O not supported for this "
-				"partition"; break;
-	      case -20: str = "Bad format of record to be written or "
-				"required keyword variable not "
-				"specified"; break;
-	      default: str = "Unknown SAL status code"; break;
-	}
-	return str;
-}
-
-void __init
-ia64_sal_handler_init (void *entry_point, void *gpval)
-{
-	/* fill in the SAL procedure descriptor and point ia64_sal to it: */
-	pdesc.addr = entry_point;
-	pdesc.gpval = gpval;
-	ia64_sal = (ia64_sal_handler) &pdesc;
-}
-
-static void __init
-check_versions (struct ia64_sal_systab *systab)
-{
-	sal_revision = (systab->sal_rev_major << 8) | systab->sal_rev_minor;
-	sal_version = (systab->sal_b_rev_major << 8) | systab->sal_b_rev_minor;
-
-	/* Check for broken firmware */
-	if ((sal_revision == SAL_VERSION_CODE(49, 29))
-	    && (sal_version == SAL_VERSION_CODE(49, 29)))
-	{
-		/*
-		 * Old firmware for zx2000 prototypes have this weird version number,
-		 * reset it to something sane.
-		 */
-		sal_revision = SAL_VERSION_CODE(2, 8);
-		sal_version = SAL_VERSION_CODE(0, 0);
-	}
-}
-
-static void __init
-sal_desc_entry_point (void *p)
-{
-	struct ia64_sal_desc_entry_point *ep = p;
-	ia64_pal_handler_init(__va(ep->pal_proc));
-	ia64_sal_handler_init(__va(ep->sal_proc), __va(ep->gp));
-}
-
-#ifdef CONFIG_SMP
-static void __init
-set_smp_redirect (int flag)
-{
-#ifndef CONFIG_HOTPLUG_CPU
-	if (no_int_routing)
-		smp_int_redirect &= ~flag;
-	else
-		smp_int_redirect |= flag;
-#else
-	/*
-	 * For CPU Hotplug we dont want to do any chipset supported
-	 * interrupt redirection. The reason is this would require that
-	 * All interrupts be stopped and hard bind the irq to a cpu.
-	 * Later when the interrupt is fired we need to set the redir hint
-	 * on again in the vector. This is cumbersome for something that the
-	 * user mode irq balancer will solve anyways.
-	 */
-	no_int_routing=1;
-	smp_int_redirect &= ~flag;
-#endif
-}
-#else
-#define set_smp_redirect(flag)	do { } while (0)
-#endif
-
-static void __init
-sal_desc_platform_feature (void *p)
-{
-	struct ia64_sal_desc_platform_feature *pf = p;
-	sal_platform_features = pf->feature_mask;
-
-	printk(KERN_INFO "SAL Platform features:");
-	if (!sal_platform_features) {
-		printk(" None\n");
-		return;
-	}
-
-	if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_BUS_LOCK)
-		printk(" BusLock");
-	if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT) {
-		printk(" IRQ_Redirection");
-		set_smp_redirect(SMP_IRQ_REDIRECTION);
-	}
-	if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT) {
-		printk(" IPI_Redirection");
-		set_smp_redirect(SMP_IPI_REDIRECTION);
-	}
-	if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)
-		printk(" ITC_Drift");
-	printk("\n");
-}
-
-#ifdef CONFIG_SMP
-static void __init
-sal_desc_ap_wakeup (void *p)
-{
-	struct ia64_sal_desc_ap_wakeup *ap = p;
-
-	switch (ap->mechanism) {
-	case IA64_SAL_AP_EXTERNAL_INT:
-		ap_wakeup_vector = ap->vector;
-		printk(KERN_INFO "SAL: AP wakeup using external interrupt "
-				"vector 0x%lx\n", ap_wakeup_vector);
-		break;
-	default:
-		printk(KERN_ERR "SAL: AP wakeup mechanism unsupported!\n");
-		break;
-	}
-}
-
-static void __init
-chk_nointroute_opt(void)
-{
-	char *cp;
-
-	for (cp = boot_command_line; *cp; ) {
-		if (memcmp(cp, "nointroute", 10) == 0) {
-			no_int_routing = 1;
-			printk ("no_int_routing on\n");
-			break;
-		} else {
-			while (*cp != ' ' && *cp)
-				++cp;
-			while (*cp == ' ')
-				++cp;
-		}
-	}
-}
-
-#else
-static void __init sal_desc_ap_wakeup(void *p) { }
-#endif
-
-/*
- * HP rx5670 firmware polls for interrupts during SAL_CACHE_FLUSH by reading
- * cr.ivr, but it never writes cr.eoi.  This leaves any interrupt marked as
- * "in-service" and masks other interrupts of equal or lower priority.
- *
- * HP internal defect reports: F1859, F2775, F3031.
- */
-static int sal_cache_flush_drops_interrupts;
-
-static int __init
-force_pal_cache_flush(char *str)
-{
-	sal_cache_flush_drops_interrupts = 1;
-	return 0;
-}
-early_param("force_pal_cache_flush", force_pal_cache_flush);
-
-void __init
-check_sal_cache_flush (void)
-{
-	unsigned long flags;
-	int cpu;
-	u64 vector, cache_type = 3;
-	struct ia64_sal_retval isrv;
-
-	if (sal_cache_flush_drops_interrupts)
-		return;
-
-	cpu = get_cpu();
-	local_irq_save(flags);
-
-	/*
-	 * Send ourselves a timer interrupt, wait until it's reported, and see
-	 * if SAL_CACHE_FLUSH drops it.
-	 */
-	ia64_send_ipi(cpu, IA64_TIMER_VECTOR, IA64_IPI_DM_INT, 0);
-
-	while (!ia64_get_irr(IA64_TIMER_VECTOR))
-		cpu_relax();
-
-	SAL_CALL(isrv, SAL_CACHE_FLUSH, cache_type, 0, 0, 0, 0, 0, 0);
-
-	if (isrv.status)
-		printk(KERN_ERR "SAL_CAL_FLUSH failed with %ld\n", isrv.status);
-
-	if (ia64_get_irr(IA64_TIMER_VECTOR)) {
-		vector = ia64_get_ivr();
-		ia64_eoi();
-		WARN_ON(vector != IA64_TIMER_VECTOR);
-	} else {
-		sal_cache_flush_drops_interrupts = 1;
-		printk(KERN_ERR "SAL: SAL_CACHE_FLUSH drops interrupts; "
-			"PAL_CACHE_FLUSH will be used instead\n");
-		ia64_eoi();
-	}
-
-	local_irq_restore(flags);
-	put_cpu();
-}
-
-s64
-ia64_sal_cache_flush (u64 cache_type)
-{
-	struct ia64_sal_retval isrv;
-
-	if (sal_cache_flush_drops_interrupts) {
-		unsigned long flags;
-		u64 progress;
-		s64 rc;
-
-		progress = 0;
-		local_irq_save(flags);
-		rc = ia64_pal_cache_flush(cache_type,
-			PAL_CACHE_FLUSH_INVALIDATE, &progress, NULL);
-		local_irq_restore(flags);
-		return rc;
-	}
-
-	SAL_CALL(isrv, SAL_CACHE_FLUSH, cache_type, 0, 0, 0, 0, 0, 0);
-	return isrv.status;
-}
-EXPORT_SYMBOL_GPL(ia64_sal_cache_flush);
-
-void __init
-ia64_sal_init (struct ia64_sal_systab *systab)
-{
-	char *p;
-	int i;
-
-	if (!systab) {
-		printk(KERN_WARNING "Hmm, no SAL System Table.\n");
-		return;
-	}
-
-	if (strncmp(systab->signature, "SST_", 4) != 0)
-		printk(KERN_ERR "bad signature in system table!");
-
-	check_versions(systab);
-#ifdef CONFIG_SMP
-	chk_nointroute_opt();
-#endif
-
-	/* revisions are coded in BCD, so %x does the job for us */
-	printk(KERN_INFO "SAL %x.%x: %.32s %.32s%sversion %x.%x\n",
-			SAL_MAJOR(sal_revision), SAL_MINOR(sal_revision),
-			systab->oem_id, systab->product_id,
-			systab->product_id[0] ? " " : "",
-			SAL_MAJOR(sal_version), SAL_MINOR(sal_version));
-
-	p = (char *) (systab + 1);
-	for (i = 0; i < systab->entry_count; i++) {
-		/*
-		 * The first byte of each entry type contains the type
-		 * descriptor.
-		 */
-		switch (*p) {
-		case SAL_DESC_ENTRY_POINT:
-			sal_desc_entry_point(p);
-			break;
-		case SAL_DESC_PLATFORM_FEATURE:
-			sal_desc_platform_feature(p);
-			break;
-		case SAL_DESC_PTC:
-			ia64_ptc_domain_info = (ia64_sal_desc_ptc_t *)p;
-			break;
-		case SAL_DESC_AP_WAKEUP:
-			sal_desc_ap_wakeup(p);
-			break;
-		}
-		p += SAL_DESC_SIZE(*p);
-	}
-
-}
-
-int
-ia64_sal_oemcall(struct ia64_sal_retval *isrvp, u64 oemfunc, u64 arg1,
-		 u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7)
-{
-	if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX)
-		return -1;
-	SAL_CALL(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
-	return 0;
-}
-EXPORT_SYMBOL(ia64_sal_oemcall);
-
-int
-ia64_sal_oemcall_nolock(struct ia64_sal_retval *isrvp, u64 oemfunc, u64 arg1,
-			u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6,
-			u64 arg7)
-{
-	if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX)
-		return -1;
-	SAL_CALL_NOLOCK(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6,
-			arg7);
-	return 0;
-}
-EXPORT_SYMBOL(ia64_sal_oemcall_nolock);
-
-int
-ia64_sal_oemcall_reentrant(struct ia64_sal_retval *isrvp, u64 oemfunc,
-			   u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5,
-			   u64 arg6, u64 arg7)
-{
-	if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX)
-		return -1;
-	SAL_CALL_REENTRANT(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6,
-			   arg7);
-	return 0;
-}
-EXPORT_SYMBOL(ia64_sal_oemcall_reentrant);
-
-long
-ia64_sal_freq_base (unsigned long which, unsigned long *ticks_per_second,
-		    unsigned long *drift_info)
-{
-	struct ia64_sal_retval isrv;
-
-	SAL_CALL(isrv, SAL_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
-	*ticks_per_second = isrv.v0;
-	*drift_info = isrv.v1;
-	return isrv.status;
-}
-EXPORT_SYMBOL_GPL(ia64_sal_freq_base);
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
deleted file mode 100644
index 03b632c56899..000000000000
--- a/arch/ia64/kernel/salinfo.c
+++ /dev/null
@@ -1,646 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * salinfo.c
- *
- * Creates entries in /proc/sal for various system features.
- *
- * Copyright (c) 2003, 2006 Silicon Graphics, Inc.  All rights reserved.
- * Copyright (c) 2003 Hewlett-Packard Co
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- *
- * 10/30/2001	jbarnes@sgi.com		copied much of Stephane's palinfo
- *					code to create this file
- * Oct 23 2003	kaos@sgi.com
- *   Replace IPI with set_cpus_allowed() to read a record from the required cpu.
- *   Redesign salinfo log processing to separate interrupt and user space
- *   contexts.
- *   Cache the record across multi-block reads from user space.
- *   Support > 64 cpus.
- *   Delete module_exit and MOD_INC/DEC_COUNT, salinfo cannot be a module.
- *
- * Jan 28 2004	kaos@sgi.com
- *   Periodically check for outstanding MCA or INIT records.
- *
- * Dec  5 2004	kaos@sgi.com
- *   Standardize which records are cleared automatically.
- *
- * Aug 18 2005	kaos@sgi.com
- *   mca.c may not pass a buffer, a NULL buffer just indicates that a new
- *   record is available in SAL.
- *   Replace some NR_CPUS by cpus_online, for hotplug cpu.
- *
- * Jan  5 2006        kaos@sgi.com
- *   Handle hotplug cpus coming online.
- *   Handle hotplug cpus going offline while they still have outstanding records.
- *   Use the cpu_* macros consistently.
- *   Replace the counting semaphore with a mutex and a test if the cpumask is non-empty.
- *   Modify the locking to make the test for "work to do" an atomic operation.
- */
-
-#include <linux/capability.h>
-#include <linux/cpu.h>
-#include <linux/types.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/module.h>
-#include <linux/smp.h>
-#include <linux/timer.h>
-#include <linux/vmalloc.h>
-#include <linux/semaphore.h>
-
-#include <asm/sal.h>
-#include <linux/uaccess.h>
-
-MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>");
-MODULE_DESCRIPTION("/proc interface to IA-64 SAL features");
-MODULE_LICENSE("GPL");
-
-typedef struct {
-	const char		*name;		/* name of the proc entry */
-	unsigned long           feature;        /* feature bit */
-	struct proc_dir_entry	*entry;		/* registered entry (removal) */
-} salinfo_entry_t;
-
-/*
- * List {name,feature} pairs for every entry in /proc/sal/<feature>
- * that this module exports
- */
-static const salinfo_entry_t salinfo_entries[]={
-	{ "bus_lock",           IA64_SAL_PLATFORM_FEATURE_BUS_LOCK, },
-	{ "irq_redirection",	IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT, },
-	{ "ipi_redirection",	IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT, },
-	{ "itc_drift",		IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT, },
-};
-
-#define NR_SALINFO_ENTRIES ARRAY_SIZE(salinfo_entries)
-
-static char *salinfo_log_name[] = {
-	"mca",
-	"init",
-	"cmc",
-	"cpe",
-};
-
-static struct proc_dir_entry *salinfo_proc_entries[
-	ARRAY_SIZE(salinfo_entries) +			/* /proc/sal/bus_lock */
-	ARRAY_SIZE(salinfo_log_name) +			/* /proc/sal/{mca,...} */
-	(2 * ARRAY_SIZE(salinfo_log_name)) +		/* /proc/sal/mca/{event,data} */
-	1];						/* /proc/sal */
-
-/* Some records we get ourselves, some are accessed as saved data in buffers
- * that are owned by mca.c.
- */
-struct salinfo_data_saved {
-	u8*			buffer;
-	u64			size;
-	u64			id;
-	int			cpu;
-};
-
-/* State transitions.  Actions are :-
- *   Write "read <cpunum>" to the data file.
- *   Write "clear <cpunum>" to the data file.
- *   Write "oemdata <cpunum> <offset> to the data file.
- *   Read from the data file.
- *   Close the data file.
- *
- * Start state is NO_DATA.
- *
- * NO_DATA
- *    write "read <cpunum>" -> NO_DATA or LOG_RECORD.
- *    write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
- *    write "oemdata <cpunum> <offset> -> return -EINVAL.
- *    read data -> return EOF.
- *    close -> unchanged.  Free record areas.
- *
- * LOG_RECORD
- *    write "read <cpunum>" -> NO_DATA or LOG_RECORD.
- *    write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
- *    write "oemdata <cpunum> <offset> -> format the oem data, goto OEMDATA.
- *    read data -> return the INIT/MCA/CMC/CPE record.
- *    close -> unchanged.  Keep record areas.
- *
- * OEMDATA
- *    write "read <cpunum>" -> NO_DATA or LOG_RECORD.
- *    write "clear <cpunum>" -> NO_DATA or LOG_RECORD.
- *    write "oemdata <cpunum> <offset> -> format the oem data, goto OEMDATA.
- *    read data -> return the formatted oemdata.
- *    close -> unchanged.  Keep record areas.
- *
- * Closing the data file does not change the state.  This allows shell scripts
- * to manipulate salinfo data, each shell redirection opens the file, does one
- * action then closes it again.  The record areas are only freed at close when
- * the state is NO_DATA.
- */
-enum salinfo_state {
-	STATE_NO_DATA,
-	STATE_LOG_RECORD,
-	STATE_OEMDATA,
-};
-
-struct salinfo_data {
-	cpumask_t		cpu_event;	/* which cpus have outstanding events */
-	wait_queue_head_t	read_wait;
-	u8			*log_buffer;
-	u64			log_size;
-	u8			*oemdata;	/* decoded oem data */
-	u64			oemdata_size;
-	int			open;		/* single-open to prevent races */
-	u8			type;
-	u8			saved_num;	/* using a saved record? */
-	enum salinfo_state	state :8;	/* processing state */
-	u8			padding;
-	int			cpu_check;	/* next CPU to check */
-	struct salinfo_data_saved data_saved[5];/* save last 5 records from mca.c, must be < 255 */
-};
-
-static struct salinfo_data salinfo_data[ARRAY_SIZE(salinfo_log_name)];
-
-static DEFINE_SPINLOCK(data_lock);
-static DEFINE_SPINLOCK(data_saved_lock);
-
-/** salinfo_platform_oemdata - optional callback to decode oemdata from an error
- * record.
- * @sect_header: pointer to the start of the section to decode.
- * @oemdata: returns vmalloc area containing the decoded output.
- * @oemdata_size: returns length of decoded output (strlen).
- *
- * Description: If user space asks for oem data to be decoded by the kernel
- * and/or prom and the platform has set salinfo_platform_oemdata to the address
- * of a platform specific routine then call that routine.  salinfo_platform_oemdata
- * vmalloc's and formats its output area, returning the address of the text
- * and its strlen.  Returns 0 for success, -ve for error.  The callback is
- * invoked on the cpu that generated the error record.
- */
-int (*salinfo_platform_oemdata)(const u8 *sect_header, u8 **oemdata, u64 *oemdata_size);
-
-struct salinfo_platform_oemdata_parms {
-	const u8 *efi_guid;
-	u8 **oemdata;
-	u64 *oemdata_size;
-};
-
-static long
-salinfo_platform_oemdata_cpu(void *context)
-{
-	struct salinfo_platform_oemdata_parms *parms = context;
-
-	return salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size);
-}
-
-static void
-shift1_data_saved (struct salinfo_data *data, int shift)
-{
-	memcpy(data->data_saved+shift, data->data_saved+shift+1,
-	       (ARRAY_SIZE(data->data_saved) - (shift+1)) * sizeof(data->data_saved[0]));
-	memset(data->data_saved + ARRAY_SIZE(data->data_saved) - 1, 0,
-	       sizeof(data->data_saved[0]));
-}
-
-/* This routine is invoked in interrupt context.  Note: mca.c enables
- * interrupts before calling this code for CMC/CPE.  MCA and INIT events are
- * not irq safe, do not call any routines that use spinlocks, they may deadlock.
- * MCA and INIT records are recorded, a timer event will look for any
- * outstanding events and wake up the user space code.
- *
- * The buffer passed from mca.c points to the output from ia64_log_get. This is
- * a persistent buffer but its contents can change between the interrupt and
- * when user space processes the record.  Save the record id to identify
- * changes.  If the buffer is NULL then just update the bitmap.
- */
-void
-salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe)
-{
-	struct salinfo_data *data = salinfo_data + type;
-	struct salinfo_data_saved *data_saved;
-	unsigned long flags = 0;
-	int i;
-	int saved_size = ARRAY_SIZE(data->data_saved);
-
-	BUG_ON(type >= ARRAY_SIZE(salinfo_log_name));
-
-	if (irqsafe)
-		spin_lock_irqsave(&data_saved_lock, flags);
-	if (buffer) {
-		for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) {
-			if (!data_saved->buffer)
-				break;
-		}
-		if (i == saved_size) {
-			if (!data->saved_num) {
-				shift1_data_saved(data, 0);
-				data_saved = data->data_saved + saved_size - 1;
-			} else
-				data_saved = NULL;
-		}
-		if (data_saved) {
-			data_saved->cpu = smp_processor_id();
-			data_saved->id = ((sal_log_record_header_t *)buffer)->id;
-			data_saved->size = size;
-			data_saved->buffer = buffer;
-		}
-	}
-	cpumask_set_cpu(smp_processor_id(), &data->cpu_event);
-	if (irqsafe) {
-		wake_up_interruptible(&data->read_wait);
-		spin_unlock_irqrestore(&data_saved_lock, flags);
-	}
-}
-
-/* Check for outstanding MCA/INIT records every minute (arbitrary) */
-#define SALINFO_TIMER_DELAY (60*HZ)
-static struct timer_list salinfo_timer;
-extern void ia64_mlogbuf_dump(void);
-
-static void
-salinfo_timeout_check(struct salinfo_data *data)
-{
-	if (!data->open)
-		return;
-	if (!cpumask_empty(&data->cpu_event))
-		wake_up_interruptible(&data->read_wait);
-}
-
-static void
-salinfo_timeout(struct timer_list *unused)
-{
-	ia64_mlogbuf_dump();
-	salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA);
-	salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_INIT);
-	salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY;
-	add_timer(&salinfo_timer);
-}
-
-static int
-salinfo_event_open(struct inode *inode, struct file *file)
-{
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	return 0;
-}
-
-static ssize_t
-salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
-{
-	struct salinfo_data *data = pde_data(file_inode(file));
-	char cmd[32];
-	size_t size;
-	int i, n, cpu = -1;
-
-retry:
-	if (cpumask_empty(&data->cpu_event)) {
-		if (file->f_flags & O_NONBLOCK)
-			return -EAGAIN;
-		if (wait_event_interruptible(data->read_wait,
-					     !cpumask_empty(&data->cpu_event)))
-			return -EINTR;
-	}
-
-	n = data->cpu_check;
-	for (i = 0; i < nr_cpu_ids; i++) {
-		if (cpumask_test_cpu(n, &data->cpu_event)) {
-			if (!cpu_online(n)) {
-				cpumask_clear_cpu(n, &data->cpu_event);
-				continue;
-			}
-			cpu = n;
-			break;
-		}
-		if (++n == nr_cpu_ids)
-			n = 0;
-	}
-
-	if (cpu == -1)
-		goto retry;
-
-	ia64_mlogbuf_dump();
-
-	/* for next read, start checking at next CPU */
-	data->cpu_check = cpu;
-	if (++data->cpu_check == nr_cpu_ids)
-		data->cpu_check = 0;
-
-	snprintf(cmd, sizeof(cmd), "read %d\n", cpu);
-
-	size = strlen(cmd);
-	if (size > count)
-		size = count;
-	if (copy_to_user(buffer, cmd, size))
-		return -EFAULT;
-
-	return size;
-}
-
-static const struct proc_ops salinfo_event_proc_ops = {
-	.proc_open	= salinfo_event_open,
-	.proc_read	= salinfo_event_read,
-	.proc_lseek	= noop_llseek,
-};
-
-static int
-salinfo_log_open(struct inode *inode, struct file *file)
-{
-	struct salinfo_data *data = pde_data(inode);
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	spin_lock(&data_lock);
-	if (data->open) {
-		spin_unlock(&data_lock);
-		return -EBUSY;
-	}
-	data->open = 1;
-	spin_unlock(&data_lock);
-
-	if (data->state == STATE_NO_DATA &&
-	    !(data->log_buffer = vmalloc(ia64_sal_get_state_info_size(data->type)))) {
-		data->open = 0;
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static int
-salinfo_log_release(struct inode *inode, struct file *file)
-{
-	struct salinfo_data *data = pde_data(inode);
-
-	if (data->state == STATE_NO_DATA) {
-		vfree(data->log_buffer);
-		vfree(data->oemdata);
-		data->log_buffer = NULL;
-		data->oemdata = NULL;
-	}
-	spin_lock(&data_lock);
-	data->open = 0;
-	spin_unlock(&data_lock);
-	return 0;
-}
-
-static long
-salinfo_log_read_cpu(void *context)
-{
-	struct salinfo_data *data = context;
-	sal_log_record_header_t *rh;
-	data->log_size = ia64_sal_get_state_info(data->type, (u64 *) data->log_buffer);
-	rh = (sal_log_record_header_t *)(data->log_buffer);
-	/* Clear corrected errors as they are read from SAL */
-	if (rh->severity == sal_log_severity_corrected)
-		ia64_sal_clear_state_info(data->type);
-	return 0;
-}
-
-static void
-salinfo_log_new_read(int cpu, struct salinfo_data *data)
-{
-	struct salinfo_data_saved *data_saved;
-	unsigned long flags;
-	int i;
-	int saved_size = ARRAY_SIZE(data->data_saved);
-
-	data->saved_num = 0;
-	spin_lock_irqsave(&data_saved_lock, flags);
-retry:
-	for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) {
-		if (data_saved->buffer && data_saved->cpu == cpu) {
-			sal_log_record_header_t *rh = (sal_log_record_header_t *)(data_saved->buffer);
-			data->log_size = data_saved->size;
-			memcpy(data->log_buffer, rh, data->log_size);
-			barrier();	/* id check must not be moved */
-			if (rh->id == data_saved->id) {
-				data->saved_num = i+1;
-				break;
-			}
-			/* saved record changed by mca.c since interrupt, discard it */
-			shift1_data_saved(data, i);
-			goto retry;
-		}
-	}
-	spin_unlock_irqrestore(&data_saved_lock, flags);
-
-	if (!data->saved_num)
-		work_on_cpu_safe(cpu, salinfo_log_read_cpu, data);
-	if (!data->log_size) {
-		data->state = STATE_NO_DATA;
-		cpumask_clear_cpu(cpu, &data->cpu_event);
-	} else {
-		data->state = STATE_LOG_RECORD;
-	}
-}
-
-static ssize_t
-salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
-{
-	struct salinfo_data *data = pde_data(file_inode(file));
-	u8 *buf;
-	u64 bufsize;
-
-	if (data->state == STATE_LOG_RECORD) {
-		buf = data->log_buffer;
-		bufsize = data->log_size;
-	} else if (data->state == STATE_OEMDATA) {
-		buf = data->oemdata;
-		bufsize = data->oemdata_size;
-	} else {
-		buf = NULL;
-		bufsize = 0;
-	}
-	return simple_read_from_buffer(buffer, count, ppos, buf, bufsize);
-}
-
-static long
-salinfo_log_clear_cpu(void *context)
-{
-	struct salinfo_data *data = context;
-
-	ia64_sal_clear_state_info(data->type);
-	return 0;
-}
-
-static int
-salinfo_log_clear(struct salinfo_data *data, int cpu)
-{
-	sal_log_record_header_t *rh;
-	unsigned long flags;
-	spin_lock_irqsave(&data_saved_lock, flags);
-	data->state = STATE_NO_DATA;
-	if (!cpumask_test_cpu(cpu, &data->cpu_event)) {
-		spin_unlock_irqrestore(&data_saved_lock, flags);
-		return 0;
-	}
-	cpumask_clear_cpu(cpu, &data->cpu_event);
-	if (data->saved_num) {
-		shift1_data_saved(data, data->saved_num - 1);
-		data->saved_num = 0;
-	}
-	spin_unlock_irqrestore(&data_saved_lock, flags);
-	rh = (sal_log_record_header_t *)(data->log_buffer);
-	/* Corrected errors have already been cleared from SAL */
-	if (rh->severity != sal_log_severity_corrected)
-		work_on_cpu_safe(cpu, salinfo_log_clear_cpu, data);
-	/* clearing a record may make a new record visible */
-	salinfo_log_new_read(cpu, data);
-	if (data->state == STATE_LOG_RECORD) {
-		spin_lock_irqsave(&data_saved_lock, flags);
-		cpumask_set_cpu(cpu, &data->cpu_event);
-		wake_up_interruptible(&data->read_wait);
-		spin_unlock_irqrestore(&data_saved_lock, flags);
-	}
-	return 0;
-}
-
-static ssize_t
-salinfo_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos)
-{
-	struct salinfo_data *data = pde_data(file_inode(file));
-	char cmd[32];
-	size_t size;
-	u32 offset;
-	int cpu;
-
-	size = sizeof(cmd);
-	if (count < size)
-		size = count;
-	if (copy_from_user(cmd, buffer, size))
-		return -EFAULT;
-
-	if (sscanf(cmd, "read %d", &cpu) == 1) {
-		salinfo_log_new_read(cpu, data);
-	} else if (sscanf(cmd, "clear %d", &cpu) == 1) {
-		int ret;
-		if ((ret = salinfo_log_clear(data, cpu)))
-			count = ret;
-	} else if (sscanf(cmd, "oemdata %d %d", &cpu, &offset) == 2) {
-		if (data->state != STATE_LOG_RECORD && data->state != STATE_OEMDATA)
-			return -EINVAL;
-		if (offset > data->log_size - sizeof(efi_guid_t))
-			return -EINVAL;
-		data->state = STATE_OEMDATA;
-		if (salinfo_platform_oemdata) {
-			struct salinfo_platform_oemdata_parms parms = {
-				.efi_guid = data->log_buffer + offset,
-				.oemdata = &data->oemdata,
-				.oemdata_size = &data->oemdata_size
-			};
-			count = work_on_cpu_safe(cpu, salinfo_platform_oemdata_cpu,
-						 &parms);
-		} else
-			data->oemdata_size = 0;
-	} else
-		return -EINVAL;
-
-	return count;
-}
-
-static const struct proc_ops salinfo_data_proc_ops = {
-	.proc_open	= salinfo_log_open,
-	.proc_release	= salinfo_log_release,
-	.proc_read	= salinfo_log_read,
-	.proc_write	= salinfo_log_write,
-	.proc_lseek	= default_llseek,
-};
-
-static int salinfo_cpu_online(unsigned int cpu)
-{
-	unsigned int i, end = ARRAY_SIZE(salinfo_data);
-	struct salinfo_data *data;
-
-	spin_lock_irq(&data_saved_lock);
-	for (i = 0, data = salinfo_data; i < end; ++i, ++data) {
-		cpumask_set_cpu(cpu, &data->cpu_event);
-		wake_up_interruptible(&data->read_wait);
-	}
-	spin_unlock_irq(&data_saved_lock);
-	return 0;
-}
-
-static int salinfo_cpu_pre_down(unsigned int cpu)
-{
-	unsigned int i, end = ARRAY_SIZE(salinfo_data);
-	struct salinfo_data *data;
-
-	spin_lock_irq(&data_saved_lock);
-	for (i = 0, data = salinfo_data; i < end; ++i, ++data) {
-		struct salinfo_data_saved *data_saved;
-		int j = ARRAY_SIZE(data->data_saved) - 1;
-
-		for (data_saved = data->data_saved + j; j >= 0;
-		     --j, --data_saved) {
-			if (data_saved->buffer && data_saved->cpu == cpu)
-				shift1_data_saved(data, j);
-		}
-		cpumask_clear_cpu(cpu, &data->cpu_event);
-	}
-	spin_unlock_irq(&data_saved_lock);
-	return 0;
-}
-
-/*
- * 'data' contains an integer that corresponds to the feature we're
- * testing
- */
-static int __maybe_unused proc_salinfo_show(struct seq_file *m, void *v)
-{
-	unsigned long data = (unsigned long)v;
-	seq_puts(m, (sal_platform_features & data) ? "1\n" : "0\n");
-	return 0;
-}
-
-static int __init
-salinfo_init(void)
-{
-	struct proc_dir_entry *salinfo_dir; /* /proc/sal dir entry */
-	struct proc_dir_entry **sdir = salinfo_proc_entries; /* keeps track of every entry */
-	struct proc_dir_entry *dir, *entry;
-	struct salinfo_data *data;
-	int i;
-
-	salinfo_dir = proc_mkdir("sal", NULL);
-	if (!salinfo_dir)
-		return 0;
-
-	for (i=0; i < NR_SALINFO_ENTRIES; i++) {
-		/* pass the feature bit in question as misc data */
-		*sdir++ = proc_create_single_data(salinfo_entries[i].name, 0,
-				salinfo_dir, proc_salinfo_show,
-				(void *)salinfo_entries[i].feature);
-	}
-
-	for (i = 0; i < ARRAY_SIZE(salinfo_log_name); i++) {
-		data = salinfo_data + i;
-		data->type = i;
-		init_waitqueue_head(&data->read_wait);
-		dir = proc_mkdir(salinfo_log_name[i], salinfo_dir);
-		if (!dir)
-			continue;
-
-		entry = proc_create_data("event", S_IRUSR, dir,
-					 &salinfo_event_proc_ops, data);
-		if (!entry)
-			continue;
-		*sdir++ = entry;
-
-		entry = proc_create_data("data", S_IRUSR | S_IWUSR, dir,
-					 &salinfo_data_proc_ops, data);
-		if (!entry)
-			continue;
-		*sdir++ = entry;
-
-		*sdir++ = dir;
-	}
-
-	*sdir++ = salinfo_dir;
-
-	timer_setup(&salinfo_timer, salinfo_timeout, 0);
-	salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY;
-	add_timer(&salinfo_timer);
-
-	i = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/salinfo:online",
-			      salinfo_cpu_online, salinfo_cpu_pre_down);
-	WARN_ON(i < 0);
-	return 0;
-}
-
-module_init(salinfo_init);
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
deleted file mode 100644
index 5a55ac82c13a..000000000000
--- a/arch/ia64/kernel/setup.c
+++ /dev/null
@@ -1,1081 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Architecture-specific setup.
- *
- * Copyright (C) 1998-2001, 2003-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2000, 2004 Intel Corp
- * 	Rohit Seth <rohit.seth@intel.com>
- * 	Suresh Siddha <suresh.b.siddha@intel.com>
- * 	Gordon Jin <gordon.jin@intel.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- *
- * 12/26/04 S.Siddha, G.Jin, R.Seth
- *			Add multi-threading and multi-core detection
- * 11/12/01 D.Mosberger Convert get_cpuinfo() to seq_file based show_cpuinfo().
- * 04/04/00 D.Mosberger renamed cpu_initialized to cpu_online_map
- * 03/31/00 R.Seth	cpu_initialized and current->processor fixes
- * 02/04/00 D.Mosberger	some more get_cpuinfo fixes...
- * 02/01/00 R.Seth	fixed get_cpuinfo for SMP
- * 01/07/99 S.Eranian	added the support for command line argument
- * 06/24/99 W.Drummond	added boot_cpu_data.
- * 05/28/05 Z. Menyhart	Dynamic stride size for "flush_icache_range()"
- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/pgtable.h>
-
-#include <linux/acpi.h>
-#include <linux/console.h>
-#include <linux/delay.h>
-#include <linux/cpu.h>
-#include <linux/kdev_t.h>
-#include <linux/kernel.h>
-#include <linux/memblock.h>
-#include <linux/reboot.h>
-#include <linux/sched/mm.h>
-#include <linux/sched/clock.h>
-#include <linux/sched/task_stack.h>
-#include <linux/seq_file.h>
-#include <linux/string.h>
-#include <linux/threads.h>
-#include <linux/screen_info.h>
-#include <linux/dmi.h>
-#include <linux/root_dev.h>
-#include <linux/serial.h>
-#include <linux/serial_core.h>
-#include <linux/efi.h>
-#include <linux/initrd.h>
-#include <linux/pm.h>
-#include <linux/cpufreq.h>
-#include <linux/kexec.h>
-#include <linux/crash_dump.h>
-
-#include <asm/mca.h>
-#include <asm/meminit.h>
-#include <asm/page.h>
-#include <asm/patch.h>
-#include <asm/processor.h>
-#include <asm/sal.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/smp.h>
-#include <asm/tlbflush.h>
-#include <asm/unistd.h>
-#include <asm/uv/uv.h>
-#include <asm/xtp.h>
-
-#if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
-# error "struct cpuinfo_ia64 too big!"
-#endif
-
-char ia64_platform_name[64];
-
-#ifdef CONFIG_SMP
-unsigned long __per_cpu_offset[NR_CPUS];
-EXPORT_SYMBOL(__per_cpu_offset);
-#endif
-
-DEFINE_PER_CPU(struct cpuinfo_ia64, ia64_cpu_info);
-EXPORT_SYMBOL(ia64_cpu_info);
-DEFINE_PER_CPU(unsigned long, local_per_cpu_offset);
-#ifdef CONFIG_SMP
-EXPORT_SYMBOL(local_per_cpu_offset);
-#endif
-unsigned long ia64_cycles_per_usec;
-struct ia64_boot_param *ia64_boot_param;
-struct screen_info screen_info;
-unsigned long vga_console_iobase;
-unsigned long vga_console_membase;
-
-static struct resource data_resource = {
-	.name	= "Kernel data",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
-};
-
-static struct resource code_resource = {
-	.name	= "Kernel code",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
-};
-
-static struct resource bss_resource = {
-	.name	= "Kernel bss",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
-};
-
-unsigned long ia64_max_cacheline_size;
-
-unsigned long ia64_iobase;	/* virtual address for I/O accesses */
-EXPORT_SYMBOL(ia64_iobase);
-struct io_space io_space[MAX_IO_SPACES];
-EXPORT_SYMBOL(io_space);
-unsigned int num_io_spaces;
-
-/*
- * "flush_icache_range()" needs to know what processor dependent stride size to use
- * when it makes i-cache(s) coherent with d-caches.
- */
-#define	I_CACHE_STRIDE_SHIFT	5	/* Safest way to go: 32 bytes by 32 bytes */
-unsigned long ia64_i_cache_stride_shift = ~0;
-/*
- * "clflush_cache_range()" needs to know what processor dependent stride size to
- * use when it flushes cache lines including both d-cache and i-cache.
- */
-/* Safest way to go: 32 bytes by 32 bytes */
-#define	CACHE_STRIDE_SHIFT	5
-unsigned long ia64_cache_stride_shift = ~0;
-
-/*
- * We use a special marker for the end of memory and it uses the extra (+1) slot
- */
-struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1] __initdata;
-static int num_rsvd_regions __initdata;
-
-
-/*
- * Filter incoming memory segments based on the primitive map created from the boot
- * parameters. Segments contained in the map are removed from the memory ranges. A
- * caller-specified function is called with the memory ranges that remain after filtering.
- * This routine does not assume the incoming segments are sorted.
- */
-int __init
-filter_rsvd_memory (u64 start, u64 end, void *arg)
-{
-	u64 range_start, range_end, prev_start;
-	void (*func)(unsigned long, unsigned long, int);
-	int i;
-
-#if IGNORE_PFN0
-	if (start == PAGE_OFFSET) {
-		printk(KERN_WARNING "warning: skipping physical page 0\n");
-		start += PAGE_SIZE;
-		if (start >= end) return 0;
-	}
-#endif
-	/*
-	 * lowest possible address(walker uses virtual)
-	 */
-	prev_start = PAGE_OFFSET;
-	func = arg;
-
-	for (i = 0; i < num_rsvd_regions; ++i) {
-		range_start = max(start, prev_start);
-		range_end   = min(end, rsvd_region[i].start);
-
-		if (range_start < range_end)
-			call_pernode_memory(__pa(range_start), range_end - range_start, func);
-
-		/* nothing more available in this segment */
-		if (range_end == end) return 0;
-
-		prev_start = rsvd_region[i].end;
-	}
-	/* end of memory marker allows full processing inside loop body */
-	return 0;
-}
-
-/*
- * Similar to "filter_rsvd_memory()", but the reserved memory ranges
- * are not filtered out.
- */
-int __init
-filter_memory(u64 start, u64 end, void *arg)
-{
-	void (*func)(unsigned long, unsigned long, int);
-
-#if IGNORE_PFN0
-	if (start == PAGE_OFFSET) {
-		printk(KERN_WARNING "warning: skipping physical page 0\n");
-		start += PAGE_SIZE;
-		if (start >= end)
-			return 0;
-	}
-#endif
-	func = arg;
-	if (start < end)
-		call_pernode_memory(__pa(start), end - start, func);
-	return 0;
-}
-
-static void __init
-sort_regions (struct rsvd_region *rsvd_region, int max)
-{
-	int j;
-
-	/* simple bubble sorting */
-	while (max--) {
-		for (j = 0; j < max; ++j) {
-			if (rsvd_region[j].start > rsvd_region[j+1].start) {
-				swap(rsvd_region[j], rsvd_region[j + 1]);
-			}
-		}
-	}
-}
-
-/* merge overlaps */
-static int __init
-merge_regions (struct rsvd_region *rsvd_region, int max)
-{
-	int i;
-	for (i = 1; i < max; ++i) {
-		if (rsvd_region[i].start >= rsvd_region[i-1].end)
-			continue;
-		if (rsvd_region[i].end > rsvd_region[i-1].end)
-			rsvd_region[i-1].end = rsvd_region[i].end;
-		--max;
-		memmove(&rsvd_region[i], &rsvd_region[i+1],
-			(max - i) * sizeof(struct rsvd_region));
-	}
-	return max;
-}
-
-/*
- * Request address space for all standard resources
- */
-static int __init register_memory(void)
-{
-	code_resource.start = ia64_tpa(_text);
-	code_resource.end   = ia64_tpa(_etext) - 1;
-	data_resource.start = ia64_tpa(_etext);
-	data_resource.end   = ia64_tpa(_edata) - 1;
-	bss_resource.start  = ia64_tpa(__bss_start);
-	bss_resource.end    = ia64_tpa(_end) - 1;
-	efi_initialize_iomem_resources(&code_resource, &data_resource,
-			&bss_resource);
-
-	return 0;
-}
-
-__initcall(register_memory);
-
-
-#ifdef CONFIG_KEXEC
-
-/*
- * This function checks if the reserved crashkernel is allowed on the specific
- * IA64 machine flavour. Machines without an IO TLB use swiotlb and require
- * some memory below 4 GB (i.e. in 32 bit area), see the implementation of
- * kernel/dma/swiotlb.c. The hpzx1 architecture has an IO TLB but cannot use that
- * in kdump case. See the comment in sba_init() in sba_iommu.c.
- *
- * So, the only machvec that really supports loading the kdump kernel
- * over 4 GB is "uv".
- */
-static int __init check_crashkernel_memory(unsigned long pbase, size_t size)
-{
-	if (is_uv_system())
-		return 1;
-	else
-		return pbase < (1UL << 32);
-}
-
-static void __init setup_crashkernel(unsigned long total, int *n)
-{
-	unsigned long long base = 0, size = 0;
-	int ret;
-
-	ret = parse_crashkernel(boot_command_line, total,
-			&size, &base);
-	if (ret == 0 && size > 0) {
-		if (!base) {
-			sort_regions(rsvd_region, *n);
-			*n = merge_regions(rsvd_region, *n);
-			base = kdump_find_rsvd_region(size,
-					rsvd_region, *n);
-		}
-
-		if (!check_crashkernel_memory(base, size)) {
-			pr_warn("crashkernel: There would be kdump memory "
-				"at %ld GB but this is unusable because it "
-				"must\nbe below 4 GB. Change the memory "
-				"configuration of the machine.\n",
-				(unsigned long)(base >> 30));
-			return;
-		}
-
-		if (base != ~0UL) {
-			printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
-					"for crashkernel (System RAM: %ldMB)\n",
-					(unsigned long)(size >> 20),
-					(unsigned long)(base >> 20),
-					(unsigned long)(total >> 20));
-			rsvd_region[*n].start =
-				(unsigned long)__va(base);
-			rsvd_region[*n].end =
-				(unsigned long)__va(base + size);
-			(*n)++;
-			crashk_res.start = base;
-			crashk_res.end = base + size - 1;
-		}
-	}
-	efi_memmap_res.start = ia64_boot_param->efi_memmap;
-	efi_memmap_res.end = efi_memmap_res.start +
-		ia64_boot_param->efi_memmap_size;
-	boot_param_res.start = __pa(ia64_boot_param);
-	boot_param_res.end = boot_param_res.start +
-		sizeof(*ia64_boot_param);
-}
-#else
-static inline void __init setup_crashkernel(unsigned long total, int *n)
-{}
-#endif
-
-#ifdef CONFIG_CRASH_DUMP
-static int __init reserve_elfcorehdr(u64 *start, u64 *end)
-{
-	u64 length;
-
-	/* We get the address using the kernel command line,
-	 * but the size is extracted from the EFI tables.
-	 * Both address and size are required for reservation
-	 * to work properly.
-	 */
-
-	if (!is_vmcore_usable())
-		return -EINVAL;
-
-	if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) {
-		vmcore_unusable();
-		return -EINVAL;
-	}
-
-	*start = (unsigned long)__va(elfcorehdr_addr);
-	*end = *start + length;
-	return 0;
-}
-#endif /* CONFIG_CRASH_DUMP */
-
-/**
- * reserve_memory - setup reserved memory areas
- *
- * Setup the reserved memory areas set aside for the boot parameters,
- * initrd, etc.  There are currently %IA64_MAX_RSVD_REGIONS defined,
- * see arch/ia64/include/asm/meminit.h if you need to define more.
- */
-void __init
-reserve_memory (void)
-{
-	int n = 0;
-	unsigned long total_memory;
-
-	/*
-	 * none of the entries in this table overlap
-	 */
-	rsvd_region[n].start = (unsigned long) ia64_boot_param;
-	rsvd_region[n].end   = rsvd_region[n].start + sizeof(*ia64_boot_param);
-	n++;
-
-	rsvd_region[n].start = (unsigned long) __va(ia64_boot_param->efi_memmap);
-	rsvd_region[n].end   = rsvd_region[n].start + ia64_boot_param->efi_memmap_size;
-	n++;
-
-	rsvd_region[n].start = (unsigned long) __va(ia64_boot_param->command_line);
-	rsvd_region[n].end   = (rsvd_region[n].start
-				+ strlen(__va(ia64_boot_param->command_line)) + 1);
-	n++;
-
-	rsvd_region[n].start = (unsigned long) ia64_imva((void *)KERNEL_START);
-	rsvd_region[n].end   = (unsigned long) ia64_imva(_end);
-	n++;
-
-#ifdef CONFIG_BLK_DEV_INITRD
-	if (ia64_boot_param->initrd_start) {
-		rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start);
-		rsvd_region[n].end   = rsvd_region[n].start + ia64_boot_param->initrd_size;
-		n++;
-	}
-#endif
-
-#ifdef CONFIG_CRASH_DUMP
-	if (reserve_elfcorehdr(&rsvd_region[n].start,
-			       &rsvd_region[n].end) == 0)
-		n++;
-#endif
-
-	total_memory = efi_memmap_init(&rsvd_region[n].start, &rsvd_region[n].end);
-	n++;
-
-	setup_crashkernel(total_memory, &n);
-
-	/* end of memory marker */
-	rsvd_region[n].start = ~0UL;
-	rsvd_region[n].end   = ~0UL;
-	n++;
-
-	num_rsvd_regions = n;
-	BUG_ON(IA64_MAX_RSVD_REGIONS + 1 < n);
-
-	sort_regions(rsvd_region, num_rsvd_regions);
-	num_rsvd_regions = merge_regions(rsvd_region, num_rsvd_regions);
-
-	/* reserve all regions except the end of memory marker with memblock */
-	for (n = 0; n < num_rsvd_regions - 1; n++) {
-		struct rsvd_region *region = &rsvd_region[n];
-		phys_addr_t addr = __pa(region->start);
-		phys_addr_t size = region->end - region->start;
-
-		memblock_reserve(addr, size);
-	}
-}
-
-/**
- * find_initrd - get initrd parameters from the boot parameter structure
- *
- * Grab the initrd start and end from the boot parameter struct given us by
- * the boot loader.
- */
-void __init
-find_initrd (void)
-{
-#ifdef CONFIG_BLK_DEV_INITRD
-	if (ia64_boot_param->initrd_start) {
-		initrd_start = (unsigned long)__va(ia64_boot_param->initrd_start);
-		initrd_end   = initrd_start+ia64_boot_param->initrd_size;
-
-		printk(KERN_INFO "Initial ramdisk at: 0x%lx (%llu bytes)\n",
-		       initrd_start, ia64_boot_param->initrd_size);
-	}
-#endif
-}
-
-static void __init
-io_port_init (void)
-{
-	unsigned long phys_iobase;
-
-	/*
-	 * Set `iobase' based on the EFI memory map or, failing that, the
-	 * value firmware left in ar.k0.
-	 *
-	 * Note that in ia32 mode, IN/OUT instructions use ar.k0 to compute
-	 * the port's virtual address, so ia32_load_state() loads it with a
-	 * user virtual address.  But in ia64 mode, glibc uses the
-	 * *physical* address in ar.k0 to mmap the appropriate area from
-	 * /dev/mem, and the inX()/outX() interfaces use MMIO.  In both
-	 * cases, user-mode can only use the legacy 0-64K I/O port space.
-	 *
-	 * ar.k0 is not involved in kernel I/O port accesses, which can use
-	 * any of the I/O port spaces and are done via MMIO using the
-	 * virtual mmio_base from the appropriate io_space[].
-	 */
-	phys_iobase = efi_get_iobase();
-	if (!phys_iobase) {
-		phys_iobase = ia64_get_kr(IA64_KR_IO_BASE);
-		printk(KERN_INFO "No I/O port range found in EFI memory map, "
-			"falling back to AR.KR0 (0x%lx)\n", phys_iobase);
-	}
-	ia64_iobase = (unsigned long) ioremap(phys_iobase, 0);
-	ia64_set_kr(IA64_KR_IO_BASE, __pa(ia64_iobase));
-
-	/* setup legacy IO port space */
-	io_space[0].mmio_base = ia64_iobase;
-	io_space[0].sparse = 1;
-	num_io_spaces = 1;
-}
-
-/**
- * early_console_setup - setup debugging console
- *
- * Consoles started here require little enough setup that we can start using
- * them very early in the boot process, either right after the machine
- * vector initialization, or even before if the drivers can detect their hw.
- *
- * Returns non-zero if a console couldn't be setup.
- */
-static inline int __init
-early_console_setup (char *cmdline)
-{
-#ifdef CONFIG_EFI_PCDP
-	if (!efi_setup_pcdp_console(cmdline))
-		return 0;
-#endif
-	return -1;
-}
-
-static void __init
-screen_info_setup(void)
-{
-	unsigned int orig_x, orig_y, num_cols, num_rows, font_height;
-
-	memset(&screen_info, 0, sizeof(screen_info));
-
-	if (!ia64_boot_param->console_info.num_rows ||
-	    !ia64_boot_param->console_info.num_cols) {
-		printk(KERN_WARNING "invalid screen-info, guessing 80x25\n");
-		orig_x = 0;
-		orig_y = 0;
-		num_cols = 80;
-		num_rows = 25;
-		font_height = 16;
-	} else {
-		orig_x = ia64_boot_param->console_info.orig_x;
-		orig_y = ia64_boot_param->console_info.orig_y;
-		num_cols = ia64_boot_param->console_info.num_cols;
-		num_rows = ia64_boot_param->console_info.num_rows;
-		font_height = 400 / num_rows;
-	}
-
-	screen_info.orig_x = orig_x;
-	screen_info.orig_y = orig_y;
-	screen_info.orig_video_cols  = num_cols;
-	screen_info.orig_video_lines = num_rows;
-	screen_info.orig_video_points = font_height;
-	screen_info.orig_video_mode = 3;	/* XXX fake */
-	screen_info.orig_video_isVGA = 1;	/* XXX fake */
-	screen_info.orig_video_ega_bx = 3;	/* XXX fake */
-}
-
-static inline void
-mark_bsp_online (void)
-{
-#ifdef CONFIG_SMP
-	/* If we register an early console, allow CPU 0 to printk */
-	set_cpu_online(smp_processor_id(), true);
-#endif
-}
-
-static __initdata int nomca;
-static __init int setup_nomca(char *s)
-{
-	nomca = 1;
-	return 0;
-}
-early_param("nomca", setup_nomca);
-
-void __init
-setup_arch (char **cmdline_p)
-{
-	unw_init();
-
-	ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
-
-	*cmdline_p = __va(ia64_boot_param->command_line);
-	strscpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE);
-
-	efi_init();
-	io_port_init();
-
-	uv_probe_system_type();
-	parse_early_param();
-
-	if (early_console_setup(*cmdline_p) == 0)
-		mark_bsp_online();
-
-	/* Initialize the ACPI boot-time table parser */
-	acpi_table_init();
-	early_acpi_boot_init();
-#ifdef CONFIG_ACPI_NUMA
-	acpi_numa_init();
-	acpi_numa_fixup();
-#ifdef CONFIG_ACPI_HOTPLUG_CPU
-	prefill_possible_map();
-#endif
-	per_cpu_scan_finalize((cpumask_empty(&early_cpu_possible_map) ?
-		32 : cpumask_weight(&early_cpu_possible_map)),
-		additional_cpus > 0 ? additional_cpus : 0);
-#endif /* CONFIG_ACPI_NUMA */
-
-#ifdef CONFIG_SMP
-	smp_build_cpu_map();
-#endif
-	find_memory();
-
-	/* process SAL system table: */
-	ia64_sal_init(__va(sal_systab_phys));
-
-#ifdef CONFIG_ITANIUM
-	ia64_patch_rse((u64) __start___rse_patchlist, (u64) __end___rse_patchlist);
-#else
-	{
-		unsigned long num_phys_stacked;
-
-		if (ia64_pal_rse_info(&num_phys_stacked, 0) == 0 && num_phys_stacked > 96)
-			ia64_patch_rse((u64) __start___rse_patchlist, (u64) __end___rse_patchlist);
-	}
-#endif
-
-#ifdef CONFIG_SMP
-	cpu_physical_id(0) = hard_smp_processor_id();
-#endif
-
-	cpu_init();	/* initialize the bootstrap CPU */
-	mmu_context_init();	/* initialize context_id bitmap */
-
-#ifdef CONFIG_VT
-	if (!conswitchp) {
-# if defined(CONFIG_VGA_CONSOLE)
-		/*
-		 * Non-legacy systems may route legacy VGA MMIO range to system
-		 * memory.  vga_con probes the MMIO hole, so memory looks like
-		 * a VGA device to it.  The EFI memory map can tell us if it's
-		 * memory so we can avoid this problem.
-		 */
-		if (efi_mem_type(0xA0000) != EFI_CONVENTIONAL_MEMORY)
-			conswitchp = &vga_con;
-# endif
-	}
-#endif
-
-	/* enable IA-64 Machine Check Abort Handling unless disabled */
-	if (!nomca)
-		ia64_mca_init();
-
-	/*
-	 * Default to /dev/sda2.  This assumes that the EFI partition
-	 * is physical disk 1 partition 1 and the Linux root disk is
-	 * physical disk 1 partition 2.
-	 */
-	ROOT_DEV = MKDEV(SCSI_DISK0_MAJOR, 2);
-
-	if (is_uv_system())
-		uv_setup(cmdline_p);
-#ifdef CONFIG_SMP
-	else
-		init_smp_config();
-#endif
-
-	screen_info_setup();
-	paging_init();
-
-	clear_sched_clock_stable();
-}
-
-/*
- * Display cpu info for all CPUs.
- */
-static int
-show_cpuinfo (struct seq_file *m, void *v)
-{
-#ifdef CONFIG_SMP
-#	define lpj	c->loops_per_jiffy
-#	define cpunum	c->cpu
-#else
-#	define lpj	loops_per_jiffy
-#	define cpunum	0
-#endif
-	static struct {
-		unsigned long mask;
-		const char *feature_name;
-	} feature_bits[] = {
-		{ 1UL << 0, "branchlong" },
-		{ 1UL << 1, "spontaneous deferral"},
-		{ 1UL << 2, "16-byte atomic ops" }
-	};
-	char features[128], *cp, *sep;
-	struct cpuinfo_ia64 *c = v;
-	unsigned long mask;
-	unsigned long proc_freq;
-	int i, size;
-
-	mask = c->features;
-
-	/* build the feature string: */
-	memcpy(features, "standard", 9);
-	cp = features;
-	size = sizeof(features);
-	sep = "";
-	for (i = 0; i < ARRAY_SIZE(feature_bits) && size > 1; ++i) {
-		if (mask & feature_bits[i].mask) {
-			cp += snprintf(cp, size, "%s%s", sep,
-				       feature_bits[i].feature_name),
-			sep = ", ";
-			mask &= ~feature_bits[i].mask;
-			size = sizeof(features) - (cp - features);
-		}
-	}
-	if (mask && size > 1) {
-		/* print unknown features as a hex value */
-		snprintf(cp, size, "%s0x%lx", sep, mask);
-	}
-
-	proc_freq = cpufreq_quick_get(cpunum);
-	if (!proc_freq)
-		proc_freq = c->proc_freq / 1000;
-
-	seq_printf(m,
-		   "processor  : %d\n"
-		   "vendor     : %s\n"
-		   "arch       : IA-64\n"
-		   "family     : %u\n"
-		   "model      : %u\n"
-		   "model name : %s\n"
-		   "revision   : %u\n"
-		   "archrev    : %u\n"
-		   "features   : %s\n"
-		   "cpu number : %lu\n"
-		   "cpu regs   : %u\n"
-		   "cpu MHz    : %lu.%03lu\n"
-		   "itc MHz    : %lu.%06lu\n"
-		   "BogoMIPS   : %lu.%02lu\n",
-		   cpunum, c->vendor, c->family, c->model,
-		   c->model_name, c->revision, c->archrev,
-		   features, c->ppn, c->number,
-		   proc_freq / 1000, proc_freq % 1000,
-		   c->itc_freq / 1000000, c->itc_freq % 1000000,
-		   lpj*HZ/500000, (lpj*HZ/5000) % 100);
-#ifdef CONFIG_SMP
-	seq_printf(m, "siblings   : %u\n",
-		   cpumask_weight(&cpu_core_map[cpunum]));
-	if (c->socket_id != -1)
-		seq_printf(m, "physical id: %u\n", c->socket_id);
-	if (c->threads_per_core > 1 || c->cores_per_socket > 1)
-		seq_printf(m,
-			   "core id    : %u\n"
-			   "thread id  : %u\n",
-			   c->core_id, c->thread_id);
-#endif
-	seq_printf(m,"\n");
-
-	return 0;
-}
-
-static void *
-c_start (struct seq_file *m, loff_t *pos)
-{
-#ifdef CONFIG_SMP
-	while (*pos < nr_cpu_ids && !cpu_online(*pos))
-		++*pos;
-#endif
-	return *pos < nr_cpu_ids ? cpu_data(*pos) : NULL;
-}
-
-static void *
-c_next (struct seq_file *m, void *v, loff_t *pos)
-{
-	++*pos;
-	return c_start(m, pos);
-}
-
-static void
-c_stop (struct seq_file *m, void *v)
-{
-}
-
-const struct seq_operations cpuinfo_op = {
-	.start =	c_start,
-	.next =		c_next,
-	.stop =		c_stop,
-	.show =		show_cpuinfo
-};
-
-#define MAX_BRANDS	8
-static char brandname[MAX_BRANDS][128];
-
-static char *
-get_model_name(__u8 family, __u8 model)
-{
-	static int overflow;
-	char brand[128];
-	int i;
-
-	memcpy(brand, "Unknown", 8);
-	if (ia64_pal_get_brand_info(brand)) {
-		if (family == 0x7)
-			memcpy(brand, "Merced", 7);
-		else if (family == 0x1f) switch (model) {
-			case 0: memcpy(brand, "McKinley", 9); break;
-			case 1: memcpy(brand, "Madison", 8); break;
-			case 2: memcpy(brand, "Madison up to 9M cache", 23); break;
-		}
-	}
-	for (i = 0; i < MAX_BRANDS; i++)
-		if (strcmp(brandname[i], brand) == 0)
-			return brandname[i];
-	for (i = 0; i < MAX_BRANDS; i++)
-		if (brandname[i][0] == '\0')
-			return strcpy(brandname[i], brand);
-	if (overflow++ == 0)
-		printk(KERN_ERR
-		       "%s: Table overflow. Some processor model information will be missing\n",
-		       __func__);
-	return "Unknown";
-}
-
-static void
-identify_cpu (struct cpuinfo_ia64 *c)
-{
-	union {
-		unsigned long bits[5];
-		struct {
-			/* id 0 & 1: */
-			char vendor[16];
-
-			/* id 2 */
-			u64 ppn;		/* processor serial number */
-
-			/* id 3: */
-			unsigned number		:  8;
-			unsigned revision	:  8;
-			unsigned model		:  8;
-			unsigned family		:  8;
-			unsigned archrev	:  8;
-			unsigned reserved	: 24;
-
-			/* id 4: */
-			u64 features;
-		} field;
-	} cpuid;
-	pal_vm_info_1_u_t vm1;
-	pal_vm_info_2_u_t vm2;
-	pal_status_t status;
-	unsigned long impl_va_msb = 50, phys_addr_size = 44;	/* Itanium defaults */
-	int i;
-	for (i = 0; i < 5; ++i)
-		cpuid.bits[i] = ia64_get_cpuid(i);
-
-	memcpy(c->vendor, cpuid.field.vendor, 16);
-#ifdef CONFIG_SMP
-	c->cpu = smp_processor_id();
-
-	/* below default values will be overwritten  by identify_siblings() 
-	 * for Multi-Threading/Multi-Core capable CPUs
-	 */
-	c->threads_per_core = c->cores_per_socket = c->num_log = 1;
-	c->socket_id = -1;
-
-	identify_siblings(c);
-
-	if (c->threads_per_core > smp_num_siblings)
-		smp_num_siblings = c->threads_per_core;
-#endif
-	c->ppn = cpuid.field.ppn;
-	c->number = cpuid.field.number;
-	c->revision = cpuid.field.revision;
-	c->model = cpuid.field.model;
-	c->family = cpuid.field.family;
-	c->archrev = cpuid.field.archrev;
-	c->features = cpuid.field.features;
-	c->model_name = get_model_name(c->family, c->model);
-
-	status = ia64_pal_vm_summary(&vm1, &vm2);
-	if (status == PAL_STATUS_SUCCESS) {
-		impl_va_msb = vm2.pal_vm_info_2_s.impl_va_msb;
-		phys_addr_size = vm1.pal_vm_info_1_s.phys_add_size;
-	}
-	c->unimpl_va_mask = ~((7L<<61) | ((1L << (impl_va_msb + 1)) - 1));
-	c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1));
-}
-
-/*
- * Do the following calculations:
- *
- * 1. the max. cache line size.
- * 2. the minimum of the i-cache stride sizes for "flush_icache_range()".
- * 3. the minimum of the cache stride sizes for "clflush_cache_range()".
- */
-static void
-get_cache_info(void)
-{
-	unsigned long line_size, max = 1;
-	unsigned long l, levels, unique_caches;
-	pal_cache_config_info_t cci;
-	long status;
-
-        status = ia64_pal_cache_summary(&levels, &unique_caches);
-        if (status != 0) {
-                printk(KERN_ERR "%s: ia64_pal_cache_summary() failed (status=%ld)\n",
-                       __func__, status);
-                max = SMP_CACHE_BYTES;
-		/* Safest setup for "flush_icache_range()" */
-		ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT;
-		/* Safest setup for "clflush_cache_range()" */
-		ia64_cache_stride_shift = CACHE_STRIDE_SHIFT;
-		goto out;
-        }
-
-	for (l = 0; l < levels; ++l) {
-		/* cache_type (data_or_unified)=2 */
-		status = ia64_pal_cache_config_info(l, 2, &cci);
-		if (status != 0) {
-			printk(KERN_ERR "%s: ia64_pal_cache_config_info"
-				"(l=%lu, 2) failed (status=%ld)\n",
-				__func__, l, status);
-			max = SMP_CACHE_BYTES;
-			/* The safest setup for "flush_icache_range()" */
-			cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
-			/* The safest setup for "clflush_cache_range()" */
-			ia64_cache_stride_shift = CACHE_STRIDE_SHIFT;
-			cci.pcci_unified = 1;
-		} else {
-			if (cci.pcci_stride < ia64_cache_stride_shift)
-				ia64_cache_stride_shift = cci.pcci_stride;
-
-			line_size = 1 << cci.pcci_line_size;
-			if (line_size > max)
-				max = line_size;
-		}
-
-		if (!cci.pcci_unified) {
-			/* cache_type (instruction)=1*/
-			status = ia64_pal_cache_config_info(l, 1, &cci);
-			if (status != 0) {
-				printk(KERN_ERR "%s: ia64_pal_cache_config_info"
-					"(l=%lu, 1) failed (status=%ld)\n",
-					__func__, l, status);
-				/* The safest setup for flush_icache_range() */
-				cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
-			}
-		}
-		if (cci.pcci_stride < ia64_i_cache_stride_shift)
-			ia64_i_cache_stride_shift = cci.pcci_stride;
-	}
-  out:
-	if (max > ia64_max_cacheline_size)
-		ia64_max_cacheline_size = max;
-}
-
-/*
- * cpu_init() initializes state that is per-CPU.  This function acts
- * as a 'CPU state barrier', nothing should get across.
- */
-void
-cpu_init (void)
-{
-	extern void ia64_mmu_init(void *);
-	static unsigned long max_num_phys_stacked = IA64_NUM_PHYS_STACK_REG;
-	unsigned long num_phys_stacked;
-	pal_vm_info_2_u_t vmi;
-	unsigned int max_ctx;
-	struct cpuinfo_ia64 *cpu_info;
-	void *cpu_data;
-
-	cpu_data = per_cpu_init();
-#ifdef CONFIG_SMP
-	/*
-	 * insert boot cpu into sibling and core mapes
-	 * (must be done after per_cpu area is setup)
-	 */
-	if (smp_processor_id() == 0) {
-		cpumask_set_cpu(0, &per_cpu(cpu_sibling_map, 0));
-		cpumask_set_cpu(0, &cpu_core_map[0]);
-	} else {
-		/*
-		 * Set ar.k3 so that assembly code in MCA handler can compute
-		 * physical addresses of per cpu variables with a simple:
-		 *   phys = ar.k3 + &per_cpu_var
-		 * and the alt-dtlb-miss handler can set per-cpu mapping into
-		 * the TLB when needed. head.S already did this for cpu0.
-		 */
-		ia64_set_kr(IA64_KR_PER_CPU_DATA,
-			    ia64_tpa(cpu_data) - (long) __per_cpu_start);
-	}
-#endif
-
-	get_cache_info();
-
-	/*
-	 * We can't pass "local_cpu_data" to identify_cpu() because we haven't called
-	 * ia64_mmu_init() yet.  And we can't call ia64_mmu_init() first because it
-	 * depends on the data returned by identify_cpu().  We break the dependency by
-	 * accessing cpu_data() through the canonical per-CPU address.
-	 */
-	cpu_info = cpu_data + ((char *) &__ia64_per_cpu_var(ia64_cpu_info) - __per_cpu_start);
-	identify_cpu(cpu_info);
-
-#ifdef CONFIG_MCKINLEY
-	{
-#		define FEATURE_SET 16
-		struct ia64_pal_retval iprv;
-
-		if (cpu_info->family == 0x1f) {
-			PAL_CALL_PHYS(iprv, PAL_PROC_GET_FEATURES, 0, FEATURE_SET, 0);
-			if ((iprv.status == 0) && (iprv.v0 & 0x80) && (iprv.v2 & 0x80))
-				PAL_CALL_PHYS(iprv, PAL_PROC_SET_FEATURES,
-				              (iprv.v1 | 0x80), FEATURE_SET, 0);
-		}
-	}
-#endif
-
-	/* Clear the stack memory reserved for pt_regs: */
-	memset(task_pt_regs(current), 0, sizeof(struct pt_regs));
-
-	ia64_set_kr(IA64_KR_FPU_OWNER, 0);
-
-	/*
-	 * Initialize the page-table base register to a global
-	 * directory with all zeroes.  This ensure that we can handle
-	 * TLB-misses to user address-space even before we created the
-	 * first user address-space.  This may happen, e.g., due to
-	 * aggressive use of lfetch.fault.
-	 */
-	ia64_set_kr(IA64_KR_PT_BASE, __pa(ia64_imva(empty_zero_page)));
-
-	/*
-	 * Initialize default control register to defer speculative faults except
-	 * for those arising from TLB misses, which are not deferred.  The
-	 * kernel MUST NOT depend on a particular setting of these bits (in other words,
-	 * the kernel must have recovery code for all speculative accesses).  Turn on
-	 * dcr.lc as per recommendation by the architecture team.  Most IA-32 apps
-	 * shouldn't be affected by this (moral: keep your ia32 locks aligned and you'll
-	 * be fine).
-	 */
-	ia64_setreg(_IA64_REG_CR_DCR,  (  IA64_DCR_DP | IA64_DCR_DK | IA64_DCR_DX | IA64_DCR_DR
-					| IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
-	mmgrab(&init_mm);
-	current->active_mm = &init_mm;
-	BUG_ON(current->mm);
-
-	ia64_mmu_init(ia64_imva(cpu_data));
-	ia64_mca_cpu_init(ia64_imva(cpu_data));
-
-	/* Clear ITC to eliminate sched_clock() overflows in human time.  */
-	ia64_set_itc(0);
-
-	/* disable all local interrupt sources: */
-	ia64_set_itv(1 << 16);
-	ia64_set_lrr0(1 << 16);
-	ia64_set_lrr1(1 << 16);
-	ia64_setreg(_IA64_REG_CR_PMV, 1 << 16);
-	ia64_setreg(_IA64_REG_CR_CMCV, 1 << 16);
-
-	/* clear TPR & XTP to enable all interrupt classes: */
-	ia64_setreg(_IA64_REG_CR_TPR, 0);
-
-	/* Clear any pending interrupts left by SAL/EFI */
-	while (ia64_get_ivr() != IA64_SPURIOUS_INT_VECTOR)
-		ia64_eoi();
-
-#ifdef CONFIG_SMP
-	normal_xtp();
-#endif
-
-	/* set ia64_ctx.max_rid to the maximum RID that is supported by all CPUs: */
-	if (ia64_pal_vm_summary(NULL, &vmi) == 0) {
-		max_ctx = (1U << (vmi.pal_vm_info_2_s.rid_size - 3)) - 1;
-		setup_ptcg_sem(vmi.pal_vm_info_2_s.max_purges, NPTCG_FROM_PAL);
-	} else {
-		printk(KERN_WARNING "cpu_init: PAL VM summary failed, assuming 18 RID bits\n");
-		max_ctx = (1U << 15) - 1;	/* use architected minimum */
-	}
-	while (max_ctx < ia64_ctx.max_ctx) {
-		unsigned int old = ia64_ctx.max_ctx;
-		if (cmpxchg(&ia64_ctx.max_ctx, old, max_ctx) == old)
-			break;
-	}
-
-	if (ia64_pal_rse_info(&num_phys_stacked, NULL) != 0) {
-		printk(KERN_WARNING "cpu_init: PAL RSE info failed; assuming 96 physical "
-		       "stacked regs\n");
-		num_phys_stacked = 96;
-	}
-	/* size of physical stacked register partition plus 8 bytes: */
-	if (num_phys_stacked > max_num_phys_stacked) {
-		ia64_patch_phys_stack_reg(num_phys_stacked*8 + 8);
-		max_num_phys_stacked = num_phys_stacked;
-	}
-}
-
-void __init arch_cpu_finalize_init(void)
-{
-	ia64_patch_mckinley_e9((unsigned long) __start___mckinley_e9_bundles,
-			       (unsigned long) __end___mckinley_e9_bundles);
-}
-
-static int __init run_dmi_scan(void)
-{
-	dmi_setup();
-	return 0;
-}
-core_initcall(run_dmi_scan);
diff --git a/arch/ia64/kernel/sigframe.h b/arch/ia64/kernel/sigframe.h
deleted file mode 100644
index 58a36ce6c26e..000000000000
--- a/arch/ia64/kernel/sigframe.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-struct sigscratch {
-	unsigned long scratch_unat;	/* ar.unat for the general registers saved in pt */
-	unsigned long ar_pfs;		/* for syscalls, the user-level function-state  */
-	struct pt_regs pt;
-};
-
-struct sigframe {
-	/*
-	 * Place signal handler args where user-level unwinder can find them easily.
-	 * DO NOT MOVE THESE.  They are part of the IA-64 Linux ABI and there is
-	 * user-level code that depends on their presence!
-	 */
-	unsigned long arg0;		/* signum */
-	unsigned long arg1;		/* siginfo pointer */
-	unsigned long arg2;		/* sigcontext pointer */
-	/*
-	 * End of architected state.
-	 */
-
-	void __user *handler;		/* pointer to the plabel of the signal handler */
-	struct siginfo info;
-	struct sigcontext sc;
-};
-
-extern void ia64_do_signal (struct sigscratch *, long);
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
deleted file mode 100644
index 51cf6a7ec158..000000000000
--- a/arch/ia64/kernel/signal.c
+++ /dev/null
@@ -1,412 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Architecture-specific signal handling support.
- *
- * Copyright (C) 1999-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * Derived from i386 and Alpha versions.
- */
-
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/ptrace.h>
-#include <linux/sched.h>
-#include <linux/signal.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
-#include <linux/tty.h>
-#include <linux/binfmts.h>
-#include <linux/unistd.h>
-#include <linux/wait.h>
-
-#include <asm/intrinsics.h>
-#include <linux/uaccess.h>
-#include <asm/rse.h>
-#include <asm/sigcontext.h>
-
-#include "sigframe.h"
-
-#define DEBUG_SIG	0
-#define STACK_ALIGN	16		/* minimal alignment for stack pointer */
-
-#if _NSIG_WORDS > 1
-# define PUT_SIGSET(k,u)	__copy_to_user((u)->sig, (k)->sig, sizeof(sigset_t))
-# define GET_SIGSET(k,u)	__copy_from_user((k)->sig, (u)->sig, sizeof(sigset_t))
-#else
-# define PUT_SIGSET(k,u)	__put_user((k)->sig[0], &(u)->sig[0])
-# define GET_SIGSET(k,u)	__get_user((k)->sig[0], &(u)->sig[0])
-#endif
-
-static long
-restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
-{
-	unsigned long ip, flags, nat, um, cfm, rsc;
-	long err;
-
-	/* Always make any pending restarted system calls return -EINTR */
-	current->restart_block.fn = do_no_restart_syscall;
-
-	/* restore scratch that always needs gets updated during signal delivery: */
-	err  = __get_user(flags, &sc->sc_flags);
-	err |= __get_user(nat, &sc->sc_nat);
-	err |= __get_user(ip, &sc->sc_ip);			/* instruction pointer */
-	err |= __get_user(cfm, &sc->sc_cfm);
-	err |= __get_user(um, &sc->sc_um);			/* user mask */
-	err |= __get_user(rsc, &sc->sc_ar_rsc);
-	err |= __get_user(scr->pt.ar_unat, &sc->sc_ar_unat);
-	err |= __get_user(scr->pt.ar_fpsr, &sc->sc_ar_fpsr);
-	err |= __get_user(scr->pt.ar_pfs, &sc->sc_ar_pfs);
-	err |= __get_user(scr->pt.pr, &sc->sc_pr);		/* predicates */
-	err |= __get_user(scr->pt.b0, &sc->sc_br[0]);		/* b0 (rp) */
-	err |= __get_user(scr->pt.b6, &sc->sc_br[6]);		/* b6 */
-	err |= __copy_from_user(&scr->pt.r1, &sc->sc_gr[1], 8);	/* r1 */
-	err |= __copy_from_user(&scr->pt.r8, &sc->sc_gr[8], 4*8);	/* r8-r11 */
-	err |= __copy_from_user(&scr->pt.r12, &sc->sc_gr[12], 2*8);	/* r12-r13 */
-	err |= __copy_from_user(&scr->pt.r15, &sc->sc_gr[15], 8);	/* r15 */
-
-	scr->pt.cr_ifs = cfm | (1UL << 63);
-	scr->pt.ar_rsc = rsc | (3 << 2); /* force PL3 */
-
-	/* establish new instruction pointer: */
-	scr->pt.cr_iip = ip & ~0x3UL;
-	ia64_psr(&scr->pt)->ri = ip & 0x3;
-	scr->pt.cr_ipsr = (scr->pt.cr_ipsr & ~IA64_PSR_UM) | (um & IA64_PSR_UM);
-
-	scr->scratch_unat = ia64_put_scratch_nat_bits(&scr->pt, nat);
-
-	if (!(flags & IA64_SC_FLAG_IN_SYSCALL)) {
-		/* Restore most scratch-state only when not in syscall. */
-		err |= __get_user(scr->pt.ar_ccv, &sc->sc_ar_ccv);		/* ar.ccv */
-		err |= __get_user(scr->pt.b7, &sc->sc_br[7]);			/* b7 */
-		err |= __get_user(scr->pt.r14, &sc->sc_gr[14]);			/* r14 */
-		err |= __copy_from_user(&scr->pt.ar_csd, &sc->sc_ar25, 2*8); /* ar.csd & ar.ssd */
-		err |= __copy_from_user(&scr->pt.r2, &sc->sc_gr[2], 2*8);	/* r2-r3 */
-		err |= __copy_from_user(&scr->pt.r16, &sc->sc_gr[16], 16*8);	/* r16-r31 */
-	}
-
-	if ((flags & IA64_SC_FLAG_FPH_VALID) != 0) {
-		struct ia64_psr *psr = ia64_psr(&scr->pt);
-
-		err |= __copy_from_user(current->thread.fph, &sc->sc_fr[32], 96*16);
-		psr->mfh = 0;	/* drop signal handler's fph contents... */
-		preempt_disable();
-		if (psr->dfh)
-			ia64_drop_fpu(current);
-		else {
-			/* We already own the local fph, otherwise psr->dfh wouldn't be 0.  */
-			__ia64_load_fpu(current->thread.fph);
-			ia64_set_local_fpu_owner(current);
-		}
-		preempt_enable();
-	}
-	return err;
-}
-
-long
-ia64_rt_sigreturn (struct sigscratch *scr)
-{
-	extern char ia64_strace_leave_kernel, ia64_leave_kernel;
-	struct sigcontext __user *sc;
-	sigset_t set;
-	long retval;
-
-	sc = &((struct sigframe __user *) (scr->pt.r12 + 16))->sc;
-
-	/*
-	 * When we return to the previously executing context, r8 and r10 have already
-	 * been setup the way we want them.  Indeed, if the signal wasn't delivered while
-	 * in a system call, we must not touch r8 or r10 as otherwise user-level state
-	 * could be corrupted.
-	 */
-	retval = (long) &ia64_leave_kernel;
-	if (test_thread_flag(TIF_SYSCALL_TRACE)
-	    || test_thread_flag(TIF_SYSCALL_AUDIT))
-		/*
-		 * strace expects to be notified after sigreturn returns even though the
-		 * context to which we return may not be in the middle of a syscall.
-		 * Thus, the return-value that strace displays for sigreturn is
-		 * meaningless.
-		 */
-		retval = (long) &ia64_strace_leave_kernel;
-
-	if (!access_ok(sc, sizeof(*sc)))
-		goto give_sigsegv;
-
-	if (GET_SIGSET(&set, &sc->sc_mask))
-		goto give_sigsegv;
-
-	set_current_blocked(&set);
-
-	if (restore_sigcontext(sc, scr))
-		goto give_sigsegv;
-
-#if DEBUG_SIG
-	printk("SIG return (%s:%d): sp=%lx ip=%lx\n",
-	       current->comm, current->pid, scr->pt.r12, scr->pt.cr_iip);
-#endif
-	if (restore_altstack(&sc->sc_stack))
-		goto give_sigsegv;
-	return retval;
-
-  give_sigsegv:
-	force_sig(SIGSEGV);
-	return retval;
-}
-
-/*
- * This does just the minimum required setup of sigcontext.
- * Specifically, it only installs data that is either not knowable at
- * the user-level or that gets modified before execution in the
- * trampoline starts.  Everything else is done at the user-level.
- */
-static long
-setup_sigcontext (struct sigcontext __user *sc, sigset_t *mask, struct sigscratch *scr)
-{
-	unsigned long flags = 0, ifs, cfm, nat;
-	long err = 0;
-
-	ifs = scr->pt.cr_ifs;
-
-	if (on_sig_stack((unsigned long) sc))
-		flags |= IA64_SC_FLAG_ONSTACK;
-	if ((ifs & (1UL << 63)) == 0)
-		/* if cr_ifs doesn't have the valid bit set, we got here through a syscall */
-		flags |= IA64_SC_FLAG_IN_SYSCALL;
-	cfm = ifs & ((1UL << 38) - 1);
-	ia64_flush_fph(current);
-	if ((current->thread.flags & IA64_THREAD_FPH_VALID)) {
-		flags |= IA64_SC_FLAG_FPH_VALID;
-		err = __copy_to_user(&sc->sc_fr[32], current->thread.fph, 96*16);
-	}
-
-	nat = ia64_get_scratch_nat_bits(&scr->pt, scr->scratch_unat);
-
-	err |= __put_user(flags, &sc->sc_flags);
-	err |= __put_user(nat, &sc->sc_nat);
-	err |= PUT_SIGSET(mask, &sc->sc_mask);
-	err |= __put_user(cfm, &sc->sc_cfm);
-	err |= __put_user(scr->pt.cr_ipsr & IA64_PSR_UM, &sc->sc_um);
-	err |= __put_user(scr->pt.ar_rsc, &sc->sc_ar_rsc);
-	err |= __put_user(scr->pt.ar_unat, &sc->sc_ar_unat);		/* ar.unat */
-	err |= __put_user(scr->pt.ar_fpsr, &sc->sc_ar_fpsr);		/* ar.fpsr */
-	err |= __put_user(scr->pt.ar_pfs, &sc->sc_ar_pfs);
-	err |= __put_user(scr->pt.pr, &sc->sc_pr);			/* predicates */
-	err |= __put_user(scr->pt.b0, &sc->sc_br[0]);			/* b0 (rp) */
-	err |= __put_user(scr->pt.b6, &sc->sc_br[6]);			/* b6 */
-	err |= __copy_to_user(&sc->sc_gr[1], &scr->pt.r1, 8);		/* r1 */
-	err |= __copy_to_user(&sc->sc_gr[8], &scr->pt.r8, 4*8);		/* r8-r11 */
-	err |= __copy_to_user(&sc->sc_gr[12], &scr->pt.r12, 2*8);	/* r12-r13 */
-	err |= __copy_to_user(&sc->sc_gr[15], &scr->pt.r15, 8);		/* r15 */
-	err |= __put_user(scr->pt.cr_iip + ia64_psr(&scr->pt)->ri, &sc->sc_ip);
-
-	if (!(flags & IA64_SC_FLAG_IN_SYSCALL)) {
-		/* Copy scratch regs to sigcontext if the signal didn't interrupt a syscall. */
-		err |= __put_user(scr->pt.ar_ccv, &sc->sc_ar_ccv);		/* ar.ccv */
-		err |= __put_user(scr->pt.b7, &sc->sc_br[7]);			/* b7 */
-		err |= __put_user(scr->pt.r14, &sc->sc_gr[14]);			/* r14 */
-		err |= __copy_to_user(&sc->sc_ar25, &scr->pt.ar_csd, 2*8); /* ar.csd & ar.ssd */
-		err |= __copy_to_user(&sc->sc_gr[2], &scr->pt.r2, 2*8);		/* r2-r3 */
-		err |= __copy_to_user(&sc->sc_gr[16], &scr->pt.r16, 16*8);	/* r16-r31 */
-	}
-	return err;
-}
-
-/*
- * Check whether the register-backing store is already on the signal stack.
- */
-static inline int
-rbs_on_sig_stack (unsigned long bsp)
-{
-	return (bsp - current->sas_ss_sp < current->sas_ss_size);
-}
-
-static long
-setup_frame(struct ksignal *ksig, sigset_t *set, struct sigscratch *scr)
-{
-	extern char __kernel_sigtramp[];
-	unsigned long tramp_addr, new_rbs = 0, new_sp;
-	struct sigframe __user *frame;
-	long err;
-
-	new_sp = scr->pt.r12;
-	tramp_addr = (unsigned long) __kernel_sigtramp;
-	if (ksig->ka.sa.sa_flags & SA_ONSTACK) {
-		int onstack = sas_ss_flags(new_sp);
-
-		if (onstack == 0) {
-			new_sp = current->sas_ss_sp + current->sas_ss_size;
-			/*
-			 * We need to check for the register stack being on the
-			 * signal stack separately, because it's switched
-			 * separately (memory stack is switched in the kernel,
-			 * register stack is switched in the signal trampoline).
-			 */
-			if (!rbs_on_sig_stack(scr->pt.ar_bspstore))
-				new_rbs = ALIGN(current->sas_ss_sp,
-						sizeof(long));
-		} else if (onstack == SS_ONSTACK) {
-			unsigned long check_sp;
-
-			/*
-			 * If we are on the alternate signal stack and would
-			 * overflow it, don't. Return an always-bogus address
-			 * instead so we will die with SIGSEGV.
-			 */
-			check_sp = (new_sp - sizeof(*frame)) & -STACK_ALIGN;
-			if (!likely(on_sig_stack(check_sp))) {
-				force_sigsegv(ksig->sig);
-				return 1;
-			}
-		}
-	}
-	frame = (void __user *) ((new_sp - sizeof(*frame)) & -STACK_ALIGN);
-
-	if (!access_ok(frame, sizeof(*frame))) {
-		force_sigsegv(ksig->sig);
-		return 1;
-	}
-
-	err  = __put_user(ksig->sig, &frame->arg0);
-	err |= __put_user(&frame->info, &frame->arg1);
-	err |= __put_user(&frame->sc, &frame->arg2);
-	err |= __put_user(new_rbs, &frame->sc.sc_rbs_base);
-	err |= __put_user(0, &frame->sc.sc_loadrs);	/* initialize to zero */
-	err |= __put_user(ksig->ka.sa.sa_handler, &frame->handler);
-
-	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
-
-	err |= __save_altstack(&frame->sc.sc_stack, scr->pt.r12);
-	err |= setup_sigcontext(&frame->sc, set, scr);
-
-	if (unlikely(err)) {
-		force_sigsegv(ksig->sig);
-		return 1;
-	}
-
-	scr->pt.r12 = (unsigned long) frame - 16;	/* new stack pointer */
-	scr->pt.ar_fpsr = FPSR_DEFAULT;			/* reset fpsr for signal handler */
-	scr->pt.cr_iip = tramp_addr;
-	ia64_psr(&scr->pt)->ri = 0;			/* start executing in first slot */
-	ia64_psr(&scr->pt)->be = 0;			/* force little-endian byte-order */
-	/*
-	 * Force the interruption function mask to zero.  This has no effect when a
-	 * system-call got interrupted by a signal (since, in that case, scr->pt_cr_ifs is
-	 * ignored), but it has the desirable effect of making it possible to deliver a
-	 * signal with an incomplete register frame (which happens when a mandatory RSE
-	 * load faults).  Furthermore, it has no negative effect on the getting the user's
-	 * dirty partition preserved, because that's governed by scr->pt.loadrs.
-	 */
-	scr->pt.cr_ifs = (1UL << 63);
-
-	/*
-	 * Note: this affects only the NaT bits of the scratch regs (the ones saved in
-	 * pt_regs), which is exactly what we want.
-	 */
-	scr->scratch_unat = 0; /* ensure NaT bits of r12 is clear */
-
-#if DEBUG_SIG
-	printk("SIG deliver (%s:%d): sig=%d sp=%lx ip=%lx handler=%p\n",
-	       current->comm, current->pid, ksig->sig, scr->pt.r12, frame->sc.sc_ip, frame->handler);
-#endif
-	return 0;
-}
-
-static long
-handle_signal (struct ksignal *ksig, struct sigscratch *scr)
-{
-	int ret = setup_frame(ksig, sigmask_to_save(), scr);
-
-	if (!ret)
-		signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLESTEP));
-
-	return ret;
-}
-
-/*
- * Note that `init' is a special process: it doesn't get signals it doesn't want to
- * handle.  Thus you cannot kill init even with a SIGKILL even by mistake.
- */
-void
-ia64_do_signal (struct sigscratch *scr, long in_syscall)
-{
-	long restart = in_syscall;
-	long errno = scr->pt.r8;
-	struct ksignal ksig;
-
-	/*
-	 * This only loops in the rare cases of handle_signal() failing, in which case we
-	 * need to push through a forced SIGSEGV.
-	 */
-	while (1) {
-		if (!get_signal(&ksig))
-			break;
-
-		/*
-		 * get_signal() may have run a debugger (via notify_parent())
-		 * and the debugger may have modified the state (e.g., to arrange for an
-		 * inferior call), thus it's important to check for restarting _after_
-		 * get_signal().
-		 */
-		if ((long) scr->pt.r10 != -1)
-			/*
-			 * A system calls has to be restarted only if one of the error codes
-			 * ERESTARTNOHAND, ERESTARTSYS, or ERESTARTNOINTR is returned.  If r10
-			 * isn't -1 then r8 doesn't hold an error code and we don't need to
-			 * restart the syscall, so we can clear the "restart" flag here.
-			 */
-			restart = 0;
-
-		if (ksig.sig <= 0)
-			break;
-
-		if (unlikely(restart)) {
-			switch (errno) {
-			case ERESTART_RESTARTBLOCK:
-			case ERESTARTNOHAND:
-				scr->pt.r8 = EINTR;
-				/* note: scr->pt.r10 is already -1 */
-				break;
-			case ERESTARTSYS:
-				if ((ksig.ka.sa.sa_flags & SA_RESTART) == 0) {
-					scr->pt.r8 = EINTR;
-					/* note: scr->pt.r10 is already -1 */
-					break;
-				}
-				fallthrough;
-			case ERESTARTNOINTR:
-				ia64_decrement_ip(&scr->pt);
-				restart = 0; /* don't restart twice if handle_signal() fails... */
-			}
-		}
-
-		/*
-		 * Whee!  Actually deliver the signal.  If the delivery failed, we need to
-		 * continue to iterate in this loop so we can deliver the SIGSEGV...
-		 */
-		if (handle_signal(&ksig, scr))
-			return;
-	}
-
-	/* Did we come from a system call? */
-	if (restart) {
-		/* Restart the system call - no handlers present */
-		if (errno == ERESTARTNOHAND || errno == ERESTARTSYS || errno == ERESTARTNOINTR
-		    || errno == ERESTART_RESTARTBLOCK)
-		{
-			/*
-			 * Note: the syscall number is in r15 which is saved in
-			 * pt_regs so all we need to do here is adjust ip so that
-			 * the "break" instruction gets re-executed.
-			 */
-			ia64_decrement_ip(&scr->pt);
-			if (errno == ERESTART_RESTARTBLOCK)
-				scr->pt.r15 = __NR_restart_syscall;
-		}
-	}
-
-	/* if there's no signal to deliver, we just put the saved sigmask
-	 * back */
-	restore_saved_sigmask();
-}
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
deleted file mode 100644
index ea4f009a232b..000000000000
--- a/arch/ia64/kernel/smp.c
+++ /dev/null
@@ -1,335 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * SMP Support
- *
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999, 2001, 2003 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * Lots of stuff stolen from arch/alpha/kernel/smp.c
- *
- * 01/05/16 Rohit Seth <rohit.seth@intel.com>  IA64-SMP functions. Reorganized
- * the existing code (on the lines of x86 port).
- * 00/09/11 David Mosberger <davidm@hpl.hp.com> Do loops_per_jiffy
- * calibration on each CPU.
- * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> fixed logical processor id
- * 00/03/31 Rohit Seth <rohit.seth@intel.com>	Fixes for Bootstrap Processor
- * & cpu_online_map now gets done here (instead of setup.c)
- * 99/10/05 davidm	Update to bring it in sync with new command-line processing
- *  scheme.
- * 10/13/00 Goutham Rao <goutham.rao@intel.com> Updated smp_call_function and
- *		smp_call_function_single to resend IPI on timeouts
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/smp.h>
-#include <linux/kernel_stat.h>
-#include <linux/mm.h>
-#include <linux/cache.h>
-#include <linux/delay.h>
-#include <linux/efi.h>
-#include <linux/bitops.h>
-#include <linux/kexec.h>
-
-#include <linux/atomic.h>
-#include <asm/current.h>
-#include <asm/delay.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/page.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/sal.h>
-#include <asm/tlbflush.h>
-#include <asm/unistd.h>
-#include <asm/mca.h>
-#include <asm/xtp.h>
-
-/*
- * Note: alignment of 4 entries/cacheline was empirically determined
- * to be a good tradeoff between hot cachelines & spreading the array
- * across too many cacheline.
- */
-static struct local_tlb_flush_counts {
-	unsigned int count;
-} __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS];
-
-static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned short [NR_CPUS],
-				     shadow_flush_counts);
-
-#define IPI_CALL_FUNC		0
-#define IPI_CPU_STOP		1
-#define IPI_CALL_FUNC_SINGLE	2
-#define IPI_KDUMP_CPU_STOP	3
-
-/* This needs to be cacheline aligned because it is written to by *other* CPUs.  */
-static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, ipi_operation);
-
-extern void cpu_halt (void);
-
-static void
-stop_this_cpu(void)
-{
-	/*
-	 * Remove this CPU:
-	 */
-	set_cpu_online(smp_processor_id(), false);
-	max_xtp();
-	local_irq_disable();
-	cpu_halt();
-}
-
-void
-cpu_die(void)
-{
-	max_xtp();
-	local_irq_disable();
-	cpu_halt();
-	/* Should never be here */
-	BUG();
-	for (;;);
-}
-
-irqreturn_t
-handle_IPI (int irq, void *dev_id)
-{
-	int this_cpu = get_cpu();
-	unsigned long *pending_ipis = &__ia64_per_cpu_var(ipi_operation);
-	unsigned long ops;
-
-	mb();	/* Order interrupt and bit testing. */
-	while ((ops = xchg(pending_ipis, 0)) != 0) {
-		mb();	/* Order bit clearing and data access. */
-		do {
-			unsigned long which;
-
-			which = ffz(~ops);
-			ops &= ~(1 << which);
-
-			switch (which) {
-			case IPI_CPU_STOP:
-				stop_this_cpu();
-				break;
-			case IPI_CALL_FUNC:
-				generic_smp_call_function_interrupt();
-				break;
-			case IPI_CALL_FUNC_SINGLE:
-				generic_smp_call_function_single_interrupt();
-				break;
-#ifdef CONFIG_KEXEC
-			case IPI_KDUMP_CPU_STOP:
-				unw_init_running(kdump_cpu_freeze, NULL);
-				break;
-#endif
-			default:
-				printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n",
-						this_cpu, which);
-				break;
-			}
-		} while (ops);
-		mb();	/* Order data access and bit testing. */
-	}
-	put_cpu();
-	return IRQ_HANDLED;
-}
-
-
-
-/*
- * Called with preemption disabled.
- */
-static inline void
-send_IPI_single (int dest_cpu, int op)
-{
-	set_bit(op, &per_cpu(ipi_operation, dest_cpu));
-	ia64_send_ipi(dest_cpu, IA64_IPI_VECTOR, IA64_IPI_DM_INT, 0);
-}
-
-/*
- * Called with preemption disabled.
- */
-static inline void
-send_IPI_allbutself (int op)
-{
-	unsigned int i;
-
-	for_each_online_cpu(i) {
-		if (i != smp_processor_id())
-			send_IPI_single(i, op);
-	}
-}
-
-/*
- * Called with preemption disabled.
- */
-static inline void
-send_IPI_mask(const struct cpumask *mask, int op)
-{
-	unsigned int cpu;
-
-	for_each_cpu(cpu, mask) {
-			send_IPI_single(cpu, op);
-	}
-}
-
-/*
- * Called with preemption disabled.
- */
-static inline void
-send_IPI_all (int op)
-{
-	int i;
-
-	for_each_online_cpu(i) {
-		send_IPI_single(i, op);
-	}
-}
-
-/*
- * Called with preemption disabled.
- */
-static inline void
-send_IPI_self (int op)
-{
-	send_IPI_single(smp_processor_id(), op);
-}
-
-#ifdef CONFIG_KEXEC
-void
-kdump_smp_send_stop(void)
-{
- 	send_IPI_allbutself(IPI_KDUMP_CPU_STOP);
-}
-
-void
-kdump_smp_send_init(void)
-{
-	unsigned int cpu, self_cpu;
-	self_cpu = smp_processor_id();
-	for_each_online_cpu(cpu) {
-		if (cpu != self_cpu) {
-			if(kdump_status[cpu] == 0)
-				ia64_send_ipi(cpu, 0, IA64_IPI_DM_INIT, 0);
-		}
-	}
-}
-#endif
-/*
- * Called with preemption disabled.
- */
-void
-arch_smp_send_reschedule (int cpu)
-{
-	ia64_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0);
-}
-EXPORT_SYMBOL_GPL(arch_smp_send_reschedule);
-
-/*
- * Called with preemption disabled.
- */
-static void
-smp_send_local_flush_tlb (int cpu)
-{
-	ia64_send_ipi(cpu, IA64_IPI_LOCAL_TLB_FLUSH, IA64_IPI_DM_INT, 0);
-}
-
-void
-smp_local_flush_tlb(void)
-{
-	/*
-	 * Use atomic ops. Otherwise, the load/increment/store sequence from
-	 * a "++" operation can have the line stolen between the load & store.
-	 * The overhead of the atomic op in negligible in this case & offers
-	 * significant benefit for the brief periods where lots of cpus
-	 * are simultaneously flushing TLBs.
-	 */
-	ia64_fetchadd(1, &local_tlb_flush_counts[smp_processor_id()].count, acq);
-	local_flush_tlb_all();
-}
-
-#define FLUSH_DELAY	5 /* Usec backoff to eliminate excessive cacheline bouncing */
-
-void
-smp_flush_tlb_cpumask(cpumask_t xcpumask)
-{
-	unsigned short *counts = __ia64_per_cpu_var(shadow_flush_counts);
-	cpumask_t cpumask = xcpumask;
-	int mycpu, cpu, flush_mycpu = 0;
-
-	preempt_disable();
-	mycpu = smp_processor_id();
-
-	for_each_cpu(cpu, &cpumask)
-		counts[cpu] = local_tlb_flush_counts[cpu].count & 0xffff;
-
-	mb();
-	for_each_cpu(cpu, &cpumask) {
-		if (cpu == mycpu)
-			flush_mycpu = 1;
-		else
-			smp_send_local_flush_tlb(cpu);
-	}
-
-	if (flush_mycpu)
-		smp_local_flush_tlb();
-
-	for_each_cpu(cpu, &cpumask)
-		while(counts[cpu] == (local_tlb_flush_counts[cpu].count & 0xffff))
-			udelay(FLUSH_DELAY);
-
-	preempt_enable();
-}
-
-void
-smp_flush_tlb_all (void)
-{
-	on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1);
-}
-
-void
-smp_flush_tlb_mm (struct mm_struct *mm)
-{
-	cpumask_var_t cpus;
-	preempt_disable();
-	/* this happens for the common case of a single-threaded fork():  */
-	if (likely(mm == current->active_mm && atomic_read(&mm->mm_users) == 1))
-	{
-		local_finish_flush_tlb_mm(mm);
-		preempt_enable();
-		return;
-	}
-	if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) {
-		smp_call_function((void (*)(void *))local_finish_flush_tlb_mm,
-			mm, 1);
-	} else {
-		cpumask_copy(cpus, mm_cpumask(mm));
-		smp_call_function_many(cpus,
-			(void (*)(void *))local_finish_flush_tlb_mm, mm, 1);
-		free_cpumask_var(cpus);
-	}
-	local_irq_disable();
-	local_finish_flush_tlb_mm(mm);
-	local_irq_enable();
-	preempt_enable();
-}
-
-void arch_send_call_function_single_ipi(int cpu)
-{
-	send_IPI_single(cpu, IPI_CALL_FUNC_SINGLE);
-}
-
-void arch_send_call_function_ipi_mask(const struct cpumask *mask)
-{
-	send_IPI_mask(mask, IPI_CALL_FUNC);
-}
-
-/*
- * this function calls the 'stop' function on all other CPUs in the system.
- */
-void
-smp_send_stop (void)
-{
-	send_IPI_allbutself(IPI_CPU_STOP);
-}
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
deleted file mode 100644
index d0e935cf2093..000000000000
--- a/arch/ia64/kernel/smpboot.c
+++ /dev/null
@@ -1,839 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * SMP boot-related support
- *
- * Copyright (C) 1998-2003, 2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2001, 2004-2005 Intel Corp
- * 	Rohit Seth <rohit.seth@intel.com>
- * 	Suresh Siddha <suresh.b.siddha@intel.com>
- * 	Gordon Jin <gordon.jin@intel.com>
- *	Ashok Raj  <ashok.raj@intel.com>
- *
- * 01/05/16 Rohit Seth <rohit.seth@intel.com>	Moved SMP booting functions from smp.c to here.
- * 01/04/27 David Mosberger <davidm@hpl.hp.com>	Added ITC synching code.
- * 02/07/31 David Mosberger <davidm@hpl.hp.com>	Switch over to hotplug-CPU boot-sequence.
- *						smp_boot_cpus()/smp_commence() is replaced by
- *						smp_prepare_cpus()/__cpu_up()/smp_cpus_done().
- * 04/06/21 Ashok Raj		<ashok.raj@intel.com> Added CPU Hotplug Support
- * 04/12/26 Jin Gordon <gordon.jin@intel.com>
- * 04/12/26 Rohit Seth <rohit.seth@intel.com>
- *						Add multi-threading and multi-core detection
- * 05/01/30 Suresh Siddha <suresh.b.siddha@intel.com>
- *						Setup cpu_sibling_map and cpu_core_map
- */
-
-#include <linux/module.h>
-#include <linux/acpi.h>
-#include <linux/memblock.h>
-#include <linux/cpu.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/mm.h>
-#include <linux/notifier.h>
-#include <linux/smp.h>
-#include <linux/spinlock.h>
-#include <linux/efi.h>
-#include <linux/percpu.h>
-#include <linux/bitops.h>
-
-#include <linux/atomic.h>
-#include <asm/cache.h>
-#include <asm/current.h>
-#include <asm/delay.h>
-#include <asm/efi.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/mca.h>
-#include <asm/page.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/sal.h>
-#include <asm/tlbflush.h>
-#include <asm/unistd.h>
-
-#define SMP_DEBUG 0
-
-#if SMP_DEBUG
-#define Dprintk(x...)  printk(x)
-#else
-#define Dprintk(x...)
-#endif
-
-#ifdef CONFIG_HOTPLUG_CPU
-#ifdef CONFIG_PERMIT_BSP_REMOVE
-#define bsp_remove_ok	1
-#else
-#define bsp_remove_ok	0
-#endif
-
-/*
- * Global array allocated for NR_CPUS at boot time
- */
-struct sal_to_os_boot sal_boot_rendez_state[NR_CPUS];
-
-/*
- * start_ap in head.S uses this to store current booting cpu
- * info.
- */
-struct sal_to_os_boot *sal_state_for_booting_cpu = &sal_boot_rendez_state[0];
-
-#define set_brendez_area(x) (sal_state_for_booting_cpu = &sal_boot_rendez_state[(x)]);
-
-#else
-#define set_brendez_area(x)
-#endif
-
-
-/*
- * ITC synchronization related stuff:
- */
-#define MASTER	(0)
-#define SLAVE	(SMP_CACHE_BYTES/8)
-
-#define NUM_ROUNDS	64	/* magic value */
-#define NUM_ITERS	5	/* likewise */
-
-static DEFINE_SPINLOCK(itc_sync_lock);
-static volatile unsigned long go[SLAVE + 1];
-
-#define DEBUG_ITC_SYNC	0
-
-extern void start_ap (void);
-extern unsigned long ia64_iobase;
-
-struct task_struct *task_for_booting_cpu;
-
-/*
- * State for each CPU
- */
-DEFINE_PER_CPU(int, cpu_state);
-
-cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
-EXPORT_SYMBOL(cpu_core_map);
-DEFINE_PER_CPU_SHARED_ALIGNED(cpumask_t, cpu_sibling_map);
-EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
-
-int smp_num_siblings = 1;
-
-/* which logical CPU number maps to which CPU (physical APIC ID) */
-volatile int ia64_cpu_to_sapicid[NR_CPUS];
-EXPORT_SYMBOL(ia64_cpu_to_sapicid);
-
-static cpumask_t cpu_callin_map;
-
-struct smp_boot_data smp_boot_data __initdata;
-
-unsigned long ap_wakeup_vector = -1; /* External Int use to wakeup APs */
-
-char __initdata no_int_routing;
-
-unsigned char smp_int_redirect; /* are INT and IPI redirectable by the chipset? */
-
-#ifdef CONFIG_FORCE_CPEI_RETARGET
-#define CPEI_OVERRIDE_DEFAULT	(1)
-#else
-#define CPEI_OVERRIDE_DEFAULT	(0)
-#endif
-
-unsigned int force_cpei_retarget = CPEI_OVERRIDE_DEFAULT;
-
-static int __init
-cmdl_force_cpei(char *str)
-{
-	int value=0;
-
-	get_option (&str, &value);
-	force_cpei_retarget = value;
-
-	return 1;
-}
-
-__setup("force_cpei=", cmdl_force_cpei);
-
-static int __init
-nointroute (char *str)
-{
-	no_int_routing = 1;
-	printk ("no_int_routing on\n");
-	return 1;
-}
-
-__setup("nointroute", nointroute);
-
-static void fix_b0_for_bsp(void)
-{
-#ifdef CONFIG_HOTPLUG_CPU
-	int cpuid;
-	static int fix_bsp_b0 = 1;
-
-	cpuid = smp_processor_id();
-
-	/*
-	 * Cache the b0 value on the first AP that comes up
-	 */
-	if (!(fix_bsp_b0 && cpuid))
-		return;
-
-	sal_boot_rendez_state[0].br[0] = sal_boot_rendez_state[cpuid].br[0];
-	printk ("Fixed BSP b0 value from CPU %d\n", cpuid);
-
-	fix_bsp_b0 = 0;
-#endif
-}
-
-void
-sync_master (void *arg)
-{
-	unsigned long flags, i;
-
-	go[MASTER] = 0;
-
-	local_irq_save(flags);
-	{
-		for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
-			while (!go[MASTER])
-				cpu_relax();
-			go[MASTER] = 0;
-			go[SLAVE] = ia64_get_itc();
-		}
-	}
-	local_irq_restore(flags);
-}
-
-/*
- * Return the number of cycles by which our itc differs from the itc on the master
- * (time-keeper) CPU.  A positive number indicates our itc is ahead of the master,
- * negative that it is behind.
- */
-static inline long
-get_delta (long *rt, long *master)
-{
-	unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
-	unsigned long tcenter, t0, t1, tm;
-	long i;
-
-	for (i = 0; i < NUM_ITERS; ++i) {
-		t0 = ia64_get_itc();
-		go[MASTER] = 1;
-		while (!(tm = go[SLAVE]))
-			cpu_relax();
-		go[SLAVE] = 0;
-		t1 = ia64_get_itc();
-
-		if (t1 - t0 < best_t1 - best_t0)
-			best_t0 = t0, best_t1 = t1, best_tm = tm;
-	}
-
-	*rt = best_t1 - best_t0;
-	*master = best_tm - best_t0;
-
-	/* average best_t0 and best_t1 without overflow: */
-	tcenter = (best_t0/2 + best_t1/2);
-	if (best_t0 % 2 + best_t1 % 2 == 2)
-		++tcenter;
-	return tcenter - best_tm;
-}
-
-/*
- * Synchronize ar.itc of the current (slave) CPU with the ar.itc of the MASTER CPU
- * (normally the time-keeper CPU).  We use a closed loop to eliminate the possibility of
- * unaccounted-for errors (such as getting a machine check in the middle of a calibration
- * step).  The basic idea is for the slave to ask the master what itc value it has and to
- * read its own itc before and after the master responds.  Each iteration gives us three
- * timestamps:
- *
- *	slave		master
- *
- *	t0 ---\
- *             ---\
- *		   --->
- *			tm
- *		   /---
- *	       /---
- *	t1 <---
- *
- *
- * The goal is to adjust the slave's ar.itc such that tm falls exactly half-way between t0
- * and t1.  If we achieve this, the clocks are synchronized provided the interconnect
- * between the slave and the master is symmetric.  Even if the interconnect were
- * asymmetric, we would still know that the synchronization error is smaller than the
- * roundtrip latency (t0 - t1).
- *
- * When the interconnect is quiet and symmetric, this lets us synchronize the itc to
- * within one or two cycles.  However, we can only *guarantee* that the synchronization is
- * accurate to within a round-trip time, which is typically in the range of several
- * hundred cycles (e.g., ~500 cycles).  In practice, this means that the itc's are usually
- * almost perfectly synchronized, but we shouldn't assume that the accuracy is much better
- * than half a micro second or so.
- */
-void
-ia64_sync_itc (unsigned int master)
-{
-	long i, delta, adj, adjust_latency = 0, done = 0;
-	unsigned long flags, rt, master_time_stamp, bound;
-#if DEBUG_ITC_SYNC
-	struct {
-		long rt;	/* roundtrip time */
-		long master;	/* master's timestamp */
-		long diff;	/* difference between midpoint and master's timestamp */
-		long lat;	/* estimate of itc adjustment latency */
-	} t[NUM_ROUNDS];
-#endif
-
-	/*
-	 * Make sure local timer ticks are disabled while we sync.  If
-	 * they were enabled, we'd have to worry about nasty issues
-	 * like setting the ITC ahead of (or a long time before) the
-	 * next scheduled tick.
-	 */
-	BUG_ON((ia64_get_itv() & (1 << 16)) == 0);
-
-	go[MASTER] = 1;
-
-	if (smp_call_function_single(master, sync_master, NULL, 0) < 0) {
-		printk(KERN_ERR "sync_itc: failed to get attention of CPU %u!\n", master);
-		return;
-	}
-
-	while (go[MASTER])
-		cpu_relax();	/* wait for master to be ready */
-
-	spin_lock_irqsave(&itc_sync_lock, flags);
-	{
-		for (i = 0; i < NUM_ROUNDS; ++i) {
-			delta = get_delta(&rt, &master_time_stamp);
-			if (delta == 0) {
-				done = 1;	/* let's lock on to this... */
-				bound = rt;
-			}
-
-			if (!done) {
-				if (i > 0) {
-					adjust_latency += -delta;
-					adj = -delta + adjust_latency/4;
-				} else
-					adj = -delta;
-
-				ia64_set_itc(ia64_get_itc() + adj);
-			}
-#if DEBUG_ITC_SYNC
-			t[i].rt = rt;
-			t[i].master = master_time_stamp;
-			t[i].diff = delta;
-			t[i].lat = adjust_latency/4;
-#endif
-		}
-	}
-	spin_unlock_irqrestore(&itc_sync_lock, flags);
-
-#if DEBUG_ITC_SYNC
-	for (i = 0; i < NUM_ROUNDS; ++i)
-		printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
-		       t[i].rt, t[i].master, t[i].diff, t[i].lat);
-#endif
-
-	printk(KERN_INFO "CPU %d: synchronized ITC with CPU %u (last diff %ld cycles, "
-	       "maxerr %lu cycles)\n", smp_processor_id(), master, delta, rt);
-}
-
-/*
- * Ideally sets up per-cpu profiling hooks.  Doesn't do much now...
- */
-static inline void smp_setup_percpu_timer(void)
-{
-}
-
-static void
-smp_callin (void)
-{
-	int cpuid, phys_id, itc_master;
-	struct cpuinfo_ia64 *last_cpuinfo, *this_cpuinfo;
-	extern void ia64_init_itm(void);
-	extern volatile int time_keeper_id;
-
-	cpuid = smp_processor_id();
-	phys_id = hard_smp_processor_id();
-	itc_master = time_keeper_id;
-
-	if (cpu_online(cpuid)) {
-		printk(KERN_ERR "huh, phys CPU#0x%x, CPU#0x%x already present??\n",
-		       phys_id, cpuid);
-		BUG();
-	}
-
-	fix_b0_for_bsp();
-
-	/*
-	 * numa_node_id() works after this.
-	 */
-	set_numa_node(cpu_to_node_map[cpuid]);
-	set_numa_mem(local_memory_node(cpu_to_node_map[cpuid]));
-
-	spin_lock(&vector_lock);
-	/* Setup the per cpu irq handling data structures */
-	__setup_vector_irq(cpuid);
-	notify_cpu_starting(cpuid);
-	set_cpu_online(cpuid, true);
-	per_cpu(cpu_state, cpuid) = CPU_ONLINE;
-	spin_unlock(&vector_lock);
-
-	smp_setup_percpu_timer();
-
-	ia64_mca_cmc_vector_setup();	/* Setup vector on AP */
-
-	local_irq_enable();
-
-	if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
-		/*
-		 * Synchronize the ITC with the BP.  Need to do this after irqs are
-		 * enabled because ia64_sync_itc() calls smp_call_function_single(), which
-		 * calls spin_unlock_bh(), which calls spin_unlock_bh(), which calls
-		 * local_bh_enable(), which bugs out if irqs are not enabled...
-		 */
-		Dprintk("Going to syncup ITC with ITC Master.\n");
-		ia64_sync_itc(itc_master);
-	}
-
-	/*
-	 * Get our bogomips.
-	 */
-	ia64_init_itm();
-
-	/*
-	 * Delay calibration can be skipped if new processor is identical to the
-	 * previous processor.
-	 */
-	last_cpuinfo = cpu_data(cpuid - 1);
-	this_cpuinfo = local_cpu_data;
-	if (last_cpuinfo->itc_freq != this_cpuinfo->itc_freq ||
-	    last_cpuinfo->proc_freq != this_cpuinfo->proc_freq ||
-	    last_cpuinfo->features != this_cpuinfo->features ||
-	    last_cpuinfo->revision != this_cpuinfo->revision ||
-	    last_cpuinfo->family != this_cpuinfo->family ||
-	    last_cpuinfo->archrev != this_cpuinfo->archrev ||
-	    last_cpuinfo->model != this_cpuinfo->model)
-		calibrate_delay();
-	local_cpu_data->loops_per_jiffy = loops_per_jiffy;
-
-	/*
-	 * Allow the master to continue.
-	 */
-	cpumask_set_cpu(cpuid, &cpu_callin_map);
-	Dprintk("Stack on CPU %d at about %p\n",cpuid, &cpuid);
-}
-
-
-/*
- * Activate a secondary processor.  head.S calls this.
- */
-int
-start_secondary (void *unused)
-{
-	/* Early console may use I/O ports */
-	ia64_set_kr(IA64_KR_IO_BASE, __pa(ia64_iobase));
-#ifndef CONFIG_PRINTK_TIME
-	Dprintk("start_secondary: starting CPU 0x%x\n", hard_smp_processor_id());
-#endif
-	efi_map_pal_code();
-	cpu_init();
-	smp_callin();
-
-	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
-	return 0;
-}
-
-static int
-do_boot_cpu (int sapicid, int cpu, struct task_struct *idle)
-{
-	int timeout;
-
-	task_for_booting_cpu = idle;
-	Dprintk("Sending wakeup vector %lu to AP 0x%x/0x%x.\n", ap_wakeup_vector, cpu, sapicid);
-
-	set_brendez_area(cpu);
-	ia64_send_ipi(cpu, ap_wakeup_vector, IA64_IPI_DM_INT, 0);
-
-	/*
-	 * Wait 10s total for the AP to start
-	 */
-	Dprintk("Waiting on callin_map ...");
-	for (timeout = 0; timeout < 100000; timeout++) {
-		if (cpumask_test_cpu(cpu, &cpu_callin_map))
-			break;  /* It has booted */
-		barrier(); /* Make sure we re-read cpu_callin_map */
-		udelay(100);
-	}
-	Dprintk("\n");
-
-	if (!cpumask_test_cpu(cpu, &cpu_callin_map)) {
-		printk(KERN_ERR "Processor 0x%x/0x%x is stuck.\n", cpu, sapicid);
-		ia64_cpu_to_sapicid[cpu] = -1;
-		set_cpu_online(cpu, false);  /* was set in smp_callin() */
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static int __init
-decay (char *str)
-{
-	int ticks;
-	get_option (&str, &ticks);
-	return 1;
-}
-
-__setup("decay=", decay);
-
-/*
- * Initialize the logical CPU number to SAPICID mapping
- */
-void __init
-smp_build_cpu_map (void)
-{
-	int sapicid, cpu, i;
-	int boot_cpu_id = hard_smp_processor_id();
-
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-		ia64_cpu_to_sapicid[cpu] = -1;
-	}
-
-	ia64_cpu_to_sapicid[0] = boot_cpu_id;
-	init_cpu_present(cpumask_of(0));
-	set_cpu_possible(0, true);
-	for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) {
-		sapicid = smp_boot_data.cpu_phys_id[i];
-		if (sapicid == boot_cpu_id)
-			continue;
-		set_cpu_present(cpu, true);
-		set_cpu_possible(cpu, true);
-		ia64_cpu_to_sapicid[cpu] = sapicid;
-		cpu++;
-	}
-}
-
-/*
- * Cycle through the APs sending Wakeup IPIs to boot each.
- */
-void __init
-smp_prepare_cpus (unsigned int max_cpus)
-{
-	int boot_cpu_id = hard_smp_processor_id();
-
-	/*
-	 * Initialize the per-CPU profiling counter/multiplier
-	 */
-
-	smp_setup_percpu_timer();
-
-	cpumask_set_cpu(0, &cpu_callin_map);
-
-	local_cpu_data->loops_per_jiffy = loops_per_jiffy;
-	ia64_cpu_to_sapicid[0] = boot_cpu_id;
-
-	printk(KERN_INFO "Boot processor id 0x%x/0x%x\n", 0, boot_cpu_id);
-
-	current_thread_info()->cpu = 0;
-
-	/*
-	 * If SMP should be disabled, then really disable it!
-	 */
-	if (!max_cpus) {
-		printk(KERN_INFO "SMP mode deactivated.\n");
-		init_cpu_online(cpumask_of(0));
-		init_cpu_present(cpumask_of(0));
-		init_cpu_possible(cpumask_of(0));
-		return;
-	}
-}
-
-void smp_prepare_boot_cpu(void)
-{
-	set_cpu_online(smp_processor_id(), true);
-	cpumask_set_cpu(smp_processor_id(), &cpu_callin_map);
-	set_numa_node(cpu_to_node_map[smp_processor_id()]);
-	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static inline void
-clear_cpu_sibling_map(int cpu)
-{
-	int i;
-
-	for_each_cpu(i, &per_cpu(cpu_sibling_map, cpu))
-		cpumask_clear_cpu(cpu, &per_cpu(cpu_sibling_map, i));
-	for_each_cpu(i, &cpu_core_map[cpu])
-		cpumask_clear_cpu(cpu, &cpu_core_map[i]);
-
-	per_cpu(cpu_sibling_map, cpu) = cpu_core_map[cpu] = CPU_MASK_NONE;
-}
-
-static void
-remove_siblinginfo(int cpu)
-{
-	if (cpu_data(cpu)->threads_per_core == 1 &&
-	    cpu_data(cpu)->cores_per_socket == 1) {
-		cpumask_clear_cpu(cpu, &cpu_core_map[cpu]);
-		cpumask_clear_cpu(cpu, &per_cpu(cpu_sibling_map, cpu));
-		return;
-	}
-
-	/* remove it from all sibling map's */
-	clear_cpu_sibling_map(cpu);
-}
-
-extern void fixup_irqs(void);
-
-int migrate_platform_irqs(unsigned int cpu)
-{
-	int new_cpei_cpu;
-	struct irq_data *data = NULL;
-	const struct cpumask *mask;
-	int 		retval = 0;
-
-	/*
-	 * dont permit CPEI target to removed.
-	 */
-	if (cpe_vector > 0 && is_cpu_cpei_target(cpu)) {
-		printk ("CPU (%d) is CPEI Target\n", cpu);
-		if (can_cpei_retarget()) {
-			/*
-			 * Now re-target the CPEI to a different processor
-			 */
-			new_cpei_cpu = cpumask_any(cpu_online_mask);
-			mask = cpumask_of(new_cpei_cpu);
-			set_cpei_target_cpu(new_cpei_cpu);
-			data = irq_get_irq_data(ia64_cpe_irq);
-			/*
-			 * Switch for now, immediately, we need to do fake intr
-			 * as other interrupts, but need to study CPEI behaviour with
-			 * polling before making changes.
-			 */
-			if (data && data->chip) {
-				data->chip->irq_disable(data);
-				data->chip->irq_set_affinity(data, mask, false);
-				data->chip->irq_enable(data);
-				printk ("Re-targeting CPEI to cpu %d\n", new_cpei_cpu);
-			}
-		}
-		if (!data) {
-			printk ("Unable to retarget CPEI, offline cpu [%d] failed\n", cpu);
-			retval = -EBUSY;
-		}
-	}
-	return retval;
-}
-
-/* must be called with cpucontrol mutex held */
-int __cpu_disable(void)
-{
-	int cpu = smp_processor_id();
-
-	/*
-	 * dont permit boot processor for now
-	 */
-	if (cpu == 0 && !bsp_remove_ok) {
-		printk ("Your platform does not support removal of BSP\n");
-		return (-EBUSY);
-	}
-
-	set_cpu_online(cpu, false);
-
-	if (migrate_platform_irqs(cpu)) {
-		set_cpu_online(cpu, true);
-		return -EBUSY;
-	}
-
-	remove_siblinginfo(cpu);
-	fixup_irqs();
-	local_flush_tlb_all();
-	cpumask_clear_cpu(cpu, &cpu_callin_map);
-	return 0;
-}
-
-void __cpu_die(unsigned int cpu)
-{
-	unsigned int i;
-
-	for (i = 0; i < 100; i++) {
-		/* They ack this in play_dead by setting CPU_DEAD */
-		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
-		{
-			printk ("CPU %d is now offline\n", cpu);
-			return;
-		}
-		msleep(100);
-	}
- 	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-void
-smp_cpus_done (unsigned int dummy)
-{
-	int cpu;
-	unsigned long bogosum = 0;
-
-	/*
-	 * Allow the user to impress friends.
-	 */
-
-	for_each_online_cpu(cpu) {
-		bogosum += cpu_data(cpu)->loops_per_jiffy;
-	}
-
-	printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
-	       (int)num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100);
-}
-
-static inline void set_cpu_sibling_map(int cpu)
-{
-	int i;
-
-	for_each_online_cpu(i) {
-		if ((cpu_data(cpu)->socket_id == cpu_data(i)->socket_id)) {
-			cpumask_set_cpu(i, &cpu_core_map[cpu]);
-			cpumask_set_cpu(cpu, &cpu_core_map[i]);
-			if (cpu_data(cpu)->core_id == cpu_data(i)->core_id) {
-				cpumask_set_cpu(i,
-						&per_cpu(cpu_sibling_map, cpu));
-				cpumask_set_cpu(cpu,
-						&per_cpu(cpu_sibling_map, i));
-			}
-		}
-	}
-}
-
-int
-__cpu_up(unsigned int cpu, struct task_struct *tidle)
-{
-	int ret;
-	int sapicid;
-
-	sapicid = ia64_cpu_to_sapicid[cpu];
-	if (sapicid == -1)
-		return -EINVAL;
-
-	/*
-	 * Already booted cpu? not valid anymore since we dont
-	 * do idle loop tightspin anymore.
-	 */
-	if (cpumask_test_cpu(cpu, &cpu_callin_map))
-		return -EINVAL;
-
-	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
-	/* Processor goes to start_secondary(), sets online flag */
-	ret = do_boot_cpu(sapicid, cpu, tidle);
-	if (ret < 0)
-		return ret;
-
-	if (cpu_data(cpu)->threads_per_core == 1 &&
-	    cpu_data(cpu)->cores_per_socket == 1) {
-		cpumask_set_cpu(cpu, &per_cpu(cpu_sibling_map, cpu));
-		cpumask_set_cpu(cpu, &cpu_core_map[cpu]);
-		return 0;
-	}
-
-	set_cpu_sibling_map(cpu);
-
-	return 0;
-}
-
-/*
- * Assume that CPUs have been discovered by some platform-dependent interface.  For
- * SoftSDV/Lion, that would be ACPI.
- *
- * Setup of the IPI irq handler is done in irq.c:init_IRQ_SMP().
- */
-void __init
-init_smp_config(void)
-{
-	struct fptr {
-		unsigned long fp;
-		unsigned long gp;
-	} *ap_startup;
-	long sal_ret;
-
-	/* Tell SAL where to drop the APs.  */
-	ap_startup = (struct fptr *) start_ap;
-	sal_ret = ia64_sal_set_vectors(SAL_VECTOR_OS_BOOT_RENDEZ,
-				       ia64_tpa(ap_startup->fp), ia64_tpa(ap_startup->gp), 0, 0, 0, 0);
-	if (sal_ret < 0)
-		printk(KERN_ERR "SMP: Can't set SAL AP Boot Rendezvous: %s\n",
-		       ia64_sal_strerror(sal_ret));
-}
-
-/*
- * identify_siblings(cpu) gets called from identify_cpu. This populates the 
- * information related to logical execution units in per_cpu_data structure.
- */
-void identify_siblings(struct cpuinfo_ia64 *c)
-{
-	long status;
-	u16 pltid;
-	pal_logical_to_physical_t info;
-
-	status = ia64_pal_logical_to_phys(-1, &info);
-	if (status != PAL_STATUS_SUCCESS) {
-		if (status != PAL_STATUS_UNIMPLEMENTED) {
-			printk(KERN_ERR
-				"ia64_pal_logical_to_phys failed with %ld\n",
-				status);
-			return;
-		}
-
-		info.overview_ppid = 0;
-		info.overview_cpp  = 1;
-		info.overview_tpc  = 1;
-	}
-
-	status = ia64_sal_physical_id_info(&pltid);
-	if (status != PAL_STATUS_SUCCESS) {
-		if (status != PAL_STATUS_UNIMPLEMENTED)
-			printk(KERN_ERR
-				"ia64_sal_pltid failed with %ld\n",
-				status);
-		return;
-	}
-
-	c->socket_id =  (pltid << 8) | info.overview_ppid;
-
-	if (info.overview_cpp == 1 && info.overview_tpc == 1)
-		return;
-
-	c->cores_per_socket = info.overview_cpp;
-	c->threads_per_core = info.overview_tpc;
-	c->num_log = info.overview_num_log;
-
-	c->core_id = info.log1_cid;
-	c->thread_id = info.log1_tid;
-}
-
-/*
- * returns non zero, if multi-threading is enabled
- * on at least one physical package. Due to hotplug cpu
- * and (maxcpus=), all threads may not necessarily be enabled
- * even though the processor supports multi-threading.
- */
-int is_multithreading_enabled(void)
-{
-	int i, j;
-
-	for_each_present_cpu(i) {
-		for_each_present_cpu(j) {
-			if (j == i)
-				continue;
-			if ((cpu_data(j)->socket_id == cpu_data(i)->socket_id)) {
-				if (cpu_data(j)->core_id == cpu_data(i)->core_id)
-					return 1;
-			}
-		}
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(is_multithreading_enabled);
diff --git a/arch/ia64/kernel/stacktrace.c b/arch/ia64/kernel/stacktrace.c
deleted file mode 100644
index 6e583a6bd2f6..000000000000
--- a/arch/ia64/kernel/stacktrace.c
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * arch/ia64/kernel/stacktrace.c
- *
- * Stack trace management functions
- *
- */
-#include <linux/sched.h>
-#include <linux/stacktrace.h>
-#include <linux/module.h>
-
-static void
-ia64_do_save_stack(struct unw_frame_info *info, void *arg)
-{
-	struct stack_trace *trace = arg;
-	unsigned long ip;
-	int skip = trace->skip;
-
-	trace->nr_entries = 0;
-	do {
-		unw_get_ip(info, &ip);
-		if (ip == 0)
-			break;
-		if (skip == 0) {
-			trace->entries[trace->nr_entries++] = ip;
-			if (trace->nr_entries == trace->max_entries)
-				break;
-		} else
-			skip--;
-	} while (unw_unwind(info) >= 0);
-}
-
-/*
- * Save stack-backtrace addresses into a stack_trace buffer.
- */
-void save_stack_trace(struct stack_trace *trace)
-{
-	unw_init_running(ia64_do_save_stack, trace);
-}
-EXPORT_SYMBOL(save_stack_trace);
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
deleted file mode 100644
index eb561cc93632..000000000000
--- a/arch/ia64/kernel/sys_ia64.c
+++ /dev/null
@@ -1,197 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * This file contains various system calls that have different calling
- * conventions on different platforms.
- *
- * Copyright (C) 1999-2000, 2002-2003, 2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/sched.h>
-#include <linux/sched/mm.h>
-#include <linux/sched/task_stack.h>
-#include <linux/shm.h>
-#include <linux/file.h>		/* doh, must come after sched.h... */
-#include <linux/smp.h>
-#include <linux/syscalls.h>
-#include <linux/highuid.h>
-#include <linux/hugetlb.h>
-
-#include <asm/shmparam.h>
-#include <linux/uaccess.h>
-
-unsigned long
-arch_get_unmapped_area (struct file *filp, unsigned long addr, unsigned long len,
-			unsigned long pgoff, unsigned long flags)
-{
-	long map_shared = (flags & MAP_SHARED);
-	unsigned long align_mask = 0;
-	struct mm_struct *mm = current->mm;
-	struct vm_unmapped_area_info info;
-
-	if (len > RGN_MAP_LIMIT)
-		return -ENOMEM;
-
-	/* handle fixed mapping: prevent overlap with huge pages */
-	if (flags & MAP_FIXED) {
-		if (is_hugepage_only_range(mm, addr, len))
-			return -EINVAL;
-		return addr;
-	}
-
-#ifdef CONFIG_HUGETLB_PAGE
-	if (REGION_NUMBER(addr) == RGN_HPAGE)
-		addr = 0;
-#endif
-	if (!addr)
-		addr = TASK_UNMAPPED_BASE;
-
-	if (map_shared && (TASK_SIZE > 0xfffffffful))
-		/*
-		 * For 64-bit tasks, align shared segments to 1MB to avoid potential
-		 * performance penalty due to virtual aliasing (see ASDM).  For 32-bit
-		 * tasks, we prefer to avoid exhausting the address space too quickly by
-		 * limiting alignment to a single page.
-		 */
-		align_mask = PAGE_MASK & (SHMLBA - 1);
-
-	info.flags = 0;
-	info.length = len;
-	info.low_limit = addr;
-	info.high_limit = TASK_SIZE;
-	info.align_mask = align_mask;
-	info.align_offset = pgoff << PAGE_SHIFT;
-	return vm_unmapped_area(&info);
-}
-
-asmlinkage long
-ia64_getpriority (int which, int who)
-{
-	long prio;
-
-	prio = sys_getpriority(which, who);
-	if (prio >= 0) {
-		force_successful_syscall_return();
-		prio = 20 - prio;
-	}
-	return prio;
-}
-
-/* XXX obsolete, but leave it here until the old libc is gone... */
-asmlinkage unsigned long
-sys_getpagesize (void)
-{
-	return PAGE_SIZE;
-}
-
-asmlinkage unsigned long
-ia64_brk (unsigned long brk)
-{
-	unsigned long retval = sys_brk(brk);
-	force_successful_syscall_return();
-	return retval;
-}
-
-/*
- * On IA-64, we return the two file descriptors in ret0 and ret1 (r8
- * and r9) as this is faster than doing a copy_to_user().
- */
-asmlinkage long
-sys_ia64_pipe (void)
-{
-	struct pt_regs *regs = task_pt_regs(current);
-	int fd[2];
-	int retval;
-
-	retval = do_pipe_flags(fd, 0);
-	if (retval)
-		goto out;
-	retval = fd[0];
-	regs->r9 = fd[1];
-  out:
-	return retval;
-}
-
-int ia64_mmap_check(unsigned long addr, unsigned long len,
-		unsigned long flags)
-{
-	unsigned long roff;
-
-	/*
-	 * Don't permit mappings into unmapped space, the virtual page table
-	 * of a region, or across a region boundary.  Note: RGN_MAP_LIMIT is
-	 * equal to 2^n-PAGE_SIZE (for some integer n <= 61) and len > 0.
-	 */
-	roff = REGION_OFFSET(addr);
-	if ((len > RGN_MAP_LIMIT) || (roff > (RGN_MAP_LIMIT - len)))
-		return -EINVAL;
-	return 0;
-}
-
-/*
- * mmap2() is like mmap() except that the offset is expressed in units
- * of PAGE_SIZE (instead of bytes).  This allows to mmap2() (pieces
- * of) files that are larger than the address space of the CPU.
- */
-asmlinkage unsigned long
-sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff)
-{
-	addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
-	if (!IS_ERR_VALUE(addr))
-		force_successful_syscall_return();
-	return addr;
-}
-
-asmlinkage unsigned long
-sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, long off)
-{
-	if (offset_in_page(off) != 0)
-		return -EINVAL;
-
-	addr = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
-	if (!IS_ERR_VALUE(addr))
-		force_successful_syscall_return();
-	return addr;
-}
-
-asmlinkage unsigned long
-ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags,
-	     unsigned long new_addr)
-{
-	addr = sys_mremap(addr, old_len, new_len, flags, new_addr);
-	if (!IS_ERR_VALUE(addr))
-		force_successful_syscall_return();
-	return addr;
-}
-
-asmlinkage long
-ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp)
-{
-	struct timespec64 rtn_tp;
-	s64 tick_ns;
-
-	/*
-	 * ia64's clock_gettime() syscall is implemented as a vdso call
-	 * fsys_clock_gettime(). Currently it handles only
-	 * CLOCK_REALTIME and CLOCK_MONOTONIC. Both are based on
-	 * 'ar.itc' counter which gets incremented at a constant
-	 * frequency. It's usually 400MHz, ~2.5x times slower than CPU
-	 * clock frequency. Which is almost a 1ns hrtimer, but not quite.
-	 *
-	 * Let's special-case these timers to report correct precision
-	 * based on ITC frequency and not HZ frequency for supported
-	 * clocks.
-	 */
-	switch (which_clock) {
-	case CLOCK_REALTIME:
-	case CLOCK_MONOTONIC:
-		tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq);
-		rtn_tp = ns_to_timespec64(tick_ns);
-		return put_timespec64(&rtn_tp, tp);
-	}
-
-	return sys_clock_getres(which_clock, tp);
-}
diff --git a/arch/ia64/kernel/syscalls/Makefile b/arch/ia64/kernel/syscalls/Makefile
deleted file mode 100644
index d009f927a048..000000000000
--- a/arch/ia64/kernel/syscalls/Makefile
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-kapi := arch/$(SRCARCH)/include/generated/asm
-uapi := arch/$(SRCARCH)/include/generated/uapi/asm
-
-$(shell mkdir -p $(uapi) $(kapi))
-
-syscall := $(src)/syscall.tbl
-syshdr := $(srctree)/scripts/syscallhdr.sh
-systbl := $(srctree)/scripts/syscalltbl.sh
-
-quiet_cmd_syshdr = SYSHDR  $@
-      cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr --offset __NR_Linux $< $@
-
-quiet_cmd_systbl = SYSTBL  $@
-      cmd_systbl = $(CONFIG_SHELL) $(systbl) $< $@
-
-$(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE
-	$(call if_changed,syshdr)
-
-$(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE
-	$(call if_changed,systbl)
-
-uapisyshdr-y		+= unistd_64.h
-kapisyshdr-y		+= syscall_table.h
-
-uapisyshdr-y	:= $(addprefix $(uapi)/, $(uapisyshdr-y))
-kapisyshdr-y	:= $(addprefix $(kapi)/, $(kapisyshdr-y))
-targets		+= $(addprefix ../../../../, $(uapisyshdr-y) $(kapisyshdr-y))
-
-PHONY += all
-all: $(uapisyshdr-y) $(kapisyshdr-y)
-	@:
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
deleted file mode 100644
index 83d8609aec03..000000000000
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ /dev/null
@@ -1,375 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
-#
-# Linux system call numbers and entry vectors for ia64
-#
-# The format is:
-# <number> <abi> <name> <entry point>
-#
-# Add 1024 to <number> will get the actual system call number
-#
-# The <abi> is always "common" for this file
-#
-0	common	ni_syscall			sys_ni_syscall
-1	common	exit				sys_exit
-2	common	read				sys_read
-3	common	write				sys_write
-4	common	open				sys_open
-5	common	close				sys_close
-6	common	creat				sys_creat
-7	common	link				sys_link
-8	common	unlink				sys_unlink
-9	common	execve				ia64_execve
-10	common	chdir				sys_chdir
-11	common	fchdir				sys_fchdir
-12	common	utimes				sys_utimes
-13	common	mknod				sys_mknod
-14	common	chmod				sys_chmod
-15	common	chown				sys_chown
-16	common	lseek				sys_lseek
-17	common	getpid				sys_getpid
-18	common	getppid				sys_getppid
-19	common	mount				sys_mount
-20	common	umount2				sys_umount
-21	common	setuid				sys_setuid
-22	common	getuid				sys_getuid
-23	common	geteuid				sys_geteuid
-24	common	ptrace				sys_ptrace
-25	common	access				sys_access
-26	common	sync				sys_sync
-27	common	fsync				sys_fsync
-28	common	fdatasync			sys_fdatasync
-29	common	kill				sys_kill
-30	common	rename				sys_rename
-31	common	mkdir				sys_mkdir
-32	common	rmdir				sys_rmdir
-33	common	dup				sys_dup
-34	common	pipe				sys_ia64_pipe
-35	common	times				sys_times
-36	common	brk				ia64_brk
-37	common	setgid				sys_setgid
-38	common	getgid				sys_getgid
-39	common	getegid				sys_getegid
-40	common	acct				sys_acct
-41	common	ioctl				sys_ioctl
-42	common	fcntl				sys_fcntl
-43	common	umask				sys_umask
-44	common	chroot				sys_chroot
-45	common	ustat				sys_ustat
-46	common	dup2				sys_dup2
-47	common	setreuid			sys_setreuid
-48	common	setregid			sys_setregid
-49	common	getresuid			sys_getresuid
-50	common	setresuid			sys_setresuid
-51	common	getresgid			sys_getresgid
-52	common	setresgid			sys_setresgid
-53	common	getgroups			sys_getgroups
-54	common	setgroups			sys_setgroups
-55	common	getpgid				sys_getpgid
-56	common	setpgid				sys_setpgid
-57	common	setsid				sys_setsid
-58	common	getsid				sys_getsid
-59	common	sethostname			sys_sethostname
-60	common	setrlimit			sys_setrlimit
-61	common	getrlimit			sys_getrlimit
-62	common	getrusage			sys_getrusage
-63	common	gettimeofday			sys_gettimeofday
-64	common	settimeofday			sys_settimeofday
-65	common	select				sys_select
-66	common	poll				sys_poll
-67	common	symlink				sys_symlink
-68	common	readlink			sys_readlink
-69	common	uselib				sys_uselib
-70	common	swapon				sys_swapon
-71	common	swapoff				sys_swapoff
-72	common	reboot				sys_reboot
-73	common	truncate			sys_truncate
-74	common	ftruncate			sys_ftruncate
-75	common	fchmod				sys_fchmod
-76	common	fchown				sys_fchown
-77	common	getpriority			ia64_getpriority
-78	common	setpriority			sys_setpriority
-79	common	statfs				sys_statfs
-80	common	fstatfs				sys_fstatfs
-81	common	gettid				sys_gettid
-82	common	semget				sys_semget
-83	common	semop				sys_semop
-84	common	semctl				sys_semctl
-85	common	msgget				sys_msgget
-86	common	msgsnd				sys_msgsnd
-87	common	msgrcv				sys_msgrcv
-88	common	msgctl				sys_msgctl
-89	common	shmget				sys_shmget
-90	common	shmat				sys_shmat
-91	common	shmdt				sys_shmdt
-92	common	shmctl				sys_shmctl
-93	common	syslog				sys_syslog
-94	common	setitimer			sys_setitimer
-95	common	getitimer			sys_getitimer
-# 1120 was old_stat
-# 1121 was old_lstat
-# 1122 was old_fstat
-99	common	vhangup				sys_vhangup
-100	common	lchown				sys_lchown
-101	common	remap_file_pages		sys_remap_file_pages
-102	common	wait4				sys_wait4
-103	common	sysinfo				sys_sysinfo
-104	common	clone				sys_clone
-105	common	setdomainname			sys_setdomainname
-106	common	uname				sys_newuname
-107	common	adjtimex			sys_adjtimex
-# 1132 was create_module
-109	common	init_module			sys_init_module
-110	common	delete_module			sys_delete_module
-# 1135 was get_kernel_syms
-# 1136 was query_module
-113	common	quotactl			sys_quotactl
-114	common	bdflush				sys_ni_syscall
-115	common	sysfs				sys_sysfs
-116	common	personality			sys_personality
-117	common	afs_syscall			sys_ni_syscall
-118	common	setfsuid			sys_setfsuid
-119	common	setfsgid			sys_setfsgid
-120	common	getdents			sys_getdents
-121	common	flock				sys_flock
-122	common	readv				sys_readv
-123	common	writev				sys_writev
-124	common	pread64				sys_pread64
-125	common	pwrite64			sys_pwrite64
-126	common	_sysctl				sys_ni_syscall
-127	common	mmap				sys_mmap
-128	common	munmap				sys_munmap
-129	common	mlock				sys_mlock
-130	common	mlockall			sys_mlockall
-131	common	mprotect			sys_mprotect
-132	common	mremap				ia64_mremap
-133	common	msync				sys_msync
-134	common	munlock				sys_munlock
-135	common	munlockall			sys_munlockall
-136	common	sched_getparam			sys_sched_getparam
-137	common	sched_setparam			sys_sched_setparam
-138	common	sched_getscheduler		sys_sched_getscheduler
-139	common	sched_setscheduler		sys_sched_setscheduler
-140	common	sched_yield			sys_sched_yield
-141	common	sched_get_priority_max		sys_sched_get_priority_max
-142	common	sched_get_priority_min		sys_sched_get_priority_min
-143	common	sched_rr_get_interval		sys_sched_rr_get_interval
-144	common	nanosleep			sys_nanosleep
-145	common	nfsservctl			sys_ni_syscall
-146	common	prctl				sys_prctl
-147	common	old_getpagesize			sys_getpagesize
-148	common	mmap2				sys_mmap2
-149	common	pciconfig_read			sys_pciconfig_read
-150	common	pciconfig_write			sys_pciconfig_write
-151	common	perfmonctl			sys_ni_syscall
-152	common	sigaltstack			sys_sigaltstack
-153	common	rt_sigaction			sys_rt_sigaction
-154	common	rt_sigpending			sys_rt_sigpending
-155	common	rt_sigprocmask			sys_rt_sigprocmask
-156	common	rt_sigqueueinfo			sys_rt_sigqueueinfo
-157	common	rt_sigreturn			sys_rt_sigreturn
-158	common	rt_sigsuspend			sys_rt_sigsuspend
-159	common	rt_sigtimedwait			sys_rt_sigtimedwait
-160	common	getcwd				sys_getcwd
-161	common	capget				sys_capget
-162	common	capset				sys_capset
-163	common	sendfile			sys_sendfile64
-164	common	getpmsg				sys_ni_syscall
-165	common	putpmsg				sys_ni_syscall
-166	common	socket				sys_socket
-167	common	bind				sys_bind
-168	common	connect				sys_connect
-169	common	listen				sys_listen
-170	common	accept				sys_accept
-171	common	getsockname			sys_getsockname
-172	common	getpeername			sys_getpeername
-173	common	socketpair			sys_socketpair
-174	common	send				sys_send
-175	common	sendto				sys_sendto
-176	common	recv				sys_recv
-177	common	recvfrom			sys_recvfrom
-178	common	shutdown			sys_shutdown
-179	common	setsockopt			sys_setsockopt
-180	common	getsockopt			sys_getsockopt
-181	common	sendmsg				sys_sendmsg
-182	common	recvmsg				sys_recvmsg
-183	common	pivot_root			sys_pivot_root
-184	common	mincore				sys_mincore
-185	common	madvise				sys_madvise
-186	common	stat				sys_newstat
-187	common	lstat				sys_newlstat
-188	common	fstat				sys_newfstat
-189	common	clone2				sys_clone2
-190	common	getdents64			sys_getdents64
-191	common	getunwind			sys_getunwind
-192	common	readahead			sys_readahead
-193	common	setxattr			sys_setxattr
-194	common	lsetxattr			sys_lsetxattr
-195	common	fsetxattr			sys_fsetxattr
-196	common	getxattr			sys_getxattr
-197	common	lgetxattr			sys_lgetxattr
-198	common	fgetxattr			sys_fgetxattr
-199	common	listxattr			sys_listxattr
-200	common	llistxattr			sys_llistxattr
-201	common	flistxattr			sys_flistxattr
-202	common	removexattr			sys_removexattr
-203	common	lremovexattr			sys_lremovexattr
-204	common	fremovexattr			sys_fremovexattr
-205	common	tkill				sys_tkill
-206	common	futex				sys_futex
-207	common	sched_setaffinity		sys_sched_setaffinity
-208	common	sched_getaffinity		sys_sched_getaffinity
-209	common	set_tid_address			sys_set_tid_address
-210	common	fadvise64			sys_fadvise64_64
-211	common	tgkill				sys_tgkill
-212	common	exit_group			sys_exit_group
-213	common	lookup_dcookie			sys_lookup_dcookie
-214	common	io_setup			sys_io_setup
-215	common	io_destroy			sys_io_destroy
-216	common	io_getevents			sys_io_getevents
-217	common	io_submit			sys_io_submit
-218	common	io_cancel			sys_io_cancel
-219	common	epoll_create			sys_epoll_create
-220	common	epoll_ctl			sys_epoll_ctl
-221	common	epoll_wait			sys_epoll_wait
-222	common	restart_syscall			sys_restart_syscall
-223	common	semtimedop			sys_semtimedop
-224	common	timer_create			sys_timer_create
-225	common	timer_settime			sys_timer_settime
-226	common	timer_gettime			sys_timer_gettime
-227	common	timer_getoverrun		sys_timer_getoverrun
-228	common	timer_delete			sys_timer_delete
-229	common	clock_settime			sys_clock_settime
-230	common	clock_gettime			sys_clock_gettime
-231	common	clock_getres			ia64_clock_getres
-232	common	clock_nanosleep			sys_clock_nanosleep
-233	common	fstatfs64			sys_fstatfs64
-234	common	statfs64			sys_statfs64
-235	common	mbind				sys_mbind
-236	common	get_mempolicy			sys_get_mempolicy
-237	common	set_mempolicy			sys_set_mempolicy
-238	common	mq_open				sys_mq_open
-239	common	mq_unlink			sys_mq_unlink
-240	common	mq_timedsend			sys_mq_timedsend
-241	common	mq_timedreceive			sys_mq_timedreceive
-242	common	mq_notify			sys_mq_notify
-243	common	mq_getsetattr			sys_mq_getsetattr
-244	common	kexec_load			sys_kexec_load
-245	common	vserver				sys_ni_syscall
-246	common	waitid				sys_waitid
-247	common	add_key				sys_add_key
-248	common	request_key			sys_request_key
-249	common	keyctl				sys_keyctl
-250	common	ioprio_set			sys_ioprio_set
-251	common	ioprio_get			sys_ioprio_get
-252	common	move_pages			sys_move_pages
-253	common	inotify_init			sys_inotify_init
-254	common	inotify_add_watch		sys_inotify_add_watch
-255	common	inotify_rm_watch		sys_inotify_rm_watch
-256	common	migrate_pages			sys_migrate_pages
-257	common	openat				sys_openat
-258	common	mkdirat				sys_mkdirat
-259	common	mknodat				sys_mknodat
-260	common	fchownat			sys_fchownat
-261	common	futimesat			sys_futimesat
-262	common	newfstatat			sys_newfstatat
-263	common	unlinkat			sys_unlinkat
-264	common	renameat			sys_renameat
-265	common	linkat				sys_linkat
-266	common	symlinkat			sys_symlinkat
-267	common	readlinkat			sys_readlinkat
-268	common	fchmodat			sys_fchmodat
-269	common	faccessat			sys_faccessat
-270	common	pselect6			sys_pselect6
-271	common	ppoll				sys_ppoll
-272	common	unshare				sys_unshare
-273	common	splice				sys_splice
-274	common	set_robust_list			sys_set_robust_list
-275	common	get_robust_list			sys_get_robust_list
-276	common	sync_file_range			sys_sync_file_range
-277	common	tee				sys_tee
-278	common	vmsplice			sys_vmsplice
-279	common	fallocate			sys_fallocate
-280	common	getcpu				sys_getcpu
-281	common	epoll_pwait			sys_epoll_pwait
-282	common	utimensat			sys_utimensat
-283	common	signalfd			sys_signalfd
-284	common	timerfd				sys_ni_syscall
-285	common	eventfd				sys_eventfd
-286	common	timerfd_create			sys_timerfd_create
-287	common	timerfd_settime			sys_timerfd_settime
-288	common	timerfd_gettime			sys_timerfd_gettime
-289	common	signalfd4			sys_signalfd4
-290	common	eventfd2			sys_eventfd2
-291	common	epoll_create1			sys_epoll_create1
-292	common	dup3				sys_dup3
-293	common	pipe2				sys_pipe2
-294	common	inotify_init1			sys_inotify_init1
-295	common	preadv				sys_preadv
-296	common	pwritev				sys_pwritev
-297	common	rt_tgsigqueueinfo		sys_rt_tgsigqueueinfo
-298	common	recvmmsg			sys_recvmmsg
-299	common	fanotify_init			sys_fanotify_init
-300	common	fanotify_mark			sys_fanotify_mark
-301	common	prlimit64			sys_prlimit64
-302	common	name_to_handle_at		sys_name_to_handle_at
-303	common	open_by_handle_at		sys_open_by_handle_at
-304	common	clock_adjtime			sys_clock_adjtime
-305	common	syncfs				sys_syncfs
-306	common	setns				sys_setns
-307	common	sendmmsg			sys_sendmmsg
-308	common	process_vm_readv		sys_process_vm_readv
-309	common	process_vm_writev		sys_process_vm_writev
-310	common	accept4				sys_accept4
-311	common	finit_module			sys_finit_module
-312	common	sched_setattr			sys_sched_setattr
-313	common	sched_getattr			sys_sched_getattr
-314	common	renameat2			sys_renameat2
-315	common	getrandom			sys_getrandom
-316	common	memfd_create			sys_memfd_create
-317	common	bpf				sys_bpf
-318	common	execveat			sys_execveat
-319	common	userfaultfd			sys_userfaultfd
-320	common	membarrier			sys_membarrier
-321	common	kcmp				sys_kcmp
-322	common	mlock2				sys_mlock2
-323	common	copy_file_range			sys_copy_file_range
-324	common	preadv2				sys_preadv2
-325	common	pwritev2			sys_pwritev2
-326	common	statx				sys_statx
-327	common	io_pgetevents			sys_io_pgetevents
-328	common	perf_event_open			sys_perf_event_open
-329	common	seccomp				sys_seccomp
-330	common	pkey_mprotect			sys_pkey_mprotect
-331	common	pkey_alloc			sys_pkey_alloc
-332	common	pkey_free			sys_pkey_free
-333	common	rseq				sys_rseq
-# 334 through 423 are reserved to sync up with other architectures
-424	common	pidfd_send_signal		sys_pidfd_send_signal
-425	common	io_uring_setup			sys_io_uring_setup
-426	common	io_uring_enter			sys_io_uring_enter
-427	common	io_uring_register		sys_io_uring_register
-428	common	open_tree			sys_open_tree
-429	common	move_mount			sys_move_mount
-430	common	fsopen				sys_fsopen
-431	common	fsconfig			sys_fsconfig
-432	common	fsmount				sys_fsmount
-433	common	fspick				sys_fspick
-434	common	pidfd_open			sys_pidfd_open
-# 435 reserved for clone3
-436	common	close_range			sys_close_range
-437	common	openat2				sys_openat2
-438	common	pidfd_getfd			sys_pidfd_getfd
-439	common	faccessat2			sys_faccessat2
-440	common	process_madvise			sys_process_madvise
-441	common	epoll_pwait2			sys_epoll_pwait2
-442	common	mount_setattr			sys_mount_setattr
-443	common	quotactl_fd			sys_quotactl_fd
-444	common	landlock_create_ruleset		sys_landlock_create_ruleset
-445	common	landlock_add_rule		sys_landlock_add_rule
-446	common	landlock_restrict_self		sys_landlock_restrict_self
-# 447 reserved for memfd_secret
-448	common	process_mrelease		sys_process_mrelease
-449	common  futex_waitv                     sys_futex_waitv
-450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
-451	common	cachestat			sys_cachestat
-452	common	fchmodat2			sys_fchmodat2
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
deleted file mode 100644
index 83ef044b63ef..000000000000
--- a/arch/ia64/kernel/time.c
+++ /dev/null
@@ -1,463 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/ia64/kernel/time.c
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger <davidm@hpl.hp.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- * Copyright (C) 1999-2000 VA Linux Systems
- * Copyright (C) 1999-2000 Walt Drummond <drummond@valinux.com>
- */
-
-#include <linux/cpu.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/profile.h>
-#include <linux/sched.h>
-#include <linux/time.h>
-#include <linux/nmi.h>
-#include <linux/interrupt.h>
-#include <linux/efi.h>
-#include <linux/timex.h>
-#include <linux/timekeeper_internal.h>
-#include <linux/platform_device.h>
-#include <linux/sched/cputime.h>
-
-#include <asm/cputime.h>
-#include <asm/delay.h>
-#include <asm/efi.h>
-#include <asm/hw_irq.h>
-#include <asm/ptrace.h>
-#include <asm/sal.h>
-#include <asm/sections.h>
-
-#include "fsyscall_gtod_data.h"
-#include "irq.h"
-
-static u64 itc_get_cycles(struct clocksource *cs);
-
-struct fsyscall_gtod_data_t fsyscall_gtod_data;
-
-struct itc_jitter_data_t itc_jitter_data;
-
-volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */
-
-#ifdef CONFIG_IA64_DEBUG_IRQ
-
-unsigned long last_cli_ip;
-EXPORT_SYMBOL(last_cli_ip);
-
-#endif
-
-static struct clocksource clocksource_itc = {
-	.name           = "itc",
-	.rating         = 350,
-	.read           = itc_get_cycles,
-	.mask           = CLOCKSOURCE_MASK(64),
-	.flags          = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-static struct clocksource *itc_clocksource;
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-
-#include <linux/kernel_stat.h>
-
-extern u64 cycle_to_nsec(u64 cyc);
-
-void vtime_flush(struct task_struct *tsk)
-{
-	struct thread_info *ti = task_thread_info(tsk);
-	u64 delta;
-
-	if (ti->utime)
-		account_user_time(tsk, cycle_to_nsec(ti->utime));
-
-	if (ti->gtime)
-		account_guest_time(tsk, cycle_to_nsec(ti->gtime));
-
-	if (ti->idle_time)
-		account_idle_time(cycle_to_nsec(ti->idle_time));
-
-	if (ti->stime) {
-		delta = cycle_to_nsec(ti->stime);
-		account_system_index_time(tsk, delta, CPUTIME_SYSTEM);
-	}
-
-	if (ti->hardirq_time) {
-		delta = cycle_to_nsec(ti->hardirq_time);
-		account_system_index_time(tsk, delta, CPUTIME_IRQ);
-	}
-
-	if (ti->softirq_time) {
-		delta = cycle_to_nsec(ti->softirq_time);
-		account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ);
-	}
-
-	ti->utime = 0;
-	ti->gtime = 0;
-	ti->idle_time = 0;
-	ti->stime = 0;
-	ti->hardirq_time = 0;
-	ti->softirq_time = 0;
-}
-
-/*
- * Called from the context switch with interrupts disabled, to charge all
- * accumulated times to the current process, and to prepare accounting on
- * the next process.
- */
-void arch_vtime_task_switch(struct task_struct *prev)
-{
-	struct thread_info *pi = task_thread_info(prev);
-	struct thread_info *ni = task_thread_info(current);
-
-	ni->ac_stamp = pi->ac_stamp;
-	ni->ac_stime = ni->ac_utime = 0;
-}
-
-/*
- * Account time for a transition between system, hard irq or soft irq state.
- * Note that this function is called with interrupts enabled.
- */
-static __u64 vtime_delta(struct task_struct *tsk)
-{
-	struct thread_info *ti = task_thread_info(tsk);
-	__u64 now, delta_stime;
-
-	WARN_ON_ONCE(!irqs_disabled());
-
-	now = ia64_get_itc();
-	delta_stime = now - ti->ac_stamp;
-	ti->ac_stamp = now;
-
-	return delta_stime;
-}
-
-void vtime_account_kernel(struct task_struct *tsk)
-{
-	struct thread_info *ti = task_thread_info(tsk);
-	__u64 stime = vtime_delta(tsk);
-
-	if (tsk->flags & PF_VCPU)
-		ti->gtime += stime;
-	else
-		ti->stime += stime;
-}
-EXPORT_SYMBOL_GPL(vtime_account_kernel);
-
-void vtime_account_idle(struct task_struct *tsk)
-{
-	struct thread_info *ti = task_thread_info(tsk);
-
-	ti->idle_time += vtime_delta(tsk);
-}
-
-void vtime_account_softirq(struct task_struct *tsk)
-{
-	struct thread_info *ti = task_thread_info(tsk);
-
-	ti->softirq_time += vtime_delta(tsk);
-}
-
-void vtime_account_hardirq(struct task_struct *tsk)
-{
-	struct thread_info *ti = task_thread_info(tsk);
-
-	ti->hardirq_time += vtime_delta(tsk);
-}
-
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-
-static irqreturn_t
-timer_interrupt (int irq, void *dev_id)
-{
-	unsigned long new_itm;
-
-	if (cpu_is_offline(smp_processor_id())) {
-		return IRQ_HANDLED;
-	}
-
-	new_itm = local_cpu_data->itm_next;
-
-	if (!time_after(ia64_get_itc(), new_itm))
-		printk(KERN_ERR "Oops: timer tick before it's due (itc=%lx,itm=%lx)\n",
-		       ia64_get_itc(), new_itm);
-
-	while (1) {
-		new_itm += local_cpu_data->itm_delta;
-
-		legacy_timer_tick(smp_processor_id() == time_keeper_id);
-
-		local_cpu_data->itm_next = new_itm;
-
-		if (time_after(new_itm, ia64_get_itc()))
-			break;
-
-		/*
-		 * Allow IPIs to interrupt the timer loop.
-		 */
-		local_irq_enable();
-		local_irq_disable();
-	}
-
-	do {
-		/*
-		 * If we're too close to the next clock tick for
-		 * comfort, we increase the safety margin by
-		 * intentionally dropping the next tick(s).  We do NOT
-		 * update itm.next because that would force us to call
-		 * xtime_update() which in turn would let our clock run
-		 * too fast (with the potentially devastating effect
-		 * of losing monotony of time).
-		 */
-		while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_delta/2))
-			new_itm += local_cpu_data->itm_delta;
-		ia64_set_itm(new_itm);
-		/* double check, in case we got hit by a (slow) PMI: */
-	} while (time_after_eq(ia64_get_itc(), new_itm));
-	return IRQ_HANDLED;
-}
-
-/*
- * Encapsulate access to the itm structure for SMP.
- */
-void
-ia64_cpu_local_tick (void)
-{
-	int cpu = smp_processor_id();
-	unsigned long shift = 0, delta;
-
-	/* arrange for the cycle counter to generate a timer interrupt: */
-	ia64_set_itv(IA64_TIMER_VECTOR);
-
-	delta = local_cpu_data->itm_delta;
-	/*
-	 * Stagger the timer tick for each CPU so they don't occur all at (almost) the
-	 * same time:
-	 */
-	if (cpu) {
-		unsigned long hi = 1UL << ia64_fls(cpu);
-		shift = (2*(cpu - hi) + 1) * delta/hi/2;
-	}
-	local_cpu_data->itm_next = ia64_get_itc() + delta + shift;
-	ia64_set_itm(local_cpu_data->itm_next);
-}
-
-static int nojitter;
-
-static int __init nojitter_setup(char *str)
-{
-	nojitter = 1;
-	printk("Jitter checking for ITC timers disabled\n");
-	return 1;
-}
-
-__setup("nojitter", nojitter_setup);
-
-
-void ia64_init_itm(void)
-{
-	unsigned long platform_base_freq, itc_freq;
-	struct pal_freq_ratio itc_ratio, proc_ratio;
-	long status, platform_base_drift, itc_drift;
-
-	/*
-	 * According to SAL v2.6, we need to use a SAL call to determine the platform base
-	 * frequency and then a PAL call to determine the frequency ratio between the ITC
-	 * and the base frequency.
-	 */
-	status = ia64_sal_freq_base(SAL_FREQ_BASE_PLATFORM,
-				    &platform_base_freq, &platform_base_drift);
-	if (status != 0) {
-		printk(KERN_ERR "SAL_FREQ_BASE_PLATFORM failed: %s\n", ia64_sal_strerror(status));
-	} else {
-		status = ia64_pal_freq_ratios(&proc_ratio, NULL, &itc_ratio);
-		if (status != 0)
-			printk(KERN_ERR "PAL_FREQ_RATIOS failed with status=%ld\n", status);
-	}
-	if (status != 0) {
-		/* invent "random" values */
-		printk(KERN_ERR
-		       "SAL/PAL failed to obtain frequency info---inventing reasonable values\n");
-		platform_base_freq = 100000000;
-		platform_base_drift = -1;	/* no drift info */
-		itc_ratio.num = 3;
-		itc_ratio.den = 1;
-	}
-	if (platform_base_freq < 40000000) {
-		printk(KERN_ERR "Platform base frequency %lu bogus---resetting to 75MHz!\n",
-		       platform_base_freq);
-		platform_base_freq = 75000000;
-		platform_base_drift = -1;
-	}
-	if (!proc_ratio.den)
-		proc_ratio.den = 1;	/* avoid division by zero */
-	if (!itc_ratio.den)
-		itc_ratio.den = 1;	/* avoid division by zero */
-
-	itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den;
-
-	local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ;
-	printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%u/%u, "
-	       "ITC freq=%lu.%03luMHz", smp_processor_id(),
-	       platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000,
-	       itc_ratio.num, itc_ratio.den, itc_freq / 1000000, (itc_freq / 1000) % 1000);
-
-	if (platform_base_drift != -1) {
-		itc_drift = platform_base_drift*itc_ratio.num/itc_ratio.den;
-		printk("+/-%ldppm\n", itc_drift);
-	} else {
-		itc_drift = -1;
-		printk("\n");
-	}
-
-	local_cpu_data->proc_freq = (platform_base_freq*proc_ratio.num)/proc_ratio.den;
-	local_cpu_data->itc_freq = itc_freq;
-	local_cpu_data->cyc_per_usec = (itc_freq + USEC_PER_SEC/2) / USEC_PER_SEC;
-	local_cpu_data->nsec_per_cyc = ((NSEC_PER_SEC<<IA64_NSEC_PER_CYC_SHIFT)
-					+ itc_freq/2)/itc_freq;
-
-	if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
-#ifdef CONFIG_SMP
-		/* On IA64 in an SMP configuration ITCs are never accurately synchronized.
-		 * Jitter compensation requires a cmpxchg which may limit
-		 * the scalability of the syscalls for retrieving time.
-		 * The ITC synchronization is usually successful to within a few
-		 * ITC ticks but this is not a sure thing. If you need to improve
-		 * timer performance in SMP situations then boot the kernel with the
-		 * "nojitter" option. However, doing so may result in time fluctuating (maybe
-		 * even going backward) if the ITC offsets between the individual CPUs
-		 * are too large.
-		 */
-		if (!nojitter)
-			itc_jitter_data.itc_jitter = 1;
-#endif
-	} else
-		/*
-		 * ITC is drifty and we have not synchronized the ITCs in smpboot.c.
-		 * ITC values may fluctuate significantly between processors.
-		 * Clock should not be used for hrtimers. Mark itc as only
-		 * useful for boot and testing.
-		 *
-		 * Note that jitter compensation is off! There is no point of
-		 * synchronizing ITCs since they may be large differentials
-		 * that change over time.
-		 *
-		 * The only way to fix this would be to repeatedly sync the
-		 * ITCs. Until that time we have to avoid ITC.
-		 */
-		clocksource_itc.rating = 50;
-
-	/* avoid softlock up message when cpu is unplug and plugged again. */
-	touch_softlockup_watchdog();
-
-	/* Setup the CPU local timer tick */
-	ia64_cpu_local_tick();
-
-	if (!itc_clocksource) {
-		clocksource_register_hz(&clocksource_itc,
-						local_cpu_data->itc_freq);
-		itc_clocksource = &clocksource_itc;
-	}
-}
-
-static u64 itc_get_cycles(struct clocksource *cs)
-{
-	unsigned long lcycle, now, ret;
-
-	if (!itc_jitter_data.itc_jitter)
-		return get_cycles();
-
-	lcycle = itc_jitter_data.itc_lastcycle;
-	now = get_cycles();
-	if (lcycle && time_after(lcycle, now))
-		return lcycle;
-
-	/*
-	 * Keep track of the last timer value returned.
-	 * In an SMP environment, you could lose out in contention of
-	 * cmpxchg. If so, your cmpxchg returns new value which the
-	 * winner of contention updated to. Use the new value instead.
-	 */
-	ret = cmpxchg(&itc_jitter_data.itc_lastcycle, lcycle, now);
-	if (unlikely(ret != lcycle))
-		return ret;
-
-	return now;
-}
-
-void read_persistent_clock64(struct timespec64 *ts)
-{
-	efi_gettimeofday(ts);
-}
-
-void __init
-time_init (void)
-{
-	register_percpu_irq(IA64_TIMER_VECTOR, timer_interrupt, IRQF_IRQPOLL,
-			    "timer");
-	ia64_init_itm();
-}
-
-/*
- * Generic udelay assumes that if preemption is allowed and the thread
- * migrates to another CPU, that the ITC values are synchronized across
- * all CPUs.
- */
-static void
-ia64_itc_udelay (unsigned long usecs)
-{
-	unsigned long start = ia64_get_itc();
-	unsigned long end = start + usecs*local_cpu_data->cyc_per_usec;
-
-	while (time_before(ia64_get_itc(), end))
-		cpu_relax();
-}
-
-void (*ia64_udelay)(unsigned long usecs) = &ia64_itc_udelay;
-
-void
-udelay (unsigned long usecs)
-{
-	(*ia64_udelay)(usecs);
-}
-EXPORT_SYMBOL(udelay);
-
-/* IA64 doesn't cache the timezone */
-void update_vsyscall_tz(void)
-{
-}
-
-void update_vsyscall(struct timekeeper *tk)
-{
-	write_seqcount_begin(&fsyscall_gtod_data.seq);
-
-	/* copy vsyscall data */
-	fsyscall_gtod_data.clk_mask = tk->tkr_mono.mask;
-	fsyscall_gtod_data.clk_mult = tk->tkr_mono.mult;
-	fsyscall_gtod_data.clk_shift = tk->tkr_mono.shift;
-	fsyscall_gtod_data.clk_fsys_mmio = tk->tkr_mono.clock->archdata.fsys_mmio;
-	fsyscall_gtod_data.clk_cycle_last = tk->tkr_mono.cycle_last;
-
-	fsyscall_gtod_data.wall_time.sec = tk->xtime_sec;
-	fsyscall_gtod_data.wall_time.snsec = tk->tkr_mono.xtime_nsec;
-
-	fsyscall_gtod_data.monotonic_time.sec = tk->xtime_sec
-					      + tk->wall_to_monotonic.tv_sec;
-	fsyscall_gtod_data.monotonic_time.snsec = tk->tkr_mono.xtime_nsec
-						+ ((u64)tk->wall_to_monotonic.tv_nsec
-							<< tk->tkr_mono.shift);
-
-	/* normalize */
-	while (fsyscall_gtod_data.monotonic_time.snsec >=
-					(((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
-		fsyscall_gtod_data.monotonic_time.snsec -=
-					((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
-		fsyscall_gtod_data.monotonic_time.sec++;
-	}
-
-	write_seqcount_end(&fsyscall_gtod_data.seq);
-}
-
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
deleted file mode 100644
index 94a848b06f15..000000000000
--- a/arch/ia64/kernel/topology.c
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * This file contains NUMA specific variables and functions which are used on
- * NUMA machines with contiguous memory.
- * 		2002/08/07 Erich Focht <efocht@ess.nec.de>
- * Populate cpu entries in sysfs for non-numa systems as well
- *  	Intel Corporation - Ashok Raj
- * 02/27/2006 Zhang, Yanmin
- *	Populate cpu cache entries in sysfs for cpu cache info
- */
-
-#include <linux/cpu.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/node.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/nodemask.h>
-#include <linux/notifier.h>
-#include <linux/export.h>
-#include <asm/mmzone.h>
-#include <asm/numa.h>
-#include <asm/cpu.h>
-
-static struct ia64_cpu *sysfs_cpus;
-
-void arch_fix_phys_package_id(int num, u32 slot)
-{
-#ifdef CONFIG_SMP
-	if (cpu_data(num)->socket_id == -1)
-		cpu_data(num)->socket_id = slot;
-#endif
-}
-EXPORT_SYMBOL_GPL(arch_fix_phys_package_id);
-
-
-#ifdef CONFIG_HOTPLUG_CPU
-int __ref arch_register_cpu(int num)
-{
-	/*
-	 * If CPEI can be re-targeted or if this is not
-	 * CPEI target, then it is hotpluggable
-	 */
-	if (can_cpei_retarget() || !is_cpu_cpei_target(num))
-		sysfs_cpus[num].cpu.hotpluggable = 1;
-	map_cpu_to_node(num, node_cpuid[num].nid);
-	return register_cpu(&sysfs_cpus[num].cpu, num);
-}
-EXPORT_SYMBOL(arch_register_cpu);
-
-void __ref arch_unregister_cpu(int num)
-{
-	unregister_cpu(&sysfs_cpus[num].cpu);
-	unmap_cpu_from_node(num, cpu_to_node(num));
-}
-EXPORT_SYMBOL(arch_unregister_cpu);
-#else
-static int __init arch_register_cpu(int num)
-{
-	return register_cpu(&sysfs_cpus[num].cpu, num);
-}
-#endif /*CONFIG_HOTPLUG_CPU*/
-
-
-static int __init topology_init(void)
-{
-	int i, err = 0;
-
-	sysfs_cpus = kcalloc(NR_CPUS, sizeof(struct ia64_cpu), GFP_KERNEL);
-	if (!sysfs_cpus)
-		panic("kzalloc in topology_init failed - NR_CPUS too big?");
-
-	for_each_present_cpu(i) {
-		if((err = arch_register_cpu(i)))
-			goto out;
-	}
-out:
-	return err;
-}
-
-subsys_initcall(topology_init);
-
-
-/*
- * Export cpu cache information through sysfs
- */
-
-/*
- *  A bunch of string array to get pretty printing
- */
-static const char *cache_types[] = {
-	"",			/* not used */
-	"Instruction",
-	"Data",
-	"Unified"	/* unified */
-};
-
-static const char *cache_mattrib[]={
-	"WriteThrough",
-	"WriteBack",
-	"",		/* reserved */
-	""		/* reserved */
-};
-
-struct cache_info {
-	pal_cache_config_info_t	cci;
-	cpumask_t shared_cpu_map;
-	int level;
-	int type;
-	struct kobject kobj;
-};
-
-struct cpu_cache_info {
-	struct cache_info *cache_leaves;
-	int	num_cache_leaves;
-	struct kobject kobj;
-};
-
-static struct cpu_cache_info	all_cpu_cache_info[NR_CPUS];
-#define LEAF_KOBJECT_PTR(x,y)    (&all_cpu_cache_info[x].cache_leaves[y])
-
-#ifdef CONFIG_SMP
-static void cache_shared_cpu_map_setup(unsigned int cpu,
-		struct cache_info * this_leaf)
-{
-	pal_cache_shared_info_t	csi;
-	int num_shared, i = 0;
-	unsigned int j;
-
-	if (cpu_data(cpu)->threads_per_core <= 1 &&
-		cpu_data(cpu)->cores_per_socket <= 1) {
-		cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
-		return;
-	}
-
-	if (ia64_pal_cache_shared_info(this_leaf->level,
-					this_leaf->type,
-					0,
-					&csi) != PAL_STATUS_SUCCESS)
-		return;
-
-	num_shared = (int) csi.num_shared;
-	do {
-		for_each_possible_cpu(j)
-			if (cpu_data(cpu)->socket_id == cpu_data(j)->socket_id
-				&& cpu_data(j)->core_id == csi.log1_cid
-				&& cpu_data(j)->thread_id == csi.log1_tid)
-				cpumask_set_cpu(j, &this_leaf->shared_cpu_map);
-
-		i++;
-	} while (i < num_shared &&
-		ia64_pal_cache_shared_info(this_leaf->level,
-				this_leaf->type,
-				i,
-				&csi) == PAL_STATUS_SUCCESS);
-}
-#else
-static void cache_shared_cpu_map_setup(unsigned int cpu,
-		struct cache_info * this_leaf)
-{
-	cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
-	return;
-}
-#endif
-
-static ssize_t show_coherency_line_size(struct cache_info *this_leaf,
-					char *buf)
-{
-	return sprintf(buf, "%u\n", 1 << this_leaf->cci.pcci_line_size);
-}
-
-static ssize_t show_ways_of_associativity(struct cache_info *this_leaf,
-					char *buf)
-{
-	return sprintf(buf, "%u\n", this_leaf->cci.pcci_assoc);
-}
-
-static ssize_t show_attributes(struct cache_info *this_leaf, char *buf)
-{
-	return sprintf(buf,
-			"%s\n",
-			cache_mattrib[this_leaf->cci.pcci_cache_attr]);
-}
-
-static ssize_t show_size(struct cache_info *this_leaf, char *buf)
-{
-	return sprintf(buf, "%uK\n", this_leaf->cci.pcci_cache_size / 1024);
-}
-
-static ssize_t show_number_of_sets(struct cache_info *this_leaf, char *buf)
-{
-	unsigned number_of_sets = this_leaf->cci.pcci_cache_size;
-	number_of_sets /= this_leaf->cci.pcci_assoc;
-	number_of_sets /= 1 << this_leaf->cci.pcci_line_size;
-
-	return sprintf(buf, "%u\n", number_of_sets);
-}
-
-static ssize_t show_shared_cpu_map(struct cache_info *this_leaf, char *buf)
-{
-	cpumask_t shared_cpu_map;
-
-	cpumask_and(&shared_cpu_map,
-				&this_leaf->shared_cpu_map, cpu_online_mask);
-	return scnprintf(buf, PAGE_SIZE, "%*pb\n",
-			 cpumask_pr_args(&shared_cpu_map));
-}
-
-static ssize_t show_type(struct cache_info *this_leaf, char *buf)
-{
-	int type = this_leaf->type + this_leaf->cci.pcci_unified;
-	return sprintf(buf, "%s\n", cache_types[type]);
-}
-
-static ssize_t show_level(struct cache_info *this_leaf, char *buf)
-{
-	return sprintf(buf, "%u\n", this_leaf->level);
-}
-
-struct cache_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct cache_info *, char *);
-	ssize_t (*store)(struct cache_info *, const char *, size_t count);
-};
-
-#ifdef define_one_ro
-	#undef define_one_ro
-#endif
-#define define_one_ro(_name) \
-	static struct cache_attr _name = \
-__ATTR(_name, 0444, show_##_name, NULL)
-
-define_one_ro(level);
-define_one_ro(type);
-define_one_ro(coherency_line_size);
-define_one_ro(ways_of_associativity);
-define_one_ro(size);
-define_one_ro(number_of_sets);
-define_one_ro(shared_cpu_map);
-define_one_ro(attributes);
-
-static struct attribute * cache_default_attrs[] = {
-	&type.attr,
-	&level.attr,
-	&coherency_line_size.attr,
-	&ways_of_associativity.attr,
-	&attributes.attr,
-	&size.attr,
-	&number_of_sets.attr,
-	&shared_cpu_map.attr,
-	NULL
-};
-ATTRIBUTE_GROUPS(cache_default);
-
-#define to_object(k) container_of(k, struct cache_info, kobj)
-#define to_attr(a) container_of(a, struct cache_attr, attr)
-
-static ssize_t ia64_cache_show(struct kobject * kobj, struct attribute * attr, char * buf)
-{
-	struct cache_attr *fattr = to_attr(attr);
-	struct cache_info *this_leaf = to_object(kobj);
-	ssize_t ret;
-
-	ret = fattr->show ? fattr->show(this_leaf, buf) : 0;
-	return ret;
-}
-
-static const struct sysfs_ops cache_sysfs_ops = {
-	.show   = ia64_cache_show
-};
-
-static struct kobj_type cache_ktype = {
-	.sysfs_ops	= &cache_sysfs_ops,
-	.default_groups	= cache_default_groups,
-};
-
-static struct kobj_type cache_ktype_percpu_entry = {
-	.sysfs_ops	= &cache_sysfs_ops,
-};
-
-static void cpu_cache_sysfs_exit(unsigned int cpu)
-{
-	kfree(all_cpu_cache_info[cpu].cache_leaves);
-	all_cpu_cache_info[cpu].cache_leaves = NULL;
-	all_cpu_cache_info[cpu].num_cache_leaves = 0;
-	memset(&all_cpu_cache_info[cpu].kobj, 0, sizeof(struct kobject));
-	return;
-}
-
-static int cpu_cache_sysfs_init(unsigned int cpu)
-{
-	unsigned long i, levels, unique_caches;
-	pal_cache_config_info_t cci;
-	int j;
-	long status;
-	struct cache_info *this_cache;
-	int num_cache_leaves = 0;
-
-	if ((status = ia64_pal_cache_summary(&levels, &unique_caches)) != 0) {
-		printk(KERN_ERR "ia64_pal_cache_summary=%ld\n", status);
-		return -1;
-	}
-
-	this_cache=kcalloc(unique_caches, sizeof(struct cache_info),
-			   GFP_KERNEL);
-	if (this_cache == NULL)
-		return -ENOMEM;
-
-	for (i=0; i < levels; i++) {
-		for (j=2; j >0 ; j--) {
-			if ((status=ia64_pal_cache_config_info(i,j, &cci)) !=
-					PAL_STATUS_SUCCESS)
-				continue;
-
-			this_cache[num_cache_leaves].cci = cci;
-			this_cache[num_cache_leaves].level = i + 1;
-			this_cache[num_cache_leaves].type = j;
-
-			cache_shared_cpu_map_setup(cpu,
-					&this_cache[num_cache_leaves]);
-			num_cache_leaves ++;
-		}
-	}
-
-	all_cpu_cache_info[cpu].cache_leaves = this_cache;
-	all_cpu_cache_info[cpu].num_cache_leaves = num_cache_leaves;
-
-	memset(&all_cpu_cache_info[cpu].kobj, 0, sizeof(struct kobject));
-
-	return 0;
-}
-
-/* Add cache interface for CPU device */
-static int cache_add_dev(unsigned int cpu)
-{
-	struct device *sys_dev = get_cpu_device(cpu);
-	unsigned long i, j;
-	struct cache_info *this_object;
-	int retval = 0;
-
-	if (all_cpu_cache_info[cpu].kobj.parent)
-		return 0;
-
-
-	retval = cpu_cache_sysfs_init(cpu);
-	if (unlikely(retval < 0))
-		return retval;
-
-	retval = kobject_init_and_add(&all_cpu_cache_info[cpu].kobj,
-				      &cache_ktype_percpu_entry, &sys_dev->kobj,
-				      "%s", "cache");
-	if (unlikely(retval < 0)) {
-		cpu_cache_sysfs_exit(cpu);
-		return retval;
-	}
-
-	for (i = 0; i < all_cpu_cache_info[cpu].num_cache_leaves; i++) {
-		this_object = LEAF_KOBJECT_PTR(cpu,i);
-		retval = kobject_init_and_add(&(this_object->kobj),
-					      &cache_ktype,
-					      &all_cpu_cache_info[cpu].kobj,
-					      "index%1lu", i);
-		if (unlikely(retval)) {
-			for (j = 0; j < i; j++) {
-				kobject_put(&(LEAF_KOBJECT_PTR(cpu,j)->kobj));
-			}
-			kobject_put(&all_cpu_cache_info[cpu].kobj);
-			cpu_cache_sysfs_exit(cpu);
-			return retval;
-		}
-		kobject_uevent(&(this_object->kobj), KOBJ_ADD);
-	}
-	kobject_uevent(&all_cpu_cache_info[cpu].kobj, KOBJ_ADD);
-	return retval;
-}
-
-/* Remove cache interface for CPU device */
-static int cache_remove_dev(unsigned int cpu)
-{
-	unsigned long i;
-
-	for (i = 0; i < all_cpu_cache_info[cpu].num_cache_leaves; i++)
-		kobject_put(&(LEAF_KOBJECT_PTR(cpu,i)->kobj));
-
-	if (all_cpu_cache_info[cpu].kobj.parent) {
-		kobject_put(&all_cpu_cache_info[cpu].kobj);
-		memset(&all_cpu_cache_info[cpu].kobj,
-			0,
-			sizeof(struct kobject));
-	}
-
-	cpu_cache_sysfs_exit(cpu);
-
-	return 0;
-}
-
-static int __init cache_sysfs_init(void)
-{
-	int ret;
-
-	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/topology:online",
-				cache_add_dev, cache_remove_dev);
-	WARN_ON(ret < 0);
-	return 0;
-}
-device_initcall(cache_sysfs_init);
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
deleted file mode 100644
index 53735b1d1be3..000000000000
--- a/arch/ia64/kernel/traps.c
+++ /dev/null
@@ -1,612 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Architecture-specific trap handling.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 05/12/00 grao <goutham.rao@intel.com> : added isr in siginfo for SIGFPE
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>		/* For unblank_screen() */
-#include <linux/export.h>
-#include <linux/extable.h>
-#include <linux/hardirq.h>
-#include <linux/kprobes.h>
-#include <linux/delay.h>		/* for ssleep() */
-#include <linux/kdebug.h>
-#include <linux/uaccess.h>
-
-#include <asm/fpswa.h>
-#include <asm/intrinsics.h>
-#include <asm/processor.h>
-#include <asm/exception.h>
-#include <asm/setup.h>
-
-fpswa_interface_t *fpswa_interface;
-EXPORT_SYMBOL(fpswa_interface);
-
-void __init
-trap_init (void)
-{
-	if (ia64_boot_param->fpswa)
-		/* FPSWA fixup: make the interface pointer a kernel virtual address: */
-		fpswa_interface = __va(ia64_boot_param->fpswa);
-}
-
-int
-die (const char *str, struct pt_regs *regs, long err)
-{
-	static struct {
-		spinlock_t lock;
-		u32 lock_owner;
-		int lock_owner_depth;
-	} die = {
-		.lock =	__SPIN_LOCK_UNLOCKED(die.lock),
-		.lock_owner = -1,
-		.lock_owner_depth = 0
-	};
-	static int die_counter;
-	int cpu = get_cpu();
-
-	if (die.lock_owner != cpu) {
-		console_verbose();
-		spin_lock_irq(&die.lock);
-		die.lock_owner = cpu;
-		die.lock_owner_depth = 0;
-		bust_spinlocks(1);
-	}
-	put_cpu();
-
-	if (++die.lock_owner_depth < 3) {
-		printk("%s[%d]: %s %ld [%d]\n",
-		current->comm, task_pid_nr(current), str, err, ++die_counter);
-		if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV)
-	            != NOTIFY_STOP)
-			show_regs(regs);
-		else
-			regs = NULL;
-  	} else
-		printk(KERN_ERR "Recursive die() failure, output suppressed\n");
-
-	bust_spinlocks(0);
-	die.lock_owner = -1;
-	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
-	spin_unlock_irq(&die.lock);
-
-	if (!regs)
-		return 1;
-
-	if (panic_on_oops)
-		panic("Fatal exception");
-
-	make_task_dead(SIGSEGV);
-	return 0;
-}
-
-int
-die_if_kernel (char *str, struct pt_regs *regs, long err)
-{
-	if (!user_mode(regs))
-		return die(str, regs, err);
-	return 0;
-}
-
-void
-__kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
-{
-	int sig, code;
-
-	switch (break_num) {
-	      case 0: /* unknown error (used by GCC for __builtin_abort()) */
-		if (notify_die(DIE_BREAK, "break 0", regs, break_num, TRAP_BRKPT, SIGTRAP)
-			       	== NOTIFY_STOP)
-			return;
-		if (die_if_kernel("bugcheck!", regs, break_num))
-			return;
-		sig = SIGILL; code = ILL_ILLOPC;
-		break;
-
-	      case 1: /* integer divide by zero */
-		sig = SIGFPE; code = FPE_INTDIV;
-		break;
-
-	      case 2: /* integer overflow */
-		sig = SIGFPE; code = FPE_INTOVF;
-		break;
-
-	      case 3: /* range check/bounds check */
-		sig = SIGFPE; code = FPE_FLTSUB;
-		break;
-
-	      case 4: /* null pointer dereference */
-		sig = SIGSEGV; code = SEGV_MAPERR;
-		break;
-
-	      case 5: /* misaligned data */
-		sig = SIGSEGV; code = BUS_ADRALN;
-		break;
-
-	      case 6: /* decimal overflow */
-		sig = SIGFPE; code = __FPE_DECOVF;
-		break;
-
-	      case 7: /* decimal divide by zero */
-		sig = SIGFPE; code = __FPE_DECDIV;
-		break;
-
-	      case 8: /* packed decimal error */
-		sig = SIGFPE; code = __FPE_DECERR;
-		break;
-
-	      case 9: /* invalid ASCII digit */
-		sig = SIGFPE; code = __FPE_INVASC;
-		break;
-
-	      case 10: /* invalid decimal digit */
-		sig = SIGFPE; code = __FPE_INVDEC;
-		break;
-
-	      case 11: /* paragraph stack overflow */
-		sig = SIGSEGV; code = __SEGV_PSTKOVF;
-		break;
-
-	      case 0x3f000 ... 0x3ffff:	/* bundle-update in progress */
-		sig = SIGILL; code = __ILL_BNDMOD;
-		break;
-
-	      default:
-		if ((break_num < 0x40000 || break_num > 0x100000)
-		    && die_if_kernel("Bad break", regs, break_num))
-			return;
-
-		if (break_num < 0x80000) {
-			sig = SIGILL; code = __ILL_BREAK;
-		} else {
-			if (notify_die(DIE_BREAK, "bad break", regs, break_num, TRAP_BRKPT, SIGTRAP)
-					== NOTIFY_STOP)
-				return;
-			sig = SIGTRAP; code = TRAP_BRKPT;
-		}
-	}
-	force_sig_fault(sig, code,
-			(void __user *) (regs->cr_iip + ia64_psr(regs)->ri),
-			break_num, 0 /* clear __ISR_VALID */, 0);
-}
-
-/*
- * disabled_fph_fault() is called when a user-level process attempts to access f32..f127
- * and it doesn't own the fp-high register partition.  When this happens, we save the
- * current fph partition in the task_struct of the fpu-owner (if necessary) and then load
- * the fp-high partition of the current task (if necessary).  Note that the kernel has
- * access to fph by the time we get here, as the IVT's "Disabled FP-Register" handler takes
- * care of clearing psr.dfh.
- */
-static inline void
-disabled_fph_fault (struct pt_regs *regs)
-{
-	struct ia64_psr *psr = ia64_psr(regs);
-
-	/* first, grant user-level access to fph partition: */
-	psr->dfh = 0;
-
-	/*
-	 * Make sure that no other task gets in on this processor
-	 * while we're claiming the FPU
-	 */
-	preempt_disable();
-#ifndef CONFIG_SMP
-	{
-		struct task_struct *fpu_owner
-			= (struct task_struct *)ia64_get_kr(IA64_KR_FPU_OWNER);
-
-		if (ia64_is_local_fpu_owner(current)) {
-			preempt_enable_no_resched();
-			return;
-		}
-
-		if (fpu_owner)
-			ia64_flush_fph(fpu_owner);
-	}
-#endif /* !CONFIG_SMP */
-	ia64_set_local_fpu_owner(current);
-	if ((current->thread.flags & IA64_THREAD_FPH_VALID) != 0) {
-		__ia64_load_fpu(current->thread.fph);
-		psr->mfh = 0;
-	} else {
-		__ia64_init_fpu();
-		/*
-		 * Set mfh because the state in thread.fph does not match the state in
-		 * the fph partition.
-		 */
-		psr->mfh = 1;
-	}
-	preempt_enable_no_resched();
-}
-
-static inline int
-fp_emulate (int fp_fault, void *bundle, long *ipsr, long *fpsr, long *isr, long *pr, long *ifs,
-	    struct pt_regs *regs)
-{
-	fp_state_t fp_state;
-	fpswa_ret_t ret;
-
-	if (!fpswa_interface)
-		return -1;
-
-	memset(&fp_state, 0, sizeof(fp_state_t));
-
-	/*
-	 * compute fp_state.  only FP registers f6 - f11 are used by the
-	 * kernel, so set those bits in the mask and set the low volatile
-	 * pointer to point to these registers.
-	 */
-	fp_state.bitmask_low64 = 0xfc0;  /* bit6..bit11 */
-
-	fp_state.fp_state_low_volatile = (fp_state_low_volatile_t *) &regs->f6;
-	/*
-	 * unsigned long (*EFI_FPSWA) (
-	 *      unsigned long    trap_type,
-	 *	void             *Bundle,
-	 *	unsigned long    *pipsr,
-	 *	unsigned long    *pfsr,
-	 *	unsigned long    *pisr,
-	 *	unsigned long    *ppreds,
-	 *	unsigned long    *pifs,
-	 *	void             *fp_state);
-	 */
-	ret = (*fpswa_interface->fpswa)((unsigned long) fp_fault, bundle,
-					(unsigned long *) ipsr, (unsigned long *) fpsr,
-					(unsigned long *) isr, (unsigned long *) pr,
-					(unsigned long *) ifs, &fp_state);
-
-	return ret.status;
-}
-
-struct fpu_swa_msg {
-	unsigned long count;
-	unsigned long time;
-};
-static DEFINE_PER_CPU(struct fpu_swa_msg, cpulast);
-DECLARE_PER_CPU(struct fpu_swa_msg, cpulast);
-static struct fpu_swa_msg last __cacheline_aligned;
-
-
-/*
- * Handle floating-point assist faults and traps.
- */
-static int
-handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
-{
-	long exception, bundle[2];
-	unsigned long fault_ip;
-
-	fault_ip = regs->cr_iip;
-	if (!fp_fault && (ia64_psr(regs)->ri == 0))
-		fault_ip -= 16;
-	if (copy_from_user(bundle, (void __user *) fault_ip, sizeof(bundle)))
-		return -1;
-
-	if (!(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT))  {
-		unsigned long count, current_jiffies = jiffies;
-		struct fpu_swa_msg *cp = this_cpu_ptr(&cpulast);
-
-		if (unlikely(current_jiffies > cp->time))
-			cp->count = 0;
-		if (unlikely(cp->count < 5)) {
-			cp->count++;
-			cp->time = current_jiffies + 5 * HZ;
-
-			/* minimize races by grabbing a copy of count BEFORE checking last.time. */
-			count = last.count;
-			barrier();
-
-			/*
-			 * Lower 4 bits are used as a count. Upper bits are a sequence
-			 * number that is updated when count is reset. The cmpxchg will
-			 * fail is seqno has changed. This minimizes multiple cpus
-			 * resetting the count.
-			 */
-			if (current_jiffies > last.time)
-				(void) cmpxchg_acq(&last.count, count, 16 + (count & ~15));
-
-			/* used fetchadd to atomically update the count */
-			if ((last.count & 15) < 5 && (ia64_fetchadd(1, &last.count, acq) & 15) < 5) {
-				last.time = current_jiffies + 5 * HZ;
-				printk(KERN_WARNING
-		       			"%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n",
-		       			current->comm, task_pid_nr(current), regs->cr_iip + ia64_psr(regs)->ri, isr);
-			}
-		}
-	}
-
-	exception = fp_emulate(fp_fault, bundle, &regs->cr_ipsr, &regs->ar_fpsr, &isr, &regs->pr,
-			       &regs->cr_ifs, regs);
-	if (fp_fault) {
-		if (exception == 0) {
-			/* emulation was successful */
-			ia64_increment_ip(regs);
-		} else if (exception == -1) {
-			printk(KERN_ERR "handle_fpu_swa: fp_emulate() returned -1\n");
-			return -1;
-		} else {
-			/* is next instruction a trap? */
-			int si_code;
-
-			if (exception & 2) {
-				ia64_increment_ip(regs);
-			}
-			si_code = FPE_FLTUNK;	/* default code */
-			if (isr & 0x11) {
-				si_code = FPE_FLTINV;
-			} else if (isr & 0x22) {
-				/* denormal operand gets the same si_code as underflow 
-				* see arch/i386/kernel/traps.c:math_error()  */
-				si_code = FPE_FLTUND;
-			} else if (isr & 0x44) {
-				si_code = FPE_FLTDIV;
-			}
-			force_sig_fault(SIGFPE, si_code,
-					(void __user *) (regs->cr_iip + ia64_psr(regs)->ri),
-					0, __ISR_VALID, isr);
-		}
-	} else {
-		if (exception == -1) {
-			printk(KERN_ERR "handle_fpu_swa: fp_emulate() returned -1\n");
-			return -1;
-		} else if (exception != 0) {
-			/* raise exception */
-			int si_code;
-
-			si_code = FPE_FLTUNK;	/* default code */
-			if (isr & 0x880) {
-				si_code = FPE_FLTOVF;
-			} else if (isr & 0x1100) {
-				si_code = FPE_FLTUND;
-			} else if (isr & 0x2200) {
-				si_code = FPE_FLTRES;
-			}
-			force_sig_fault(SIGFPE, si_code,
-					(void __user *) (regs->cr_iip + ia64_psr(regs)->ri),
-					0, __ISR_VALID, isr);
-		}
-	}
-	return 0;
-}
-
-struct illegal_op_return {
-	unsigned long fkt, arg1, arg2, arg3;
-};
-
-struct illegal_op_return
-ia64_illegal_op_fault (unsigned long ec, long arg1, long arg2, long arg3,
-		       long arg4, long arg5, long arg6, long arg7,
-		       struct pt_regs regs)
-{
-	struct illegal_op_return rv;
-	char buf[128];
-
-#ifdef CONFIG_IA64_BRL_EMU
-	{
-		extern struct illegal_op_return ia64_emulate_brl (struct pt_regs *, unsigned long);
-
-		rv = ia64_emulate_brl(&regs, ec);
-		if (rv.fkt != (unsigned long) -1)
-			return rv;
-	}
-#endif
-
-	sprintf(buf, "IA-64 Illegal operation fault");
-	rv.fkt = 0;
-	if (die_if_kernel(buf, &regs, 0))
-		return rv;
-
-	force_sig_fault(SIGILL, ILL_ILLOPC,
-			(void __user *) (regs.cr_iip + ia64_psr(&regs)->ri),
-			0, 0, 0);
-	return rv;
-}
-
-void __kprobes
-ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
-	    unsigned long iim, unsigned long itir, long arg5, long arg6,
-	    long arg7, struct pt_regs regs)
-{
-	unsigned long code, error = isr, iip;
-	char buf[128];
-	int result, sig, si_code;
-	static const char *reason[] = {
-		"IA-64 Illegal Operation fault",
-		"IA-64 Privileged Operation fault",
-		"IA-64 Privileged Register fault",
-		"IA-64 Reserved Register/Field fault",
-		"Disabled Instruction Set Transition fault",
-		"Unknown fault 5", "Unknown fault 6", "Unknown fault 7", "Illegal Hazard fault",
-		"Unknown fault 9", "Unknown fault 10", "Unknown fault 11", "Unknown fault 12",
-		"Unknown fault 13", "Unknown fault 14", "Unknown fault 15"
-	};
-
-	if ((isr & IA64_ISR_NA) && ((isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH)) {
-		/*
-		 * This fault was due to lfetch.fault, set "ed" bit in the psr to cancel
-		 * the lfetch.
-		 */
-		ia64_psr(&regs)->ed = 1;
-		return;
-	}
-
-	iip = regs.cr_iip + ia64_psr(&regs)->ri;
-
-	switch (vector) {
-	      case 24: /* General Exception */
-		code = (isr >> 4) & 0xf;
-		sprintf(buf, "General Exception: %s%s", reason[code],
-			(code == 3) ? ((isr & (1UL << 37))
-				       ? " (RSE access)" : " (data access)") : "");
-		if (code == 8) {
-# ifdef CONFIG_IA64_PRINT_HAZARDS
-			printk("%s[%d]: possible hazard @ ip=%016lx (pr = %016lx)\n",
-			       current->comm, task_pid_nr(current),
-			       regs.cr_iip + ia64_psr(&regs)->ri, regs.pr);
-# endif
-			return;
-		}
-		break;
-
-	      case 25: /* Disabled FP-Register */
-		if (isr & 2) {
-			disabled_fph_fault(&regs);
-			return;
-		}
-		sprintf(buf, "Disabled FPL fault---not supposed to happen!");
-		break;
-
-	      case 26: /* NaT Consumption */
-		if (user_mode(&regs)) {
-			void __user *addr;
-
-			if (((isr >> 4) & 0xf) == 2) {
-				/* NaT page consumption */
-				sig = SIGSEGV;
-				code = SEGV_ACCERR;
-				addr = (void __user *) ifa;
-			} else {
-				/* register NaT consumption */
-				sig = SIGILL;
-				code = ILL_ILLOPN;
-				addr = (void __user *) (regs.cr_iip
-							+ ia64_psr(&regs)->ri);
-			}
-			force_sig_fault(sig, code, addr,
-					vector, __ISR_VALID, isr);
-			return;
-		} else if (ia64_done_with_exception(&regs))
-			return;
-		sprintf(buf, "NaT consumption");
-		break;
-
-	      case 31: /* Unsupported Data Reference */
-		if (user_mode(&regs)) {
-			force_sig_fault(SIGILL, ILL_ILLOPN, (void __user *) iip,
-					vector, __ISR_VALID, isr);
-			return;
-		}
-		sprintf(buf, "Unsupported data reference");
-		break;
-
-	      case 29: /* Debug */
-	      case 35: /* Taken Branch Trap */
-	      case 36: /* Single Step Trap */
-		if (fsys_mode(current, &regs)) {
-			extern char __kernel_syscall_via_break[];
-			/*
-			 * Got a trap in fsys-mode: Taken Branch Trap
-			 * and Single Step trap need special handling;
-			 * Debug trap is ignored (we disable it here
-			 * and re-enable it in the lower-privilege trap).
-			 */
-			if (unlikely(vector == 29)) {
-				set_thread_flag(TIF_DB_DISABLED);
-				ia64_psr(&regs)->db = 0;
-				ia64_psr(&regs)->lp = 1;
-				return;
-			}
-			/* re-do the system call via break 0x100000: */
-			regs.cr_iip = (unsigned long) __kernel_syscall_via_break;
-			ia64_psr(&regs)->ri = 0;
-			ia64_psr(&regs)->cpl = 3;
-			return;
-		}
-		switch (vector) {
-		      default:
-		      case 29:
-			si_code = TRAP_HWBKPT;
-#ifdef CONFIG_ITANIUM
-			/*
-			 * Erratum 10 (IFA may contain incorrect address) now has
-			 * "NoFix" status.  There are no plans for fixing this.
-			 */
-			if (ia64_psr(&regs)->is == 0)
-			  ifa = regs.cr_iip;
-#endif
-			break;
-		      case 35: si_code = TRAP_BRANCH; ifa = 0; break;
-		      case 36: si_code = TRAP_TRACE; ifa = 0; break;
-		}
-		if (notify_die(DIE_FAULT, "ia64_fault", &regs, vector, si_code, SIGTRAP)
-			       	== NOTIFY_STOP)
-			return;
-		force_sig_fault(SIGTRAP, si_code, (void __user *) ifa,
-				0, __ISR_VALID, isr);
-		return;
-
-	      case 32: /* fp fault */
-	      case 33: /* fp trap */
-		result = handle_fpu_swa((vector == 32) ? 1 : 0, &regs, isr);
-		if ((result < 0) || (current->thread.flags & IA64_THREAD_FPEMU_SIGFPE)) {
-			force_sig_fault(SIGFPE, FPE_FLTINV, (void __user *) iip,
-					0, __ISR_VALID, isr);
-		}
-		return;
-
-	      case 34:
-		if (isr & 0x2) {
-			/* Lower-Privilege Transfer Trap */
-
-			/* If we disabled debug traps during an fsyscall,
-			 * re-enable them here.
-			 */
-			if (test_thread_flag(TIF_DB_DISABLED)) {
-				clear_thread_flag(TIF_DB_DISABLED);
-				ia64_psr(&regs)->db = 1;
-			}
-
-			/*
-			 * Just clear PSR.lp and then return immediately:
-			 * all the interesting work (e.g., signal delivery)
-			 * is done in the kernel exit path.
-			 */
-			ia64_psr(&regs)->lp = 0;
-			return;
-		} else {
-			/* Unimplemented Instr. Address Trap */
-			if (user_mode(&regs)) {
-				force_sig_fault(SIGILL, ILL_BADIADDR,
-						(void __user *) iip,
-						0, 0, 0);
-				return;
-			}
-			sprintf(buf, "Unimplemented Instruction Address fault");
-		}
-		break;
-
-	      case 45:
-		printk(KERN_ERR "Unexpected IA-32 exception (Trap 45)\n");
-		printk(KERN_ERR "  iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx\n",
-		       iip, ifa, isr);
-		force_sig(SIGSEGV);
-		return;
-
-	      case 46:
-		printk(KERN_ERR "Unexpected IA-32 intercept trap (Trap 46)\n");
-		printk(KERN_ERR "  iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx, iim - 0x%lx\n",
-		       iip, ifa, isr, iim);
-		force_sig(SIGSEGV);
-		return;
-
-	      case 47:
-		sprintf(buf, "IA-32 Interruption Fault (int 0x%lx)", isr >> 16);
-		break;
-
-	      default:
-		sprintf(buf, "Fault %lu", vector);
-		break;
-	}
-	if (!die_if_kernel(buf, &regs, error))
-		force_sig(SIGILL);
-}
diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
deleted file mode 100644
index 0acb5a0cd7ab..000000000000
--- a/arch/ia64/kernel/unaligned.c
+++ /dev/null
@@ -1,1560 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Architecture-specific unaligned trap handling.
- *
- * Copyright (C) 1999-2002, 2004 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 2002/12/09   Fix rotating register handling (off-by-1 error, missing fr-rotation).  Fix
- *		get_rse_reg() to not leak kernel bits to user-level (reading an out-of-frame
- *		stacked register returns an undefined value; it does NOT trigger a
- *		"rsvd register fault").
- * 2001/10/11	Fix unaligned access to rotating registers in s/w pipelined loops.
- * 2001/08/13	Correct size of extended floats (float_fsz) from 16 to 10 bytes.
- * 2001/01/17	Add support emulation of unaligned kernel accesses.
- */
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/sched/signal.h>
-#include <linux/tty.h>
-#include <linux/extable.h>
-#include <linux/ratelimit.h>
-#include <linux/uaccess.h>
-
-#include <asm/intrinsics.h>
-#include <asm/processor.h>
-#include <asm/rse.h>
-#include <asm/exception.h>
-#include <asm/unaligned.h>
-
-extern int die_if_kernel(char *str, struct pt_regs *regs, long err);
-
-#undef DEBUG_UNALIGNED_TRAP
-
-#ifdef DEBUG_UNALIGNED_TRAP
-# define DPRINT(a...)	do { printk("%s %u: ", __func__, __LINE__); printk (a); } while (0)
-# define DDUMP(str,vp,len)	dump(str, vp, len)
-
-static void
-dump (const char *str, void *vp, size_t len)
-{
-	unsigned char *cp = vp;
-	int i;
-
-	printk("%s", str);
-	for (i = 0; i < len; ++i)
-		printk (" %02x", *cp++);
-	printk("\n");
-}
-#else
-# define DPRINT(a...)
-# define DDUMP(str,vp,len)
-#endif
-
-#define IA64_FIRST_STACKED_GR	32
-#define IA64_FIRST_ROTATING_FR	32
-#define SIGN_EXT9		0xffffffffffffff00ul
-
-/*
- *  sysctl settable hook which tells the kernel whether to honor the
- *  IA64_THREAD_UAC_NOPRINT prctl.  Because this is user settable, we want
- *  to allow the super user to enable/disable this for security reasons
- *  (i.e. don't allow attacker to fill up logs with unaligned accesses).
- */
-int no_unaligned_warning;
-int unaligned_dump_stack;
-
-/*
- * For M-unit:
- *
- *  opcode |   m  |   x6    |
- * --------|------|---------|
- * [40-37] | [36] | [35:30] |
- * --------|------|---------|
- *     4   |   1  |    6    | = 11 bits
- * --------------------------
- * However bits [31:30] are not directly useful to distinguish between
- * load/store so we can use [35:32] instead, which gives the following
- * mask ([40:32]) using 9 bits. The 'e' comes from the fact that we defer
- * checking the m-bit until later in the load/store emulation.
- */
-#define IA64_OPCODE_MASK	0x1ef
-#define IA64_OPCODE_SHIFT	32
-
-/*
- * Table C-28 Integer Load/Store
- *
- * We ignore [35:32]= 0x6, 0x7, 0xE, 0xF
- *
- * ld8.fill, st8.fill  MUST be aligned because the RNATs are based on
- * the address (bits [8:3]), so we must failed.
- */
-#define LD_OP            0x080
-#define LDS_OP           0x081
-#define LDA_OP           0x082
-#define LDSA_OP          0x083
-#define LDBIAS_OP        0x084
-#define LDACQ_OP         0x085
-/* 0x086, 0x087 are not relevant */
-#define LDCCLR_OP        0x088
-#define LDCNC_OP         0x089
-#define LDCCLRACQ_OP     0x08a
-#define ST_OP            0x08c
-#define STREL_OP         0x08d
-/* 0x08e,0x8f are not relevant */
-
-/*
- * Table C-29 Integer Load +Reg
- *
- * we use the ld->m (bit [36:36]) field to determine whether or not we have
- * a load/store of this form.
- */
-
-/*
- * Table C-30 Integer Load/Store +Imm
- *
- * We ignore [35:32]= 0x6, 0x7, 0xE, 0xF
- *
- * ld8.fill, st8.fill  must be aligned because the Nat register are based on
- * the address, so we must fail and the program must be fixed.
- */
-#define LD_IMM_OP            0x0a0
-#define LDS_IMM_OP           0x0a1
-#define LDA_IMM_OP           0x0a2
-#define LDSA_IMM_OP          0x0a3
-#define LDBIAS_IMM_OP        0x0a4
-#define LDACQ_IMM_OP         0x0a5
-/* 0x0a6, 0xa7 are not relevant */
-#define LDCCLR_IMM_OP        0x0a8
-#define LDCNC_IMM_OP         0x0a9
-#define LDCCLRACQ_IMM_OP     0x0aa
-#define ST_IMM_OP            0x0ac
-#define STREL_IMM_OP         0x0ad
-/* 0x0ae,0xaf are not relevant */
-
-/*
- * Table C-32 Floating-point Load/Store
- */
-#define LDF_OP           0x0c0
-#define LDFS_OP          0x0c1
-#define LDFA_OP          0x0c2
-#define LDFSA_OP         0x0c3
-/* 0x0c6 is irrelevant */
-#define LDFCCLR_OP       0x0c8
-#define LDFCNC_OP        0x0c9
-/* 0x0cb is irrelevant  */
-#define STF_OP           0x0cc
-
-/*
- * Table C-33 Floating-point Load +Reg
- *
- * we use the ld->m (bit [36:36]) field to determine whether or not we have
- * a load/store of this form.
- */
-
-/*
- * Table C-34 Floating-point Load/Store +Imm
- */
-#define LDF_IMM_OP       0x0e0
-#define LDFS_IMM_OP      0x0e1
-#define LDFA_IMM_OP      0x0e2
-#define LDFSA_IMM_OP     0x0e3
-/* 0x0e6 is irrelevant */
-#define LDFCCLR_IMM_OP   0x0e8
-#define LDFCNC_IMM_OP    0x0e9
-#define STF_IMM_OP       0x0ec
-
-typedef struct {
-	unsigned long	 qp:6;	/* [0:5]   */
-	unsigned long    r1:7;	/* [6:12]  */
-	unsigned long   imm:7;	/* [13:19] */
-	unsigned long    r3:7;	/* [20:26] */
-	unsigned long     x:1;  /* [27:27] */
-	unsigned long  hint:2;	/* [28:29] */
-	unsigned long x6_sz:2;	/* [30:31] */
-	unsigned long x6_op:4;	/* [32:35], x6 = x6_sz|x6_op */
-	unsigned long     m:1;	/* [36:36] */
-	unsigned long    op:4;	/* [37:40] */
-	unsigned long   pad:23; /* [41:63] */
-} load_store_t;
-
-
-typedef enum {
-	UPD_IMMEDIATE,	/* ldXZ r1=[r3],imm(9) */
-	UPD_REG		/* ldXZ r1=[r3],r2     */
-} update_t;
-
-/*
- * We use tables to keep track of the offsets of registers in the saved state.
- * This way we save having big switch/case statements.
- *
- * We use bit 0 to indicate switch_stack or pt_regs.
- * The offset is simply shifted by 1 bit.
- * A 2-byte value should be enough to hold any kind of offset
- *
- * In case the calling convention changes (and thus pt_regs/switch_stack)
- * simply use RSW instead of RPT or vice-versa.
- */
-
-#define RPO(x)	((size_t) &((struct pt_regs *)0)->x)
-#define RSO(x)	((size_t) &((struct switch_stack *)0)->x)
-
-#define RPT(x)		(RPO(x) << 1)
-#define RSW(x)		(1| RSO(x)<<1)
-
-#define GR_OFFS(x)	(gr_info[x]>>1)
-#define GR_IN_SW(x)	(gr_info[x] & 0x1)
-
-#define FR_OFFS(x)	(fr_info[x]>>1)
-#define FR_IN_SW(x)	(fr_info[x] & 0x1)
-
-static u16 gr_info[32]={
-	0,			/* r0 is read-only : WE SHOULD NEVER GET THIS */
-
-	RPT(r1), RPT(r2), RPT(r3),
-
-	RSW(r4), RSW(r5), RSW(r6), RSW(r7),
-
-	RPT(r8), RPT(r9), RPT(r10), RPT(r11),
-	RPT(r12), RPT(r13), RPT(r14), RPT(r15),
-
-	RPT(r16), RPT(r17), RPT(r18), RPT(r19),
-	RPT(r20), RPT(r21), RPT(r22), RPT(r23),
-	RPT(r24), RPT(r25), RPT(r26), RPT(r27),
-	RPT(r28), RPT(r29), RPT(r30), RPT(r31)
-};
-
-static u16 fr_info[32]={
-	0,			/* constant : WE SHOULD NEVER GET THIS */
-	0,			/* constant : WE SHOULD NEVER GET THIS */
-
-	RSW(f2), RSW(f3), RSW(f4), RSW(f5),
-
-	RPT(f6), RPT(f7), RPT(f8), RPT(f9),
-	RPT(f10), RPT(f11),
-
-	RSW(f12), RSW(f13), RSW(f14),
-	RSW(f15), RSW(f16), RSW(f17), RSW(f18), RSW(f19),
-	RSW(f20), RSW(f21), RSW(f22), RSW(f23), RSW(f24),
-	RSW(f25), RSW(f26), RSW(f27), RSW(f28), RSW(f29),
-	RSW(f30), RSW(f31)
-};
-
-/* Invalidate ALAT entry for integer register REGNO.  */
-static void
-invala_gr (int regno)
-{
-#	define F(reg)	case reg: ia64_invala_gr(reg); break
-
-	switch (regno) {
-		F(  0); F(  1); F(  2); F(  3); F(  4); F(  5); F(  6); F(  7);
-		F(  8); F(  9); F( 10); F( 11); F( 12); F( 13); F( 14); F( 15);
-		F( 16); F( 17); F( 18); F( 19); F( 20); F( 21); F( 22); F( 23);
-		F( 24); F( 25); F( 26); F( 27); F( 28); F( 29); F( 30); F( 31);
-		F( 32); F( 33); F( 34); F( 35); F( 36); F( 37); F( 38); F( 39);
-		F( 40); F( 41); F( 42); F( 43); F( 44); F( 45); F( 46); F( 47);
-		F( 48); F( 49); F( 50); F( 51); F( 52); F( 53); F( 54); F( 55);
-		F( 56); F( 57); F( 58); F( 59); F( 60); F( 61); F( 62); F( 63);
-		F( 64); F( 65); F( 66); F( 67); F( 68); F( 69); F( 70); F( 71);
-		F( 72); F( 73); F( 74); F( 75); F( 76); F( 77); F( 78); F( 79);
-		F( 80); F( 81); F( 82); F( 83); F( 84); F( 85); F( 86); F( 87);
-		F( 88); F( 89); F( 90); F( 91); F( 92); F( 93); F( 94); F( 95);
-		F( 96); F( 97); F( 98); F( 99); F(100); F(101); F(102); F(103);
-		F(104); F(105); F(106); F(107); F(108); F(109); F(110); F(111);
-		F(112); F(113); F(114); F(115); F(116); F(117); F(118); F(119);
-		F(120); F(121); F(122); F(123); F(124); F(125); F(126); F(127);
-	}
-#	undef F
-}
-
-/* Invalidate ALAT entry for floating-point register REGNO.  */
-static void
-invala_fr (int regno)
-{
-#	define F(reg)	case reg: ia64_invala_fr(reg); break
-
-	switch (regno) {
-		F(  0); F(  1); F(  2); F(  3); F(  4); F(  5); F(  6); F(  7);
-		F(  8); F(  9); F( 10); F( 11); F( 12); F( 13); F( 14); F( 15);
-		F( 16); F( 17); F( 18); F( 19); F( 20); F( 21); F( 22); F( 23);
-		F( 24); F( 25); F( 26); F( 27); F( 28); F( 29); F( 30); F( 31);
-		F( 32); F( 33); F( 34); F( 35); F( 36); F( 37); F( 38); F( 39);
-		F( 40); F( 41); F( 42); F( 43); F( 44); F( 45); F( 46); F( 47);
-		F( 48); F( 49); F( 50); F( 51); F( 52); F( 53); F( 54); F( 55);
-		F( 56); F( 57); F( 58); F( 59); F( 60); F( 61); F( 62); F( 63);
-		F( 64); F( 65); F( 66); F( 67); F( 68); F( 69); F( 70); F( 71);
-		F( 72); F( 73); F( 74); F( 75); F( 76); F( 77); F( 78); F( 79);
-		F( 80); F( 81); F( 82); F( 83); F( 84); F( 85); F( 86); F( 87);
-		F( 88); F( 89); F( 90); F( 91); F( 92); F( 93); F( 94); F( 95);
-		F( 96); F( 97); F( 98); F( 99); F(100); F(101); F(102); F(103);
-		F(104); F(105); F(106); F(107); F(108); F(109); F(110); F(111);
-		F(112); F(113); F(114); F(115); F(116); F(117); F(118); F(119);
-		F(120); F(121); F(122); F(123); F(124); F(125); F(126); F(127);
-	}
-#	undef F
-}
-
-static inline unsigned long
-rotate_reg (unsigned long sor, unsigned long rrb, unsigned long reg)
-{
-	reg += rrb;
-	if (reg >= sor)
-		reg -= sor;
-	return reg;
-}
-
-static void
-set_rse_reg (struct pt_regs *regs, unsigned long r1, unsigned long val, int nat)
-{
-	struct switch_stack *sw = (struct switch_stack *) regs - 1;
-	unsigned long *bsp, *bspstore, *addr, *rnat_addr, *ubs_end;
-	unsigned long *kbs = (void *) current + IA64_RBS_OFFSET;
-	unsigned long rnats, nat_mask;
-	unsigned long on_kbs;
-	long sof = (regs->cr_ifs) & 0x7f;
-	long sor = 8 * ((regs->cr_ifs >> 14) & 0xf);
-	long rrb_gr = (regs->cr_ifs >> 18) & 0x7f;
-	long ridx = r1 - 32;
-
-	if (ridx >= sof) {
-		/* this should never happen, as the "rsvd register fault" has higher priority */
-		DPRINT("ignoring write to r%lu; only %lu registers are allocated!\n", r1, sof);
-		return;
-	}
-
-	if (ridx < sor)
-		ridx = rotate_reg(sor, rrb_gr, ridx);
-
-	DPRINT("r%lu, sw.bspstore=%lx pt.bspstore=%lx sof=%ld sol=%ld ridx=%ld\n",
-	       r1, sw->ar_bspstore, regs->ar_bspstore, sof, (regs->cr_ifs >> 7) & 0x7f, ridx);
-
-	on_kbs = ia64_rse_num_regs(kbs, (unsigned long *) sw->ar_bspstore);
-	addr = ia64_rse_skip_regs((unsigned long *) sw->ar_bspstore, -sof + ridx);
-	if (addr >= kbs) {
-		/* the register is on the kernel backing store: easy... */
-		rnat_addr = ia64_rse_rnat_addr(addr);
-		if ((unsigned long) rnat_addr >= sw->ar_bspstore)
-			rnat_addr = &sw->ar_rnat;
-		nat_mask = 1UL << ia64_rse_slot_num(addr);
-
-		*addr = val;
-		if (nat)
-			*rnat_addr |=  nat_mask;
-		else
-			*rnat_addr &= ~nat_mask;
-		return;
-	}
-
-	if (!user_stack(current, regs)) {
-		DPRINT("ignoring kernel write to r%lu; register isn't on the kernel RBS!", r1);
-		return;
-	}
-
-	bspstore = (unsigned long *)regs->ar_bspstore;
-	ubs_end = ia64_rse_skip_regs(bspstore, on_kbs);
-	bsp     = ia64_rse_skip_regs(ubs_end, -sof);
-	addr    = ia64_rse_skip_regs(bsp, ridx);
-
-	DPRINT("ubs_end=%p bsp=%p addr=%p\n", (void *) ubs_end, (void *) bsp, (void *) addr);
-
-	ia64_poke(current, sw, (unsigned long) ubs_end, (unsigned long) addr, val);
-
-	rnat_addr = ia64_rse_rnat_addr(addr);
-
-	ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, &rnats);
-	DPRINT("rnat @%p = 0x%lx nat=%d old nat=%ld\n",
-	       (void *) rnat_addr, rnats, nat, (rnats >> ia64_rse_slot_num(addr)) & 1);
-
-	nat_mask = 1UL << ia64_rse_slot_num(addr);
-	if (nat)
-		rnats |=  nat_mask;
-	else
-		rnats &= ~nat_mask;
-	ia64_poke(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, rnats);
-
-	DPRINT("rnat changed to @%p = 0x%lx\n", (void *) rnat_addr, rnats);
-}
-
-
-static void
-get_rse_reg (struct pt_regs *regs, unsigned long r1, unsigned long *val, int *nat)
-{
-	struct switch_stack *sw = (struct switch_stack *) regs - 1;
-	unsigned long *bsp, *addr, *rnat_addr, *ubs_end, *bspstore;
-	unsigned long *kbs = (void *) current + IA64_RBS_OFFSET;
-	unsigned long rnats, nat_mask;
-	unsigned long on_kbs;
-	long sof = (regs->cr_ifs) & 0x7f;
-	long sor = 8 * ((regs->cr_ifs >> 14) & 0xf);
-	long rrb_gr = (regs->cr_ifs >> 18) & 0x7f;
-	long ridx = r1 - 32;
-
-	if (ridx >= sof) {
-		/* read of out-of-frame register returns an undefined value; 0 in our case.  */
-		DPRINT("ignoring read from r%lu; only %lu registers are allocated!\n", r1, sof);
-		goto fail;
-	}
-
-	if (ridx < sor)
-		ridx = rotate_reg(sor, rrb_gr, ridx);
-
-	DPRINT("r%lu, sw.bspstore=%lx pt.bspstore=%lx sof=%ld sol=%ld ridx=%ld\n",
-	       r1, sw->ar_bspstore, regs->ar_bspstore, sof, (regs->cr_ifs >> 7) & 0x7f, ridx);
-
-	on_kbs = ia64_rse_num_regs(kbs, (unsigned long *) sw->ar_bspstore);
-	addr = ia64_rse_skip_regs((unsigned long *) sw->ar_bspstore, -sof + ridx);
-	if (addr >= kbs) {
-		/* the register is on the kernel backing store: easy... */
-		*val = *addr;
-		if (nat) {
-			rnat_addr = ia64_rse_rnat_addr(addr);
-			if ((unsigned long) rnat_addr >= sw->ar_bspstore)
-				rnat_addr = &sw->ar_rnat;
-			nat_mask = 1UL << ia64_rse_slot_num(addr);
-			*nat = (*rnat_addr & nat_mask) != 0;
-		}
-		return;
-	}
-
-	if (!user_stack(current, regs)) {
-		DPRINT("ignoring kernel read of r%lu; register isn't on the RBS!", r1);
-		goto fail;
-	}
-
-	bspstore = (unsigned long *)regs->ar_bspstore;
-	ubs_end = ia64_rse_skip_regs(bspstore, on_kbs);
-	bsp     = ia64_rse_skip_regs(ubs_end, -sof);
-	addr    = ia64_rse_skip_regs(bsp, ridx);
-
-	DPRINT("ubs_end=%p bsp=%p addr=%p\n", (void *) ubs_end, (void *) bsp, (void *) addr);
-
-	ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) addr, val);
-
-	if (nat) {
-		rnat_addr = ia64_rse_rnat_addr(addr);
-		nat_mask = 1UL << ia64_rse_slot_num(addr);
-
-		DPRINT("rnat @%p = 0x%lx\n", (void *) rnat_addr, rnats);
-
-		ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, &rnats);
-		*nat = (rnats & nat_mask) != 0;
-	}
-	return;
-
-  fail:
-	*val = 0;
-	if (nat)
-		*nat = 0;
-	return;
-}
-
-
-static void
-setreg (unsigned long regnum, unsigned long val, int nat, struct pt_regs *regs)
-{
-	struct switch_stack *sw = (struct switch_stack *) regs - 1;
-	unsigned long addr;
-	unsigned long bitmask;
-	unsigned long *unat;
-
-	/*
-	 * First takes care of stacked registers
-	 */
-	if (regnum >= IA64_FIRST_STACKED_GR) {
-		set_rse_reg(regs, regnum, val, nat);
-		return;
-	}
-
-	/*
-	 * Using r0 as a target raises a General Exception fault which has higher priority
-	 * than the Unaligned Reference fault.
-	 */
-
-	/*
-	 * Now look at registers in [0-31] range and init correct UNAT
-	 */
-	if (GR_IN_SW(regnum)) {
-		addr = (unsigned long)sw;
-		unat = &sw->ar_unat;
-	} else {
-		addr = (unsigned long)regs;
-		unat = &sw->caller_unat;
-	}
-	DPRINT("tmp_base=%lx switch_stack=%s offset=%d\n",
-	       addr, unat==&sw->ar_unat ? "yes":"no", GR_OFFS(regnum));
-	/*
-	 * add offset from base of struct
-	 * and do it !
-	 */
-	addr += GR_OFFS(regnum);
-
-	*(unsigned long *)addr = val;
-
-	/*
-	 * We need to clear the corresponding UNAT bit to fully emulate the load
-	 * UNAT bit_pos = GR[r3]{8:3} form EAS-2.4
-	 */
-	bitmask   = 1UL << (addr >> 3 & 0x3f);
-	DPRINT("*0x%lx=0x%lx NaT=%d prev_unat @%p=%lx\n", addr, val, nat, (void *) unat, *unat);
-	if (nat) {
-		*unat |= bitmask;
-	} else {
-		*unat &= ~bitmask;
-	}
-	DPRINT("*0x%lx=0x%lx NaT=%d new unat: %p=%lx\n", addr, val, nat, (void *) unat,*unat);
-}
-
-/*
- * Return the (rotated) index for floating point register REGNUM (REGNUM must be in the
- * range from 32-127, result is in the range from 0-95.
- */
-static inline unsigned long
-fph_index (struct pt_regs *regs, long regnum)
-{
-	unsigned long rrb_fr = (regs->cr_ifs >> 25) & 0x7f;
-	return rotate_reg(96, rrb_fr, (regnum - IA64_FIRST_ROTATING_FR));
-}
-
-static void
-setfpreg (unsigned long regnum, struct ia64_fpreg *fpval, struct pt_regs *regs)
-{
-	struct switch_stack *sw = (struct switch_stack *)regs - 1;
-	unsigned long addr;
-
-	/*
-	 * From EAS-2.5: FPDisableFault has higher priority than Unaligned
-	 * Fault. Thus, when we get here, we know the partition is enabled.
-	 * To update f32-f127, there are three choices:
-	 *
-	 *	(1) save f32-f127 to thread.fph and update the values there
-	 *	(2) use a gigantic switch statement to directly access the registers
-	 *	(3) generate code on the fly to update the desired register
-	 *
-	 * For now, we are using approach (1).
-	 */
-	if (regnum >= IA64_FIRST_ROTATING_FR) {
-		ia64_sync_fph(current);
-		current->thread.fph[fph_index(regs, regnum)] = *fpval;
-	} else {
-		/*
-		 * pt_regs or switch_stack ?
-		 */
-		if (FR_IN_SW(regnum)) {
-			addr = (unsigned long)sw;
-		} else {
-			addr = (unsigned long)regs;
-		}
-
-		DPRINT("tmp_base=%lx offset=%d\n", addr, FR_OFFS(regnum));
-
-		addr += FR_OFFS(regnum);
-		*(struct ia64_fpreg *)addr = *fpval;
-
-		/*
-		 * mark the low partition as being used now
-		 *
-		 * It is highly unlikely that this bit is not already set, but
-		 * let's do it for safety.
-		 */
-		regs->cr_ipsr |= IA64_PSR_MFL;
-	}
-}
-
-/*
- * Those 2 inline functions generate the spilled versions of the constant floating point
- * registers which can be used with stfX
- */
-static inline void
-float_spill_f0 (struct ia64_fpreg *final)
-{
-	ia64_stf_spill(final, 0);
-}
-
-static inline void
-float_spill_f1 (struct ia64_fpreg *final)
-{
-	ia64_stf_spill(final, 1);
-}
-
-static void
-getfpreg (unsigned long regnum, struct ia64_fpreg *fpval, struct pt_regs *regs)
-{
-	struct switch_stack *sw = (struct switch_stack *) regs - 1;
-	unsigned long addr;
-
-	/*
-	 * From EAS-2.5: FPDisableFault has higher priority than
-	 * Unaligned Fault. Thus, when we get here, we know the partition is
-	 * enabled.
-	 *
-	 * When regnum > 31, the register is still live and we need to force a save
-	 * to current->thread.fph to get access to it.  See discussion in setfpreg()
-	 * for reasons and other ways of doing this.
-	 */
-	if (regnum >= IA64_FIRST_ROTATING_FR) {
-		ia64_flush_fph(current);
-		*fpval = current->thread.fph[fph_index(regs, regnum)];
-	} else {
-		/*
-		 * f0 = 0.0, f1= 1.0. Those registers are constant and are thus
-		 * not saved, we must generate their spilled form on the fly
-		 */
-		switch(regnum) {
-		case 0:
-			float_spill_f0(fpval);
-			break;
-		case 1:
-			float_spill_f1(fpval);
-			break;
-		default:
-			/*
-			 * pt_regs or switch_stack ?
-			 */
-			addr =  FR_IN_SW(regnum) ? (unsigned long)sw
-						 : (unsigned long)regs;
-
-			DPRINT("is_sw=%d tmp_base=%lx offset=0x%x\n",
-			       FR_IN_SW(regnum), addr, FR_OFFS(regnum));
-
-			addr  += FR_OFFS(regnum);
-			*fpval = *(struct ia64_fpreg *)addr;
-		}
-	}
-}
-
-
-static void
-getreg (unsigned long regnum, unsigned long *val, int *nat, struct pt_regs *regs)
-{
-	struct switch_stack *sw = (struct switch_stack *) regs - 1;
-	unsigned long addr, *unat;
-
-	if (regnum >= IA64_FIRST_STACKED_GR) {
-		get_rse_reg(regs, regnum, val, nat);
-		return;
-	}
-
-	/*
-	 * take care of r0 (read-only always evaluate to 0)
-	 */
-	if (regnum == 0) {
-		*val = 0;
-		if (nat)
-			*nat = 0;
-		return;
-	}
-
-	/*
-	 * Now look at registers in [0-31] range and init correct UNAT
-	 */
-	if (GR_IN_SW(regnum)) {
-		addr = (unsigned long)sw;
-		unat = &sw->ar_unat;
-	} else {
-		addr = (unsigned long)regs;
-		unat = &sw->caller_unat;
-	}
-
-	DPRINT("addr_base=%lx offset=0x%x\n", addr,  GR_OFFS(regnum));
-
-	addr += GR_OFFS(regnum);
-
-	*val  = *(unsigned long *)addr;
-
-	/*
-	 * do it only when requested
-	 */
-	if (nat)
-		*nat  = (*unat >> (addr >> 3 & 0x3f)) & 0x1UL;
-}
-
-static void
-emulate_load_updates (update_t type, load_store_t ld, struct pt_regs *regs, unsigned long ifa)
-{
-	/*
-	 * IMPORTANT:
-	 * Given the way we handle unaligned speculative loads, we should
-	 * not get to this point in the code but we keep this sanity check,
-	 * just in case.
-	 */
-	if (ld.x6_op == 1 || ld.x6_op == 3) {
-		printk(KERN_ERR "%s: register update on speculative load, error\n", __func__);
-		if (die_if_kernel("unaligned reference on speculative load with register update\n",
-				  regs, 30))
-			return;
-	}
-
-
-	/*
-	 * at this point, we know that the base register to update is valid i.e.,
-	 * it's not r0
-	 */
-	if (type == UPD_IMMEDIATE) {
-		unsigned long imm;
-
-		/*
-		 * Load +Imm: ldXZ r1=[r3],imm(9)
-		 *
-		 *
-		 * form imm9: [13:19] contain the first 7 bits
-		 */
-		imm = ld.x << 7 | ld.imm;
-
-		/*
-		 * sign extend (1+8bits) if m set
-		 */
-		if (ld.m) imm |= SIGN_EXT9;
-
-		/*
-		 * ifa == r3 and we know that the NaT bit on r3 was clear so
-		 * we can directly use ifa.
-		 */
-		ifa += imm;
-
-		setreg(ld.r3, ifa, 0, regs);
-
-		DPRINT("ld.x=%d ld.m=%d imm=%ld r3=0x%lx\n", ld.x, ld.m, imm, ifa);
-
-	} else if (ld.m) {
-		unsigned long r2;
-		int nat_r2;
-
-		/*
-		 * Load +Reg Opcode: ldXZ r1=[r3],r2
-		 *
-		 * Note: that we update r3 even in the case of ldfX.a
-		 * (where the load does not happen)
-		 *
-		 * The way the load algorithm works, we know that r3 does not
-		 * have its NaT bit set (would have gotten NaT consumption
-		 * before getting the unaligned fault). So we can use ifa
-		 * which equals r3 at this point.
-		 *
-		 * IMPORTANT:
-		 * The above statement holds ONLY because we know that we
-		 * never reach this code when trying to do a ldX.s.
-		 * If we ever make it to here on an ldfX.s then
-		 */
-		getreg(ld.imm, &r2, &nat_r2, regs);
-
-		ifa += r2;
-
-		/*
-		 * propagate Nat r2 -> r3
-		 */
-		setreg(ld.r3, ifa, nat_r2, regs);
-
-		DPRINT("imm=%d r2=%ld r3=0x%lx nat_r2=%d\n",ld.imm, r2, ifa, nat_r2);
-	}
-}
-
-static int emulate_store(unsigned long ifa, void *val, int len, bool kernel_mode)
-{
-	if (kernel_mode)
-		return copy_to_kernel_nofault((void *)ifa, val, len);
-
-	return copy_to_user((void __user *)ifa, val, len);
-}
-
-static int emulate_load(void *val, unsigned long ifa, int len, bool kernel_mode)
-{
-	if (kernel_mode)
-	       return copy_from_kernel_nofault(val, (void *)ifa, len);
-
-	return copy_from_user(val, (void __user *)ifa, len);
-}
-
-static int
-emulate_load_int (unsigned long ifa, load_store_t ld, struct pt_regs *regs,
-		  bool kernel_mode)
-{
-	unsigned int len = 1 << ld.x6_sz;
-	unsigned long val = 0;
-
-	/*
-	 * r0, as target, doesn't need to be checked because Illegal Instruction
-	 * faults have higher priority than unaligned faults.
-	 *
-	 * r0 cannot be found as the base as it would never generate an
-	 * unaligned reference.
-	 */
-
-	/*
-	 * ldX.a we will emulate load and also invalidate the ALAT entry.
-	 * See comment below for explanation on how we handle ldX.a
-	 */
-
-	if (len != 2 && len != 4 && len != 8) {
-		DPRINT("unknown size: x6=%d\n", ld.x6_sz);
-		return -1;
-	}
-	/* this assumes little-endian byte-order: */
-	if (emulate_load(&val, ifa, len, kernel_mode))
-		return -1;
-	setreg(ld.r1, val, 0, regs);
-
-	/*
-	 * check for updates on any kind of loads
-	 */
-	if (ld.op == 0x5 || ld.m)
-		emulate_load_updates(ld.op == 0x5 ? UPD_IMMEDIATE: UPD_REG, ld, regs, ifa);
-
-	/*
-	 * handling of various loads (based on EAS2.4):
-	 *
-	 * ldX.acq (ordered load):
-	 *	- acquire semantics would have been used, so force fence instead.
-	 *
-	 * ldX.c.clr (check load and clear):
-	 *	- if we get to this handler, it's because the entry was not in the ALAT.
-	 *	  Therefore the operation reverts to a normal load
-	 *
-	 * ldX.c.nc (check load no clear):
-	 *	- same as previous one
-	 *
-	 * ldX.c.clr.acq (ordered check load and clear):
-	 *	- same as above for c.clr part. The load needs to have acquire semantics. So
-	 *	  we use the fence semantics which is stronger and thus ensures correctness.
-	 *
-	 * ldX.a (advanced load):
-	 *	- suppose ldX.a r1=[r3]. If we get to the unaligned trap it's because the
-	 *	  address doesn't match requested size alignment. This means that we would
-	 *	  possibly need more than one load to get the result.
-	 *
-	 *	  The load part can be handled just like a normal load, however the difficult
-	 *	  part is to get the right thing into the ALAT. The critical piece of information
-	 *	  in the base address of the load & size. To do that, a ld.a must be executed,
-	 *	  clearly any address can be pushed into the table by using ld1.a r1=[r3]. Now
-	 *	  if we use the same target register, we will be okay for the check.a instruction.
-	 *	  If we look at the store, basically a stX [r3]=r1 checks the ALAT  for any entry
-	 *	  which would overlap within [r3,r3+X] (the size of the load was store in the
-	 *	  ALAT). If such an entry is found the entry is invalidated. But this is not good
-	 *	  enough, take the following example:
-	 *		r3=3
-	 *		ld4.a r1=[r3]
-	 *
-	 *	  Could be emulated by doing:
-	 *		ld1.a r1=[r3],1
-	 *		store to temporary;
-	 *		ld1.a r1=[r3],1
-	 *		store & shift to temporary;
-	 *		ld1.a r1=[r3],1
-	 *		store & shift to temporary;
-	 *		ld1.a r1=[r3]
-	 *		store & shift to temporary;
-	 *		r1=temporary
-	 *
-	 *	  So in this case, you would get the right value is r1 but the wrong info in
-	 *	  the ALAT.  Notice that you could do it in reverse to finish with address 3
-	 *	  but you would still get the size wrong.  To get the size right, one needs to
-	 *	  execute exactly the same kind of load. You could do it from a aligned
-	 *	  temporary location, but you would get the address wrong.
-	 *
-	 *	  So no matter what, it is not possible to emulate an advanced load
-	 *	  correctly. But is that really critical ?
-	 *
-	 *	  We will always convert ld.a into a normal load with ALAT invalidated.  This
-	 *	  will enable compiler to do optimization where certain code path after ld.a
-	 *	  is not required to have ld.c/chk.a, e.g., code path with no intervening stores.
-	 *
-	 *	  If there is a store after the advanced load, one must either do a ld.c.* or
-	 *	  chk.a.* to reuse the value stored in the ALAT. Both can "fail" (meaning no
-	 *	  entry found in ALAT), and that's perfectly ok because:
-	 *
-	 *		- ld.c.*, if the entry is not present a  normal load is executed
-	 *		- chk.a.*, if the entry is not present, execution jumps to recovery code
-	 *
-	 *	  In either case, the load can be potentially retried in another form.
-	 *
-	 *	  ALAT must be invalidated for the register (so that chk.a or ld.c don't pick
-	 *	  up a stale entry later). The register base update MUST also be performed.
-	 */
-
-	/*
-	 * when the load has the .acq completer then
-	 * use ordering fence.
-	 */
-	if (ld.x6_op == 0x5 || ld.x6_op == 0xa)
-		mb();
-
-	/*
-	 * invalidate ALAT entry in case of advanced load
-	 */
-	if (ld.x6_op == 0x2)
-		invala_gr(ld.r1);
-
-	return 0;
-}
-
-static int
-emulate_store_int (unsigned long ifa, load_store_t ld, struct pt_regs *regs,
-		   bool kernel_mode)
-{
-	unsigned long r2;
-	unsigned int len = 1 << ld.x6_sz;
-
-	/*
-	 * if we get to this handler, Nat bits on both r3 and r2 have already
-	 * been checked. so we don't need to do it
-	 *
-	 * extract the value to be stored
-	 */
-	getreg(ld.imm, &r2, NULL, regs);
-
-	/*
-	 * we rely on the macros in unaligned.h for now i.e.,
-	 * we let the compiler figure out how to read memory gracefully.
-	 *
-	 * We need this switch/case because the way the inline function
-	 * works. The code is optimized by the compiler and looks like
-	 * a single switch/case.
-	 */
-	DPRINT("st%d [%lx]=%lx\n", len, ifa, r2);
-
-	if (len != 2 && len != 4 && len != 8) {
-		DPRINT("unknown size: x6=%d\n", ld.x6_sz);
-		return -1;
-	}
-
-	/* this assumes little-endian byte-order: */
-	if (emulate_store(ifa, &r2, len, kernel_mode))
-		return -1;
-
-	/*
-	 * stX [r3]=r2,imm(9)
-	 *
-	 * NOTE:
-	 * ld.r3 can never be r0, because r0 would not generate an
-	 * unaligned access.
-	 */
-	if (ld.op == 0x5) {
-		unsigned long imm;
-
-		/*
-		 * form imm9: [12:6] contain first 7bits
-		 */
-		imm = ld.x << 7 | ld.r1;
-		/*
-		 * sign extend (8bits) if m set
-		 */
-		if (ld.m) imm |= SIGN_EXT9;
-		/*
-		 * ifa == r3 (NaT is necessarily cleared)
-		 */
-		ifa += imm;
-
-		DPRINT("imm=%lx r3=%lx\n", imm, ifa);
-
-		setreg(ld.r3, ifa, 0, regs);
-	}
-	/*
-	 * we don't have alat_invalidate_multiple() so we need
-	 * to do the complete flush :-<<
-	 */
-	ia64_invala();
-
-	/*
-	 * stX.rel: use fence instead of release
-	 */
-	if (ld.x6_op == 0xd)
-		mb();
-
-	return 0;
-}
-
-/*
- * floating point operations sizes in bytes
- */
-static const unsigned char float_fsz[4]={
-	10, /* extended precision (e) */
-	8,  /* integer (8)            */
-	4,  /* single precision (s)   */
-	8   /* double precision (d)   */
-};
-
-static inline void
-mem2float_extended (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldfe(6, init);
-	ia64_stop();
-	ia64_stf_spill(final, 6);
-}
-
-static inline void
-mem2float_integer (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldf8(6, init);
-	ia64_stop();
-	ia64_stf_spill(final, 6);
-}
-
-static inline void
-mem2float_single (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldfs(6, init);
-	ia64_stop();
-	ia64_stf_spill(final, 6);
-}
-
-static inline void
-mem2float_double (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldfd(6, init);
-	ia64_stop();
-	ia64_stf_spill(final, 6);
-}
-
-static inline void
-float2mem_extended (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldf_fill(6, init);
-	ia64_stop();
-	ia64_stfe(final, 6);
-}
-
-static inline void
-float2mem_integer (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldf_fill(6, init);
-	ia64_stop();
-	ia64_stf8(final, 6);
-}
-
-static inline void
-float2mem_single (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldf_fill(6, init);
-	ia64_stop();
-	ia64_stfs(final, 6);
-}
-
-static inline void
-float2mem_double (struct ia64_fpreg *init, struct ia64_fpreg *final)
-{
-	ia64_ldf_fill(6, init);
-	ia64_stop();
-	ia64_stfd(final, 6);
-}
-
-static int
-emulate_load_floatpair (unsigned long ifa, load_store_t ld, struct pt_regs *regs, bool kernel_mode)
-{
-	struct ia64_fpreg fpr_init[2];
-	struct ia64_fpreg fpr_final[2];
-	unsigned long len = float_fsz[ld.x6_sz];
-
-	/*
-	 * fr0 & fr1 don't need to be checked because Illegal Instruction faults have
-	 * higher priority than unaligned faults.
-	 *
-	 * r0 cannot be found as the base as it would never generate an unaligned
-	 * reference.
-	 */
-
-	/*
-	 * make sure we get clean buffers
-	 */
-	memset(&fpr_init, 0, sizeof(fpr_init));
-	memset(&fpr_final, 0, sizeof(fpr_final));
-
-	/*
-	 * ldfpX.a: we don't try to emulate anything but we must
-	 * invalidate the ALAT entry and execute updates, if any.
-	 */
-	if (ld.x6_op != 0x2) {
-		/*
-		 * This assumes little-endian byte-order.  Note that there is no "ldfpe"
-		 * instruction:
-		 */
-		if (emulate_load(&fpr_init[0], ifa, len, kernel_mode)
-		    || emulate_load(&fpr_init[1], (ifa + len), len, kernel_mode))
-			return -1;
-
-		DPRINT("ld.r1=%d ld.imm=%d x6_sz=%d\n", ld.r1, ld.imm, ld.x6_sz);
-		DDUMP("frp_init =", &fpr_init, 2*len);
-		/*
-		 * XXX fixme
-		 * Could optimize inlines by using ldfpX & 2 spills
-		 */
-		switch( ld.x6_sz ) {
-			case 0:
-				mem2float_extended(&fpr_init[0], &fpr_final[0]);
-				mem2float_extended(&fpr_init[1], &fpr_final[1]);
-				break;
-			case 1:
-				mem2float_integer(&fpr_init[0], &fpr_final[0]);
-				mem2float_integer(&fpr_init[1], &fpr_final[1]);
-				break;
-			case 2:
-				mem2float_single(&fpr_init[0], &fpr_final[0]);
-				mem2float_single(&fpr_init[1], &fpr_final[1]);
-				break;
-			case 3:
-				mem2float_double(&fpr_init[0], &fpr_final[0]);
-				mem2float_double(&fpr_init[1], &fpr_final[1]);
-				break;
-		}
-		DDUMP("fpr_final =", &fpr_final, 2*len);
-		/*
-		 * XXX fixme
-		 *
-		 * A possible optimization would be to drop fpr_final and directly
-		 * use the storage from the saved context i.e., the actual final
-		 * destination (pt_regs, switch_stack or thread structure).
-		 */
-		setfpreg(ld.r1, &fpr_final[0], regs);
-		setfpreg(ld.imm, &fpr_final[1], regs);
-	}
-
-	/*
-	 * Check for updates: only immediate updates are available for this
-	 * instruction.
-	 */
-	if (ld.m) {
-		/*
-		 * the immediate is implicit given the ldsz of the operation:
-		 * single: 8 (2x4) and for  all others it's 16 (2x8)
-		 */
-		ifa += len<<1;
-
-		/*
-		 * IMPORTANT:
-		 * the fact that we force the NaT of r3 to zero is ONLY valid
-		 * as long as we don't come here with a ldfpX.s.
-		 * For this reason we keep this sanity check
-		 */
-		if (ld.x6_op == 1 || ld.x6_op == 3)
-			printk(KERN_ERR "%s: register update on speculative load pair, error\n",
-			       __func__);
-
-		setreg(ld.r3, ifa, 0, regs);
-	}
-
-	/*
-	 * Invalidate ALAT entries, if any, for both registers.
-	 */
-	if (ld.x6_op == 0x2) {
-		invala_fr(ld.r1);
-		invala_fr(ld.imm);
-	}
-	return 0;
-}
-
-
-static int
-emulate_load_float (unsigned long ifa, load_store_t ld, struct pt_regs *regs,
-	            bool kernel_mode)
-{
-	struct ia64_fpreg fpr_init;
-	struct ia64_fpreg fpr_final;
-	unsigned long len = float_fsz[ld.x6_sz];
-
-	/*
-	 * fr0 & fr1 don't need to be checked because Illegal Instruction
-	 * faults have higher priority than unaligned faults.
-	 *
-	 * r0 cannot be found as the base as it would never generate an
-	 * unaligned reference.
-	 */
-
-	/*
-	 * make sure we get clean buffers
-	 */
-	memset(&fpr_init,0, sizeof(fpr_init));
-	memset(&fpr_final,0, sizeof(fpr_final));
-
-	/*
-	 * ldfX.a we don't try to emulate anything but we must
-	 * invalidate the ALAT entry.
-	 * See comments in ldX for descriptions on how the various loads are handled.
-	 */
-	if (ld.x6_op != 0x2) {
-		if (emulate_load(&fpr_init, ifa, len, kernel_mode))
-			return -1;
-
-		DPRINT("ld.r1=%d x6_sz=%d\n", ld.r1, ld.x6_sz);
-		DDUMP("fpr_init =", &fpr_init, len);
-		/*
-		 * we only do something for x6_op={0,8,9}
-		 */
-		switch( ld.x6_sz ) {
-			case 0:
-				mem2float_extended(&fpr_init, &fpr_final);
-				break;
-			case 1:
-				mem2float_integer(&fpr_init, &fpr_final);
-				break;
-			case 2:
-				mem2float_single(&fpr_init, &fpr_final);
-				break;
-			case 3:
-				mem2float_double(&fpr_init, &fpr_final);
-				break;
-		}
-		DDUMP("fpr_final =", &fpr_final, len);
-		/*
-		 * XXX fixme
-		 *
-		 * A possible optimization would be to drop fpr_final and directly
-		 * use the storage from the saved context i.e., the actual final
-		 * destination (pt_regs, switch_stack or thread structure).
-		 */
-		setfpreg(ld.r1, &fpr_final, regs);
-	}
-
-	/*
-	 * check for updates on any loads
-	 */
-	if (ld.op == 0x7 || ld.m)
-		emulate_load_updates(ld.op == 0x7 ? UPD_IMMEDIATE: UPD_REG, ld, regs, ifa);
-
-	/*
-	 * invalidate ALAT entry in case of advanced floating point loads
-	 */
-	if (ld.x6_op == 0x2)
-		invala_fr(ld.r1);
-
-	return 0;
-}
-
-
-static int
-emulate_store_float (unsigned long ifa, load_store_t ld, struct pt_regs *regs,
-		     bool kernel_mode)
-{
-	struct ia64_fpreg fpr_init;
-	struct ia64_fpreg fpr_final;
-	unsigned long len = float_fsz[ld.x6_sz];
-
-	/*
-	 * make sure we get clean buffers
-	 */
-	memset(&fpr_init,0, sizeof(fpr_init));
-	memset(&fpr_final,0, sizeof(fpr_final));
-
-	/*
-	 * if we get to this handler, Nat bits on both r3 and r2 have already
-	 * been checked. so we don't need to do it
-	 *
-	 * extract the value to be stored
-	 */
-	getfpreg(ld.imm, &fpr_init, regs);
-	/*
-	 * during this step, we extract the spilled registers from the saved
-	 * context i.e., we refill. Then we store (no spill) to temporary
-	 * aligned location
-	 */
-	switch( ld.x6_sz ) {
-		case 0:
-			float2mem_extended(&fpr_init, &fpr_final);
-			break;
-		case 1:
-			float2mem_integer(&fpr_init, &fpr_final);
-			break;
-		case 2:
-			float2mem_single(&fpr_init, &fpr_final);
-			break;
-		case 3:
-			float2mem_double(&fpr_init, &fpr_final);
-			break;
-	}
-	DPRINT("ld.r1=%d x6_sz=%d\n", ld.r1, ld.x6_sz);
-	DDUMP("fpr_init =", &fpr_init, len);
-	DDUMP("fpr_final =", &fpr_final, len);
-
-	if (emulate_store(ifa, &fpr_final, len, kernel_mode))
-		return -1;
-
-	/*
-	 * stfX [r3]=r2,imm(9)
-	 *
-	 * NOTE:
-	 * ld.r3 can never be r0, because r0 would not generate an
-	 * unaligned access.
-	 */
-	if (ld.op == 0x7) {
-		unsigned long imm;
-
-		/*
-		 * form imm9: [12:6] contain first 7bits
-		 */
-		imm = ld.x << 7 | ld.r1;
-		/*
-		 * sign extend (8bits) if m set
-		 */
-		if (ld.m)
-			imm |= SIGN_EXT9;
-		/*
-		 * ifa == r3 (NaT is necessarily cleared)
-		 */
-		ifa += imm;
-
-		DPRINT("imm=%lx r3=%lx\n", imm, ifa);
-
-		setreg(ld.r3, ifa, 0, regs);
-	}
-	/*
-	 * we don't have alat_invalidate_multiple() so we need
-	 * to do the complete flush :-<<
-	 */
-	ia64_invala();
-
-	return 0;
-}
-
-/*
- * Make sure we log the unaligned access, so that user/sysadmin can notice it and
- * eventually fix the program.  However, we don't want to do that for every access so we
- * pace it with jiffies.
- */
-static DEFINE_RATELIMIT_STATE(logging_rate_limit, 5 * HZ, 5);
-
-void
-ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
-{
-	struct ia64_psr *ipsr = ia64_psr(regs);
-	unsigned long bundle[2];
-	unsigned long opcode;
-	const struct exception_table_entry *eh = NULL;
-	union {
-		unsigned long l;
-		load_store_t insn;
-	} u;
-	int ret = -1;
-	bool kernel_mode = false;
-
-	if (ia64_psr(regs)->be) {
-		/* we don't support big-endian accesses */
-		if (die_if_kernel("big-endian unaligned accesses are not supported", regs, 0))
-			return;
-		goto force_sigbus;
-	}
-
-	/*
-	 * Treat kernel accesses for which there is an exception handler entry the same as
-	 * user-level unaligned accesses.  Otherwise, a clever program could trick this
-	 * handler into reading an arbitrary kernel addresses...
-	 */
-	if (!user_mode(regs))
-		eh = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri);
-	if (user_mode(regs) || eh) {
-		if ((current->thread.flags & IA64_THREAD_UAC_SIGBUS) != 0)
-			goto force_sigbus;
-
-		if (!no_unaligned_warning &&
-		    !(current->thread.flags & IA64_THREAD_UAC_NOPRINT) &&
-		    __ratelimit(&logging_rate_limit))
-		{
-			char buf[200];	/* comm[] is at most 16 bytes... */
-			size_t len;
-
-			len = sprintf(buf, "%s(%d): unaligned access to 0x%016lx, "
-				      "ip=0x%016lx\n\r", current->comm,
-				      task_pid_nr(current),
-				      ifa, regs->cr_iip + ipsr->ri);
-			/*
-			 * Don't call tty_write_message() if we're in the kernel; we might
-			 * be holding locks...
-			 */
-			if (user_mode(regs)) {
-				struct tty_struct *tty = get_current_tty();
-				tty_write_message(tty, buf);
-				tty_kref_put(tty);
-			}
-			buf[len-1] = '\0';	/* drop '\r' */
-			/* watch for command names containing %s */
-			printk(KERN_WARNING "%s", buf);
-		} else {
-			if (no_unaligned_warning) {
-				printk_once(KERN_WARNING "%s(%d) encountered an "
-				       "unaligned exception which required\n"
-				       "kernel assistance, which degrades "
-				       "the performance of the application.\n"
-				       "Unaligned exception warnings have "
-				       "been disabled by the system "
-				       "administrator\n"
-				       "echo 0 > /proc/sys/kernel/ignore-"
-				       "unaligned-usertrap to re-enable\n",
-				       current->comm, task_pid_nr(current));
-			}
-		}
-	} else {
-		if (__ratelimit(&logging_rate_limit)) {
-			printk(KERN_WARNING "kernel unaligned access to 0x%016lx, ip=0x%016lx\n",
-			       ifa, regs->cr_iip + ipsr->ri);
-			if (unaligned_dump_stack)
-				dump_stack();
-		}
-		kernel_mode = true;
-	}
-
-	DPRINT("iip=%lx ifa=%lx isr=%lx (ei=%d, sp=%d)\n",
-	       regs->cr_iip, ifa, regs->cr_ipsr, ipsr->ri, ipsr->it);
-
-	if (emulate_load(bundle, regs->cr_iip, 16, kernel_mode))
-		goto failure;
-
-	/*
-	 * extract the instruction from the bundle given the slot number
-	 */
-	switch (ipsr->ri) {
-	      default:
-	      case 0: u.l = (bundle[0] >>  5); break;
-	      case 1: u.l = (bundle[0] >> 46) | (bundle[1] << 18); break;
-	      case 2: u.l = (bundle[1] >> 23); break;
-	}
-	opcode = (u.l >> IA64_OPCODE_SHIFT) & IA64_OPCODE_MASK;
-
-	DPRINT("opcode=%lx ld.qp=%d ld.r1=%d ld.imm=%d ld.r3=%d ld.x=%d ld.hint=%d "
-	       "ld.x6=0x%x ld.m=%d ld.op=%d\n", opcode, u.insn.qp, u.insn.r1, u.insn.imm,
-	       u.insn.r3, u.insn.x, u.insn.hint, u.insn.x6_sz, u.insn.m, u.insn.op);
-
-	/*
-	 * IMPORTANT:
-	 * Notice that the switch statement DOES not cover all possible instructions
-	 * that DO generate unaligned references. This is made on purpose because for some
-	 * instructions it DOES NOT make sense to try and emulate the access. Sometimes it
-	 * is WRONG to try and emulate. Here is a list of instruction we don't emulate i.e.,
-	 * the program will get a signal and die:
-	 *
-	 *	load/store:
-	 *		- ldX.spill
-	 *		- stX.spill
-	 *	Reason: RNATs are based on addresses
-	 *		- ld16
-	 *		- st16
-	 *	Reason: ld16 and st16 are supposed to occur in a single
-	 *		memory op
-	 *
-	 *	synchronization:
-	 *		- cmpxchg
-	 *		- fetchadd
-	 *		- xchg
-	 *	Reason: ATOMIC operations cannot be emulated properly using multiple
-	 *	        instructions.
-	 *
-	 *	speculative loads:
-	 *		- ldX.sZ
-	 *	Reason: side effects, code must be ready to deal with failure so simpler
-	 *		to let the load fail.
-	 * ---------------------------------------------------------------------------------
-	 * XXX fixme
-	 *
-	 * I would like to get rid of this switch case and do something
-	 * more elegant.
-	 */
-	switch (opcode) {
-	      case LDS_OP:
-	      case LDSA_OP:
-		if (u.insn.x)
-			/* oops, really a semaphore op (cmpxchg, etc) */
-			goto failure;
-		fallthrough;
-	      case LDS_IMM_OP:
-	      case LDSA_IMM_OP:
-	      case LDFS_OP:
-	      case LDFSA_OP:
-	      case LDFS_IMM_OP:
-		/*
-		 * The instruction will be retried with deferred exceptions turned on, and
-		 * we should get Nat bit installed
-		 *
-		 * IMPORTANT: When PSR_ED is set, the register & immediate update forms
-		 * are actually executed even though the operation failed. So we don't
-		 * need to take care of this.
-		 */
-		DPRINT("forcing PSR_ED\n");
-		regs->cr_ipsr |= IA64_PSR_ED;
-		goto done;
-
-	      case LD_OP:
-	      case LDA_OP:
-	      case LDBIAS_OP:
-	      case LDACQ_OP:
-	      case LDCCLR_OP:
-	      case LDCNC_OP:
-	      case LDCCLRACQ_OP:
-		if (u.insn.x)
-			/* oops, really a semaphore op (cmpxchg, etc) */
-			goto failure;
-		fallthrough;
-	      case LD_IMM_OP:
-	      case LDA_IMM_OP:
-	      case LDBIAS_IMM_OP:
-	      case LDACQ_IMM_OP:
-	      case LDCCLR_IMM_OP:
-	      case LDCNC_IMM_OP:
-	      case LDCCLRACQ_IMM_OP:
-		ret = emulate_load_int(ifa, u.insn, regs, kernel_mode);
-		break;
-
-	      case ST_OP:
-	      case STREL_OP:
-		if (u.insn.x)
-			/* oops, really a semaphore op (cmpxchg, etc) */
-			goto failure;
-		fallthrough;
-	      case ST_IMM_OP:
-	      case STREL_IMM_OP:
-		ret = emulate_store_int(ifa, u.insn, regs, kernel_mode);
-		break;
-
-	      case LDF_OP:
-	      case LDFA_OP:
-	      case LDFCCLR_OP:
-	      case LDFCNC_OP:
-		if (u.insn.x)
-			ret = emulate_load_floatpair(ifa, u.insn, regs, kernel_mode);
-		else
-			ret = emulate_load_float(ifa, u.insn, regs, kernel_mode);
-		break;
-
-	      case LDF_IMM_OP:
-	      case LDFA_IMM_OP:
-	      case LDFCCLR_IMM_OP:
-	      case LDFCNC_IMM_OP:
-		ret = emulate_load_float(ifa, u.insn, regs, kernel_mode);
-		break;
-
-	      case STF_OP:
-	      case STF_IMM_OP:
-		ret = emulate_store_float(ifa, u.insn, regs, kernel_mode);
-		break;
-
-	      default:
-		goto failure;
-	}
-	DPRINT("ret=%d\n", ret);
-	if (ret)
-		goto failure;
-
-	if (ipsr->ri == 2)
-		/*
-		 * given today's architecture this case is not likely to happen because a
-		 * memory access instruction (M) can never be in the last slot of a
-		 * bundle. But let's keep it for now.
-		 */
-		regs->cr_iip += 16;
-	ipsr->ri = (ipsr->ri + 1) & 0x3;
-
-	DPRINT("ipsr->ri=%d iip=%lx\n", ipsr->ri, regs->cr_iip);
-  done:
-	return;
-
-  failure:
-	/* something went wrong... */
-	if (!user_mode(regs)) {
-		if (eh) {
-			ia64_handle_exception(regs, eh);
-			goto done;
-		}
-		if (die_if_kernel("error during unaligned kernel access\n", regs, ret))
-			return;
-		/* NOT_REACHED */
-	}
-  force_sigbus:
-	force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *) ifa,
-			0, 0, 0);
-	goto done;
-}
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
deleted file mode 100644
index a0fec82c56b8..000000000000
--- a/arch/ia64/kernel/uncached.c
+++ /dev/null
@@ -1,273 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2001-2008 Silicon Graphics, Inc.  All rights reserved.
- *
- * A simple uncached page allocator using the generic allocator. This
- * allocator first utilizes the spare (spill) pages found in the EFI
- * memmap and will then start converting cached pages to uncached ones
- * at a granule at a time. Node awareness is implemented by having a
- * pool of pages per node.
- */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/efi.h>
-#include <linux/nmi.h>
-#include <linux/genalloc.h>
-#include <linux/gfp.h>
-#include <linux/pgtable.h>
-#include <asm/efi.h>
-#include <asm/page.h>
-#include <asm/pal.h>
-#include <linux/atomic.h>
-#include <asm/tlbflush.h>
-
-struct uncached_pool {
-	struct gen_pool *pool;
-	struct mutex add_chunk_mutex;	/* serialize adding a converted chunk */
-	int nchunks_added;		/* #of converted chunks added to pool */
-	atomic_t status;		/* smp called function's return status*/
-};
-
-#define MAX_CONVERTED_CHUNKS_PER_NODE	2
-
-struct uncached_pool uncached_pools[MAX_NUMNODES];
-
-
-static void uncached_ipi_visibility(void *data)
-{
-	int status;
-	struct uncached_pool *uc_pool = (struct uncached_pool *)data;
-
-	status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL);
-	if ((status != PAL_VISIBILITY_OK) &&
-	    (status != PAL_VISIBILITY_OK_REMOTE_NEEDED))
-		atomic_inc(&uc_pool->status);
-}
-
-
-static void uncached_ipi_mc_drain(void *data)
-{
-	int status;
-	struct uncached_pool *uc_pool = (struct uncached_pool *)data;
-
-	status = ia64_pal_mc_drain();
-	if (status != PAL_STATUS_SUCCESS)
-		atomic_inc(&uc_pool->status);
-}
-
-
-/*
- * Add a new chunk of uncached memory pages to the specified pool.
- *
- * @pool: pool to add new chunk of uncached memory to
- * @nid: node id of node to allocate memory from, or -1
- *
- * This is accomplished by first allocating a granule of cached memory pages
- * and then converting them to uncached memory pages.
- */
-static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
-{
-	struct page *page;
-	int status, i, nchunks_added = uc_pool->nchunks_added;
-	unsigned long c_addr, uc_addr;
-
-	if (mutex_lock_interruptible(&uc_pool->add_chunk_mutex) != 0)
-		return -1;	/* interrupted by a signal */
-
-	if (uc_pool->nchunks_added > nchunks_added) {
-		/* someone added a new chunk while we were waiting */
-		mutex_unlock(&uc_pool->add_chunk_mutex);
-		return 0;
-	}
-
-	if (uc_pool->nchunks_added >= MAX_CONVERTED_CHUNKS_PER_NODE) {
-		mutex_unlock(&uc_pool->add_chunk_mutex);
-		return -1;
-	}
-
-	/* attempt to allocate a granule's worth of cached memory pages */
-
-	page = __alloc_pages_node(nid,
-				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
-				IA64_GRANULE_SHIFT-PAGE_SHIFT);
-	if (!page) {
-		mutex_unlock(&uc_pool->add_chunk_mutex);
-		return -1;
-	}
-
-	/* convert the memory pages from cached to uncached */
-
-	c_addr = (unsigned long)page_address(page);
-	uc_addr = c_addr - PAGE_OFFSET + __IA64_UNCACHED_OFFSET;
-
-	/*
-	 * There's a small race here where it's possible for someone to
-	 * access the page through /dev/mem halfway through the conversion
-	 * to uncached - not sure it's really worth bothering about
-	 */
-	for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++)
-		SetPageUncached(&page[i]);
-
-	flush_tlb_kernel_range(uc_addr, uc_addr + IA64_GRANULE_SIZE);
-
-	status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL);
-	if (status == PAL_VISIBILITY_OK_REMOTE_NEEDED) {
-		atomic_set(&uc_pool->status, 0);
-		smp_call_function(uncached_ipi_visibility, uc_pool, 1);
-		if (atomic_read(&uc_pool->status))
-			goto failed;
-	} else if (status != PAL_VISIBILITY_OK)
-		goto failed;
-
-	preempt_disable();
-
-	flush_icache_range(uc_addr, uc_addr + IA64_GRANULE_SIZE);
-
-	/* flush the just introduced uncached translation from the TLB */
-	local_flush_tlb_all();
-
-	preempt_enable();
-
-	status = ia64_pal_mc_drain();
-	if (status != PAL_STATUS_SUCCESS)
-		goto failed;
-	atomic_set(&uc_pool->status, 0);
-	smp_call_function(uncached_ipi_mc_drain, uc_pool, 1);
-	if (atomic_read(&uc_pool->status))
-		goto failed;
-
-	/*
-	 * The chunk of memory pages has been converted to uncached so now we
-	 * can add it to the pool.
-	 */
-	status = gen_pool_add(uc_pool->pool, uc_addr, IA64_GRANULE_SIZE, nid);
-	if (status)
-		goto failed;
-
-	uc_pool->nchunks_added++;
-	mutex_unlock(&uc_pool->add_chunk_mutex);
-	return 0;
-
-	/* failed to convert or add the chunk so give it back to the kernel */
-failed:
-	for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++)
-		ClearPageUncached(&page[i]);
-
-	free_pages(c_addr, IA64_GRANULE_SHIFT-PAGE_SHIFT);
-	mutex_unlock(&uc_pool->add_chunk_mutex);
-	return -1;
-}
-
-
-/*
- * uncached_alloc_page
- *
- * @starting_nid: node id of node to start with, or -1
- * @n_pages: number of contiguous pages to allocate
- *
- * Allocate the specified number of contiguous uncached pages on the
- * requested node. If not enough contiguous uncached pages are available
- * on the requested node, roundrobin starting with the next higher node.
- */
-unsigned long uncached_alloc_page(int starting_nid, int n_pages)
-{
-	unsigned long uc_addr;
-	struct uncached_pool *uc_pool;
-	int nid;
-
-	if (unlikely(starting_nid >= MAX_NUMNODES))
-		return 0;
-
-	if (starting_nid < 0)
-		starting_nid = numa_node_id();
-	nid = starting_nid;
-
-	do {
-		if (!node_state(nid, N_HIGH_MEMORY))
-			continue;
-		uc_pool = &uncached_pools[nid];
-		if (uc_pool->pool == NULL)
-			continue;
-		do {
-			uc_addr = gen_pool_alloc(uc_pool->pool,
-						 n_pages * PAGE_SIZE);
-			if (uc_addr != 0)
-				return uc_addr;
-		} while (uncached_add_chunk(uc_pool, nid) == 0);
-
-	} while ((nid = (nid + 1) % MAX_NUMNODES) != starting_nid);
-
-	return 0;
-}
-EXPORT_SYMBOL(uncached_alloc_page);
-
-
-/*
- * uncached_free_page
- *
- * @uc_addr: uncached address of first page to free
- * @n_pages: number of contiguous pages to free
- *
- * Free the specified number of uncached pages.
- */
-void uncached_free_page(unsigned long uc_addr, int n_pages)
-{
-	int nid = paddr_to_nid(uc_addr - __IA64_UNCACHED_OFFSET);
-	struct gen_pool *pool = uncached_pools[nid].pool;
-
-	if (unlikely(pool == NULL))
-		return;
-
-	if ((uc_addr & (0XFUL << 60)) != __IA64_UNCACHED_OFFSET)
-		panic("uncached_free_page invalid address %lx\n", uc_addr);
-
-	gen_pool_free(pool, uc_addr, n_pages * PAGE_SIZE);
-}
-EXPORT_SYMBOL(uncached_free_page);
-
-
-/*
- * uncached_build_memmap,
- *
- * @uc_start: uncached starting address of a chunk of uncached memory
- * @uc_end: uncached ending address of a chunk of uncached memory
- * @arg: ignored, (NULL argument passed in on call to efi_memmap_walk_uc())
- *
- * Called at boot time to build a map of pages that can be used for
- * memory special operations.
- */
-static int __init uncached_build_memmap(u64 uc_start, u64 uc_end, void *arg)
-{
-	int nid = paddr_to_nid(uc_start - __IA64_UNCACHED_OFFSET);
-	struct gen_pool *pool = uncached_pools[nid].pool;
-	size_t size = uc_end - uc_start;
-
-	touch_softlockup_watchdog();
-
-	if (pool != NULL) {
-		memset((char *)uc_start, 0, size);
-		(void) gen_pool_add(pool, uc_start, size, nid);
-	}
-	return 0;
-}
-
-
-static int __init uncached_init(void)
-{
-	int nid;
-
-	for_each_online_node(nid) {
-		uncached_pools[nid].pool = gen_pool_create(PAGE_SHIFT, nid);
-		mutex_init(&uncached_pools[nid].add_chunk_mutex);
-	}
-
-	efi_memmap_walk_uc(uncached_build_memmap, NULL);
-	return 0;
-}
-
-__initcall(uncached_init);
diff --git a/arch/ia64/kernel/unwind.c b/arch/ia64/kernel/unwind.c
deleted file mode 100644
index 6bd64c35e691..000000000000
--- a/arch/ia64/kernel/unwind.c
+++ /dev/null
@@ -1,2320 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 1999-2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2003 Fenghua Yu <fenghua.yu@intel.com>
- * 	- Change pt_regs_off() to make it less dependent on pt_regs structure.
- */
-/*
- * This file implements call frame unwind support for the Linux
- * kernel.  Parsing and processing the unwind information is
- * time-consuming, so this implementation translates the unwind
- * descriptors into unwind scripts.  These scripts are very simple
- * (basically a sequence of assignments) and efficient to execute.
- * They are cached for later re-use.  Each script is specific for a
- * given instruction pointer address and the set of predicate values
- * that the script depends on (most unwind descriptors are
- * unconditional and scripts often do not depend on predicates at
- * all).  This code is based on the unwind conventions described in
- * the "IA-64 Software Conventions and Runtime Architecture" manual.
- *
- * SMP conventions:
- *	o updates to the global unwind data (in structure "unw") are serialized
- *	  by the unw.lock spinlock
- *	o each unwind script has its own read-write lock; a thread must acquire
- *	  a read lock before executing a script and must acquire a write lock
- *	  before modifying a script
- *	o if both the unw.lock spinlock and a script's read-write lock must be
- *	  acquired, then the read-write lock must be acquired first.
- */
-#include <linux/module.h>
-#include <linux/memblock.h>
-#include <linux/elf.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-#include <asm/unwind.h>
-
-#include <asm/delay.h>
-#include <asm/page.h>
-#include <asm/ptrace.h>
-#include <asm/ptrace_offsets.h>
-#include <asm/rse.h>
-#include <asm/sections.h>
-#include <linux/uaccess.h>
-
-#include "entry.h"
-#include "unwind_i.h"
-
-#define UNW_LOG_CACHE_SIZE	7	/* each unw_script is ~256 bytes in size */
-#define UNW_CACHE_SIZE		(1 << UNW_LOG_CACHE_SIZE)
-
-#define UNW_LOG_HASH_SIZE	(UNW_LOG_CACHE_SIZE + 1)
-#define UNW_HASH_SIZE		(1 << UNW_LOG_HASH_SIZE)
-
-#define UNW_STATS	0	/* WARNING: this disabled interrupts for long time-spans!! */
-
-#ifdef UNW_DEBUG
-  static unsigned int unw_debug_level = UNW_DEBUG;
-#  define UNW_DEBUG_ON(n)	unw_debug_level >= n
-   /* Do not code a printk level, not all debug lines end in newline */
-#  define UNW_DPRINT(n, ...)  if (UNW_DEBUG_ON(n)) printk(__VA_ARGS__)
-#  undef inline
-#  define inline
-#else /* !UNW_DEBUG */
-#  define UNW_DEBUG_ON(n)  0
-#  define UNW_DPRINT(n, ...)
-#endif /* UNW_DEBUG */
-
-#if UNW_STATS
-# define STAT(x...)	x
-#else
-# define STAT(x...)
-#endif
-
-#define alloc_reg_state()	kmalloc(sizeof(struct unw_reg_state), GFP_ATOMIC)
-#define free_reg_state(usr)	kfree(usr)
-#define alloc_labeled_state()	kmalloc(sizeof(struct unw_labeled_state), GFP_ATOMIC)
-#define free_labeled_state(usr)	kfree(usr)
-
-typedef unsigned long unw_word;
-typedef unsigned char unw_hash_index_t;
-
-static struct {
-	spinlock_t lock;			/* spinlock for unwind data */
-
-	/* list of unwind tables (one per load-module) */
-	struct unw_table *tables;
-
-	unsigned long r0;			/* constant 0 for r0 */
-
-	/* table of registers that prologues can save (and order in which they're saved): */
-	const unsigned char save_order[8];
-
-	/* maps a preserved register index (preg_index) to corresponding switch_stack offset: */
-	unsigned short sw_off[sizeof(struct unw_frame_info) / 8];
-
-	unsigned short lru_head;		/* index of lead-recently used script */
-	unsigned short lru_tail;		/* index of most-recently used script */
-
-	/* index into unw_frame_info for preserved register i */
-	unsigned short preg_index[UNW_NUM_REGS];
-
-	short pt_regs_offsets[32];
-
-	/* unwind table for the kernel: */
-	struct unw_table kernel_table;
-
-	/* unwind table describing the gate page (kernel code that is mapped into user space): */
-	size_t gate_table_size;
-	unsigned long *gate_table;
-
-	/* hash table that maps instruction pointer to script index: */
-	unsigned short hash[UNW_HASH_SIZE];
-
-	/* script cache: */
-	struct unw_script cache[UNW_CACHE_SIZE];
-
-# ifdef UNW_DEBUG
-	const char *preg_name[UNW_NUM_REGS];
-# endif
-# if UNW_STATS
-	struct {
-		struct {
-			int lookups;
-			int hinted_hits;
-			int normal_hits;
-			int collision_chain_traversals;
-		} cache;
-		struct {
-			unsigned long build_time;
-			unsigned long run_time;
-			unsigned long parse_time;
-			int builds;
-			int news;
-			int collisions;
-			int runs;
-		} script;
-		struct {
-			unsigned long init_time;
-			unsigned long unwind_time;
-			int inits;
-			int unwinds;
-		} api;
-	} stat;
-# endif
-} unw = {
-	.tables = &unw.kernel_table,
-	.lock = __SPIN_LOCK_UNLOCKED(unw.lock),
-	.save_order = {
-		UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR,
-		UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR
-	},
-	.preg_index = {
-		offsetof(struct unw_frame_info, pri_unat_loc)/8,	/* PRI_UNAT_GR */
-		offsetof(struct unw_frame_info, pri_unat_loc)/8,	/* PRI_UNAT_MEM */
-		offsetof(struct unw_frame_info, bsp_loc)/8,
-		offsetof(struct unw_frame_info, bspstore_loc)/8,
-		offsetof(struct unw_frame_info, pfs_loc)/8,
-		offsetof(struct unw_frame_info, rnat_loc)/8,
-		offsetof(struct unw_frame_info, psp)/8,
-		offsetof(struct unw_frame_info, rp_loc)/8,
-		offsetof(struct unw_frame_info, r4)/8,
-		offsetof(struct unw_frame_info, r5)/8,
-		offsetof(struct unw_frame_info, r6)/8,
-		offsetof(struct unw_frame_info, r7)/8,
-		offsetof(struct unw_frame_info, unat_loc)/8,
-		offsetof(struct unw_frame_info, pr_loc)/8,
-		offsetof(struct unw_frame_info, lc_loc)/8,
-		offsetof(struct unw_frame_info, fpsr_loc)/8,
-		offsetof(struct unw_frame_info, b1_loc)/8,
-		offsetof(struct unw_frame_info, b2_loc)/8,
-		offsetof(struct unw_frame_info, b3_loc)/8,
-		offsetof(struct unw_frame_info, b4_loc)/8,
-		offsetof(struct unw_frame_info, b5_loc)/8,
-		offsetof(struct unw_frame_info, f2_loc)/8,
-		offsetof(struct unw_frame_info, f3_loc)/8,
-		offsetof(struct unw_frame_info, f4_loc)/8,
-		offsetof(struct unw_frame_info, f5_loc)/8,
-		offsetof(struct unw_frame_info, fr_loc[16 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[17 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[18 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[19 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[20 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[21 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[22 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[23 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[24 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[25 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[26 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[27 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[28 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[29 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[30 - 16])/8,
-		offsetof(struct unw_frame_info, fr_loc[31 - 16])/8,
-	},
-	.pt_regs_offsets = {
-		[0] = -1,
-		offsetof(struct pt_regs,  r1),
-		offsetof(struct pt_regs,  r2),
-		offsetof(struct pt_regs,  r3),
-		[4] = -1, [5] = -1, [6] = -1, [7] = -1,
-		offsetof(struct pt_regs,  r8),
-		offsetof(struct pt_regs,  r9),
-		offsetof(struct pt_regs, r10),
-		offsetof(struct pt_regs, r11),
-		offsetof(struct pt_regs, r12),
-		offsetof(struct pt_regs, r13),
-		offsetof(struct pt_regs, r14),
-		offsetof(struct pt_regs, r15),
-		offsetof(struct pt_regs, r16),
-		offsetof(struct pt_regs, r17),
-		offsetof(struct pt_regs, r18),
-		offsetof(struct pt_regs, r19),
-		offsetof(struct pt_regs, r20),
-		offsetof(struct pt_regs, r21),
-		offsetof(struct pt_regs, r22),
-		offsetof(struct pt_regs, r23),
-		offsetof(struct pt_regs, r24),
-		offsetof(struct pt_regs, r25),
-		offsetof(struct pt_regs, r26),
-		offsetof(struct pt_regs, r27),
-		offsetof(struct pt_regs, r28),
-		offsetof(struct pt_regs, r29),
-		offsetof(struct pt_regs, r30),
-		offsetof(struct pt_regs, r31),
-	},
-	.hash = { [0 ... UNW_HASH_SIZE - 1] = -1 },
-#ifdef UNW_DEBUG
-	.preg_name = {
-		"pri_unat_gr", "pri_unat_mem", "bsp", "bspstore", "ar.pfs", "ar.rnat", "psp", "rp",
-		"r4", "r5", "r6", "r7",
-		"ar.unat", "pr", "ar.lc", "ar.fpsr",
-		"b1", "b2", "b3", "b4", "b5",
-		"f2", "f3", "f4", "f5",
-		"f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
-		"f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31"
-	}
-#endif
-};
-
-static inline int
-read_only (void *addr)
-{
-	return (unsigned long) ((char *) addr - (char *) &unw.r0) < sizeof(unw.r0);
-}
-
-/*
- * Returns offset of rREG in struct pt_regs.
- */
-static inline unsigned long
-pt_regs_off (unsigned long reg)
-{
-	short off = -1;
-
-	if (reg < ARRAY_SIZE(unw.pt_regs_offsets))
-		off = unw.pt_regs_offsets[reg];
-
-	if (off < 0) {
-		UNW_DPRINT(0, "unwind.%s: bad scratch reg r%lu\n", __func__, reg);
-		off = 0;
-	}
-	return (unsigned long) off;
-}
-
-static inline struct pt_regs *
-get_scratch_regs (struct unw_frame_info *info)
-{
-	if (!info->pt) {
-		/* This should not happen with valid unwind info.  */
-		UNW_DPRINT(0, "unwind.%s: bad unwind info: resetting info->pt\n", __func__);
-		if (info->flags & UNW_FLAG_INTERRUPT_FRAME)
-			info->pt = (unsigned long) ((struct pt_regs *) info->psp - 1);
-		else
-			info->pt = info->sp - 16;
-	}
-	UNW_DPRINT(3, "unwind.%s: sp 0x%lx pt 0x%lx\n", __func__, info->sp, info->pt);
-	return (struct pt_regs *) info->pt;
-}
-
-/* Unwind accessors.  */
-
-int
-unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char *nat, int write)
-{
-	unsigned long *addr, *nat_addr, nat_mask = 0, dummy_nat;
-	struct unw_ireg *ireg;
-	struct pt_regs *pt;
-
-	if ((unsigned) regnum - 1 >= 127) {
-		if (regnum == 0 && !write) {
-			*val = 0;	/* read r0 always returns 0 */
-			*nat = 0;
-			return 0;
-		}
-		UNW_DPRINT(0, "unwind.%s: trying to access non-existent r%u\n",
-			   __func__, regnum);
-		return -1;
-	}
-
-	if (regnum < 32) {
-		if (regnum >= 4 && regnum <= 7) {
-			/* access a preserved register */
-			ireg = &info->r4 + (regnum - 4);
-			addr = ireg->loc;
-			if (addr) {
-				nat_addr = addr + ireg->nat.off;
-				switch (ireg->nat.type) {
-				      case UNW_NAT_VAL:
-					/* simulate getf.sig/setf.sig */
-					if (write) {
-						if (*nat) {
-							/* write NaTVal and be done with it */
-							addr[0] = 0;
-							addr[1] = 0x1fffe;
-							return 0;
-						}
-						addr[1] = 0x1003e;
-					} else {
-						if (addr[0] == 0 && addr[1] == 0x1ffe) {
-							/* return NaT and be done with it */
-							*val = 0;
-							*nat = 1;
-							return 0;
-						}
-					}
-					fallthrough;
-				      case UNW_NAT_NONE:
-					dummy_nat = 0;
-					nat_addr = &dummy_nat;
-					break;
-
-				      case UNW_NAT_MEMSTK:
-					nat_mask = (1UL << ((long) addr & 0x1f8)/8);
-					break;
-
-				      case UNW_NAT_REGSTK:
-					nat_addr = ia64_rse_rnat_addr(addr);
-					if ((unsigned long) addr < info->regstk.limit
-					    || (unsigned long) addr >= info->regstk.top)
-					{
-						UNW_DPRINT(0, "unwind.%s: %p outside of regstk "
-							"[0x%lx-0x%lx)\n",
-							__func__, (void *) addr,
-							info->regstk.limit,
-							info->regstk.top);
-						return -1;
-					}
-					if ((unsigned long) nat_addr >= info->regstk.top)
-						nat_addr = &info->sw->ar_rnat;
-					nat_mask = (1UL << ia64_rse_slot_num(addr));
-					break;
-				}
-			} else {
-				addr = &info->sw->r4 + (regnum - 4);
-				nat_addr = &info->sw->ar_unat;
-				nat_mask = (1UL << ((long) addr & 0x1f8)/8);
-			}
-		} else {
-			/* access a scratch register */
-			pt = get_scratch_regs(info);
-			addr = (unsigned long *) ((unsigned long)pt + pt_regs_off(regnum));
-			if (info->pri_unat_loc)
-				nat_addr = info->pri_unat_loc;
-			else
-				nat_addr = &info->sw->caller_unat;
-			nat_mask = (1UL << ((long) addr & 0x1f8)/8);
-		}
-	} else {
-		/* access a stacked register */
-		addr = ia64_rse_skip_regs((unsigned long *) info->bsp, regnum - 32);
-		nat_addr = ia64_rse_rnat_addr(addr);
-		if ((unsigned long) addr < info->regstk.limit
-		    || (unsigned long) addr >= info->regstk.top)
-		{
-			UNW_DPRINT(0, "unwind.%s: ignoring attempt to access register outside "
-				   "of rbs\n",  __func__);
-			return -1;
-		}
-		if ((unsigned long) nat_addr >= info->regstk.top)
-			nat_addr = &info->sw->ar_rnat;
-		nat_mask = (1UL << ia64_rse_slot_num(addr));
-	}
-
-	if (write) {
-		if (read_only(addr)) {
-			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__func__);
-		} else {
-			*addr = *val;
-			if (*nat)
-				*nat_addr |= nat_mask;
-			else
-				*nat_addr &= ~nat_mask;
-		}
-	} else {
-		if ((*nat_addr & nat_mask) == 0) {
-			*val = *addr;
-			*nat = 0;
-		} else {
-			*val = 0;	/* if register is a NaT, *addr may contain kernel data! */
-			*nat = 1;
-		}
-	}
-	return 0;
-}
-EXPORT_SYMBOL(unw_access_gr);
-
-int
-unw_access_br (struct unw_frame_info *info, int regnum, unsigned long *val, int write)
-{
-	unsigned long *addr;
-	struct pt_regs *pt;
-
-	switch (regnum) {
-		/* scratch: */
-	      case 0: pt = get_scratch_regs(info); addr = &pt->b0; break;
-	      case 6: pt = get_scratch_regs(info); addr = &pt->b6; break;
-	      case 7: pt = get_scratch_regs(info); addr = &pt->b7; break;
-
-		/* preserved: */
-	      case 1: case 2: case 3: case 4: case 5:
-		addr = *(&info->b1_loc + (regnum - 1));
-		if (!addr)
-			addr = &info->sw->b1 + (regnum - 1);
-		break;
-
-	      default:
-		UNW_DPRINT(0, "unwind.%s: trying to access non-existent b%u\n",
-			   __func__, regnum);
-		return -1;
-	}
-	if (write)
-		if (read_only(addr)) {
-			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__func__);
-		} else
-			*addr = *val;
-	else
-		*val = *addr;
-	return 0;
-}
-EXPORT_SYMBOL(unw_access_br);
-
-int
-unw_access_fr (struct unw_frame_info *info, int regnum, struct ia64_fpreg *val, int write)
-{
-	struct ia64_fpreg *addr = NULL;
-	struct pt_regs *pt;
-
-	if ((unsigned) (regnum - 2) >= 126) {
-		UNW_DPRINT(0, "unwind.%s: trying to access non-existent f%u\n",
-			   __func__, regnum);
-		return -1;
-	}
-
-	if (regnum <= 5) {
-		addr = *(&info->f2_loc + (regnum - 2));
-		if (!addr)
-			addr = &info->sw->f2 + (regnum - 2);
-	} else if (regnum <= 15) {
-		if (regnum <= 11) {
-			pt = get_scratch_regs(info);
-			addr = &pt->f6  + (regnum - 6);
-		}
-		else
-			addr = &info->sw->f12 + (regnum - 12);
-	} else if (regnum <= 31) {
-		addr = info->fr_loc[regnum - 16];
-		if (!addr)
-			addr = &info->sw->f16 + (regnum - 16);
-	} else {
-		struct task_struct *t = info->task;
-
-		if (write)
-			ia64_sync_fph(t);
-		else
-			ia64_flush_fph(t);
-		addr = t->thread.fph + (regnum - 32);
-	}
-
-	if (write)
-		if (read_only(addr)) {
-			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__func__);
-		} else
-			*addr = *val;
-	else
-		*val = *addr;
-	return 0;
-}
-EXPORT_SYMBOL(unw_access_fr);
-
-int
-unw_access_ar (struct unw_frame_info *info, int regnum, unsigned long *val, int write)
-{
-	unsigned long *addr;
-	struct pt_regs *pt;
-
-	switch (regnum) {
-	      case UNW_AR_BSP:
-		addr = info->bsp_loc;
-		if (!addr)
-			addr = &info->sw->ar_bspstore;
-		break;
-
-	      case UNW_AR_BSPSTORE:
-		addr = info->bspstore_loc;
-		if (!addr)
-			addr = &info->sw->ar_bspstore;
-		break;
-
-	      case UNW_AR_PFS:
-		addr = info->pfs_loc;
-		if (!addr)
-			addr = &info->sw->ar_pfs;
-		break;
-
-	      case UNW_AR_RNAT:
-		addr = info->rnat_loc;
-		if (!addr)
-			addr = &info->sw->ar_rnat;
-		break;
-
-	      case UNW_AR_UNAT:
-		addr = info->unat_loc;
-		if (!addr)
-			addr = &info->sw->caller_unat;
-		break;
-
-	      case UNW_AR_LC:
-		addr = info->lc_loc;
-		if (!addr)
-			addr = &info->sw->ar_lc;
-		break;
-
-	      case UNW_AR_EC:
-		if (!info->cfm_loc)
-			return -1;
-		if (write)
-			*info->cfm_loc =
-				(*info->cfm_loc & ~(0x3fUL << 52)) | ((*val & 0x3f) << 52);
-		else
-			*val = (*info->cfm_loc >> 52) & 0x3f;
-		return 0;
-
-	      case UNW_AR_FPSR:
-		addr = info->fpsr_loc;
-		if (!addr)
-			addr = &info->sw->ar_fpsr;
-		break;
-
-	      case UNW_AR_RSC:
-		pt = get_scratch_regs(info);
-		addr = &pt->ar_rsc;
-		break;
-
-	      case UNW_AR_CCV:
-		pt = get_scratch_regs(info);
-		addr = &pt->ar_ccv;
-		break;
-
-	      case UNW_AR_CSD:
-		pt = get_scratch_regs(info);
-		addr = &pt->ar_csd;
-		break;
-
-	      case UNW_AR_SSD:
-		pt = get_scratch_regs(info);
-		addr = &pt->ar_ssd;
-		break;
-
-	      default:
-		UNW_DPRINT(0, "unwind.%s: trying to access non-existent ar%u\n",
-			   __func__, regnum);
-		return -1;
-	}
-
-	if (write) {
-		if (read_only(addr)) {
-			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__func__);
-		} else
-			*addr = *val;
-	} else
-		*val = *addr;
-	return 0;
-}
-EXPORT_SYMBOL(unw_access_ar);
-
-int
-unw_access_pr (struct unw_frame_info *info, unsigned long *val, int write)
-{
-	unsigned long *addr;
-
-	addr = info->pr_loc;
-	if (!addr)
-		addr = &info->sw->pr;
-
-	if (write) {
-		if (read_only(addr)) {
-			UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n",
-				__func__);
-		} else
-			*addr = *val;
-	} else
-		*val = *addr;
-	return 0;
-}
-EXPORT_SYMBOL(unw_access_pr);
-
-
-/* Routines to manipulate the state stack.  */
-
-static inline void
-push (struct unw_state_record *sr)
-{
-	struct unw_reg_state *rs;
-
-	rs = alloc_reg_state();
-	if (!rs) {
-		printk(KERN_ERR "unwind: cannot stack reg state!\n");
-		return;
-	}
-	memcpy(rs, &sr->curr, sizeof(*rs));
-	sr->curr.next = rs;
-}
-
-static void
-pop (struct unw_state_record *sr)
-{
-	struct unw_reg_state *rs = sr->curr.next;
-
-	if (!rs) {
-		printk(KERN_ERR "unwind: stack underflow!\n");
-		return;
-	}
-	memcpy(&sr->curr, rs, sizeof(*rs));
-	free_reg_state(rs);
-}
-
-/* Make a copy of the state stack.  Non-recursive to avoid stack overflows.  */
-static struct unw_reg_state *
-dup_state_stack (struct unw_reg_state *rs)
-{
-	struct unw_reg_state *copy, *prev = NULL, *first = NULL;
-
-	while (rs) {
-		copy = alloc_reg_state();
-		if (!copy) {
-			printk(KERN_ERR "unwind.dup_state_stack: out of memory\n");
-			return NULL;
-		}
-		memcpy(copy, rs, sizeof(*copy));
-		if (first)
-			prev->next = copy;
-		else
-			first = copy;
-		rs = rs->next;
-		prev = copy;
-	}
-	return first;
-}
-
-/* Free all stacked register states (but not RS itself).  */
-static void
-free_state_stack (struct unw_reg_state *rs)
-{
-	struct unw_reg_state *p, *next;
-
-	for (p = rs->next; p != NULL; p = next) {
-		next = p->next;
-		free_reg_state(p);
-	}
-	rs->next = NULL;
-}
-
-/* Unwind decoder routines */
-
-static enum unw_register_index __attribute_const__
-decode_abreg (unsigned char abreg, int memory)
-{
-	switch (abreg) {
-	      case 0x04 ... 0x07: return UNW_REG_R4 + (abreg - 0x04);
-	      case 0x22 ... 0x25: return UNW_REG_F2 + (abreg - 0x22);
-	      case 0x30 ... 0x3f: return UNW_REG_F16 + (abreg - 0x30);
-	      case 0x41 ... 0x45: return UNW_REG_B1 + (abreg - 0x41);
-	      case 0x60: return UNW_REG_PR;
-	      case 0x61: return UNW_REG_PSP;
-	      case 0x62: return memory ? UNW_REG_PRI_UNAT_MEM : UNW_REG_PRI_UNAT_GR;
-	      case 0x63: return UNW_REG_RP;
-	      case 0x64: return UNW_REG_BSP;
-	      case 0x65: return UNW_REG_BSPSTORE;
-	      case 0x66: return UNW_REG_RNAT;
-	      case 0x67: return UNW_REG_UNAT;
-	      case 0x68: return UNW_REG_FPSR;
-	      case 0x69: return UNW_REG_PFS;
-	      case 0x6a: return UNW_REG_LC;
-	      default:
-		break;
-	}
-	UNW_DPRINT(0, "unwind.%s: bad abreg=0x%x\n", __func__, abreg);
-	return UNW_REG_LC;
-}
-
-static void
-set_reg (struct unw_reg_info *reg, enum unw_where where, int when, unsigned long val)
-{
-	reg->val = val;
-	reg->where = where;
-	if (reg->when == UNW_WHEN_NEVER)
-		reg->when = when;
-}
-
-static void
-alloc_spill_area (unsigned long *offp, unsigned long regsize,
-		  struct unw_reg_info *lo, struct unw_reg_info *hi)
-{
-	struct unw_reg_info *reg;
-
-	for (reg = hi; reg >= lo; --reg) {
-		if (reg->where == UNW_WHERE_SPILL_HOME) {
-			reg->where = UNW_WHERE_PSPREL;
-			*offp -= regsize;
-			reg->val = *offp;
-		}
-	}
-}
-
-static inline void
-spill_next_when (struct unw_reg_info **regp, struct unw_reg_info *lim, unw_word t)
-{
-	struct unw_reg_info *reg;
-
-	for (reg = *regp; reg <= lim; ++reg) {
-		if (reg->where == UNW_WHERE_SPILL_HOME) {
-			reg->when = t;
-			*regp = reg + 1;
-			return;
-		}
-	}
-	UNW_DPRINT(0, "unwind.%s: excess spill!\n",  __func__);
-}
-
-static inline void
-finish_prologue (struct unw_state_record *sr)
-{
-	struct unw_reg_info *reg;
-	unsigned long off;
-	int i;
-
-	/*
-	 * First, resolve implicit register save locations (see Section "11.4.2.3 Rules
-	 * for Using Unwind Descriptors", rule 3):
-	 */
-	for (i = 0; i < (int) ARRAY_SIZE(unw.save_order); ++i) {
-		reg = sr->curr.reg + unw.save_order[i];
-		if (reg->where == UNW_WHERE_GR_SAVE) {
-			reg->where = UNW_WHERE_GR;
-			reg->val = sr->gr_save_loc++;
-		}
-	}
-
-	/*
-	 * Next, compute when the fp, general, and branch registers get
-	 * saved.  This must come before alloc_spill_area() because
-	 * we need to know which registers are spilled to their home
-	 * locations.
-	 */
-	if (sr->imask) {
-		unsigned char kind, mask = 0, *cp = sr->imask;
-		int t;
-		static const unsigned char limit[3] = {
-			UNW_REG_F31, UNW_REG_R7, UNW_REG_B5
-		};
-		struct unw_reg_info *(regs[3]);
-
-		regs[0] = sr->curr.reg + UNW_REG_F2;
-		regs[1] = sr->curr.reg + UNW_REG_R4;
-		regs[2] = sr->curr.reg + UNW_REG_B1;
-
-		for (t = 0; t < sr->region_len; ++t) {
-			if ((t & 3) == 0)
-				mask = *cp++;
-			kind = (mask >> 2*(3-(t & 3))) & 3;
-			if (kind > 0)
-				spill_next_when(&regs[kind - 1], sr->curr.reg + limit[kind - 1],
-						sr->region_start + t);
-		}
-	}
-	/*
-	 * Next, lay out the memory stack spill area:
-	 */
-	if (sr->any_spills) {
-		off = sr->spill_offset;
-		alloc_spill_area(&off, 16, sr->curr.reg + UNW_REG_F2, sr->curr.reg + UNW_REG_F31);
-		alloc_spill_area(&off,  8, sr->curr.reg + UNW_REG_B1, sr->curr.reg + UNW_REG_B5);
-		alloc_spill_area(&off,  8, sr->curr.reg + UNW_REG_R4, sr->curr.reg + UNW_REG_R7);
-	}
-}
-
-/*
- * Region header descriptors.
- */
-
-static void
-desc_prologue (int body, unw_word rlen, unsigned char mask, unsigned char grsave,
-	       struct unw_state_record *sr)
-{
-	int i, region_start;
-
-	if (!(sr->in_body || sr->first_region))
-		finish_prologue(sr);
-	sr->first_region = 0;
-
-	/* check if we're done: */
-	if (sr->when_target < sr->region_start + sr->region_len) {
-		sr->done = 1;
-		return;
-	}
-
-	region_start = sr->region_start + sr->region_len;
-
-	for (i = 0; i < sr->epilogue_count; ++i)
-		pop(sr);
-	sr->epilogue_count = 0;
-	sr->epilogue_start = UNW_WHEN_NEVER;
-
-	sr->region_start = region_start;
-	sr->region_len = rlen;
-	sr->in_body = body;
-
-	if (!body) {
-		push(sr);
-
-		for (i = 0; i < 4; ++i) {
-			if (mask & 0x8)
-				set_reg(sr->curr.reg + unw.save_order[i], UNW_WHERE_GR,
-					sr->region_start + sr->region_len - 1, grsave++);
-			mask <<= 1;
-		}
-		sr->gr_save_loc = grsave;
-		sr->any_spills = 0;
-		sr->imask = NULL;
-		sr->spill_offset = 0x10;	/* default to psp+16 */
-	}
-}
-
-/*
- * Prologue descriptors.
- */
-
-static inline void
-desc_abi (unsigned char abi, unsigned char context, struct unw_state_record *sr)
-{
-	if (abi == 3 && context == 'i') {
-		sr->flags |= UNW_FLAG_INTERRUPT_FRAME;
-		UNW_DPRINT(3, "unwind.%s: interrupt frame\n",  __func__);
-	}
-	else
-		UNW_DPRINT(0, "unwind%s: ignoring unwabi(abi=0x%x,context=0x%x)\n",
-				__func__, abi, context);
-}
-
-static inline void
-desc_br_gr (unsigned char brmask, unsigned char gr, struct unw_state_record *sr)
-{
-	int i;
-
-	for (i = 0; i < 5; ++i) {
-		if (brmask & 1)
-			set_reg(sr->curr.reg + UNW_REG_B1 + i, UNW_WHERE_GR,
-				sr->region_start + sr->region_len - 1, gr++);
-		brmask >>= 1;
-	}
-}
-
-static inline void
-desc_br_mem (unsigned char brmask, struct unw_state_record *sr)
-{
-	int i;
-
-	for (i = 0; i < 5; ++i) {
-		if (brmask & 1) {
-			set_reg(sr->curr.reg + UNW_REG_B1 + i, UNW_WHERE_SPILL_HOME,
-				sr->region_start + sr->region_len - 1, 0);
-			sr->any_spills = 1;
-		}
-		brmask >>= 1;
-	}
-}
-
-static inline void
-desc_frgr_mem (unsigned char grmask, unw_word frmask, struct unw_state_record *sr)
-{
-	int i;
-
-	for (i = 0; i < 4; ++i) {
-		if ((grmask & 1) != 0) {
-			set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_SPILL_HOME,
-				sr->region_start + sr->region_len - 1, 0);
-			sr->any_spills = 1;
-		}
-		grmask >>= 1;
-	}
-	for (i = 0; i < 20; ++i) {
-		if ((frmask & 1) != 0) {
-			int base = (i < 4) ? UNW_REG_F2 : UNW_REG_F16 - 4;
-			set_reg(sr->curr.reg + base + i, UNW_WHERE_SPILL_HOME,
-				sr->region_start + sr->region_len - 1, 0);
-			sr->any_spills = 1;
-		}
-		frmask >>= 1;
-	}
-}
-
-static inline void
-desc_fr_mem (unsigned char frmask, struct unw_state_record *sr)
-{
-	int i;
-
-	for (i = 0; i < 4; ++i) {
-		if ((frmask & 1) != 0) {
-			set_reg(sr->curr.reg + UNW_REG_F2 + i, UNW_WHERE_SPILL_HOME,
-				sr->region_start + sr->region_len - 1, 0);
-			sr->any_spills = 1;
-		}
-		frmask >>= 1;
-	}
-}
-
-static inline void
-desc_gr_gr (unsigned char grmask, unsigned char gr, struct unw_state_record *sr)
-{
-	int i;
-
-	for (i = 0; i < 4; ++i) {
-		if ((grmask & 1) != 0)
-			set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_GR,
-				sr->region_start + sr->region_len - 1, gr++);
-		grmask >>= 1;
-	}
-}
-
-static inline void
-desc_gr_mem (unsigned char grmask, struct unw_state_record *sr)
-{
-	int i;
-
-	for (i = 0; i < 4; ++i) {
-		if ((grmask & 1) != 0) {
-			set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_SPILL_HOME,
-				sr->region_start + sr->region_len - 1, 0);
-			sr->any_spills = 1;
-		}
-		grmask >>= 1;
-	}
-}
-
-static inline void
-desc_mem_stack_f (unw_word t, unw_word size, struct unw_state_record *sr)
-{
-	set_reg(sr->curr.reg + UNW_REG_PSP, UNW_WHERE_NONE,
-		sr->region_start + min_t(int, t, sr->region_len - 1), 16*size);
-}
-
-static inline void
-desc_mem_stack_v (unw_word t, struct unw_state_record *sr)
-{
-	sr->curr.reg[UNW_REG_PSP].when = sr->region_start + min_t(int, t, sr->region_len - 1);
-}
-
-static inline void
-desc_reg_gr (unsigned char reg, unsigned char dst, struct unw_state_record *sr)
-{
-	set_reg(sr->curr.reg + reg, UNW_WHERE_GR, sr->region_start + sr->region_len - 1, dst);
-}
-
-static inline void
-desc_reg_psprel (unsigned char reg, unw_word pspoff, struct unw_state_record *sr)
-{
-	set_reg(sr->curr.reg + reg, UNW_WHERE_PSPREL, sr->region_start + sr->region_len - 1,
-		0x10 - 4*pspoff);
-}
-
-static inline void
-desc_reg_sprel (unsigned char reg, unw_word spoff, struct unw_state_record *sr)
-{
-	set_reg(sr->curr.reg + reg, UNW_WHERE_SPREL, sr->region_start + sr->region_len - 1,
-		4*spoff);
-}
-
-static inline void
-desc_rp_br (unsigned char dst, struct unw_state_record *sr)
-{
-	sr->return_link_reg = dst;
-}
-
-static inline void
-desc_reg_when (unsigned char regnum, unw_word t, struct unw_state_record *sr)
-{
-	struct unw_reg_info *reg = sr->curr.reg + regnum;
-
-	if (reg->where == UNW_WHERE_NONE)
-		reg->where = UNW_WHERE_GR_SAVE;
-	reg->when = sr->region_start + min_t(int, t, sr->region_len - 1);
-}
-
-static inline void
-desc_spill_base (unw_word pspoff, struct unw_state_record *sr)
-{
-	sr->spill_offset = 0x10 - 4*pspoff;
-}
-
-static inline unsigned char *
-desc_spill_mask (unsigned char *imaskp, struct unw_state_record *sr)
-{
-	sr->imask = imaskp;
-	return imaskp + (2*sr->region_len + 7)/8;
-}
-
-/*
- * Body descriptors.
- */
-static inline void
-desc_epilogue (unw_word t, unw_word ecount, struct unw_state_record *sr)
-{
-	sr->epilogue_start = sr->region_start + sr->region_len - 1 - t;
-	sr->epilogue_count = ecount + 1;
-}
-
-static inline void
-desc_copy_state (unw_word label, struct unw_state_record *sr)
-{
-	struct unw_labeled_state *ls;
-
-	for (ls = sr->labeled_states; ls; ls = ls->next) {
-		if (ls->label == label) {
-			free_state_stack(&sr->curr);
-			memcpy(&sr->curr, &ls->saved_state, sizeof(sr->curr));
-			sr->curr.next = dup_state_stack(ls->saved_state.next);
-			return;
-		}
-	}
-	printk(KERN_ERR "unwind: failed to find state labeled 0x%lx\n", label);
-}
-
-static inline void
-desc_label_state (unw_word label, struct unw_state_record *sr)
-{
-	struct unw_labeled_state *ls;
-
-	ls = alloc_labeled_state();
-	if (!ls) {
-		printk(KERN_ERR "unwind.desc_label_state(): out of memory\n");
-		return;
-	}
-	ls->label = label;
-	memcpy(&ls->saved_state, &sr->curr, sizeof(ls->saved_state));
-	ls->saved_state.next = dup_state_stack(sr->curr.next);
-
-	/* insert into list of labeled states: */
-	ls->next = sr->labeled_states;
-	sr->labeled_states = ls;
-}
-
-/*
- * General descriptors.
- */
-
-static inline int
-desc_is_active (unsigned char qp, unw_word t, struct unw_state_record *sr)
-{
-	if (sr->when_target <= sr->region_start + min_t(int, t, sr->region_len - 1))
-		return 0;
-	if (qp > 0) {
-		if ((sr->pr_val & (1UL << qp)) == 0)
-			return 0;
-		sr->pr_mask |= (1UL << qp);
-	}
-	return 1;
-}
-
-static inline void
-desc_restore_p (unsigned char qp, unw_word t, unsigned char abreg, struct unw_state_record *sr)
-{
-	struct unw_reg_info *r;
-
-	if (!desc_is_active(qp, t, sr))
-		return;
-
-	r = sr->curr.reg + decode_abreg(abreg, 0);
-	r->where = UNW_WHERE_NONE;
-	r->when = UNW_WHEN_NEVER;
-	r->val = 0;
-}
-
-static inline void
-desc_spill_reg_p (unsigned char qp, unw_word t, unsigned char abreg, unsigned char x,
-		     unsigned char ytreg, struct unw_state_record *sr)
-{
-	enum unw_where where = UNW_WHERE_GR;
-	struct unw_reg_info *r;
-
-	if (!desc_is_active(qp, t, sr))
-		return;
-
-	if (x)
-		where = UNW_WHERE_BR;
-	else if (ytreg & 0x80)
-		where = UNW_WHERE_FR;
-
-	r = sr->curr.reg + decode_abreg(abreg, 0);
-	r->where = where;
-	r->when = sr->region_start + min_t(int, t, sr->region_len - 1);
-	r->val = (ytreg & 0x7f);
-}
-
-static inline void
-desc_spill_psprel_p (unsigned char qp, unw_word t, unsigned char abreg, unw_word pspoff,
-		     struct unw_state_record *sr)
-{
-	struct unw_reg_info *r;
-
-	if (!desc_is_active(qp, t, sr))
-		return;
-
-	r = sr->curr.reg + decode_abreg(abreg, 1);
-	r->where = UNW_WHERE_PSPREL;
-	r->when = sr->region_start + min_t(int, t, sr->region_len - 1);
-	r->val = 0x10 - 4*pspoff;
-}
-
-static inline void
-desc_spill_sprel_p (unsigned char qp, unw_word t, unsigned char abreg, unw_word spoff,
-		       struct unw_state_record *sr)
-{
-	struct unw_reg_info *r;
-
-	if (!desc_is_active(qp, t, sr))
-		return;
-
-	r = sr->curr.reg + decode_abreg(abreg, 1);
-	r->where = UNW_WHERE_SPREL;
-	r->when = sr->region_start + min_t(int, t, sr->region_len - 1);
-	r->val = 4*spoff;
-}
-
-#define UNW_DEC_BAD_CODE(code)			printk(KERN_ERR "unwind: unknown code 0x%02x\n", \
-						       code);
-
-/*
- * region headers:
- */
-#define UNW_DEC_PROLOGUE_GR(fmt,r,m,gr,arg)	desc_prologue(0,r,m,gr,arg)
-#define UNW_DEC_PROLOGUE(fmt,b,r,arg)		desc_prologue(b,r,0,32,arg)
-/*
- * prologue descriptors:
- */
-#define UNW_DEC_ABI(fmt,a,c,arg)		desc_abi(a,c,arg)
-#define UNW_DEC_BR_GR(fmt,b,g,arg)		desc_br_gr(b,g,arg)
-#define UNW_DEC_BR_MEM(fmt,b,arg)		desc_br_mem(b,arg)
-#define UNW_DEC_FRGR_MEM(fmt,g,f,arg)		desc_frgr_mem(g,f,arg)
-#define UNW_DEC_FR_MEM(fmt,f,arg)		desc_fr_mem(f,arg)
-#define UNW_DEC_GR_GR(fmt,m,g,arg)		desc_gr_gr(m,g,arg)
-#define UNW_DEC_GR_MEM(fmt,m,arg)		desc_gr_mem(m,arg)
-#define UNW_DEC_MEM_STACK_F(fmt,t,s,arg)	desc_mem_stack_f(t,s,arg)
-#define UNW_DEC_MEM_STACK_V(fmt,t,arg)		desc_mem_stack_v(t,arg)
-#define UNW_DEC_REG_GR(fmt,r,d,arg)		desc_reg_gr(r,d,arg)
-#define UNW_DEC_REG_PSPREL(fmt,r,o,arg)		desc_reg_psprel(r,o,arg)
-#define UNW_DEC_REG_SPREL(fmt,r,o,arg)		desc_reg_sprel(r,o,arg)
-#define UNW_DEC_REG_WHEN(fmt,r,t,arg)		desc_reg_when(r,t,arg)
-#define UNW_DEC_PRIUNAT_WHEN_GR(fmt,t,arg)	desc_reg_when(UNW_REG_PRI_UNAT_GR,t,arg)
-#define UNW_DEC_PRIUNAT_WHEN_MEM(fmt,t,arg)	desc_reg_when(UNW_REG_PRI_UNAT_MEM,t,arg)
-#define UNW_DEC_PRIUNAT_GR(fmt,r,arg)		desc_reg_gr(UNW_REG_PRI_UNAT_GR,r,arg)
-#define UNW_DEC_PRIUNAT_PSPREL(fmt,o,arg)	desc_reg_psprel(UNW_REG_PRI_UNAT_MEM,o,arg)
-#define UNW_DEC_PRIUNAT_SPREL(fmt,o,arg)	desc_reg_sprel(UNW_REG_PRI_UNAT_MEM,o,arg)
-#define UNW_DEC_RP_BR(fmt,d,arg)		desc_rp_br(d,arg)
-#define UNW_DEC_SPILL_BASE(fmt,o,arg)		desc_spill_base(o,arg)
-#define UNW_DEC_SPILL_MASK(fmt,m,arg)		(m = desc_spill_mask(m,arg))
-/*
- * body descriptors:
- */
-#define UNW_DEC_EPILOGUE(fmt,t,c,arg)		desc_epilogue(t,c,arg)
-#define UNW_DEC_COPY_STATE(fmt,l,arg)		desc_copy_state(l,arg)
-#define UNW_DEC_LABEL_STATE(fmt,l,arg)		desc_label_state(l,arg)
-/*
- * general unwind descriptors:
- */
-#define UNW_DEC_SPILL_REG_P(f,p,t,a,x,y,arg)	desc_spill_reg_p(p,t,a,x,y,arg)
-#define UNW_DEC_SPILL_REG(f,t,a,x,y,arg)	desc_spill_reg_p(0,t,a,x,y,arg)
-#define UNW_DEC_SPILL_PSPREL_P(f,p,t,a,o,arg)	desc_spill_psprel_p(p,t,a,o,arg)
-#define UNW_DEC_SPILL_PSPREL(f,t,a,o,arg)	desc_spill_psprel_p(0,t,a,o,arg)
-#define UNW_DEC_SPILL_SPREL_P(f,p,t,a,o,arg)	desc_spill_sprel_p(p,t,a,o,arg)
-#define UNW_DEC_SPILL_SPREL(f,t,a,o,arg)	desc_spill_sprel_p(0,t,a,o,arg)
-#define UNW_DEC_RESTORE_P(f,p,t,a,arg)		desc_restore_p(p,t,a,arg)
-#define UNW_DEC_RESTORE(f,t,a,arg)		desc_restore_p(0,t,a,arg)
-
-#include "unwind_decoder.c"
-
-
-/* Unwind scripts. */
-
-static inline unw_hash_index_t
-hash (unsigned long ip)
-{
-	/* magic number = ((sqrt(5)-1)/2)*2^64 */
-	static const unsigned long hashmagic = 0x9e3779b97f4a7c16UL;
-
-	return (ip >> 4) * hashmagic >> (64 - UNW_LOG_HASH_SIZE);
-}
-
-static inline long
-cache_match (struct unw_script *script, unsigned long ip, unsigned long pr)
-{
-	read_lock(&script->lock);
-	if (ip == script->ip && ((pr ^ script->pr_val) & script->pr_mask) == 0)
-		/* keep the read lock... */
-		return 1;
-	read_unlock(&script->lock);
-	return 0;
-}
-
-static inline struct unw_script *
-script_lookup (struct unw_frame_info *info)
-{
-	struct unw_script *script = unw.cache + info->hint;
-	unsigned short index;
-	unsigned long ip, pr;
-
-	if (UNW_DEBUG_ON(0))
-		return NULL;	/* Always regenerate scripts in debug mode */
-
-	STAT(++unw.stat.cache.lookups);
-
-	ip = info->ip;
-	pr = info->pr;
-
-	if (cache_match(script, ip, pr)) {
-		STAT(++unw.stat.cache.hinted_hits);
-		return script;
-	}
-
-	index = unw.hash[hash(ip)];
-	if (index >= UNW_CACHE_SIZE)
-		return NULL;
-
-	script = unw.cache + index;
-	while (1) {
-		if (cache_match(script, ip, pr)) {
-			/* update hint; no locking required as single-word writes are atomic */
-			STAT(++unw.stat.cache.normal_hits);
-			unw.cache[info->prev_script].hint = script - unw.cache;
-			return script;
-		}
-		if (script->coll_chain >= UNW_HASH_SIZE)
-			return NULL;
-		script = unw.cache + script->coll_chain;
-		STAT(++unw.stat.cache.collision_chain_traversals);
-	}
-}
-
-/*
- * On returning, a write lock for the SCRIPT is still being held.
- */
-static inline struct unw_script *
-script_new (unsigned long ip)
-{
-	struct unw_script *script, *prev, *tmp;
-	unw_hash_index_t index;
-	unsigned short head;
-
-	STAT(++unw.stat.script.news);
-
-	/*
-	 * Can't (easily) use cmpxchg() here because of ABA problem
-	 * that is intrinsic in cmpxchg()...
-	 */
-	head = unw.lru_head;
-	script = unw.cache + head;
-	unw.lru_head = script->lru_chain;
-
-	/*
-	 * We'd deadlock here if we interrupted a thread that is holding a read lock on
-	 * script->lock.  Thus, if the write_trylock() fails, we simply bail out.  The
-	 * alternative would be to disable interrupts whenever we hold a read-lock, but
-	 * that seems silly.
-	 */
-	if (!write_trylock(&script->lock))
-		return NULL;
-
-	/* re-insert script at the tail of the LRU chain: */
-	unw.cache[unw.lru_tail].lru_chain = head;
-	unw.lru_tail = head;
-
-	/* remove the old script from the hash table (if it's there): */
-	if (script->ip) {
-		index = hash(script->ip);
-		tmp = unw.cache + unw.hash[index];
-		prev = NULL;
-		while (1) {
-			if (tmp == script) {
-				if (prev)
-					prev->coll_chain = tmp->coll_chain;
-				else
-					unw.hash[index] = tmp->coll_chain;
-				break;
-			} else
-				prev = tmp;
-			if (tmp->coll_chain >= UNW_CACHE_SIZE)
-			/* old script wasn't in the hash-table */
-				break;
-			tmp = unw.cache + tmp->coll_chain;
-		}
-	}
-
-	/* enter new script in the hash table */
-	index = hash(ip);
-	script->coll_chain = unw.hash[index];
-	unw.hash[index] = script - unw.cache;
-
-	script->ip = ip;	/* set new IP while we're holding the locks */
-
-	STAT(if (script->coll_chain < UNW_CACHE_SIZE) ++unw.stat.script.collisions);
-
-	script->flags = 0;
-	script->hint = 0;
-	script->count = 0;
-	return script;
-}
-
-static void
-script_finalize (struct unw_script *script, struct unw_state_record *sr)
-{
-	script->pr_mask = sr->pr_mask;
-	script->pr_val = sr->pr_val;
-	/*
-	 * We could down-grade our write-lock on script->lock here but
-	 * the rwlock API doesn't offer atomic lock downgrading, so
-	 * we'll just keep the write-lock and release it later when
-	 * we're done using the script.
-	 */
-}
-
-static inline void
-script_emit (struct unw_script *script, struct unw_insn insn)
-{
-	if (script->count >= UNW_MAX_SCRIPT_LEN) {
-		UNW_DPRINT(0, "unwind.%s: script exceeds maximum size of %u instructions!\n",
-			__func__, UNW_MAX_SCRIPT_LEN);
-		return;
-	}
-	script->insn[script->count++] = insn;
-}
-
-static inline void
-emit_nat_info (struct unw_state_record *sr, int i, struct unw_script *script)
-{
-	struct unw_reg_info *r = sr->curr.reg + i;
-	enum unw_insn_opcode opc;
-	struct unw_insn insn;
-	unsigned long val = 0;
-
-	switch (r->where) {
-	      case UNW_WHERE_GR:
-		if (r->val >= 32) {
-			/* register got spilled to a stacked register */
-			opc = UNW_INSN_SETNAT_TYPE;
-			val = UNW_NAT_REGSTK;
-		} else
-			/* register got spilled to a scratch register */
-			opc = UNW_INSN_SETNAT_MEMSTK;
-		break;
-
-	      case UNW_WHERE_FR:
-		opc = UNW_INSN_SETNAT_TYPE;
-		val = UNW_NAT_VAL;
-		break;
-
-	      case UNW_WHERE_BR:
-		opc = UNW_INSN_SETNAT_TYPE;
-		val = UNW_NAT_NONE;
-		break;
-
-	      case UNW_WHERE_PSPREL:
-	      case UNW_WHERE_SPREL:
-		opc = UNW_INSN_SETNAT_MEMSTK;
-		break;
-
-	      default:
-		UNW_DPRINT(0, "unwind.%s: don't know how to emit nat info for where = %u\n",
-			   __func__, r->where);
-		return;
-	}
-	insn.opc = opc;
-	insn.dst = unw.preg_index[i];
-	insn.val = val;
-	script_emit(script, insn);
-}
-
-static void
-compile_reg (struct unw_state_record *sr, int i, struct unw_script *script)
-{
-	struct unw_reg_info *r = sr->curr.reg + i;
-	enum unw_insn_opcode opc;
-	unsigned long val, rval;
-	struct unw_insn insn;
-	long need_nat_info;
-
-	if (r->where == UNW_WHERE_NONE || r->when >= sr->when_target)
-		return;
-
-	opc = UNW_INSN_MOVE;
-	val = rval = r->val;
-	need_nat_info = (i >= UNW_REG_R4 && i <= UNW_REG_R7);
-
-	switch (r->where) {
-	      case UNW_WHERE_GR:
-		if (rval >= 32) {
-			opc = UNW_INSN_MOVE_STACKED;
-			val = rval - 32;
-		} else if (rval >= 4 && rval <= 7) {
-			if (need_nat_info) {
-				opc = UNW_INSN_MOVE2;
-				need_nat_info = 0;
-			}
-			val = unw.preg_index[UNW_REG_R4 + (rval - 4)];
-		} else if (rval == 0) {
-			opc = UNW_INSN_MOVE_CONST;
-			val = 0;
-		} else {
-			/* register got spilled to a scratch register */
-			opc = UNW_INSN_MOVE_SCRATCH;
-			val = pt_regs_off(rval);
-		}
-		break;
-
-	      case UNW_WHERE_FR:
-		if (rval <= 5)
-			val = unw.preg_index[UNW_REG_F2  + (rval -  2)];
-		else if (rval >= 16 && rval <= 31)
-			val = unw.preg_index[UNW_REG_F16 + (rval - 16)];
-		else {
-			opc = UNW_INSN_MOVE_SCRATCH;
-			if (rval <= 11)
-				val = offsetof(struct pt_regs, f6) + 16*(rval - 6);
-			else
-				UNW_DPRINT(0, "unwind.%s: kernel may not touch f%lu\n",
-					   __func__, rval);
-		}
-		break;
-
-	      case UNW_WHERE_BR:
-		if (rval >= 1 && rval <= 5)
-			val = unw.preg_index[UNW_REG_B1 + (rval - 1)];
-		else {
-			opc = UNW_INSN_MOVE_SCRATCH;
-			if (rval == 0)
-				val = offsetof(struct pt_regs, b0);
-			else if (rval == 6)
-				val = offsetof(struct pt_regs, b6);
-			else
-				val = offsetof(struct pt_regs, b7);
-		}
-		break;
-
-	      case UNW_WHERE_SPREL:
-		opc = UNW_INSN_ADD_SP;
-		break;
-
-	      case UNW_WHERE_PSPREL:
-		opc = UNW_INSN_ADD_PSP;
-		break;
-
-	      default:
-		UNW_DPRINT(0, "unwind%s: register %u has unexpected `where' value of %u\n",
-			   __func__, i, r->where);
-		break;
-	}
-	insn.opc = opc;
-	insn.dst = unw.preg_index[i];
-	insn.val = val;
-	script_emit(script, insn);
-	if (need_nat_info)
-		emit_nat_info(sr, i, script);
-
-	if (i == UNW_REG_PSP) {
-		/*
-		 * info->psp must contain the _value_ of the previous
-		 * sp, not it's save location.  We get this by
-		 * dereferencing the value we just stored in
-		 * info->psp:
-		 */
-		insn.opc = UNW_INSN_LOAD;
-		insn.dst = insn.val = unw.preg_index[UNW_REG_PSP];
-		script_emit(script, insn);
-	}
-}
-
-static inline const struct unw_table_entry *
-lookup (struct unw_table *table, unsigned long rel_ip)
-{
-	const struct unw_table_entry *e = NULL;
-	unsigned long lo, hi, mid;
-
-	/* do a binary search for right entry: */
-	for (lo = 0, hi = table->length; lo < hi; ) {
-		mid = (lo + hi) / 2;
-		e = &table->array[mid];
-		if (rel_ip < e->start_offset)
-			hi = mid;
-		else if (rel_ip >= e->end_offset)
-			lo = mid + 1;
-		else
-			break;
-	}
-	if (rel_ip < e->start_offset || rel_ip >= e->end_offset)
-		return NULL;
-	return e;
-}
-
-/*
- * Build an unwind script that unwinds from state OLD_STATE to the
- * entrypoint of the function that called OLD_STATE.
- */
-static inline struct unw_script *
-build_script (struct unw_frame_info *info)
-{
-	const struct unw_table_entry *e = NULL;
-	struct unw_script *script = NULL;
-	struct unw_labeled_state *ls, *next;
-	unsigned long ip = info->ip;
-	struct unw_state_record sr;
-	struct unw_table *table, *prev;
-	struct unw_reg_info *r;
-	struct unw_insn insn;
-	u8 *dp, *desc_end;
-	u64 hdr;
-	int i;
-	STAT(unsigned long start, parse_start;)
-
-	STAT(++unw.stat.script.builds; start = ia64_get_itc());
-
-	/* build state record */
-	memset(&sr, 0, sizeof(sr));
-	for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r)
-		r->when = UNW_WHEN_NEVER;
-	sr.pr_val = info->pr;
-
-	UNW_DPRINT(3, "unwind.%s: ip 0x%lx\n", __func__, ip);
-	script = script_new(ip);
-	if (!script) {
-		UNW_DPRINT(0, "unwind.%s: failed to create unwind script\n",  __func__);
-		STAT(unw.stat.script.build_time += ia64_get_itc() - start);
-		return NULL;
-	}
-	unw.cache[info->prev_script].hint = script - unw.cache;
-
-	/* search the kernels and the modules' unwind tables for IP: */
-
-	STAT(parse_start = ia64_get_itc());
-
-	prev = NULL;
-	for (table = unw.tables; table; table = table->next) {
-		if (ip >= table->start && ip < table->end) {
-			/*
-			 * Leave the kernel unwind table at the very front,
-			 * lest moving it breaks some assumption elsewhere.
-			 * Otherwise, move the matching table to the second
-			 * position in the list so that traversals can benefit
-			 * from commonality in backtrace paths.
-			 */
-			if (prev && prev != unw.tables) {
-				/* unw is safe - we're already spinlocked */
-				prev->next = table->next;
-				table->next = unw.tables->next;
-				unw.tables->next = table;
-			}
-			e = lookup(table, ip - table->segment_base);
-			break;
-		}
-		prev = table;
-	}
-	if (!e) {
-		/* no info, return default unwinder (leaf proc, no mem stack, no saved regs)  */
-		UNW_DPRINT(1, "unwind.%s: no unwind info for ip=0x%lx (prev ip=0x%lx)\n",
-			__func__, ip, unw.cache[info->prev_script].ip);
-		sr.curr.reg[UNW_REG_RP].where = UNW_WHERE_BR;
-		sr.curr.reg[UNW_REG_RP].when = -1;
-		sr.curr.reg[UNW_REG_RP].val = 0;
-		compile_reg(&sr, UNW_REG_RP, script);
-		script_finalize(script, &sr);
-		STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start);
-		STAT(unw.stat.script.build_time += ia64_get_itc() - start);
-		return script;
-	}
-
-	sr.when_target = (3*((ip & ~0xfUL) - (table->segment_base + e->start_offset))/16
-			  + (ip & 0xfUL));
-	hdr = *(u64 *) (table->segment_base + e->info_offset);
-	dp =   (u8 *)  (table->segment_base + e->info_offset + 8);
-	desc_end = dp + 8*UNW_LENGTH(hdr);
-
-	while (!sr.done && dp < desc_end)
-		dp = unw_decode(dp, sr.in_body, &sr);
-
-	if (sr.when_target > sr.epilogue_start) {
-		/*
-		 * sp has been restored and all values on the memory stack below
-		 * psp also have been restored.
-		 */
-		sr.curr.reg[UNW_REG_PSP].val = 0;
-		sr.curr.reg[UNW_REG_PSP].where = UNW_WHERE_NONE;
-		sr.curr.reg[UNW_REG_PSP].when = UNW_WHEN_NEVER;
-		for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r)
-			if ((r->where == UNW_WHERE_PSPREL && r->val <= 0x10)
-			    || r->where == UNW_WHERE_SPREL)
-			{
-				r->val = 0;
-				r->where = UNW_WHERE_NONE;
-				r->when = UNW_WHEN_NEVER;
-			}
-	}
-
-	script->flags = sr.flags;
-
-	/*
-	 * If RP did't get saved, generate entry for the return link
-	 * register.
-	 */
-	if (sr.curr.reg[UNW_REG_RP].when >= sr.when_target) {
-		sr.curr.reg[UNW_REG_RP].where = UNW_WHERE_BR;
-		sr.curr.reg[UNW_REG_RP].when = -1;
-		sr.curr.reg[UNW_REG_RP].val = sr.return_link_reg;
-		UNW_DPRINT(1, "unwind.%s: using default for rp at ip=0x%lx where=%d val=0x%lx\n",
-			   __func__, ip, sr.curr.reg[UNW_REG_RP].where,
-			   sr.curr.reg[UNW_REG_RP].val);
-	}
-
-#ifdef UNW_DEBUG
-	UNW_DPRINT(1, "unwind.%s: state record for func 0x%lx, t=%u:\n",
-		__func__, table->segment_base + e->start_offset, sr.when_target);
-	for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r) {
-		if (r->where != UNW_WHERE_NONE || r->when != UNW_WHEN_NEVER) {
-			UNW_DPRINT(1, "  %s <- ", unw.preg_name[r - sr.curr.reg]);
-			switch (r->where) {
-			      case UNW_WHERE_GR:     UNW_DPRINT(1, "r%lu", r->val); break;
-			      case UNW_WHERE_FR:     UNW_DPRINT(1, "f%lu", r->val); break;
-			      case UNW_WHERE_BR:     UNW_DPRINT(1, "b%lu", r->val); break;
-			      case UNW_WHERE_SPREL:  UNW_DPRINT(1, "[sp+0x%lx]", r->val); break;
-			      case UNW_WHERE_PSPREL: UNW_DPRINT(1, "[psp+0x%lx]", r->val); break;
-			      case UNW_WHERE_NONE:
-				UNW_DPRINT(1, "%s+0x%lx", unw.preg_name[r - sr.curr.reg], r->val);
-				break;
-
-			      default:
-				UNW_DPRINT(1, "BADWHERE(%d)", r->where);
-				break;
-			}
-			UNW_DPRINT(1, "\t\t%d\n", r->when);
-		}
-	}
-#endif
-
-	STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start);
-
-	/* translate state record into unwinder instructions: */
-
-	/*
-	 * First, set psp if we're dealing with a fixed-size frame;
-	 * subsequent instructions may depend on this value.
-	 */
-	if (sr.when_target > sr.curr.reg[UNW_REG_PSP].when
-	    && (sr.curr.reg[UNW_REG_PSP].where == UNW_WHERE_NONE)
-	    && sr.curr.reg[UNW_REG_PSP].val != 0) {
-		/* new psp is sp plus frame size */
-		insn.opc = UNW_INSN_ADD;
-		insn.dst = offsetof(struct unw_frame_info, psp)/8;
-		insn.val = sr.curr.reg[UNW_REG_PSP].val;	/* frame size */
-		script_emit(script, insn);
-	}
-
-	/* determine where the primary UNaT is: */
-	if (sr.when_target < sr.curr.reg[UNW_REG_PRI_UNAT_GR].when)
-		i = UNW_REG_PRI_UNAT_MEM;
-	else if (sr.when_target < sr.curr.reg[UNW_REG_PRI_UNAT_MEM].when)
-		i = UNW_REG_PRI_UNAT_GR;
-	else if (sr.curr.reg[UNW_REG_PRI_UNAT_MEM].when > sr.curr.reg[UNW_REG_PRI_UNAT_GR].when)
-		i = UNW_REG_PRI_UNAT_MEM;
-	else
-		i = UNW_REG_PRI_UNAT_GR;
-
-	compile_reg(&sr, i, script);
-
-	for (i = UNW_REG_BSP; i < UNW_NUM_REGS; ++i)
-		compile_reg(&sr, i, script);
-
-	/* free labeled register states & stack: */
-
-	STAT(parse_start = ia64_get_itc());
-	for (ls = sr.labeled_states; ls; ls = next) {
-		next = ls->next;
-		free_state_stack(&ls->saved_state);
-		free_labeled_state(ls);
-	}
-	free_state_stack(&sr.curr);
-	STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start);
-
-	script_finalize(script, &sr);
-	STAT(unw.stat.script.build_time += ia64_get_itc() - start);
-	return script;
-}
-
-/*
- * Apply the unwinding actions represented by OPS and update SR to
- * reflect the state that existed upon entry to the function that this
- * unwinder represents.
- */
-static inline void
-run_script (struct unw_script *script, struct unw_frame_info *state)
-{
-	struct unw_insn *ip, *limit, next_insn;
-	unsigned long opc, dst, val, off;
-	unsigned long *s = (unsigned long *) state;
-	STAT(unsigned long start;)
-
-	STAT(++unw.stat.script.runs; start = ia64_get_itc());
-	state->flags = script->flags;
-	ip = script->insn;
-	limit = script->insn + script->count;
-	next_insn = *ip;
-
-	while (ip++ < limit) {
-		opc = next_insn.opc;
-		dst = next_insn.dst;
-		val = next_insn.val;
-		next_insn = *ip;
-
-	  redo:
-		switch (opc) {
-		      case UNW_INSN_ADD:
-			s[dst] += val;
-			break;
-
-		      case UNW_INSN_MOVE2:
-			if (!s[val])
-				goto lazy_init;
-			s[dst+1] = s[val+1];
-			s[dst] = s[val];
-			break;
-
-		      case UNW_INSN_MOVE:
-			if (!s[val])
-				goto lazy_init;
-			s[dst] = s[val];
-			break;
-
-		      case UNW_INSN_MOVE_SCRATCH:
-			if (state->pt) {
-				s[dst] = (unsigned long) get_scratch_regs(state) + val;
-			} else {
-				s[dst] = 0;
-				UNW_DPRINT(0, "unwind.%s: no state->pt, dst=%ld, val=%ld\n",
-					   __func__, dst, val);
-			}
-			break;
-
-		      case UNW_INSN_MOVE_CONST:
-			if (val == 0)
-				s[dst] = (unsigned long) &unw.r0;
-			else {
-				s[dst] = 0;
-				UNW_DPRINT(0, "unwind.%s: UNW_INSN_MOVE_CONST bad val=%ld\n",
-					   __func__, val);
-			}
-			break;
-
-
-		      case UNW_INSN_MOVE_STACKED:
-			s[dst] = (unsigned long) ia64_rse_skip_regs((unsigned long *)state->bsp,
-								    val);
-			break;
-
-		      case UNW_INSN_ADD_PSP:
-			s[dst] = state->psp + val;
-			break;
-
-		      case UNW_INSN_ADD_SP:
-			s[dst] = state->sp + val;
-			break;
-
-		      case UNW_INSN_SETNAT_MEMSTK:
-			if (!state->pri_unat_loc)
-				state->pri_unat_loc = &state->sw->caller_unat;
-			/* register off. is a multiple of 8, so the least 3 bits (type) are 0 */
-			s[dst+1] = ((unsigned long) state->pri_unat_loc - s[dst]) | UNW_NAT_MEMSTK;
-			break;
-
-		      case UNW_INSN_SETNAT_TYPE:
-			s[dst+1] = val;
-			break;
-
-		      case UNW_INSN_LOAD:
-#ifdef UNW_DEBUG
-			if ((s[val] & (local_cpu_data->unimpl_va_mask | 0x7)) != 0
-			    || s[val] < TASK_SIZE)
-			{
-				UNW_DPRINT(0, "unwind.%s: rejecting bad psp=0x%lx\n",
-					   __func__, s[val]);
-				break;
-			}
-#endif
-			s[dst] = *(unsigned long *) s[val];
-			break;
-		}
-	}
-	STAT(unw.stat.script.run_time += ia64_get_itc() - start);
-	return;
-
-  lazy_init:
-	off = unw.sw_off[val];
-	s[val] = (unsigned long) state->sw + off;
-	if (off >= offsetof(struct switch_stack, r4) && off <= offsetof(struct switch_stack, r7))
-		/*
-		 * We're initializing a general register: init NaT info, too.  Note that
-		 * the offset is a multiple of 8 which gives us the 3 bits needed for
-		 * the type field.
-		 */
-		s[val+1] = (offsetof(struct switch_stack, ar_unat) - off) | UNW_NAT_MEMSTK;
-	goto redo;
-}
-
-static int
-find_save_locs (struct unw_frame_info *info)
-{
-	int have_write_lock = 0;
-	struct unw_script *scr;
-	unsigned long flags = 0;
-
-	if ((info->ip & (local_cpu_data->unimpl_va_mask | 0xf)) || info->ip < TASK_SIZE) {
-		/* don't let obviously bad addresses pollute the cache */
-		/* FIXME: should really be level 0 but it occurs too often. KAO */
-		UNW_DPRINT(1, "unwind.%s: rejecting bad ip=0x%lx\n", __func__, info->ip);
-		info->rp_loc = NULL;
-		return -1;
-	}
-
-	scr = script_lookup(info);
-	if (!scr) {
-		spin_lock_irqsave(&unw.lock, flags);
-		scr = build_script(info);
-		if (!scr) {
-			spin_unlock_irqrestore(&unw.lock, flags);
-			UNW_DPRINT(0,
-				   "unwind.%s: failed to locate/build unwind script for ip %lx\n",
-				   __func__, info->ip);
-			return -1;
-		}
-		have_write_lock = 1;
-	}
-	info->hint = scr->hint;
-	info->prev_script = scr - unw.cache;
-
-	run_script(scr, info);
-
-	if (have_write_lock) {
-		write_unlock(&scr->lock);
-		spin_unlock_irqrestore(&unw.lock, flags);
-	} else
-		read_unlock(&scr->lock);
-	return 0;
-}
-
-static int
-unw_valid(const struct unw_frame_info *info, unsigned long* p)
-{
-	unsigned long loc = (unsigned long)p;
-	return (loc >= info->regstk.limit && loc < info->regstk.top) ||
-	       (loc >= info->memstk.top && loc < info->memstk.limit);
-}
-
-int
-unw_unwind (struct unw_frame_info *info)
-{
-	unsigned long prev_ip, prev_sp, prev_bsp;
-	unsigned long ip, pr, num_regs;
-	STAT(unsigned long start, flags;)
-	int retval;
-
-	STAT(local_irq_save(flags); ++unw.stat.api.unwinds; start = ia64_get_itc());
-
-	prev_ip = info->ip;
-	prev_sp = info->sp;
-	prev_bsp = info->bsp;
-
-	/* validate the return IP pointer */
-	if (!unw_valid(info, info->rp_loc)) {
-		/* FIXME: should really be level 0 but it occurs too often. KAO */
-		UNW_DPRINT(1, "unwind.%s: failed to locate return link (ip=0x%lx)!\n",
-			   __func__, info->ip);
-		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
-		return -1;
-	}
-	/* restore the ip */
-	ip = info->ip = *info->rp_loc;
-	if (ip < GATE_ADDR) {
-		UNW_DPRINT(2, "unwind.%s: reached user-space (ip=0x%lx)\n", __func__, ip);
-		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
-		return -1;
-	}
-
-	/* validate the previous stack frame pointer */
-	if (!unw_valid(info, info->pfs_loc)) {
-		UNW_DPRINT(0, "unwind.%s: failed to locate ar.pfs!\n", __func__);
-		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
-		return -1;
-	}
-	/* restore the cfm: */
-	info->cfm_loc = info->pfs_loc;
-
-	/* restore the bsp: */
-	pr = info->pr;
-	num_regs = 0;
-	if ((info->flags & UNW_FLAG_INTERRUPT_FRAME)) {
-		info->pt = info->sp + 16;
-		if ((pr & (1UL << PRED_NON_SYSCALL)) != 0)
-			num_regs = *info->cfm_loc & 0x7f;		/* size of frame */
-		info->pfs_loc =
-			(unsigned long *) (info->pt + offsetof(struct pt_regs, ar_pfs));
-		UNW_DPRINT(3, "unwind.%s: interrupt_frame pt 0x%lx\n", __func__, info->pt);
-	} else
-		num_regs = (*info->cfm_loc >> 7) & 0x7f;	/* size of locals */
-	info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->bsp, -num_regs);
-	if (info->bsp < info->regstk.limit || info->bsp > info->regstk.top) {
-		UNW_DPRINT(0, "unwind.%s: bsp (0x%lx) out of range [0x%lx-0x%lx]\n",
-			__func__, info->bsp, info->regstk.limit, info->regstk.top);
-		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
-		return -1;
-	}
-
-	/* restore the sp: */
-	info->sp = info->psp;
-	if (info->sp < info->memstk.top || info->sp > info->memstk.limit) {
-		UNW_DPRINT(0, "unwind.%s: sp (0x%lx) out of range [0x%lx-0x%lx]\n",
-			__func__, info->sp, info->memstk.top, info->memstk.limit);
-		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
-		return -1;
-	}
-
-	if (info->ip == prev_ip && info->sp == prev_sp && info->bsp == prev_bsp) {
-		UNW_DPRINT(0, "unwind.%s: ip, sp, bsp unchanged; stopping here (ip=0x%lx)\n",
-			   __func__, ip);
-		STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
-		return -1;
-	}
-
-	/* as we unwind, the saved ar.unat becomes the primary unat: */
-	info->pri_unat_loc = info->unat_loc;
-
-	/* finally, restore the predicates: */
-	unw_get_pr(info, &info->pr);
-
-	retval = find_save_locs(info);
-	STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags));
-	return retval;
-}
-EXPORT_SYMBOL(unw_unwind);
-
-int
-unw_unwind_to_user (struct unw_frame_info *info)
-{
-	unsigned long ip, sp, pr = info->pr;
-
-	do {
-		unw_get_sp(info, &sp);
-		if ((long)((unsigned long)info->task + IA64_STK_OFFSET - sp)
-		    < IA64_PT_REGS_SIZE) {
-			UNW_DPRINT(0, "unwind.%s: ran off the top of the kernel stack\n",
-				   __func__);
-			break;
-		}
-		if (unw_is_intr_frame(info) &&
-		    (pr & (1UL << PRED_USER_STACK)))
-			return 0;
-		if (unw_get_pr (info, &pr) < 0) {
-			unw_get_rp(info, &ip);
-			UNW_DPRINT(0, "unwind.%s: failed to read "
-				   "predicate register (ip=0x%lx)\n",
-				__func__, ip);
-			return -1;
-		}
-	} while (unw_unwind(info) >= 0);
-	unw_get_ip(info, &ip);
-	UNW_DPRINT(0, "unwind.%s: failed to unwind to user-level (ip=0x%lx)\n",
-		   __func__, ip);
-	return -1;
-}
-EXPORT_SYMBOL(unw_unwind_to_user);
-
-static void
-init_frame_info (struct unw_frame_info *info, struct task_struct *t,
-		 struct switch_stack *sw, unsigned long stktop)
-{
-	unsigned long rbslimit, rbstop, stklimit;
-	STAT(unsigned long start, flags;)
-
-	STAT(local_irq_save(flags); ++unw.stat.api.inits; start = ia64_get_itc());
-
-	/*
-	 * Subtle stuff here: we _could_ unwind through the switch_stack frame but we
-	 * don't want to do that because it would be slow as each preserved register would
-	 * have to be processed.  Instead, what we do here is zero out the frame info and
-	 * start the unwind process at the function that created the switch_stack frame.
-	 * When a preserved value in switch_stack needs to be accessed, run_script() will
-	 * initialize the appropriate pointer on demand.
-	 */
-	memset(info, 0, sizeof(*info));
-
-	rbslimit = (unsigned long) t + IA64_RBS_OFFSET;
-	stklimit = (unsigned long) t + IA64_STK_OFFSET;
-
-	rbstop   = sw->ar_bspstore;
-	if (rbstop > stklimit || rbstop < rbslimit)
-		rbstop = rbslimit;
-
-	if (stktop <= rbstop)
-		stktop = rbstop;
-	if (stktop > stklimit)
-		stktop = stklimit;
-
-	info->regstk.limit = rbslimit;
-	info->regstk.top   = rbstop;
-	info->memstk.limit = stklimit;
-	info->memstk.top   = stktop;
-	info->task = t;
-	info->sw  = sw;
-	info->sp = info->psp = stktop;
-	info->pr = sw->pr;
-	UNW_DPRINT(3, "unwind.%s:\n"
-		   "  task   0x%lx\n"
-		   "  rbs = [0x%lx-0x%lx)\n"
-		   "  stk = [0x%lx-0x%lx)\n"
-		   "  pr     0x%lx\n"
-		   "  sw     0x%lx\n"
-		   "  sp     0x%lx\n",
-		   __func__, (unsigned long) t, rbslimit, rbstop, stktop, stklimit,
-		   info->pr, (unsigned long) info->sw, info->sp);
-	STAT(unw.stat.api.init_time += ia64_get_itc() - start; local_irq_restore(flags));
-}
-
-void
-unw_init_frame_info (struct unw_frame_info *info, struct task_struct *t, struct switch_stack *sw)
-{
-	unsigned long sol;
-
-	init_frame_info(info, t, sw, (unsigned long) (sw + 1) - 16);
-	info->cfm_loc = &sw->ar_pfs;
-	sol = (*info->cfm_loc >> 7) & 0x7f;
-	info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->regstk.top, -sol);
-	info->ip = sw->b0;
-	UNW_DPRINT(3, "unwind.%s:\n"
-		   "  bsp    0x%lx\n"
-		   "  sol    0x%lx\n"
-		   "  ip     0x%lx\n",
-		   __func__, info->bsp, sol, info->ip);
-	find_save_locs(info);
-}
-
-EXPORT_SYMBOL(unw_init_frame_info);
-
-void
-unw_init_from_blocked_task (struct unw_frame_info *info, struct task_struct *t)
-{
-	struct switch_stack *sw = (struct switch_stack *) (t->thread.ksp + 16);
-
-	UNW_DPRINT(1, "unwind.%s\n", __func__);
-	unw_init_frame_info(info, t, sw);
-}
-EXPORT_SYMBOL(unw_init_from_blocked_task);
-
-static void
-init_unwind_table (struct unw_table *table, const char *name, unsigned long segment_base,
-		   unsigned long gp, const void *table_start, const void *table_end)
-{
-	const struct unw_table_entry *start = table_start, *end = table_end;
-
-	table->name = name;
-	table->segment_base = segment_base;
-	table->gp = gp;
-	table->start = segment_base + start[0].start_offset;
-	table->end = segment_base + end[-1].end_offset;
-	table->array = start;
-	table->length = end - start;
-}
-
-void *
-unw_add_unwind_table (const char *name, unsigned long segment_base, unsigned long gp,
-		      const void *table_start, const void *table_end)
-{
-	const struct unw_table_entry *start = table_start, *end = table_end;
-	struct unw_table *table;
-	unsigned long flags;
-
-	if (end - start <= 0) {
-		UNW_DPRINT(0, "unwind.%s: ignoring attempt to insert empty unwind table\n",
-			   __func__);
-		return NULL;
-	}
-
-	table = kmalloc(sizeof(*table), GFP_USER);
-	if (!table)
-		return NULL;
-
-	init_unwind_table(table, name, segment_base, gp, table_start, table_end);
-
-	spin_lock_irqsave(&unw.lock, flags);
-	{
-		/* keep kernel unwind table at the front (it's searched most commonly): */
-		table->next = unw.tables->next;
-		unw.tables->next = table;
-	}
-	spin_unlock_irqrestore(&unw.lock, flags);
-
-	return table;
-}
-
-void
-unw_remove_unwind_table (void *handle)
-{
-	struct unw_table *table, *prev;
-	struct unw_script *tmp;
-	unsigned long flags;
-	long index;
-
-	if (!handle) {
-		UNW_DPRINT(0, "unwind.%s: ignoring attempt to remove non-existent unwind table\n",
-			   __func__);
-		return;
-	}
-
-	table = handle;
-	if (table == &unw.kernel_table) {
-		UNW_DPRINT(0, "unwind.%s: sorry, freeing the kernel's unwind table is a "
-			   "no-can-do!\n", __func__);
-		return;
-	}
-
-	spin_lock_irqsave(&unw.lock, flags);
-	{
-		/* first, delete the table: */
-
-		for (prev = (struct unw_table *) &unw.tables; prev; prev = prev->next)
-			if (prev->next == table)
-				break;
-		if (!prev) {
-			UNW_DPRINT(0, "unwind.%s: failed to find unwind table %p\n",
-				   __func__, (void *) table);
-			spin_unlock_irqrestore(&unw.lock, flags);
-			return;
-		}
-		prev->next = table->next;
-	}
-	spin_unlock_irqrestore(&unw.lock, flags);
-
-	/* next, remove hash table entries for this table */
-
-	for (index = 0; index < UNW_HASH_SIZE; ++index) {
-		tmp = unw.cache + unw.hash[index];
-		if (unw.hash[index] >= UNW_CACHE_SIZE
-		    || tmp->ip < table->start || tmp->ip >= table->end)
-			continue;
-
-		write_lock(&tmp->lock);
-		{
-			if (tmp->ip >= table->start && tmp->ip < table->end) {
-				unw.hash[index] = tmp->coll_chain;
-				tmp->ip = 0;
-			}
-		}
-		write_unlock(&tmp->lock);
-	}
-
-	kfree(table);
-}
-
-static int __init
-create_gate_table (void)
-{
-	const struct unw_table_entry *entry, *start, *end;
-	unsigned long *lp, segbase = GATE_ADDR;
-	size_t info_size, size;
-	char *info;
-	Elf64_Phdr *punw = NULL, *phdr = (Elf64_Phdr *) (GATE_ADDR + GATE_EHDR->e_phoff);
-	int i;
-
-	for (i = 0; i < GATE_EHDR->e_phnum; ++i, ++phdr)
-		if (phdr->p_type == PT_IA_64_UNWIND) {
-			punw = phdr;
-			break;
-		}
-
-	if (!punw) {
-		printk("%s: failed to find gate DSO's unwind table!\n", __func__);
-		return 0;
-	}
-
-	start = (const struct unw_table_entry *) punw->p_vaddr;
-	end = (struct unw_table_entry *) ((char *) start + punw->p_memsz);
-	size  = 0;
-
-	unw_add_unwind_table("linux-gate.so", segbase, 0, start, end);
-
-	for (entry = start; entry < end; ++entry)
-		size += 3*8 + 8 + 8*UNW_LENGTH(*(u64 *) (segbase + entry->info_offset));
-	size += 8;	/* reserve space for "end of table" marker */
-
-	unw.gate_table = kmalloc(size, GFP_KERNEL);
-	if (!unw.gate_table) {
-		unw.gate_table_size = 0;
-		printk(KERN_ERR "%s: unable to create unwind data for gate page!\n", __func__);
-		return 0;
-	}
-	unw.gate_table_size = size;
-
-	lp = unw.gate_table;
-	info = (char *) unw.gate_table + size;
-
-	for (entry = start; entry < end; ++entry, lp += 3) {
-		info_size = 8 + 8*UNW_LENGTH(*(u64 *) (segbase + entry->info_offset));
-		info -= info_size;
-		memcpy(info, (char *) segbase + entry->info_offset, info_size);
-
-		lp[0] = segbase + entry->start_offset;		/* start */
-		lp[1] = segbase + entry->end_offset;		/* end */
-		lp[2] = info - (char *) unw.gate_table;		/* info */
-	}
-	*lp = 0;	/* end-of-table marker */
-	return 0;
-}
-
-__initcall(create_gate_table);
-
-void __init
-unw_init (void)
-{
-	extern char __gp[];
-	extern void unw_hash_index_t_is_too_narrow (void);
-	long i, off;
-
-	if (8*sizeof(unw_hash_index_t) < UNW_LOG_HASH_SIZE)
-		unw_hash_index_t_is_too_narrow();
-
-	unw.sw_off[unw.preg_index[UNW_REG_PRI_UNAT_GR]] = SW(CALLER_UNAT);
-	unw.sw_off[unw.preg_index[UNW_REG_BSPSTORE]] = SW(AR_BSPSTORE);
-	unw.sw_off[unw.preg_index[UNW_REG_PFS]] = SW(AR_PFS);
-	unw.sw_off[unw.preg_index[UNW_REG_RP]] = SW(B0);
-	unw.sw_off[unw.preg_index[UNW_REG_UNAT]] = SW(CALLER_UNAT);
-	unw.sw_off[unw.preg_index[UNW_REG_PR]] = SW(PR);
-	unw.sw_off[unw.preg_index[UNW_REG_LC]] = SW(AR_LC);
-	unw.sw_off[unw.preg_index[UNW_REG_FPSR]] = SW(AR_FPSR);
-	for (i = UNW_REG_R4, off = SW(R4); i <= UNW_REG_R7; ++i, off += 8)
-		unw.sw_off[unw.preg_index[i]] = off;
-	for (i = UNW_REG_B1, off = SW(B1); i <= UNW_REG_B5; ++i, off += 8)
-		unw.sw_off[unw.preg_index[i]] = off;
-	for (i = UNW_REG_F2, off = SW(F2); i <= UNW_REG_F5; ++i, off += 16)
-		unw.sw_off[unw.preg_index[i]] = off;
-	for (i = UNW_REG_F16, off = SW(F16); i <= UNW_REG_F31; ++i, off += 16)
-		unw.sw_off[unw.preg_index[i]] = off;
-
-	for (i = 0; i < UNW_CACHE_SIZE; ++i) {
-		if (i > 0)
-			unw.cache[i].lru_chain = (i - 1);
-		unw.cache[i].coll_chain = -1;
-		rwlock_init(&unw.cache[i].lock);
-	}
-	unw.lru_head = UNW_CACHE_SIZE - 1;
-	unw.lru_tail = 0;
-
-	init_unwind_table(&unw.kernel_table, "kernel", KERNEL_START, (unsigned long) __gp,
-			  __start_unwind, __end_unwind);
-}
-
-/*
- * DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED
- *
- *	This system call has been deprecated.  The new and improved way to get
- *	at the kernel's unwind info is via the gate DSO.  The address of the
- *	ELF header for this DSO is passed to user-level via AT_SYSINFO_EHDR.
- *
- * DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED
- *
- * This system call copies the unwind data into the buffer pointed to by BUF and returns
- * the size of the unwind data.  If BUF_SIZE is smaller than the size of the unwind data
- * or if BUF is NULL, nothing is copied, but the system call still returns the size of the
- * unwind data.
- *
- * The first portion of the unwind data contains an unwind table and rest contains the
- * associated unwind info (in no particular order).  The unwind table consists of a table
- * of entries of the form:
- *
- *	u64 start;	(64-bit address of start of function)
- *	u64 end;	(64-bit address of start of function)
- *	u64 info;	(BUF-relative offset to unwind info)
- *
- * The end of the unwind table is indicated by an entry with a START address of zero.
- *
- * Please see the IA-64 Software Conventions and Runtime Architecture manual for details
- * on the format of the unwind info.
- *
- * ERRORS
- *	EFAULT	BUF points outside your accessible address space.
- */
-asmlinkage long
-sys_getunwind (void __user *buf, size_t buf_size)
-{
-	if (buf && buf_size >= unw.gate_table_size)
-		if (copy_to_user(buf, unw.gate_table, unw.gate_table_size) != 0)
-			return -EFAULT;
-	return unw.gate_table_size;
-}
diff --git a/arch/ia64/kernel/unwind_decoder.c b/arch/ia64/kernel/unwind_decoder.c
deleted file mode 100644
index 83f54f7929b5..000000000000
--- a/arch/ia64/kernel/unwind_decoder.c
+++ /dev/null
@@ -1,460 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2000 Hewlett-Packard Co
- * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * Generic IA-64 unwind info decoder.
- *
- * This file is used both by the Linux kernel and objdump.  Please keep
- * the two copies of this file in sync.
- *
- * You need to customize the decoder by defining the following
- * macros/constants before including this file:
- *
- *  Types:
- *	unw_word	Unsigned integer type with at least 64 bits 
- *
- *  Register names:
- *	UNW_REG_BSP
- *	UNW_REG_BSPSTORE
- *	UNW_REG_FPSR
- *	UNW_REG_LC
- *	UNW_REG_PFS
- *	UNW_REG_PR
- *	UNW_REG_RNAT
- *	UNW_REG_PSP
- *	UNW_REG_RP
- *	UNW_REG_UNAT
- *
- *  Decoder action macros:
- *	UNW_DEC_BAD_CODE(code)
- *	UNW_DEC_ABI(fmt,abi,context,arg)
- *	UNW_DEC_BR_GR(fmt,brmask,gr,arg)
- *	UNW_DEC_BR_MEM(fmt,brmask,arg)
- *	UNW_DEC_COPY_STATE(fmt,label,arg)
- *	UNW_DEC_EPILOGUE(fmt,t,ecount,arg)
- *	UNW_DEC_FRGR_MEM(fmt,grmask,frmask,arg)
- *	UNW_DEC_FR_MEM(fmt,frmask,arg)
- *	UNW_DEC_GR_GR(fmt,grmask,gr,arg)
- *	UNW_DEC_GR_MEM(fmt,grmask,arg)
- *	UNW_DEC_LABEL_STATE(fmt,label,arg)
- *	UNW_DEC_MEM_STACK_F(fmt,t,size,arg)
- *	UNW_DEC_MEM_STACK_V(fmt,t,arg)
- *	UNW_DEC_PRIUNAT_GR(fmt,r,arg)
- *	UNW_DEC_PRIUNAT_WHEN_GR(fmt,t,arg)
- *	UNW_DEC_PRIUNAT_WHEN_MEM(fmt,t,arg)
- *	UNW_DEC_PRIUNAT_WHEN_PSPREL(fmt,pspoff,arg)
- *	UNW_DEC_PRIUNAT_WHEN_SPREL(fmt,spoff,arg)
- *	UNW_DEC_PROLOGUE(fmt,body,rlen,arg)
- *	UNW_DEC_PROLOGUE_GR(fmt,rlen,mask,grsave,arg)
- *	UNW_DEC_REG_PSPREL(fmt,reg,pspoff,arg)
- *	UNW_DEC_REG_REG(fmt,src,dst,arg)
- *	UNW_DEC_REG_SPREL(fmt,reg,spoff,arg)
- *	UNW_DEC_REG_WHEN(fmt,reg,t,arg)
- *	UNW_DEC_RESTORE(fmt,t,abreg,arg)
- *	UNW_DEC_RESTORE_P(fmt,qp,t,abreg,arg)
- *	UNW_DEC_SPILL_BASE(fmt,pspoff,arg)
- *	UNW_DEC_SPILL_MASK(fmt,imaskp,arg)
- *	UNW_DEC_SPILL_PSPREL(fmt,t,abreg,pspoff,arg)
- *	UNW_DEC_SPILL_PSPREL_P(fmt,qp,t,abreg,pspoff,arg)
- *	UNW_DEC_SPILL_REG(fmt,t,abreg,x,ytreg,arg)
- *	UNW_DEC_SPILL_REG_P(fmt,qp,t,abreg,x,ytreg,arg)
- *	UNW_DEC_SPILL_SPREL(fmt,t,abreg,spoff,arg)
- *	UNW_DEC_SPILL_SPREL_P(fmt,qp,t,abreg,pspoff,arg)
- */
-
-static unw_word
-unw_decode_uleb128 (unsigned char **dpp)
-{
-  unsigned shift = 0;
-  unw_word byte, result = 0;
-  unsigned char *bp = *dpp;
-
-  while (1)
-    {
-      byte = *bp++;
-      result |= (byte & 0x7f) << shift;
-      if ((byte & 0x80) == 0)
-	break;
-      shift += 7;
-    }
-  *dpp = bp;
-  return result;
-}
-
-static unsigned char *
-unw_decode_x1 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unsigned char byte1, abreg;
-  unw_word t, off;
-
-  byte1 = *dp++;
-  t = unw_decode_uleb128 (&dp);
-  off = unw_decode_uleb128 (&dp);
-  abreg = (byte1 & 0x7f);
-  if (byte1 & 0x80)
-	  UNW_DEC_SPILL_SPREL(X1, t, abreg, off, arg);
-  else
-	  UNW_DEC_SPILL_PSPREL(X1, t, abreg, off, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_x2 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unsigned char byte1, byte2, abreg, x, ytreg;
-  unw_word t;
-
-  byte1 = *dp++; byte2 = *dp++;
-  t = unw_decode_uleb128 (&dp);
-  abreg = (byte1 & 0x7f);
-  ytreg = byte2;
-  x = (byte1 >> 7) & 1;
-  if ((byte1 & 0x80) == 0 && ytreg == 0)
-    UNW_DEC_RESTORE(X2, t, abreg, arg);
-  else
-    UNW_DEC_SPILL_REG(X2, t, abreg, x, ytreg, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_x3 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unsigned char byte1, byte2, abreg, qp;
-  unw_word t, off;
-
-  byte1 = *dp++; byte2 = *dp++;
-  t = unw_decode_uleb128 (&dp);
-  off = unw_decode_uleb128 (&dp);
-
-  qp = (byte1 & 0x3f);
-  abreg = (byte2 & 0x7f);
-
-  if (byte1 & 0x80)
-    UNW_DEC_SPILL_SPREL_P(X3, qp, t, abreg, off, arg);
-  else
-    UNW_DEC_SPILL_PSPREL_P(X3, qp, t, abreg, off, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_x4 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unsigned char byte1, byte2, byte3, qp, abreg, x, ytreg;
-  unw_word t;
-
-  byte1 = *dp++; byte2 = *dp++; byte3 = *dp++;
-  t = unw_decode_uleb128 (&dp);
-
-  qp = (byte1 & 0x3f);
-  abreg = (byte2 & 0x7f);
-  x = (byte2 >> 7) & 1;
-  ytreg = byte3;
-
-  if ((byte2 & 0x80) == 0 && byte3 == 0)
-    UNW_DEC_RESTORE_P(X4, qp, t, abreg, arg);
-  else
-    UNW_DEC_SPILL_REG_P(X4, qp, t, abreg, x, ytreg, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_r1 (unsigned char *dp, unsigned char code, void *arg)
-{
-  int body = (code & 0x20) != 0;
-  unw_word rlen;
-
-  rlen = (code & 0x1f);
-  UNW_DEC_PROLOGUE(R1, body, rlen, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_r2 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unsigned char byte1, mask, grsave;
-  unw_word rlen;
-
-  byte1 = *dp++;
-
-  mask = ((code & 0x7) << 1) | ((byte1 >> 7) & 1);
-  grsave = (byte1 & 0x7f);
-  rlen = unw_decode_uleb128 (&dp);
-  UNW_DEC_PROLOGUE_GR(R2, rlen, mask, grsave, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_r3 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unw_word rlen;
-
-  rlen = unw_decode_uleb128 (&dp);
-  UNW_DEC_PROLOGUE(R3, ((code & 0x3) == 1), rlen, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_p1 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unsigned char brmask = (code & 0x1f);
-
-  UNW_DEC_BR_MEM(P1, brmask, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_p2_p5 (unsigned char *dp, unsigned char code, void *arg)
-{
-  if ((code & 0x10) == 0)
-    {
-      unsigned char byte1 = *dp++;
-
-      UNW_DEC_BR_GR(P2, ((code & 0xf) << 1) | ((byte1 >> 7) & 1),
-		    (byte1 & 0x7f), arg);
-    }
-  else if ((code & 0x08) == 0)
-    {
-      unsigned char byte1 = *dp++, r, dst;
-
-      r = ((code & 0x7) << 1) | ((byte1 >> 7) & 1);
-      dst = (byte1 & 0x7f);
-      switch (r)
-	{
-	case 0: UNW_DEC_REG_GR(P3, UNW_REG_PSP, dst, arg); break;
-	case 1: UNW_DEC_REG_GR(P3, UNW_REG_RP, dst, arg); break;
-	case 2: UNW_DEC_REG_GR(P3, UNW_REG_PFS, dst, arg); break;
-	case 3: UNW_DEC_REG_GR(P3, UNW_REG_PR, dst, arg); break;
-	case 4: UNW_DEC_REG_GR(P3, UNW_REG_UNAT, dst, arg); break;
-	case 5: UNW_DEC_REG_GR(P3, UNW_REG_LC, dst, arg); break;
-	case 6: UNW_DEC_RP_BR(P3, dst, arg); break;
-	case 7: UNW_DEC_REG_GR(P3, UNW_REG_RNAT, dst, arg); break;
-	case 8: UNW_DEC_REG_GR(P3, UNW_REG_BSP, dst, arg); break;
-	case 9: UNW_DEC_REG_GR(P3, UNW_REG_BSPSTORE, dst, arg); break;
-	case 10: UNW_DEC_REG_GR(P3, UNW_REG_FPSR, dst, arg); break;
-	case 11: UNW_DEC_PRIUNAT_GR(P3, dst, arg); break;
-	default: UNW_DEC_BAD_CODE(r); break;
-	}
-    }
-  else if ((code & 0x7) == 0)
-    UNW_DEC_SPILL_MASK(P4, dp, arg);
-  else if ((code & 0x7) == 1)
-    {
-      unw_word grmask, frmask, byte1, byte2, byte3;
-
-      byte1 = *dp++; byte2 = *dp++; byte3 = *dp++;
-      grmask = ((byte1 >> 4) & 0xf);
-      frmask = ((byte1 & 0xf) << 16) | (byte2 << 8) | byte3;
-      UNW_DEC_FRGR_MEM(P5, grmask, frmask, arg);
-    }
-  else
-    UNW_DEC_BAD_CODE(code);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_p6 (unsigned char *dp, unsigned char code, void *arg)
-{
-  int gregs = (code & 0x10) != 0;
-  unsigned char mask = (code & 0x0f);
-
-  if (gregs)
-    UNW_DEC_GR_MEM(P6, mask, arg);
-  else
-    UNW_DEC_FR_MEM(P6, mask, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_p7_p10 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unsigned char r, byte1, byte2;
-  unw_word t, size;
-
-  if ((code & 0x10) == 0)
-    {
-      r = (code & 0xf);
-      t = unw_decode_uleb128 (&dp);
-      switch (r)
-	{
-	case 0:
-	  size = unw_decode_uleb128 (&dp);
-	  UNW_DEC_MEM_STACK_F(P7, t, size, arg);
-	  break;
-
-	case 1: UNW_DEC_MEM_STACK_V(P7, t, arg); break;
-	case 2: UNW_DEC_SPILL_BASE(P7, t, arg); break;
-	case 3: UNW_DEC_REG_SPREL(P7, UNW_REG_PSP, t, arg); break;
-	case 4: UNW_DEC_REG_WHEN(P7, UNW_REG_RP, t, arg); break;
-	case 5: UNW_DEC_REG_PSPREL(P7, UNW_REG_RP, t, arg); break;
-	case 6: UNW_DEC_REG_WHEN(P7, UNW_REG_PFS, t, arg); break;
-	case 7: UNW_DEC_REG_PSPREL(P7, UNW_REG_PFS, t, arg); break;
-	case 8: UNW_DEC_REG_WHEN(P7, UNW_REG_PR, t, arg); break;
-	case 9: UNW_DEC_REG_PSPREL(P7, UNW_REG_PR, t, arg); break;
-	case 10: UNW_DEC_REG_WHEN(P7, UNW_REG_LC, t, arg); break;
-	case 11: UNW_DEC_REG_PSPREL(P7, UNW_REG_LC, t, arg); break;
-	case 12: UNW_DEC_REG_WHEN(P7, UNW_REG_UNAT, t, arg); break;
-	case 13: UNW_DEC_REG_PSPREL(P7, UNW_REG_UNAT, t, arg); break;
-	case 14: UNW_DEC_REG_WHEN(P7, UNW_REG_FPSR, t, arg); break;
-	case 15: UNW_DEC_REG_PSPREL(P7, UNW_REG_FPSR, t, arg); break;
-	default: UNW_DEC_BAD_CODE(r); break;
-	}
-    }
-  else
-    {
-      switch (code & 0xf)
-	{
-	case 0x0: /* p8 */
-	  {
-	    r = *dp++;
-	    t = unw_decode_uleb128 (&dp);
-	    switch (r)
-	      {
-	      case  1: UNW_DEC_REG_SPREL(P8, UNW_REG_RP, t, arg); break;
-	      case  2: UNW_DEC_REG_SPREL(P8, UNW_REG_PFS, t, arg); break;
-	      case  3: UNW_DEC_REG_SPREL(P8, UNW_REG_PR, t, arg); break;
-	      case  4: UNW_DEC_REG_SPREL(P8, UNW_REG_LC, t, arg); break;
-	      case  5: UNW_DEC_REG_SPREL(P8, UNW_REG_UNAT, t, arg); break;
-	      case  6: UNW_DEC_REG_SPREL(P8, UNW_REG_FPSR, t, arg); break;
-	      case  7: UNW_DEC_REG_WHEN(P8, UNW_REG_BSP, t, arg); break;
-	      case  8: UNW_DEC_REG_PSPREL(P8, UNW_REG_BSP, t, arg); break;
-	      case  9: UNW_DEC_REG_SPREL(P8, UNW_REG_BSP, t, arg); break;
-	      case 10: UNW_DEC_REG_WHEN(P8, UNW_REG_BSPSTORE, t, arg); break;
-	      case 11: UNW_DEC_REG_PSPREL(P8, UNW_REG_BSPSTORE, t, arg); break;
-	      case 12: UNW_DEC_REG_SPREL(P8, UNW_REG_BSPSTORE, t, arg); break;
-	      case 13: UNW_DEC_REG_WHEN(P8, UNW_REG_RNAT, t, arg); break;
-	      case 14: UNW_DEC_REG_PSPREL(P8, UNW_REG_RNAT, t, arg); break;
-	      case 15: UNW_DEC_REG_SPREL(P8, UNW_REG_RNAT, t, arg); break;
-	      case 16: UNW_DEC_PRIUNAT_WHEN_GR(P8, t, arg); break;
-	      case 17: UNW_DEC_PRIUNAT_PSPREL(P8, t, arg); break;
-	      case 18: UNW_DEC_PRIUNAT_SPREL(P8, t, arg); break;
-	      case 19: UNW_DEC_PRIUNAT_WHEN_MEM(P8, t, arg); break;
-	      default: UNW_DEC_BAD_CODE(r); break;
-	    }
-	  }
-	  break;
-
-	case 0x1:
-	  byte1 = *dp++; byte2 = *dp++;
-	  UNW_DEC_GR_GR(P9, (byte1 & 0xf), (byte2 & 0x7f), arg);
-	  break;
-
-	case 0xf: /* p10 */
-	  byte1 = *dp++; byte2 = *dp++;
-	  UNW_DEC_ABI(P10, byte1, byte2, arg);
-	  break;
-
-	case 0x9:
-	  return unw_decode_x1 (dp, code, arg);
-
-	case 0xa:
-	  return unw_decode_x2 (dp, code, arg);
-
-	case 0xb:
-	  return unw_decode_x3 (dp, code, arg);
-
-	case 0xc:
-	  return unw_decode_x4 (dp, code, arg);
-
-	default:
-	  UNW_DEC_BAD_CODE(code);
-	  break;
-	}
-    }
-  return dp;
-}
-
-static unsigned char *
-unw_decode_b1 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unw_word label = (code & 0x1f);
-
-  if ((code & 0x20) != 0)
-    UNW_DEC_COPY_STATE(B1, label, arg);
-  else
-    UNW_DEC_LABEL_STATE(B1, label, arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_b2 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unw_word t;
-
-  t = unw_decode_uleb128 (&dp);
-  UNW_DEC_EPILOGUE(B2, t, (code & 0x1f), arg);
-  return dp;
-}
-
-static unsigned char *
-unw_decode_b3_x4 (unsigned char *dp, unsigned char code, void *arg)
-{
-  unw_word t, ecount, label;
-
-  if ((code & 0x10) == 0)
-    {
-      t = unw_decode_uleb128 (&dp);
-      ecount = unw_decode_uleb128 (&dp);
-      UNW_DEC_EPILOGUE(B3, t, ecount, arg);
-    }
-  else if ((code & 0x07) == 0)
-    {
-      label = unw_decode_uleb128 (&dp);
-      if ((code & 0x08) != 0)
-	UNW_DEC_COPY_STATE(B4, label, arg);
-      else
-	UNW_DEC_LABEL_STATE(B4, label, arg);
-    }
-  else
-    switch (code & 0x7)
-      {
-      case 1: return unw_decode_x1 (dp, code, arg);
-      case 2: return unw_decode_x2 (dp, code, arg);
-      case 3: return unw_decode_x3 (dp, code, arg);
-      case 4: return unw_decode_x4 (dp, code, arg);
-      default: UNW_DEC_BAD_CODE(code); break;
-      }
-  return dp;
-}
-
-typedef unsigned char *(*unw_decoder) (unsigned char *, unsigned char, void *);
-
-static unw_decoder unw_decode_table[2][8] =
-{
-  /* prologue table: */
-  {
-    unw_decode_r1,	/* 0 */
-    unw_decode_r1,
-    unw_decode_r2,
-    unw_decode_r3,
-    unw_decode_p1,	/* 4 */
-    unw_decode_p2_p5,
-    unw_decode_p6,
-    unw_decode_p7_p10
-  },
-  {
-    unw_decode_r1,	/* 0 */
-    unw_decode_r1,
-    unw_decode_r2,
-    unw_decode_r3,
-    unw_decode_b1,	/* 4 */
-    unw_decode_b1,
-    unw_decode_b2,
-    unw_decode_b3_x4
-  }
-};
-
-/*
- * Decode one descriptor and return address of next descriptor.
- */
-static inline unsigned char *
-unw_decode (unsigned char *dp, int inside_body, void *arg)
-{
-  unw_decoder decoder;
-  unsigned char code;
-
-  code = *dp++;
-  decoder = unw_decode_table[inside_body][code >> 5];
-  dp = (*decoder) (dp, code, arg);
-  return dp;
-}
diff --git a/arch/ia64/kernel/unwind_i.h b/arch/ia64/kernel/unwind_i.h
deleted file mode 100644
index 1dd57ba44327..000000000000
--- a/arch/ia64/kernel/unwind_i.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2000, 2002-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * Kernel unwind support.
- */
-
-#define UNW_VER(x)		((x) >> 48)
-#define UNW_FLAG_MASK		0x0000ffff00000000
-#define UNW_FLAG_OSMASK		0x0000f00000000000
-#define UNW_FLAG_EHANDLER(x)	((x) & 0x0000000100000000L)
-#define UNW_FLAG_UHANDLER(x)	((x) & 0x0000000200000000L)
-#define UNW_LENGTH(x)		((x) & 0x00000000ffffffffL)
-
-enum unw_register_index {
-	/* primary unat: */
-	UNW_REG_PRI_UNAT_GR,
-	UNW_REG_PRI_UNAT_MEM,
-
-	/* register stack */
-	UNW_REG_BSP,					/* register stack pointer */
-	UNW_REG_BSPSTORE,
-	UNW_REG_PFS,					/* previous function state */
-	UNW_REG_RNAT,
-	/* memory stack */
-	UNW_REG_PSP,					/* previous memory stack pointer */
-	/* return pointer: */
-	UNW_REG_RP,
-
-	/* preserved registers: */
-	UNW_REG_R4, UNW_REG_R5, UNW_REG_R6, UNW_REG_R7,
-	UNW_REG_UNAT, UNW_REG_PR, UNW_REG_LC, UNW_REG_FPSR,
-	UNW_REG_B1, UNW_REG_B2, UNW_REG_B3, UNW_REG_B4, UNW_REG_B5,
-	UNW_REG_F2, UNW_REG_F3, UNW_REG_F4, UNW_REG_F5,
-	UNW_REG_F16, UNW_REG_F17, UNW_REG_F18, UNW_REG_F19,
-	UNW_REG_F20, UNW_REG_F21, UNW_REG_F22, UNW_REG_F23,
-	UNW_REG_F24, UNW_REG_F25, UNW_REG_F26, UNW_REG_F27,
-	UNW_REG_F28, UNW_REG_F29, UNW_REG_F30, UNW_REG_F31,
-	UNW_NUM_REGS
-};
-
-struct unw_info_block {
-	u64 header;
-	u64 desc[];		/* unwind descriptors */
-	/* personality routine and language-specific data follow behind descriptors */
-};
-
-struct unw_table {
-	struct unw_table *next;		/* must be first member! */
-	const char *name;
-	unsigned long gp;		/* global pointer for this load-module */
-	unsigned long segment_base;	/* base for offsets in the unwind table entries */
-	unsigned long start;
-	unsigned long end;
-	const struct unw_table_entry *array;
-	unsigned long length;
-};
-
-enum unw_where {
-	UNW_WHERE_NONE,			/* register isn't saved at all */
-	UNW_WHERE_GR,			/* register is saved in a general register */
-	UNW_WHERE_FR,			/* register is saved in a floating-point register */
-	UNW_WHERE_BR,			/* register is saved in a branch register */
-	UNW_WHERE_SPREL,		/* register is saved on memstack (sp-relative) */
-	UNW_WHERE_PSPREL,		/* register is saved on memstack (psp-relative) */
-	/*
-	 * At the end of each prologue these locations get resolved to
-	 * UNW_WHERE_PSPREL and UNW_WHERE_GR, respectively:
-	 */
-	UNW_WHERE_SPILL_HOME,		/* register is saved in its spill home */
-	UNW_WHERE_GR_SAVE		/* register is saved in next general register */
-};
-
-#define UNW_WHEN_NEVER	0x7fffffff
-
-struct unw_reg_info {
-	unsigned long val;		/* save location: register number or offset */
-	enum unw_where where;		/* where the register gets saved */
-	int when;			/* when the register gets saved */
-};
-
-struct unw_reg_state {
-	struct unw_reg_state *next;		/* next (outer) element on state stack */
-	struct unw_reg_info reg[UNW_NUM_REGS];	/* register save locations */
-};
-
-struct unw_labeled_state {
-	struct unw_labeled_state *next;		/* next labeled state (or NULL) */
-	unsigned long label;			/* label for this state */
-	struct unw_reg_state saved_state;
-};
-
-struct unw_state_record {
-	unsigned int first_region : 1;	/* is this the first region? */
-	unsigned int done : 1;		/* are we done scanning descriptors? */
-	unsigned int any_spills : 1;	/* got any register spills? */
-	unsigned int in_body : 1;	/* are we inside a body (as opposed to a prologue)? */
-	unsigned long flags;		/* see UNW_FLAG_* in unwind.h */
-
-	u8 *imask;			/* imask of spill_mask record or NULL */
-	unsigned long pr_val;		/* predicate values */
-	unsigned long pr_mask;		/* predicate mask */
-	long spill_offset;		/* psp-relative offset for spill base */
-	int region_start;
-	int region_len;
-	int epilogue_start;
-	int epilogue_count;
-	int when_target;
-
-	u8 gr_save_loc;			/* next general register to use for saving a register */
-	u8 return_link_reg;		/* branch register in which the return link is passed */
-
-	struct unw_labeled_state *labeled_states;	/* list of all labeled states */
-	struct unw_reg_state curr;	/* current state */
-};
-
-enum unw_nat_type {
-	UNW_NAT_NONE,		/* NaT not represented */
-	UNW_NAT_VAL,		/* NaT represented by NaT value (fp reg) */
-	UNW_NAT_MEMSTK,		/* NaT value is in unat word at offset OFF  */
-	UNW_NAT_REGSTK		/* NaT is in rnat */
-};
-
-enum unw_insn_opcode {
-	UNW_INSN_ADD,			/* s[dst] += val */
-	UNW_INSN_ADD_PSP,		/* s[dst] = (s.psp + val) */
-	UNW_INSN_ADD_SP,		/* s[dst] = (s.sp + val) */
-	UNW_INSN_MOVE,			/* s[dst] = s[val] */
-	UNW_INSN_MOVE2,			/* s[dst] = s[val]; s[dst+1] = s[val+1] */
-	UNW_INSN_MOVE_STACKED,		/* s[dst] = ia64_rse_skip(*s.bsp, val) */
-	UNW_INSN_SETNAT_MEMSTK,		/* s[dst+1].nat.type = MEMSTK;
-					   s[dst+1].nat.off = *s.pri_unat - s[dst] */
-	UNW_INSN_SETNAT_TYPE,		/* s[dst+1].nat.type = val */
-	UNW_INSN_LOAD,			/* s[dst] = *s[val] */
-	UNW_INSN_MOVE_SCRATCH,		/* s[dst] = scratch reg "val" */
-	UNW_INSN_MOVE_CONST,            /* s[dst] = constant reg "val" */
-};
-
-struct unw_insn {
-	unsigned int opc	:  4;
-	unsigned int dst	:  9;
-	signed int val		: 19;
-};
-
-/*
- * Preserved general static registers (r4-r7) give rise to two script
- * instructions; everything else yields at most one instruction; at
- * the end of the script, the psp gets popped, accounting for one more
- * instruction.
- */
-#define UNW_MAX_SCRIPT_LEN	(UNW_NUM_REGS + 5)
-
-struct unw_script {
-	unsigned long ip;		/* ip this script is for */
-	unsigned long pr_mask;		/* mask of predicates script depends on */
-	unsigned long pr_val;		/* predicate values this script is for */
-	rwlock_t lock;
-	unsigned int flags;		/* see UNW_FLAG_* in unwind.h */
-	unsigned short lru_chain;	/* used for least-recently-used chain */
-	unsigned short coll_chain;	/* used for hash collisions */
-	unsigned short hint;		/* hint for next script to try (or -1) */
-	unsigned short count;		/* number of instructions in script */
-	struct unw_insn insn[UNW_MAX_SCRIPT_LEN];
-};
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
deleted file mode 100644
index 53dfde161c8a..000000000000
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ /dev/null
@@ -1,224 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#include <linux/pgtable.h>
-#include <asm/cache.h>
-#include <asm/ptrace.h>
-#include <asm/thread_info.h>
-
-#define EMITS_PT_NOTE
-#define RO_EXCEPTION_TABLE_ALIGN	16
-
-#include <asm-generic/vmlinux.lds.h>
-
-OUTPUT_FORMAT("elf64-ia64-little")
-OUTPUT_ARCH(ia64)
-ENTRY(phys_start)
-jiffies = jiffies_64;
-
-PHDRS {
-	text   PT_LOAD;
-	percpu PT_LOAD;
-	data   PT_LOAD;
-	note   PT_NOTE;
-	unwind 0x70000001; /* PT_IA_64_UNWIND, but ld doesn't match the name */
-}
-
-SECTIONS {
-	/*
-	 * unwind exit sections must be discarded before
-	 * the rest of the sections get included.
-	 */
-	/DISCARD/ : {
-		*(.IA_64.unwind.exit.text)
-		*(.IA_64.unwind_info.exit.text)
-		*(.comment)
-		*(.note)
-	}
-
-	v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */
-	phys_start = _start - LOAD_OFFSET;
-
-	code : {
-	} :text
-	. = KERNEL_START;
-
-	_text = .;
-	_stext = .;
-
-	.text : AT(ADDR(.text) - LOAD_OFFSET) {
-		__start_ivt_text = .;
-		*(.text..ivt)
-		__end_ivt_text = .;
-		TEXT_TEXT
-		SCHED_TEXT
-		LOCK_TEXT
-		KPROBES_TEXT
-		IRQENTRY_TEXT
-		SOFTIRQENTRY_TEXT
-		*(.gnu.linkonce.t*)
-	}
-
-	.text2 : AT(ADDR(.text2) - LOAD_OFFSET)	{
-		*(.text2)
-	}
-
-#ifdef CONFIG_SMP
-	.text..lock : AT(ADDR(.text..lock) - LOAD_OFFSET) {
-		*(.text..lock)
-	}
-#endif
-	_etext = .;
-
-	/*
-	 * Read-only data
-	 */
-
-	/* MCA table */
-	. = ALIGN(16);
-	__mca_table : AT(ADDR(__mca_table) - LOAD_OFFSET) {
-		__start___mca_table = .;
-		*(__mca_table)
-		__stop___mca_table = .;
-	}
-
-	.data..patch.phys_stack_reg : AT(ADDR(.data..patch.phys_stack_reg) - LOAD_OFFSET) {
-		__start___phys_stack_reg_patchlist = .;
-		*(.data..patch.phys_stack_reg)
-		__end___phys_stack_reg_patchlist = .;
-	}
-
-	/*
-	 * Global data
-	 */
-	_data = .;
-
-	/* Unwind info & table: */
-	. = ALIGN(8);
-	.IA_64.unwind_info : AT(ADDR(.IA_64.unwind_info) - LOAD_OFFSET) {
-		*(.IA_64.unwind_info*)
-	}
-	.IA_64.unwind : AT(ADDR(.IA_64.unwind) - LOAD_OFFSET) {
-		__start_unwind = .;
-		*(.IA_64.unwind*)
-		__end_unwind = .;
-	} :text :unwind
-	code_continues2 : {
-	} :text
-
-	RO_DATA(4096)
-
-	.opd : AT(ADDR(.opd) - LOAD_OFFSET) {
-		__start_opd = .;
-		*(.opd)
-		__end_opd = .;
-	}
-
-	/*
-	 * Initialization code and data:
-	 */
-	. = ALIGN(PAGE_SIZE);
-	__init_begin = .;
-
-	INIT_TEXT_SECTION(PAGE_SIZE)
-	INIT_DATA_SECTION(16)
-
-	.data..patch.vtop : AT(ADDR(.data..patch.vtop) - LOAD_OFFSET) {
-		__start___vtop_patchlist = .;
-		*(.data..patch.vtop)
-		__end___vtop_patchlist = .;
-	}
-
-	.data..patch.rse : AT(ADDR(.data..patch.rse) - LOAD_OFFSET) {
-		__start___rse_patchlist = .;
-		*(.data..patch.rse)
-		__end___rse_patchlist = .;
-	}
-
-	.data..patch.mckinley_e9 : AT(ADDR(.data..patch.mckinley_e9) - LOAD_OFFSET) {
-		__start___mckinley_e9_bundles = .;
-		*(.data..patch.mckinley_e9)
-		__end___mckinley_e9_bundles = .;
-	}
-
-#ifdef	CONFIG_SMP
-	. = ALIGN(PERCPU_PAGE_SIZE);
-	__cpu0_per_cpu = .;
-	. = . + PERCPU_PAGE_SIZE;   /* cpu0 per-cpu space */
-#endif
-
-	. = ALIGN(PAGE_SIZE);
-	__init_end = .;
-
-	.data..page_aligned : AT(ADDR(.data..page_aligned) - LOAD_OFFSET) {
-		PAGE_ALIGNED_DATA(PAGE_SIZE)
-		. = ALIGN(PAGE_SIZE);
-		__start_gate_section = .;
-		*(.data..gate)
-		__stop_gate_section = .;
-	}
-	/*
-	 * make sure the gate page doesn't expose
-	 * kernel data
-	 */
-	. = ALIGN(PAGE_SIZE);
-
-	/* Per-cpu data: */
-	. = ALIGN(PERCPU_PAGE_SIZE);
-	PERCPU_VADDR(SMP_CACHE_BYTES, PERCPU_ADDR, :percpu)
-	__phys_per_cpu_start = __per_cpu_load;
-	/*
-	 * ensure percpu data fits
-	 * into percpu page size
-	 */
-	. = __phys_per_cpu_start + PERCPU_PAGE_SIZE;
-
-	data : {
-	} :data
-	.data : AT(ADDR(.data) - LOAD_OFFSET) {
-		_sdata  =  .;
-		INIT_TASK_DATA(PAGE_SIZE)
-		CACHELINE_ALIGNED_DATA(SMP_CACHE_BYTES)
-		READ_MOSTLY_DATA(SMP_CACHE_BYTES)
-		DATA_DATA
-		*(.data1)
-		*(.gnu.linkonce.d*)
-		CONSTRUCTORS
-	}
-
-	BUG_TABLE
-
-	. = ALIGN(16);	/* gp must be 16-byte aligned for exc. table */
-	.got : AT(ADDR(.got) - LOAD_OFFSET) {
-		*(.got.plt)
-		*(.got)
-	}
-	__gp = ADDR(.got) + 0x200000;
-
-	/*
-	 * We want the small data sections together,
-	 * so single-instruction offsets can access
-	 * them all, and initialized data all before
-	 * uninitialized, so we can shorten the
-	 * on-disk segment size.
-	 */
-	.sdata : AT(ADDR(.sdata) - LOAD_OFFSET) {
-		*(.sdata)
-		*(.sdata1)
-		*(.srdata)
-	}
-	_edata  =  .;
-
-	BSS_SECTION(0, 0, 0)
-
-	_end = .;
-
-	code : {
-	} :text
-
-	STABS_DEBUG
-	DWARF_DEBUG
-	ELF_DETAILS
-
-	/* Default discards */
-	DISCARDS
-}
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
deleted file mode 100644
index 081fcba01dc0..000000000000
--- a/arch/ia64/lib/Makefile
+++ /dev/null
@@ -1,48 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for ia64-specific library routines..
-#
-
-lib-y := io.o __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o		\
-	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o			\
-	checksum.o clear_page.o csum_partial_copy.o			\
-	clear_user.o strncpy_from_user.o strnlen_user.o			\
-	flush.o ip_fast_csum.o do_csum.o				\
-	memset.o strlen.o xor.o
-
-lib-$(CONFIG_ITANIUM)	+= copy_page.o copy_user.o memcpy.o
-lib-$(CONFIG_MCKINLEY)	+= copy_page_mck.o memcpy_mck.o
-
-AFLAGS___divdi3.o	=
-AFLAGS___udivdi3.o	= -DUNSIGNED
-AFLAGS___moddi3.o	= 	     -DMODULO
-AFLAGS___umoddi3.o	= -DUNSIGNED -DMODULO
-
-AFLAGS___divsi3.o	=
-AFLAGS___udivsi3.o	= -DUNSIGNED
-AFLAGS___modsi3.o	=	     -DMODULO
-AFLAGS___umodsi3.o	= -DUNSIGNED -DMODULO
-
-$(obj)/__divdi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__udivdi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__moddi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__umoddi3.o: $(src)/idiv64.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__divsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__udivsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__modsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-$(obj)/__umodsi3.o: $(src)/idiv32.S FORCE
-	$(call if_changed_rule,as_o_S)
diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c
deleted file mode 100644
index d26517fe3500..000000000000
--- a/arch/ia64/lib/checksum.c
+++ /dev/null
@@ -1,102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Network checksum routines
- *
- * Copyright (C) 1999, 2003 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * Most of the code coming from arch/alpha/lib/checksum.c
- *
- * This file contains network checksum routines that are better done
- * in an architecture-specific manner due to speed..
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/byteorder.h>
-
-static inline unsigned short
-from64to16 (unsigned long x)
-{
-	/* add up 32-bit words for 33 bits */
-	x = (x & 0xffffffff) + (x >> 32);
-	/* add up 16-bit and 17-bit words for 17+c bits */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up 16-bit and 2-bit for 16+c bit */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up carry.. */
-	x = (x & 0xffff) + (x >> 16);
-	return x;
-}
-
-/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented.
- */
-__sum16
-csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
-		  __u8 proto, __wsum sum)
-{
-	return (__force __sum16)~from64to16(
-		(__force u64)saddr + (__force u64)daddr +
-		(__force u64)sum + ((len + proto) << 8));
-}
-
-EXPORT_SYMBOL(csum_tcpudp_magic);
-
-__wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
-		   __u8 proto, __wsum sum)
-{
-	unsigned long result;
-
-	result = (__force u64)saddr + (__force u64)daddr +
-		 (__force u64)sum + ((len + proto) << 8);
-
-	/* Fold down to 32-bits so we don't lose in the typedef-less network stack.  */
-	/* 64 to 33 */
-	result = (result & 0xffffffff) + (result >> 32);
-	/* 33 to 32 */
-	result = (result & 0xffffffff) + (result >> 32);
-	return (__force __wsum)result;
-}
-EXPORT_SYMBOL(csum_tcpudp_nofold);
-
-extern unsigned long do_csum (const unsigned char *, long);
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 32-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
-	u64 result = do_csum(buff, len);
-
-	/* add in old sum, and carry.. */
-	result += (__force u32)sum;
-	/* 32+c bits -> 32 bits */
-	result = (result & 0xffffffff) + (result >> 32);
-	return (__force __wsum)result;
-}
-
-EXPORT_SYMBOL(csum_partial);
-
-/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
- */
-__sum16 ip_compute_csum (const void *buff, int len)
-{
-	return (__force __sum16)~do_csum(buff,len);
-}
-
-EXPORT_SYMBOL(ip_compute_csum);
diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S
deleted file mode 100644
index ba0dd2538fa5..000000000000
--- a/arch/ia64/lib/clear_page.S
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1999-2002 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
- *
- * 1/06/01 davidm	Tuned for Itanium.
- * 2/12/02 kchen	Tuned for both Itanium and McKinley
- * 3/08/02 davidm	Some more tweaking
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#ifdef CONFIG_ITANIUM
-# define L3_LINE_SIZE	64	// Itanium L3 line size
-# define PREFETCH_LINES	9	// magic number
-#else
-# define L3_LINE_SIZE	128	// McKinley L3 line size
-# define PREFETCH_LINES	12	// magic number
-#endif
-
-#define saved_lc	r2
-#define dst_fetch	r3
-#define dst1		r8
-#define dst2		r9
-#define dst3		r10
-#define dst4		r11
-
-#define dst_last	r31
-
-GLOBAL_ENTRY(clear_page)
-	.prologue
-	.regstk 1,0,0,0
-	mov r16 = PAGE_SIZE/L3_LINE_SIZE-1	// main loop count, -1=repeat/until
-	.save ar.lc, saved_lc
-	mov saved_lc = ar.lc
-
-	.body
-	mov ar.lc = (PREFETCH_LINES - 1)
-	mov dst_fetch = in0
-	adds dst1 = 16, in0
-	adds dst2 = 32, in0
-	;;
-.fetch:	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
-	adds dst3 = 48, in0		// executing this multiple times is harmless
-	br.cloop.sptk.few .fetch
-	;;
-	addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
-	mov ar.lc = r16			// one L3 line per iteration
-	adds dst4 = 64, in0
-	;;
-#ifdef CONFIG_ITANIUM
-	// Optimized for Itanium
-1:	stf.spill.nta [dst1] = f0, 64
-	stf.spill.nta [dst2] = f0, 64
-	cmp.lt p8,p0=dst_fetch, dst_last
-	;;
-#else
-	// Optimized for McKinley
-1:	stf.spill.nta [dst1] = f0, 64
-	stf.spill.nta [dst2] = f0, 64
-	stf.spill.nta [dst3] = f0, 64
-	stf.spill.nta [dst4] = f0, 128
-	cmp.lt p8,p0=dst_fetch, dst_last
-	;;
-	stf.spill.nta [dst1] = f0, 64
-	stf.spill.nta [dst2] = f0, 64
-#endif
-	stf.spill.nta [dst3] = f0, 64
-(p8)	stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
-	br.cloop.sptk.few 1b
-	;;
-	mov ar.lc = saved_lc		// restore lc
-	br.ret.sptk.many rp
-END(clear_page)
-EXPORT_SYMBOL(clear_page)
diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S
deleted file mode 100644
index 1d9e45ccf8e5..000000000000
--- a/arch/ia64/lib/clear_user.S
+++ /dev/null
@@ -1,212 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * This routine clears to zero a linear memory buffer in user space.
- *
- * Inputs:
- *	in0:	address of buffer
- *	in1:	length of buffer in bytes
- * Outputs:
- *	r8:	number of bytes that didn't get cleared due to a fault
- *
- * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-// arguments
-//
-#define buf		r32
-#define len		r33
-
-//
-// local registers
-//
-#define cnt		r16
-#define buf2		r17
-#define saved_lc	r18
-#define saved_pfs	r19
-#define tmp		r20
-#define len2		r21
-#define len3		r22
-
-//
-// Theory of operations:
-//	- we check whether or not the buffer is small, i.e., less than 17
-//	  in which case we do the byte by byte loop.
-//
-//	- Otherwise we go progressively from 1 byte store to 8byte store in
-//	  the head part, the body is a 16byte store loop and we finish we the
-//	  tail for the last 15 bytes.
-//	  The good point about this breakdown is that the long buffer handling
-//	  contains only 2 branches.
-//
-//	The reason for not using shifting & masking for both the head and the
-//	tail is to stay semantically correct. This routine is not supposed
-//	to write bytes outside of the buffer. While most of the time this would
-//	be ok, we can't tolerate a mistake. A classical example is the case
-//	of multithreaded code were to the extra bytes touched is actually owned
-//	by another thread which runs concurrently to ours. Another, less likely,
-//	example is with device drivers where reading an I/O mapped location may
-//	have side effects (same thing for writing).
-//
-
-GLOBAL_ENTRY(__do_clear_user)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc	saved_pfs=ar.pfs,2,0,0,0
-	cmp.eq p6,p0=r0,len		// check for zero length
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc		// preserve ar.lc (slow)
-	.body
-	;;				// avoid WAW on CFM
-	adds tmp=-1,len			// br.ctop is repeat/until
-	mov ret0=len			// return value is length at this point
-(p6)	br.ret.spnt.many rp
-	;;
-	cmp.lt p6,p0=16,len		// if len > 16 then long memset
-	mov ar.lc=tmp			// initialize lc for small count
-(p6)	br.cond.dptk .long_do_clear
-	;;				// WAR on ar.lc
-	//
-	// worst case 16 iterations, avg 8 iterations
-	//
-	// We could have played with the predicates to use the extra
-	// M slot for 2 stores/iteration but the cost the initialization
-	// the various counters compared to how long the loop is supposed
-	// to last on average does not make this solution viable.
-	//
-1:
-	EX( .Lexit1, st1 [buf]=r0,1 )
-	adds len=-1,len			// countdown length using len
-	br.cloop.dptk 1b
-	;;				// avoid RAW on ar.lc
-	//
-	// .Lexit4: comes from byte by byte loop
-	//	    len contains bytes left
-.Lexit1:
-	mov ret0=len			// faster than using ar.lc
-	mov ar.lc=saved_lc
-	br.ret.sptk.many rp		// end of short clear_user
-
-
-	//
-	// At this point we know we have more than 16 bytes to copy
-	// so we focus on alignment (no branches required)
-	//
-	// The use of len/len2 for countdown of the number of bytes left
-	// instead of ret0 is due to the fact that the exception code
-	// changes the values of r8.
-	//
-.long_do_clear:
-	tbit.nz p6,p0=buf,0		// odd alignment (for long_do_clear)
-	;;
-	EX( .Lexit3, (p6) st1 [buf]=r0,1 )	// 1-byte aligned
-(p6)	adds len=-1,len;;		// sync because buf is modified
-	tbit.nz p6,p0=buf,1
-	;;
-	EX( .Lexit3, (p6) st2 [buf]=r0,2 )	// 2-byte aligned
-(p6)	adds len=-2,len;;
-	tbit.nz p6,p0=buf,2
-	;;
-	EX( .Lexit3, (p6) st4 [buf]=r0,4 )	// 4-byte aligned
-(p6)	adds len=-4,len;;
-	tbit.nz p6,p0=buf,3
-	;;
-	EX( .Lexit3, (p6) st8 [buf]=r0,8 )	// 8-byte aligned
-(p6)	adds len=-8,len;;
-	shr.u cnt=len,4		// number of 128-bit (2x64bit) words
-	;;
-	cmp.eq p6,p0=r0,cnt
-	adds tmp=-1,cnt
-(p6)	br.cond.dpnt .dotail		// we have less than 16 bytes left
-	;;
-	adds buf2=8,buf			// setup second base pointer
-	mov ar.lc=tmp
-	;;
-
-	//
-	// 16bytes/iteration core loop
-	//
-	// The second store can never generate a fault because
-	// we come into the loop only when we are 16-byte aligned.
-	// This means that if we cross a page then it will always be
-	// in the first store and never in the second.
-	//
-	//
-	// We need to keep track of the remaining length. A possible (optimistic)
-	// way would be to use ar.lc and derive how many byte were left by
-	// doing : left= 16*ar.lc + 16.  this would avoid the addition at
-	// every iteration.
-	// However we need to keep the synchronization point. A template
-	// M;;MB does not exist and thus we can keep the addition at no
-	// extra cycle cost (use a nop slot anyway). It also simplifies the
-	// (unlikely)  error recovery code
-	//
-
-2:	EX(.Lexit3, st8 [buf]=r0,16 )
-	;;				// needed to get len correct when error
-	st8 [buf2]=r0,16
-	adds len=-16,len
-	br.cloop.dptk 2b
-	;;
-	mov ar.lc=saved_lc
-	//
-	// tail correction based on len only
-	//
-	// We alternate the use of len3,len2 to allow parallelism and correct
-	// error handling. We also reuse p6/p7 to return correct value.
-	// The addition of len2/len3 does not cost anything more compared to
-	// the regular memset as we had empty slots.
-	//
-.dotail:
-	mov len2=len			// for parallelization of error handling
-	mov len3=len
-	tbit.nz p6,p0=len,3
-	;;
-	EX( .Lexit2, (p6) st8 [buf]=r0,8 )	// at least 8 bytes
-(p6)	adds len3=-8,len2
-	tbit.nz p7,p6=len,2
-	;;
-	EX( .Lexit2, (p7) st4 [buf]=r0,4 )	// at least 4 bytes
-(p7)	adds len2=-4,len3
-	tbit.nz p6,p7=len,1
-	;;
-	EX( .Lexit2, (p6) st2 [buf]=r0,2 )	// at least 2 bytes
-(p6)	adds len3=-2,len2
-	tbit.nz p7,p6=len,0
-	;;
-	EX( .Lexit2, (p7) st1 [buf]=r0 )	// only 1 byte left
-	mov ret0=r0				// success
-	br.ret.sptk.many rp			// end of most likely path
-
-	//
-	// Outlined error handling code
-	//
-
-	//
-	// .Lexit3: comes from core loop, need restore pr/lc
-	//	    len contains bytes left
-	//
-	//
-	// .Lexit2:
-	//	if p6 -> coming from st8 or st2 : len2 contains what's left
-	//	if p7 -> coming from st4 or st1 : len3 contains what's left
-	// We must restore lc/pr even though might not have been used.
-.Lexit2:
-	.pred.rel "mutex", p6, p7
-(p6)	mov len=len2
-(p7)	mov len=len3
-	;;
-	//
-	// .Lexit4: comes from head, need not restore pr/lc
-	//	    len contains bytes left
-	//
-.Lexit3:
-	mov ret0=len
-	mov ar.lc=saved_lc
-	br.ret.sptk.many rp
-END(__do_clear_user)
-EXPORT_SYMBOL(__do_clear_user)
diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S
deleted file mode 100644
index c0a0e6b2af00..000000000000
--- a/arch/ia64/lib/copy_page.S
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard copy_page() function
- *
- * Inputs:
- *	in0:	address of target page
- *	in1:	address of source page
- * Output:
- *	no return value
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger <davidm@hpl.hp.com>
- *
- * 4/06/01 davidm	Tuned to make it perform well both for cached and uncached copies.
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define PIPE_DEPTH	3
-#define EPI		p[PIPE_DEPTH-1]
-
-#define lcount		r16
-#define saved_pr	r17
-#define saved_lc	r18
-#define saved_pfs	r19
-#define src1		r20
-#define src2		r21
-#define tgt1		r22
-#define tgt2		r23
-#define srcf		r24
-#define tgtf		r25
-#define tgt_last	r26
-
-#define Nrot		((8*PIPE_DEPTH+7)&~7)
-
-GLOBAL_ENTRY(copy_page)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
-
-	.rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
-	      t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
-	.rotp p[PIPE_DEPTH]
-
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc
-	mov ar.ec=PIPE_DEPTH
-
-	mov lcount=PAGE_SIZE/64-1
-	.save pr, saved_pr
-	mov saved_pr=pr
-	mov pr.rot=1<<16
-
-	.body
-
-	mov src1=in1
-	adds src2=8,in1
-	mov tgt_last = PAGE_SIZE
-	;;
-	adds tgt2=8,in0
-	add srcf=512,in1
-	mov ar.lc=lcount
-	mov tgt1=in0
-	add tgtf=512,in0
-	add tgt_last = tgt_last, in0
-	;;
-1:
-(p[0])	ld8 t1[0]=[src1],16
-(EPI)	st8 [tgt1]=t1[PIPE_DEPTH-1],16
-(p[0])	ld8 t2[0]=[src2],16
-(EPI)	st8 [tgt2]=t2[PIPE_DEPTH-1],16
-	cmp.ltu p6,p0 = tgtf, tgt_last
-	;;
-(p[0])	ld8 t3[0]=[src1],16
-(EPI)	st8 [tgt1]=t3[PIPE_DEPTH-1],16
-(p[0])	ld8 t4[0]=[src2],16
-(EPI)	st8 [tgt2]=t4[PIPE_DEPTH-1],16
-	;;
-(p[0])	ld8 t5[0]=[src1],16
-(EPI)	st8 [tgt1]=t5[PIPE_DEPTH-1],16
-(p[0])	ld8 t6[0]=[src2],16
-(EPI)	st8 [tgt2]=t6[PIPE_DEPTH-1],16
-	;;
-(p[0])	ld8 t7[0]=[src1],16
-(EPI)	st8 [tgt1]=t7[PIPE_DEPTH-1],16
-(p[0])	ld8 t8[0]=[src2],16
-(EPI)	st8 [tgt2]=t8[PIPE_DEPTH-1],16
-
-(p6)	lfetch [srcf], 64
-(p6)	lfetch [tgtf], 64
-	br.ctop.sptk.few 1b
-	;;
-	mov pr=saved_pr,0xffffffffffff0000	// restore predicates
-	mov ar.pfs=saved_pfs
-	mov ar.lc=saved_lc
-	br.ret.sptk.many rp
-END(copy_page)
-EXPORT_SYMBOL(copy_page)
diff --git a/arch/ia64/lib/copy_page_mck.S b/arch/ia64/lib/copy_page_mck.S
deleted file mode 100644
index 5e8bb4b4b535..000000000000
--- a/arch/ia64/lib/copy_page_mck.S
+++ /dev/null
@@ -1,188 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * McKinley-optimized version of copy_page().
- *
- * Copyright (C) 2002 Hewlett-Packard Co
- *	David Mosberger <davidm@hpl.hp.com>
- *
- * Inputs:
- *	in0:	address of target page
- *	in1:	address of source page
- * Output:
- *	no return value
- *
- * General idea:
- *	- use regular loads and stores to prefetch data to avoid consuming M-slot just for
- *	  lfetches => good for in-cache performance
- *	- avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single
- *	  cycle
- *
- * Principle of operation:
- *	First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.
- *	To avoid secondary misses in L2, we prefetch both source and destination with a line-size
- *	of 128 bytes.  When both of these lines are in the L2 and the first half of the
- *	source line is in L1, we start copying the remaining words.  The second half of the
- *	source line is prefetched in an earlier iteration, so that by the time we start
- *	accessing it, it's also present in the L1.
- *
- *	We use a software-pipelined loop to control the overall operation.  The pipeline
- *	has 2*PREFETCH_DIST+K stages.  The first PREFETCH_DIST stages are used for prefetching
- *	source cache-lines.  The second PREFETCH_DIST stages are used for prefetching destination
- *	cache-lines, the last K stages are used to copy the cache-line words not copied by
- *	the prefetches.  The four relevant points in the pipelined are called A, B, C, D:
- *	p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
- *	should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought
- *	into L1D and p[D] is TRUE if a cacheline needs to be copied.
- *
- *	This all sounds very complicated, but thanks to the modulo-scheduled loop support,
- *	the resulting code is very regular and quite easy to follow (once you get the idea).
- *
- *	As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
- *	as the separate .prefetch_loop.  Logically, this loop performs exactly like the
- *	main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,
- *	so that each loop iteration is faster (again, good for cached case).
- *
- *	When reading the code, it helps to keep the following picture in mind:
- *
- *	       word 0 word 1
- *            +------+------+---
- *	      |	v[x] | 	t1  | ^
- *	      |	t2   |	t3  | |
- *	      |	t4   |	t5  | |
- *	      |	t6   |	t7  | | 128 bytes
- *     	      |	n[y] | 	t9  | |	(L2 cache line)
- *	      |	t10  | 	t11 | |
- *	      |	t12  | 	t13 | |
- *	      |	t14  | 	t15 | v
- *	      +------+------+---
- *
- *	Here, v[x] is copied by the (memory) prefetch.  n[y] is loaded at p[C]
- *	to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
- *	an order that avoids bank conflicts.
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define PREFETCH_DIST	8		// McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
-
-#define src0		r2
-#define src1		r3
-#define dst0		r9
-#define dst1		r10
-#define src_pre_mem	r11
-#define dst_pre_mem	r14
-#define src_pre_l2	r15
-#define dst_pre_l2	r16
-#define t1		r17
-#define t2		r18
-#define t3		r19
-#define t4		r20
-#define t5		t1	// alias!
-#define t6		t2	// alias!
-#define t7		t3	// alias!
-#define t9		t5	// alias!
-#define t10		t4	// alias!
-#define t11		t7	// alias!
-#define t12		t6	// alias!
-#define t14		t10	// alias!
-#define t13		r21
-#define t15		r22
-
-#define saved_lc	r23
-#define saved_pr	r24
-
-#define	A	0
-#define B	(PREFETCH_DIST)
-#define C	(B + PREFETCH_DIST)
-#define D	(C + 3)
-#define N	(D + 1)
-#define Nrot	((N + 7) & ~7)
-
-GLOBAL_ENTRY(copy_page)
-	.prologue
-	alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
-
-	.rotr v[2*PREFETCH_DIST], n[D-C+1]
-	.rotp p[N]
-
-	.save ar.lc, saved_lc
-	mov saved_lc = ar.lc
-	.save pr, saved_pr
-	mov saved_pr = pr
-	.body
-
-	mov src_pre_mem = in1
-	mov pr.rot = 0x10000
-	mov ar.ec = 1				// special unrolled loop
-
-	mov dst_pre_mem = in0
-	mov ar.lc = 2*PREFETCH_DIST - 1
-
-	add src_pre_l2 = 8*8, in1
-	add dst_pre_l2 = 8*8, in0
-	add src0 = 8, in1			// first t1 src
-	add src1 = 3*8, in1			// first t3 src
-	add dst0 = 8, in0			// first t1 dst
-	add dst1 = 3*8, in0			// first t3 dst
-	mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
-	nop.m 0
-	nop.i 0
-	;;
-	// same as .line_copy loop, but with all predicated-off instructions removed:
-.prefetch_loop:
-(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0
-(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2
-	br.ctop.sptk .prefetch_loop
-	;;
-	cmp.eq p16, p0 = r0, r0			// reset p16 to 1 (br.ctop cleared it to zero)
-	mov ar.lc = t1				// with 64KB pages, t1 is too big to fit in 8 bits!
-	mov ar.ec = N				// # of stages in pipeline
-	;;
-.line_copy:
-(p[D])	ld8 t2 = [src0], 3*8			// M0
-(p[D])	ld8 t4 = [src1], 3*8			// M1
-(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2 prefetch dst from memory
-(p[D])	st8 [dst_pre_l2] = n[D-C], 128		// M3 prefetch dst from L2
-	;;
-(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0 prefetch src from memory
-(p[C])	ld8 n[0] = [src_pre_l2], 128		// M1 prefetch src from L2
-(p[D])	st8 [dst0] =  t1, 8			// M2
-(p[D])	st8 [dst1] =  t3, 8			// M3
-	;;
-(p[D])	ld8  t5 = [src0], 8
-(p[D])	ld8  t7 = [src1], 3*8
-(p[D])	st8 [dst0] =  t2, 3*8
-(p[D])	st8 [dst1] =  t4, 3*8
-	;;
-(p[D])	ld8  t6 = [src0], 3*8
-(p[D])	ld8 t10 = [src1], 8
-(p[D])	st8 [dst0] =  t5, 8
-(p[D])	st8 [dst1] =  t7, 3*8
-	;;
-(p[D])	ld8  t9 = [src0], 3*8
-(p[D])	ld8 t11 = [src1], 3*8
-(p[D])	st8 [dst0] =  t6, 3*8
-(p[D])	st8 [dst1] = t10, 8
-	;;
-(p[D])	ld8 t12 = [src0], 8
-(p[D])	ld8 t14 = [src1], 8
-(p[D])	st8 [dst0] =  t9, 3*8
-(p[D])	st8 [dst1] = t11, 3*8
-	;;
-(p[D])	ld8 t13 = [src0], 4*8
-(p[D])	ld8 t15 = [src1], 4*8
-(p[D])	st8 [dst0] = t12, 8
-(p[D])	st8 [dst1] = t14, 8
-	;;
-(p[D-1])ld8  t1 = [src0], 8
-(p[D-1])ld8  t3 = [src1], 8
-(p[D])	st8 [dst0] = t13, 4*8
-(p[D])	st8 [dst1] = t15, 4*8
-	br.ctop.sptk .line_copy
-	;;
-	mov ar.lc = saved_lc
-	mov pr = saved_pr, -1
-	br.ret.sptk.many rp
-END(copy_page)
-EXPORT_SYMBOL(copy_page)
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S
deleted file mode 100644
index 8daab72cfe77..000000000000
--- a/arch/ia64/lib/copy_user.S
+++ /dev/null
@@ -1,613 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the copy_user() routine.
- * It is used to copy date across the kernel/user boundary.
- *
- * The source and destination are always on opposite side of
- * the boundary. When reading from user space we must catch
- * faults on loads. When writing to user space we must catch
- * errors on stores. Note that because of the nature of the copy
- * we don't need to worry about overlapping regions.
- *
- *
- * Inputs:
- *	in0	address of source buffer
- *	in1	address of destination buffer
- *	in2	number of bytes to copy
- *
- * Outputs:
- *	ret0	0 in case of success. The number of bytes NOT copied in
- *		case of error.
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * Fixme:
- *	- handle the case where we have more than 16 bytes and the alignment
- *	  are different.
- *	- more benchmarking
- *	- fix extraneous stop bit introduced by the EX() macro.
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-// Tuneable parameters
-//
-#define COPY_BREAK	16	// we do byte copy below (must be >=16)
-#define PIPE_DEPTH	21	// pipe depth
-
-#define EPI		p[PIPE_DEPTH-1]
-
-//
-// arguments
-//
-#define dst		in0
-#define src		in1
-#define len		in2
-
-//
-// local registers
-//
-#define t1		r2	// rshift in bytes
-#define t2		r3	// lshift in bytes
-#define rshift		r14	// right shift in bits
-#define lshift		r15	// left shift in bits
-#define word1		r16
-#define word2		r17
-#define cnt		r18
-#define len2		r19
-#define saved_lc	r20
-#define saved_pr	r21
-#define tmp		r22
-#define val		r23
-#define src1		r24
-#define dst1		r25
-#define src2		r26
-#define dst2		r27
-#define len1		r28
-#define enddst		r29
-#define endsrc		r30
-#define saved_pfs	r31
-
-GLOBAL_ENTRY(__copy_user)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
-
-	.rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
-	.rotp p[PIPE_DEPTH]
-
-	adds len2=-1,len	// br.ctop is repeat/until
-	mov ret0=r0
-
-	;;			// RAW of cfm when len=0
-	cmp.eq p8,p0=r0,len	// check for zero length
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc	// preserve ar.lc (slow)
-(p8)	br.ret.spnt.many rp	// empty mempcy()
-	;;
-	add enddst=dst,len	// first byte after end of source
-	add endsrc=src,len	// first byte after end of destination
-	.save pr, saved_pr
-	mov saved_pr=pr		// preserve predicates
-
-	.body
-
-	mov dst1=dst		// copy because of rotation
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-
-	mov src1=src		// copy because of rotation
-	mov ar.lc=len2		// initialize lc for small count
-	cmp.lt p10,p7=COPY_BREAK,len	// if len > COPY_BREAK then long copy
-
-	xor tmp=src,dst		// same alignment test prepare
-(p10)	br.cond.dptk .long_copy_user
-	;;			// RAW pr.rot/p16 ?
-	//
-	// Now we do the byte by byte loop with software pipeline
-	//
-	// p7 is necessarily false by now
-1:
-	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-	br.ctop.dptk.few 1b
-	;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.pfs=saved_pfs		// restore ar.ec
-	br.ret.sptk.many rp		// end of short memcpy
-
-	//
-	// Not 8-byte aligned
-	//
-.diff_align_copy_user:
-	// At this point we know we have more than 16 bytes to copy
-	// and also that src and dest do _not_ have the same alignment.
-	and src2=0x7,src1				// src offset
-	and dst2=0x7,dst1				// dst offset
-	;;
-	// The basic idea is that we copy byte-by-byte at the head so
-	// that we can reach 8-byte alignment for both src1 and dst1.
-	// Then copy the body using software pipelined 8-byte copy,
-	// shifting the two back-to-back words right and left, then copy
-	// the tail by copying byte-by-byte.
-	//
-	// Fault handling. If the byte-by-byte at the head fails on the
-	// load, then restart and finish the pipleline by copying zeros
-	// to the dst1. Then copy zeros for the rest of dst1.
-	// If 8-byte software pipeline fails on the load, do the same as
-	// failure_in3 does. If the byte-by-byte at the tail fails, it is
-	// handled simply by failure_in_pipe1.
-	//
-	// The case p14 represents the source has more bytes in the
-	// the first word (by the shifted part), whereas the p15 needs to
-	// copy some bytes from the 2nd word of the source that has the
-	// tail of the 1st of the destination.
-	//
-
-	//
-	// Optimization. If dst1 is 8-byte aligned (quite common), we don't need
-	// to copy the head to dst1, to start 8-byte copy software pipeline.
-	// We know src1 is not 8-byte aligned in this case.
-	//
-	cmp.eq p14,p15=r0,dst2
-(p15)	br.cond.spnt 1f
-	;;
-	sub t1=8,src2
-	mov t2=src2
-	;;
-	shl rshift=t2,3
-	sub len1=len,t1					// set len1
-	;;
-	sub lshift=64,rshift
-	;;
-	br.cond.spnt .word_copy_user
-	;;
-1:
-	cmp.leu	p14,p15=src2,dst2
-	sub t1=dst2,src2
-	;;
-	.pred.rel "mutex", p14, p15
-(p14)	sub word1=8,src2				// (8 - src offset)
-(p15)	sub t1=r0,t1					// absolute value
-(p15)	sub word1=8,dst2				// (8 - dst offset)
-	;;
-	// For the case p14, we don't need to copy the shifted part to
-	// the 1st word of destination.
-	sub t2=8,t1
-(p14)	sub word1=word1,t1
-	;;
-	sub len1=len,word1				// resulting len
-(p15)	shl rshift=t1,3					// in bits
-(p14)	shl rshift=t2,3
-	;;
-(p14)	sub len1=len1,t1
-	adds cnt=-1,word1
-	;;
-	sub lshift=64,rshift
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-	mov ar.lc=cnt
-	;;
-2:
-	EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
-	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-	br.ctop.dptk.few 2b
-	;;
-	clrrrb
-	;;
-.word_copy_user:
-	cmp.gtu p9,p0=16,len1
-(p9)	br.cond.spnt 4f			// if (16 > len1) skip 8-byte copy
-	;;
-	shr.u cnt=len1,3		// number of 64-bit words
-	;;
-	adds cnt=-1,cnt
-	;;
-	.pred.rel "mutex", p14, p15
-(p14)	sub src1=src1,t2
-(p15)	sub src1=src1,t1
-	//
-	// Now both src1 and dst1 point to an 8-byte aligned address. And
-	// we have more than 8 bytes to copy.
-	//
-	mov ar.lc=cnt
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-	;;
-3:
-	//
-	// The pipleline consists of 3 stages:
-	// 1 (p16):	Load a word from src1
-	// 2 (EPI_1):	Shift right pair, saving to tmp
-	// 3 (EPI):	Store tmp to dst1
-	//
-	// To make it simple, use at least 2 (p16) loops to set up val1[n]
-	// because we need 2 back-to-back val1[] to get tmp.
-	// Note that this implies EPI_2 must be p18 or greater.
-	//
-
-#define EPI_1		p[PIPE_DEPTH-2]
-#define SWITCH(pred, shift)	cmp.eq pred,p0=shift,rshift
-#define CASE(pred, shift)	\
-	(pred)	br.cond.spnt .copy_user_bit##shift
-#define BODY(rshift)						\
-.copy_user_bit##rshift:						\
-1:								\
-	EX(.failure_out,(EPI) st8 [dst1]=tmp,8);		\
-(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
-	EX(3f,(p16) ld8 val1[1]=[src1],8);			\
-(p16)	mov val1[0]=r0;						\
-	br.ctop.dptk 1b;					\
-	;;							\
-	br.cond.sptk.many .diff_align_do_tail;			\
-2:								\
-(EPI)	st8 [dst1]=tmp,8;					\
-(EPI_1)	shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;	\
-3:								\
-(p16)	mov val1[1]=r0;						\
-(p16)	mov val1[0]=r0;						\
-	br.ctop.dptk 2b;					\
-	;;							\
-	br.cond.sptk.many .failure_in2
-
-	//
-	// Since the instruction 'shrp' requires a fixed 128-bit value
-	// specifying the bits to shift, we need to provide 7 cases
-	// below.
-	//
-	SWITCH(p6, 8)
-	SWITCH(p7, 16)
-	SWITCH(p8, 24)
-	SWITCH(p9, 32)
-	SWITCH(p10, 40)
-	SWITCH(p11, 48)
-	SWITCH(p12, 56)
-	;;
-	CASE(p6, 8)
-	CASE(p7, 16)
-	CASE(p8, 24)
-	CASE(p9, 32)
-	CASE(p10, 40)
-	CASE(p11, 48)
-	CASE(p12, 56)
-	;;
-	BODY(8)
-	BODY(16)
-	BODY(24)
-	BODY(32)
-	BODY(40)
-	BODY(48)
-	BODY(56)
-	;;
-.diff_align_do_tail:
-	.pred.rel "mutex", p14, p15
-(p14)	sub src1=src1,t1
-(p14)	adds dst1=-8,dst1
-(p15)	sub dst1=dst1,t1
-	;;
-4:
-	// Tail correction.
-	//
-	// The problem with this piplelined loop is that the last word is not
-	// loaded and thus parf of the last word written is not correct.
-	// To fix that, we simply copy the tail byte by byte.
-
-	sub len1=endsrc,src1,1
-	clrrrb
-	;;
-	mov ar.ec=PIPE_DEPTH
-	mov pr.rot=1<<16	// p16=true all others are false
-	mov ar.lc=len1
-	;;
-5:
-	EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-	EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-	br.ctop.dptk.few 5b
-	;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// Beginning of long mempcy (i.e. > 16 bytes)
-	//
-.long_copy_user:
-	tbit.nz p6,p7=src1,0	// odd alignment
-	and tmp=7,tmp
-	;;
-	cmp.eq p10,p8=r0,tmp
-	mov len1=len		// copy because of rotation
-(p8)	br.cond.dpnt .diff_align_copy_user
-	;;
-	// At this point we know we have more than 16 bytes to copy
-	// and also that both src and dest have the same alignment
-	// which may not be the one we want. So for now we must move
-	// forward slowly until we reach 16byte alignment: no need to
-	// worry about reaching the end of buffer.
-	//
-	EX(.failure_in1,(p6) ld1 val1[0]=[src1],1)	// 1-byte aligned
-(p6)	adds len1=-1,len1;;
-	tbit.nz p7,p0=src1,1
-	;;
-	EX(.failure_in1,(p7) ld2 val1[1]=[src1],2)	// 2-byte aligned
-(p7)	adds len1=-2,len1;;
-	tbit.nz p8,p0=src1,2
-	;;
-	//
-	// Stop bit not required after ld4 because if we fail on ld4
-	// we have never executed the ld1, therefore st1 is not executed.
-	//
-	EX(.failure_in1,(p8) ld4 val2[0]=[src1],4)	// 4-byte aligned
-	;;
-	EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
-	tbit.nz p9,p0=src1,3
-	;;
-	//
-	// Stop bit not required after ld8 because if we fail on ld8
-	// we have never executed the ld2, therefore st2 is not executed.
-	//
-	EX(.failure_in1,(p9) ld8 val2[1]=[src1],8)	// 8-byte aligned
-	EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
-(p8)	adds len1=-4,len1
-	;;
-	EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
-(p9)	adds len1=-8,len1;;
-	shr.u cnt=len1,4		// number of 128-bit (2x64bit) words
-	;;
-	EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
-	tbit.nz p6,p0=len1,3
-	cmp.eq p7,p0=r0,cnt
-	adds tmp=-1,cnt			// br.ctop is repeat/until
-(p7)	br.cond.dpnt .dotail		// we have less than 16 bytes left
-	;;
-	adds src2=8,src1
-	adds dst2=8,dst1
-	mov ar.lc=tmp
-	;;
-	//
-	// 16bytes/iteration
-	//
-2:
-	EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
-(p16)	ld8 val2[0]=[src2],16
-
-	EX(.failure_out, (EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16)
-(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
-	br.ctop.dptk 2b
-	;;			// RAW on src1 when fall through from loop
-	//
-	// Tail correction based on len only
-	//
-	// No matter where we come from (loop or test) the src1 pointer
-	// is 16 byte aligned AND we have less than 16 bytes to copy.
-	//
-.dotail:
-	EX(.failure_in1,(p6) ld8 val1[0]=[src1],8)	// at least 8 bytes
-	tbit.nz p7,p0=len1,2
-	;;
-	EX(.failure_in1,(p7) ld4 val1[1]=[src1],4)	// at least 4 bytes
-	tbit.nz p8,p0=len1,1
-	;;
-	EX(.failure_in1,(p8) ld2 val2[0]=[src1],2)	// at least 2 bytes
-	tbit.nz p9,p0=len1,0
-	;;
-	EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
-	;;
-	EX(.failure_in1,(p9) ld1 val2[1]=[src1])	// only 1 byte left
-	mov ar.lc=saved_lc
-	;;
-	EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	EX(.failure_out, (p8)	st2 [dst1]=val2[0],2)
-	mov ar.pfs=saved_pfs
-	;;
-	EX(.failure_out, (p9)	st1 [dst1]=val2[1])
-	br.ret.sptk.many rp
-
-
-	//
-	// Here we handle the case where the byte by byte copy fails
-	// on the load.
-	// Several factors make the zeroing of the rest of the buffer kind of
-	// tricky:
-	//	- the pipeline: loads/stores are not in sync (pipeline)
-	//
-	//	  In the same loop iteration, the dst1 pointer does not directly
-	//	  reflect where the faulty load was.
-	//
-	//	- pipeline effect
-	//	  When you get a fault on load, you may have valid data from
-	//	  previous loads not yet store in transit. Such data must be
-	//	  store normally before moving onto zeroing the rest.
-	//
-	//	- single/multi dispersal independence.
-	//
-	// solution:
-	//	- we don't disrupt the pipeline, i.e. data in transit in
-	//	  the software pipeline will be eventually move to memory.
-	//	  We simply replace the load with a simple mov and keep the
-	//	  pipeline going. We can't really do this inline because
-	//	  p16 is always reset to 1 when lc > 0.
-	//
-.failure_in_pipe1:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-1:
-(p16)	mov val1[0]=r0
-(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
-	br.ctop.dptk 1b
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// This is the case where the byte by byte copy fails on the load
-	// when we copy the head. We need to finish the pipeline and copy
-	// zeros for the rest of the destination. Since this happens
-	// at the top we still need to fill the body and tail.
-.failure_in_pipe2:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-2:
-(p16)	mov val1[0]=r0
-(EPI)	st1 [dst1]=val1[PIPE_DEPTH-1],1
-	br.ctop.dptk 2b
-	;;
-	sub len=enddst,dst1,1		// precompute len
-	br.cond.dptk.many .failure_in1bis
-	;;
-
-	//
-	// Here we handle the head & tail part when we check for alignment.
-	// The following code handles only the load failures. The
-	// main diffculty comes from the fact that loads/stores are
-	// scheduled. So when you fail on a load, the stores corresponding
-	// to previous successful loads must be executed.
-	//
-	// However some simplifications are possible given the way
-	// things work.
-	//
-	// 1) HEAD
-	// Theory of operation:
-	//
-	//  Page A   | Page B
-	//  ---------|-----
-	//          1|8 x
-	//	  1 2|8 x
-	//	    4|8 x
-	//	  1 4|8 x
-	//        2 4|8 x
-	//      1 2 4|8 x
-	//	     |1
-	//	     |2 x
-	//	     |4 x
-	//
-	// page_size >= 4k (2^12).  (x means 4, 2, 1)
-	// Here we suppose Page A exists and Page B does not.
-	//
-	// As we move towards eight byte alignment we may encounter faults.
-	// The numbers on each page show the size of the load (current alignment).
-	//
-	// Key point:
-	//	- if you fail on 1, 2, 4 then you have never executed any smaller
-	//	  size loads, e.g. failing ld4 means no ld1 nor ld2 executed
-	//	  before.
-	//
-	// This allows us to simplify the cleanup code, because basically you
-	// only have to worry about "pending" stores in the case of a failing
-	// ld8(). Given the way the code is written today, this means only
-	// worry about st2, st4. There we can use the information encapsulated
-	// into the predicates.
-	//
-	// Other key point:
-	//	- if you fail on the ld8 in the head, it means you went straight
-	//	  to it, i.e. 8byte alignment within an unexisting page.
-	// Again this comes from the fact that if you crossed just for the ld8 then
-	// you are 8byte aligned but also 16byte align, therefore you would
-	// either go for the 16byte copy loop OR the ld8 in the tail part.
-	// The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
-	// because it would mean you had 15bytes to copy in which case you
-	// would have defaulted to the byte by byte copy.
-	//
-	//
-	// 2) TAIL
-	// Here we now we have less than 16 bytes AND we are either 8 or 16 byte
-	// aligned.
-	//
-	// Key point:
-	// This means that we either:
-	//		- are right on a page boundary
-	//	OR
-	//		- are at more than 16 bytes from a page boundary with
-	//		  at most 15 bytes to copy: no chance of crossing.
-	//
-	// This allows us to assume that if we fail on a load we haven't possibly
-	// executed any of the previous (tail) ones, so we don't need to do
-	// any stores. For instance, if we fail on ld2, this means we had
-	// 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
-	//
-	// This means that we are in a situation similar the a fault in the
-	// head part. That's nice!
-	//
-.failure_in1:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-	sub len=endsrc,src1,1
-	//
-	// we know that ret0 can never be zero at this point
-	// because we failed why trying to do a load, i.e. there is still
-	// some work to do.
-	// The failure_in1bis and length problem is taken care of at the
-	// calling side.
-	//
-	;;
-.failure_in1bis:		// from (.failure_in3)
-	mov ar.lc=len		// Continue with a stupid byte store.
-	;;
-5:
-	st1 [dst1]=r0,1
-	br.cloop.dptk 5b
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// Here we simply restart the loop but instead
-	// of doing loads we fill the pipeline with zeroes
-	// We can't simply store r0 because we may have valid
-	// data in transit in the pipeline.
-	// ar.lc and ar.ec are setup correctly at this point
-	//
-	// we MUST use src1/endsrc here and not dst1/enddst because
-	// of the pipeline effect.
-	//
-.failure_in3:
-	sub ret0=endsrc,src1	// number of bytes to zero, i.e. not copied
-	;;
-2:
-(p16)	mov val1[0]=r0
-(p16)	mov val2[0]=r0
-(EPI)	st8 [dst1]=val1[PIPE_DEPTH-1],16
-(EPI)	st8 [dst2]=val2[PIPE_DEPTH-1],16
-	br.ctop.dptk 2b
-	;;
-	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
-	sub len=enddst,dst1,1		// precompute len
-(p6)	br.cond.dptk .failure_in1bis
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-.failure_in2:
-	sub ret0=endsrc,src1
-	cmp.ne p6,p0=dst1,enddst	// Do we need to finish the tail ?
-	sub len=enddst,dst1,1		// precompute len
-(p6)	br.cond.dptk .failure_in1bis
-	;;
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	//
-	// handling of failures on stores: that's the easy part
-	//
-.failure_out:
-	sub ret0=enddst,dst1
-	mov pr=saved_pr,0xffffffffffff0000
-	mov ar.lc=saved_lc
-
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-END(__copy_user)
-EXPORT_SYMBOL(__copy_user)
diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c
deleted file mode 100644
index 917e3138b277..000000000000
--- a/arch/ia64/lib/csum_partial_copy.c
+++ /dev/null
@@ -1,98 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Network Checksum & Copy routine
- *
- * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * Most of the code has been imported from Linux/Alpha
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-
-#include <net/checksum.h>
-
-/*
- * XXX Fixme: those 2 inlines are meant for debugging and will go away
- */
-static inline unsigned
-short from64to16(unsigned long x)
-{
-	/* add up 32-bit words for 33 bits */
-	x = (x & 0xffffffff) + (x >> 32);
-	/* add up 16-bit and 17-bit words for 17+c bits */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up 16-bit and 2-bit for 16+c bit */
-	x = (x & 0xffff) + (x >> 16);
-	/* add up carry.. */
-	x = (x & 0xffff) + (x >> 16);
-	return x;
-}
-
-static inline
-unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
-{
-	int odd, count;
-	unsigned long result = (unsigned long)psum;
-
-	if (len <= 0)
-		goto out;
-	odd = 1 & (unsigned long) buff;
-	if (odd) {
-		result = *buff << 8;
-		len--;
-		buff++;
-	}
-	count = len >> 1;		/* nr of 16-bit words.. */
-	if (count) {
-		if (2 & (unsigned long) buff) {
-			result += *(unsigned short *) buff;
-			count--;
-			len -= 2;
-			buff += 2;
-		}
-		count >>= 1;		/* nr of 32-bit words.. */
-		if (count) {
-			if (4 & (unsigned long) buff) {
-				result += *(unsigned int *) buff;
-				count--;
-				len -= 4;
-				buff += 4;
-			}
-			count >>= 1;	/* nr of 64-bit words.. */
-			if (count) {
-				unsigned long carry = 0;
-				do {
-					unsigned long w = *(unsigned long *) buff;
-					count--;
-					buff += 8;
-					result += carry;
-					result += w;
-					carry = (w > result);
-				} while (count);
-				result += carry;
-				result = (result & 0xffffffff) + (result >> 32);
-			}
-			if (len & 4) {
-				result += *(unsigned int *) buff;
-				buff += 4;
-			}
-		}
-		if (len & 2) {
-			result += *(unsigned short *) buff;
-			buff += 2;
-		}
-	}
-	if (len & 1)
-		result += *buff;
-
-	result = from64to16(result);
-
-	if (odd)
-		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-
-out:
-	return result;
-}
diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S
deleted file mode 100644
index 6004dad2597c..000000000000
--- a/arch/ia64/lib/do_csum.S
+++ /dev/null
@@ -1,324 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optmized version of the standard do_csum() function
- *
- * Return: a 64bit quantity containing the 16bit Internet checksum
- *
- * Inputs:
- *	in0: address of buffer to checksum (char *)
- *	in1: length of the buffer (int)
- *
- * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * 02/04/22	Ken Chen <kenneth.w.chen@intel.com>
- *		Data locality study on the checksum buffer.
- *		More optimization cleanup - remove excessive stop bits.
- * 02/04/08	David Mosberger <davidm@hpl.hp.com>
- *		More cleanup and tuning.
- * 01/04/18	Jun Nakajima <jun.nakajima@intel.com>
- *		Clean up and optimize and the software pipeline, loading two
- *		back-to-back 8-byte words per loop. Clean up the initialization
- *		for the loop. Support the cases where load latency = 1 or 2.
- *		Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
- */
-
-#include <asm/asmmacro.h>
-
-//
-// Theory of operations:
-//	The goal is to go as quickly as possible to the point where
-//	we can checksum 16 bytes/loop. Before reaching that point we must
-//	take care of incorrect alignment of first byte.
-//
-//	The code hereafter also takes care of the "tail" part of the buffer
-//	before entering the core loop, if any. The checksum is a sum so it
-//	allows us to commute operations. So we do the "head" and "tail"
-//	first to finish at full speed in the body. Once we get the head and
-//	tail values, we feed them into the pipeline, very handy initialization.
-//
-//	Of course we deal with the special case where the whole buffer fits
-//	into one 8 byte word. In this case we have only one entry in the pipeline.
-//
-//	We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
-//	possible load latency and also to accommodate for head and tail.
-//
-//	The end of the function deals with folding the checksum from 64bits
-//	down to 16bits taking care of the carry.
-//
-//	This version avoids synchronization in the core loop by also using a
-//	pipeline for the accumulation of the checksum in resultx[] (x=1,2).
-//
-//	 wordx[] (x=1,2)
-//	|---|
-//      |   | 0			: new value loaded in pipeline
-//	|---|
-//      |   | -			: in transit data
-//	|---|
-//      |   | LOAD_LATENCY	: current value to add to checksum
-//	|---|
-//      |   | LOAD_LATENCY+1	: previous value added to checksum
-//      |---|			(previous iteration)
-//
-//	resultx[] (x=1,2)
-//	|---|
-//      |   | 0			: initial value
-//	|---|
-//      |   | LOAD_LATENCY-1	: new checksum
-//	|---|
-//      |   | LOAD_LATENCY	: previous value of checksum
-//	|---|
-//      |   | LOAD_LATENCY+1	: final checksum when out of the loop
-//      |---|
-//
-//
-//	See RFC1071 "Computing the Internet Checksum" for various techniques for
-//	calculating the Internet checksum.
-//
-// NOT YET DONE:
-//	- Maybe another algorithm which would take care of the folding at the
-//	  end in a different manner
-//	- Work with people more knowledgeable than me on the network stack
-//	  to figure out if we could not split the function depending on the
-//	  type of packet or alignment we get. Like the ip_fast_csum() routine
-//	  where we know we have at least 20bytes worth of data to checksum.
-//	- Do a better job of handling small packets.
-//	- Note on prefetching: it was found that under various load, i.e. ftp read/write,
-//	  nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
-//	  on the data that buffer points to (partly because the checksum is often preceded by
-//	  a copy_from_user()).  This finding indiate that lfetch will not be beneficial since
-//	  the data is already in the cache.
-//
-
-#define saved_pfs	r11
-#define hmask		r16
-#define tmask		r17
-#define first1		r18
-#define firstval	r19
-#define firstoff	r20
-#define last		r21
-#define lastval		r22
-#define lastoff		r23
-#define saved_lc	r24
-#define saved_pr	r25
-#define tmp1		r26
-#define tmp2		r27
-#define tmp3		r28
-#define carry1		r29
-#define carry2		r30
-#define first2		r31
-
-#define buf		in0
-#define len		in1
-
-#define LOAD_LATENCY	2	// XXX fix me
-
-#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
-# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
-#endif
-
-#define PIPE_DEPTH			(LOAD_LATENCY+2)
-#define ELD	p[LOAD_LATENCY]		// end of load
-#define ELD_1	p[LOAD_LATENCY+1]	// and next stage
-
-// unsigned long do_csum(unsigned char *buf,long len)
-
-GLOBAL_ENTRY(do_csum)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,2,16,0,16
-	.rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
-	.rotp p[PIPE_DEPTH], pC1[2], pC2[2]
-	mov ret0=r0		// in case we have zero length
-	cmp.lt p0,p6=r0,len	// check for zero length or negative (32bit len)
-	;;
-	add tmp1=buf,len	// last byte's address
-	.save pr, saved_pr
-	mov saved_pr=pr		// preserve predicates (rotation)
-(p6)	br.ret.spnt.many rp	// return if zero or negative length
-
-	mov hmask=-1		// initialize head mask
-	tbit.nz p15,p0=buf,0	// is buf an odd address?
-	and first1=-8,buf	// 8-byte align down address of first1 element
-
-	and firstoff=7,buf	// how many bytes off for first1 element
-	mov tmask=-1		// initialize tail mask
-
-	;;
-	adds tmp2=-1,tmp1	// last-1
-	and lastoff=7,tmp1	// how many bytes off for last element
-	;;
-	sub tmp1=8,lastoff	// complement to lastoff
-	and last=-8,tmp2	// address of word containing last byte
-	;;
-	sub tmp3=last,first1	// tmp3=distance from first1 to last
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc	// save lc
-	cmp.eq p8,p9=last,first1	// everything fits in one word ?
-
-	ld8 firstval=[first1],8	// load, ahead of time, "first1" word
-	and tmp1=7, tmp1	// make sure that if tmp1==8 -> tmp1=0
-	shl tmp2=firstoff,3	// number of bits
-	;;
-(p9)	ld8 lastval=[last]	// load, ahead of time, "last" word, if needed
-	shl tmp1=tmp1,3		// number of bits
-(p9)	adds tmp3=-8,tmp3	// effectively loaded
-	;;
-(p8)	mov lastval=r0		// we don't need lastval if first1==last
-	shl hmask=hmask,tmp2	// build head mask, mask off [0,first1off[
-	shr.u tmask=tmask,tmp1	// build tail mask, mask off ]8,lastoff]
-	;;
-	.body
-#define count tmp3
-
-(p8)	and hmask=hmask,tmask	// apply tail mask to head mask if 1 word only
-(p9)	and word2[0]=lastval,tmask	// mask last it as appropriate
-	shr.u count=count,3	// how many 8-byte?
-	;;
-	// If count is odd, finish this 8-byte word so that we can
-	// load two back-to-back 8-byte words per loop thereafter.
-	and word1[0]=firstval,hmask	// and mask it as appropriate
-	tbit.nz p10,p11=count,0		// if (count is odd)
-	;;
-(p8)	mov result1[0]=word1[0]
-(p9)	add result1[0]=word1[0],word2[0]
-	;;
-	cmp.ltu p6,p0=result1[0],word1[0]	// check the carry
-	cmp.eq.or.andcm p8,p0=0,count		// exit if zero 8-byte
-	;;
-(p6)	adds result1[0]=1,result1[0]
-(p8)	br.cond.dptk .do_csum_exit	// if (within an 8-byte word)
-(p11)	br.cond.dptk .do_csum16		// if (count is even)
-
-	// Here count is odd.
-	ld8 word1[1]=[first1],8		// load an 8-byte word
-	cmp.eq p9,p10=1,count		// if (count == 1)
-	adds count=-1,count		// loaded an 8-byte word
-	;;
-	add result1[0]=result1[0],word1[1]
-	;;
-	cmp.ltu p6,p0=result1[0],word1[1]
-	;;
-(p6)	adds result1[0]=1,result1[0]
-(p9)	br.cond.sptk .do_csum_exit	// if (count == 1) exit
-	// Fall through to calculate the checksum, feeding result1[0] as
-	// the initial value in result1[0].
-	//
-	// Calculate the checksum loading two 8-byte words per loop.
-	//
-.do_csum16:
-	add first2=8,first1
-	shr.u count=count,1	// we do 16 bytes per loop
-	;;
-	adds count=-1,count
-	mov carry1=r0
-	mov carry2=r0
-	brp.loop.imp 1f,2f
-	;;
-	mov ar.ec=PIPE_DEPTH
-	mov ar.lc=count	// set lc
-	mov pr.rot=1<<16
-	// result1[0] must be initialized in advance.
-	mov result2[0]=r0
-	;;
-	.align 32
-1:
-(ELD_1)	cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
-(pC1[1])adds carry1=1,carry1
-(ELD_1)	cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
-(pC2[1])adds carry2=1,carry2
-(ELD)	add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
-(ELD)	add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
-2:
-(p[0])	ld8 word1[0]=[first1],16
-(p[0])	ld8 word2[0]=[first2],16
-	br.ctop.sptk 1b
-	;;
-	// Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
-(pC1[1])adds carry1=1,carry1	// since we miss the last one
-(pC2[1])adds carry2=1,carry2
-	;;
-	add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
-	add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
-	;;
-	cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
-	cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
-	;;
-(p6)	adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
-(p7)	adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
-	;;
-	add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
-	;;
-	cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
-	;;
-(p6)	adds result1[0]=1,result1[0]
-	;;
-.do_csum_exit:
-	//
-	// now fold 64 into 16 bits taking care of carry
-	// that's not very good because it has lots of sequentiality
-	//
-	mov tmp3=0xffff
-	zxt4 tmp1=result1[0]
-	shr.u tmp2=result1[0],32
-	;;
-	add result1[0]=tmp1,tmp2
-	;;
-	and tmp1=result1[0],tmp3
-	shr.u tmp2=result1[0],16
-	;;
-	add result1[0]=tmp1,tmp2
-	;;
-	and tmp1=result1[0],tmp3
-	shr.u tmp2=result1[0],16
-	;;
-	add result1[0]=tmp1,tmp2
-	;;
-	and tmp1=result1[0],tmp3
-	shr.u tmp2=result1[0],16
-	;;
-	add ret0=tmp1,tmp2
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	// if buf was odd then swap bytes
-	mov ar.pfs=saved_pfs		// restore ar.ec
-(p15)	mux1 ret0=ret0,@rev		// reverse word
-	;;
-	mov ar.lc=saved_lc
-(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
-	br.ret.sptk.many rp
-
-//	I (Jun Nakajima) wrote an equivalent code (see below), but it was
-//	not much better than the original. So keep the original there so that
-//	someone else can challenge.
-//
-//	shr.u word1[0]=result1[0],32
-//	zxt4 result1[0]=result1[0]
-//	;;
-//	add result1[0]=result1[0],word1[0]
-//	;;
-//	zxt2 result2[0]=result1[0]
-//	extr.u word1[0]=result1[0],16,16
-//	shr.u carry1=result1[0],32
-//	;;
-//	add result2[0]=result2[0],word1[0]
-//	;;
-//	add result2[0]=result2[0],carry1
-//	;;
-//	extr.u ret0=result2[0],16,16
-//	;;
-//	add ret0=ret0,result2[0]
-//	;;
-//	zxt2 ret0=ret0
-//	mov ar.pfs=saved_pfs		 // restore ar.ec
-//	mov pr=saved_pr,0xffffffffffff0000
-//	;;
-//	// if buf was odd then swap bytes
-//	mov ar.lc=saved_lc
-//(p15)	mux1 ret0=ret0,@rev		// reverse word
-//	;;
-//(p15)	shr.u ret0=ret0,64-16	// + shift back to position = swap bytes
-//	br.ret.sptk.many rp
-
-END(do_csum)
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S
deleted file mode 100644
index f8e795fe45cb..000000000000
--- a/arch/ia64/lib/flush.S
+++ /dev/null
@@ -1,119 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Cache flushing routines.
- *
- * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 05/28/05 Zoltan Menyhart	Dynamic stride size
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-	/*
-	 * flush_icache_range(start,end)
-	 *
-	 *	Make i-cache(s) coherent with d-caches.
-	 *
-	 *	Must deal with range from start to end-1 but nothing else (need to
-	 *	be careful not to touch addresses that may be unmapped).
-	 *
-	 *	Note: "in0" and "in1" are preserved for debugging purposes.
-	 */
-	.section .kprobes.text,"ax"
-GLOBAL_ENTRY(flush_icache_range)
-
-	.prologue
-	alloc	r2=ar.pfs,2,0,0,0
-	movl	r3=ia64_i_cache_stride_shift
- 	mov	r21=1
-	;;
-	ld8	r20=[r3]		// r20: stride shift
-	sub	r22=in1,r0,1		// last byte address
-	;;
-	shr.u	r23=in0,r20		// start / (stride size)
-	shr.u	r22=r22,r20		// (last byte address) / (stride size)
-	shl	r21=r21,r20		// r21: stride size of the i-cache(s)
-	;;
-	sub	r8=r22,r23		// number of strides - 1
-	shl	r24=r23,r20		// r24: addresses for "fc.i" =
-					//	"start" rounded down to stride boundary
-	.save	ar.lc,r3
-	mov	r3=ar.lc		// save ar.lc
-	;;
-
-	.body
-	mov	ar.lc=r8
-	;;
-	/*
-	 * 32 byte aligned loop, even number of (actually 2) bundles
-	 */
-.Loop:	fc.i	r24			// issuable on M0 only
-	add	r24=r21,r24		// we flush "stride size" bytes per iteration
-	nop.i	0
-	br.cloop.sptk.few .Loop
-	;;
-	sync.i
-	;;
-	srlz.i
-	;;
-	mov	ar.lc=r3		// restore ar.lc
-	br.ret.sptk.many rp
-END(flush_icache_range)
-EXPORT_SYMBOL_GPL(flush_icache_range)
-
-	/*
-	 * clflush_cache_range(start,size)
-	 *
-	 *	Flush cache lines from start to start+size-1.
-	 *
-	 *	Must deal with range from start to start+size-1 but nothing else
-	 *	(need to be careful not to touch addresses that may be
-	 *	unmapped).
-	 *
-	 *	Note: "in0" and "in1" are preserved for debugging purposes.
-	 */
-	.section .kprobes.text,"ax"
-GLOBAL_ENTRY(clflush_cache_range)
-
-	.prologue
-	alloc	r2=ar.pfs,2,0,0,0
-	movl	r3=ia64_cache_stride_shift
-	mov	r21=1
-	add     r22=in1,in0
-	;;
-	ld8	r20=[r3]		// r20: stride shift
-	sub	r22=r22,r0,1		// last byte address
-	;;
-	shr.u	r23=in0,r20		// start / (stride size)
-	shr.u	r22=r22,r20		// (last byte address) / (stride size)
-	shl	r21=r21,r20		// r21: stride size of the i-cache(s)
-	;;
-	sub	r8=r22,r23		// number of strides - 1
-	shl	r24=r23,r20		// r24: addresses for "fc" =
-					//	"start" rounded down to stride
-					//	boundary
-	.save	ar.lc,r3
-	mov	r3=ar.lc		// save ar.lc
-	;;
-
-	.body
-	mov	ar.lc=r8
-	;;
-	/*
-	 * 32 byte aligned loop, even number of (actually 2) bundles
-	 */
-.Loop_fc:
-	fc	r24		// issuable on M0 only
-	add	r24=r21,r24	// we flush "stride size" bytes per iteration
-	nop.i	0
-	br.cloop.sptk.few .Loop_fc
-	;;
-	sync.i
-	;;
-	srlz.i
-	;;
-	mov	ar.lc=r3		// restore ar.lc
-	br.ret.sptk.many rp
-END(clflush_cache_range)
diff --git a/arch/ia64/lib/idiv32.S b/arch/ia64/lib/idiv32.S
deleted file mode 100644
index 83586fbc51ff..000000000000
--- a/arch/ia64/lib/idiv32.S
+++ /dev/null
@@ -1,86 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2000 Hewlett-Packard Co
- * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 32-bit integer division.
- *
- * This code is based on the application note entitled "Divide, Square Root
- * and Remainder Algorithms for the IA-64 Architecture".  This document
- * is available as Intel document number 248725-002 or via the web at
- * http://developer.intel.com/software/opensource/numerics/
- *
- * For more details on the theory behind these algorithms, see "IA-64
- * and Elementary Functions" by Peter Markstein; HP Professional Books
- * (http://www.goodreads.com/book/show/2019887.Ia_64_and_Elementary_Functions)
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-#ifdef MODULO
-# define OP	mod
-#else
-# define OP	div
-#endif
-
-#ifdef UNSIGNED
-# define SGN	u
-# define EXTEND	zxt4
-# define INT_TO_FP(a,b)	fcvt.xuf.s1 a=b
-# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s1 a=b
-#else
-# define SGN
-# define EXTEND	sxt4
-# define INT_TO_FP(a,b)	fcvt.xf a=b
-# define FP_TO_INT(a,b)	fcvt.fx.trunc.s1 a=b
-#endif
-
-#define PASTE1(a,b)	a##b
-#define PASTE(a,b)	PASTE1(a,b)
-#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,si3))
-
-GLOBAL_ENTRY(NAME)
-	.regstk 2,0,0,0
-	// Transfer inputs to FP registers.
-	mov r2 = 0xffdd			// r2 = -34 + 65535 (fp reg format bias)
-	EXTEND in0 = in0		// in0 = a
-	EXTEND in1 = in1		// in1 = b
-	;;
-	setf.sig f8 = in0
-	setf.sig f9 = in1
-#ifdef MODULO
-	sub in1 = r0, in1		// in1 = -b
-#endif
-	;;
-	// Convert the inputs to FP, to avoid FP software-assist faults.
-	INT_TO_FP(f8, f8)
-	INT_TO_FP(f9, f9)
-	;;
-	setf.exp f7 = r2		// f7 = 2^-34
-	frcpa.s1 f6, p6 = f8, f9	// y0 = frcpa(b)
-	;;
-(p6)	fmpy.s1 f8 = f8, f6		// q0 = a*y0
-(p6)	fnma.s1 f6 = f9, f6, f1		// e0 = -b*y0 + 1 
-	;;
-#ifdef MODULO
-	setf.sig f9 = in1		// f9 = -b
-#endif
-(p6)	fma.s1 f8 = f6, f8, f8		// q1 = e0*q0 + q0
-(p6)	fma.s1 f6 = f6, f6, f7		// e1 = e0*e0 + 2^-34
-	;;
-#ifdef MODULO
-	setf.sig f7 = in0
-#endif
-(p6)	fma.s1 f6 = f6, f8, f8		// q2 = e1*q1 + q1
-	;;
-	FP_TO_INT(f6, f6)		// q = trunc(q2)
-	;;
-#ifdef MODULO
-	xma.l f6 = f6, f9, f7		// r = q*(-b) + a
-	;;
-#endif
-	getf.sig r8 = f6		// transfer result to result register
-	br.ret.sptk.many rp
-END(NAME)
-EXPORT_SYMBOL(NAME)
diff --git a/arch/ia64/lib/idiv64.S b/arch/ia64/lib/idiv64.S
deleted file mode 100644
index 5c9113691f72..000000000000
--- a/arch/ia64/lib/idiv64.S
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 1999-2000 Hewlett-Packard Co
- * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 64-bit integer division.
- *
- * This code is based on the application note entitled "Divide, Square Root
- * and Remainder Algorithms for the IA-64 Architecture".  This document
- * is available as Intel document number 248725-002 or via the web at
- * http://developer.intel.com/software/opensource/numerics/
- *
- * For more details on the theory behind these algorithms, see "IA-64
- * and Elementary Functions" by Peter Markstein; HP Professional Books
- * (http://www.goodreads.com/book/show/2019887.Ia_64_and_Elementary_Functions)
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-#ifdef MODULO
-# define OP	mod
-#else
-# define OP	div
-#endif
-
-#ifdef UNSIGNED
-# define SGN	u
-# define INT_TO_FP(a,b)	fcvt.xuf.s1 a=b
-# define FP_TO_INT(a,b)	fcvt.fxu.trunc.s1 a=b
-#else
-# define SGN
-# define INT_TO_FP(a,b)	fcvt.xf a=b
-# define FP_TO_INT(a,b)	fcvt.fx.trunc.s1 a=b
-#endif
-
-#define PASTE1(a,b)	a##b
-#define PASTE(a,b)	PASTE1(a,b)
-#define NAME		PASTE(PASTE(__,SGN),PASTE(OP,di3))
-
-GLOBAL_ENTRY(NAME)
-	.regstk 2,0,0,0
-	// Transfer inputs to FP registers.
-	setf.sig f8 = in0
-	setf.sig f9 = in1
-	;;
-	// Convert the inputs to FP, to avoid FP software-assist faults.
-	INT_TO_FP(f8, f8)
-	INT_TO_FP(f9, f9)
-	;;
-	frcpa.s1 f11, p6 = f8, f9	// y0 = frcpa(b)
-	;;
-(p6)	fmpy.s1 f7 = f8, f11		// q0 = a*y0
-(p6)	fnma.s1 f6 = f9, f11, f1	// e0 = -b*y0 + 1
-	;;
-(p6)	fma.s1 f10 = f7, f6, f7		// q1 = q0*e0 + q0
-(p6)	fmpy.s1 f7 = f6, f6		// e1 = e0*e0
-	;;
-#ifdef MODULO
-	sub in1 = r0, in1		// in1 = -b
-#endif
-(p6)	fma.s1 f10 = f10, f7, f10	// q2 = q1*e1 + q1
-(p6)	fma.s1 f6 = f11, f6, f11	// y1 = y0*e0 + y0
-	;;
-(p6)	fma.s1 f6 = f6, f7, f6		// y2 = y1*e1 + y1
-(p6)	fnma.s1 f7 = f9, f10, f8	// r = -b*q2 + a
-	;;
-#ifdef MODULO
-	setf.sig f8 = in0		// f8 = a
-	setf.sig f9 = in1		// f9 = -b
-#endif
-(p6)	fma.s1 f11 = f7, f6, f10	// q3 = r*y2 + q2
-	;;
-	FP_TO_INT(f11, f11)		// q = trunc(q3)
-	;;
-#ifdef MODULO
-	xma.l f11 = f11, f9, f8		// r = q*(-b) + a
-	;;
-#endif
-	getf.sig r8 = f11		// transfer result to result register
-	br.ret.sptk.many rp
-END(NAME)
-EXPORT_SYMBOL(NAME)
diff --git a/arch/ia64/lib/io.c b/arch/ia64/lib/io.c
deleted file mode 100644
index c3e02462ed16..000000000000
--- a/arch/ia64/lib/io.c
+++ /dev/null
@@ -1,51 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/module.h>
-#include <linux/types.h>
-
-#include <asm/io.h>
-
-/*
- * Copy data from IO memory space to "real" memory space.
- * This needs to be optimized.
- */
-void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
-{
-	char *dst = to;
-
-	while (count) {
-		count--;
-		*dst++ = readb(from++);
-	}
-}
-EXPORT_SYMBOL(memcpy_fromio);
-
-/*
- * Copy data from "real" memory space to IO memory space.
- * This needs to be optimized.
- */
-void memcpy_toio(volatile void __iomem *to, const void *from, long count)
-{
-	const char *src = from;
-
-	while (count) {
-		count--;
-		writeb(*src++, to++);
-	}
-}
-EXPORT_SYMBOL(memcpy_toio);
-
-/*
- * "memset" on IO memory space.
- * This needs to be optimized.
- */
-void memset_io(volatile void __iomem *dst, int c, long count)
-{
-	unsigned char ch = (char)(c & 0xff);
-
-	while (count) {
-		count--;
-		writeb(ch, dst);
-		dst++;
-	}
-}
-EXPORT_SYMBOL(memset_io);
diff --git a/arch/ia64/lib/ip_fast_csum.S b/arch/ia64/lib/ip_fast_csum.S
deleted file mode 100644
index fcc0b812ce2e..000000000000
--- a/arch/ia64/lib/ip_fast_csum.S
+++ /dev/null
@@ -1,148 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Optmized version of the ip_fast_csum() function
- * Used for calculating IP header checksum
- *
- * Return: 16bit checksum, complemented
- *
- * Inputs:
- *      in0: address of buffer to checksum (char *)
- *      in1: length of the buffer (int)
- *
- * Copyright (C) 2002, 2006 Intel Corp.
- * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-/*
- * Since we know that most likely this function is called with buf aligned
- * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
- * versus calling generic version of do_csum, which has lots of overhead in
- * handling various alignments and sizes.  However, due to lack of constrains
- * put on the function input argument, cases with alignment not on 4-byte or
- * size not equal to 20 bytes will be handled by the generic do_csum function.
- */
-
-#define in0	r32
-#define in1	r33
-#define in2	r34
-#define in3	r35
-#define in4	r36
-#define ret0	r8
-
-GLOBAL_ENTRY(ip_fast_csum)
-	.prologue
-	.body
-	cmp.ne	p6,p7=5,in1	// size other than 20 byte?
-	and	r14=3,in0	// is it aligned on 4-byte?
-	add	r15=4,in0	// second source pointer
-	;;
-	cmp.ne.or.andcm p6,p7=r14,r0
-	;;
-(p7)	ld4	r20=[in0],8
-(p7)	ld4	r21=[r15],8
-(p6)	br.spnt	.generic
-	;;
-	ld4	r22=[in0],8
-	ld4	r23=[r15],8
-	;;
-	ld4	r24=[in0]
-	add	r20=r20,r21
-	add	r22=r22,r23
-	;;
-	add	r20=r20,r22
-	;;
-	add	r20=r20,r24
-	;;
-	shr.u	ret0=r20,16	// now need to add the carry
-	zxt2	r20=r20
-	;;
-	add	r20=ret0,r20
-	;;
-	shr.u	ret0=r20,16	// add carry again
-	zxt2	r20=r20
-	;;
-	add	r20=ret0,r20
-	;;
-	shr.u	ret0=r20,16
-	zxt2	r20=r20
-	;;
-	add	r20=ret0,r20
-	mov	r9=0xffff
-	;;
-	andcm	ret0=r9,r20
-	.restore sp		// reset frame state
-	br.ret.sptk.many b0
-	;;
-
-.generic:
-	.prologue
-	.save ar.pfs, r35
-	alloc	r35=ar.pfs,2,2,2,0
-	.save rp, r34
-	mov	r34=b0
-	.body
-	dep.z	out1=in1,2,30
-	mov	out0=in0
-	;;
-	br.call.sptk.many b0=do_csum
-	;;
-	andcm	ret0=-1,ret0
-	mov	ar.pfs=r35
-	mov	b0=r34
-	br.ret.sptk.many b0
-END(ip_fast_csum)
-EXPORT_SYMBOL(ip_fast_csum)
-
-GLOBAL_ENTRY(csum_ipv6_magic)
-	ld4	r20=[in0],4
-	ld4	r21=[in1],4
-	zxt4	in2=in2
-	;;
-	ld4	r22=[in0],4
-	ld4	r23=[in1],4
-	dep	r15=in3,in2,32,16
-	;;
-	ld4	r24=[in0],4
-	ld4	r25=[in1],4
-	mux1	r15=r15,@rev
-	add	r16=r20,r21
-	add	r17=r22,r23
-	zxt4	in4=in4
-	;;
-	ld4	r26=[in0],4
-	ld4	r27=[in1],4
-	shr.u	r15=r15,16
-	add	r18=r24,r25
-	add	r8=r16,r17
-	;;
-	add	r19=r26,r27
-	add	r8=r8,r18
-	;;
-	add	r8=r8,r19
-	add	r15=r15,in4
-	;;
-	add	r8=r8,r15
-	;;
-	shr.u	r10=r8,32	// now fold sum into short
-	zxt4	r11=r8
-	;;
-	add	r8=r10,r11
-	;;
-	shr.u	r10=r8,16	// yeah, keep it rolling
-	zxt2	r11=r8
-	;;
-	add	r8=r10,r11
-	;;
-	shr.u	r10=r8,16	// three times lucky
-	zxt2	r11=r8
-	;;
-	add	r8=r10,r11
-	mov	r9=0xffff
-	;;
-	andcm	r8=r9,r8
-	br.ret.sptk.many b0
-END(csum_ipv6_magic)
-EXPORT_SYMBOL(csum_ipv6_magic)
diff --git a/arch/ia64/lib/memcpy.S b/arch/ia64/lib/memcpy.S
deleted file mode 100644
index 35c9069a8345..000000000000
--- a/arch/ia64/lib/memcpy.S
+++ /dev/null
@@ -1,304 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard memcpy() function
- *
- * Inputs:
- * 	in0:	destination address
- *	in1:	source address
- *	in2:	number of bytes to copy
- * Output:
- * 	no return value
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(memcpy)
-
-#	define MEM_LAT	21		/* latency to memory */
-
-#	define dst	r2
-#	define src	r3
-#	define retval	r8
-#	define saved_pfs r9
-#	define saved_lc	r10
-#	define saved_pr	r11
-#	define cnt	r16
-#	define src2	r17
-#	define t0	r18
-#	define t1	r19
-#	define t2	r20
-#	define t3	r21
-#	define t4	r22
-#	define src_end	r23
-
-#	define N	(MEM_LAT + 4)
-#	define Nrot	((N + 7) & ~7)
-
-	/*
-	 * First, check if everything (src, dst, len) is a multiple of eight.  If
-	 * so, we handle everything with no taken branches (other than the loop
-	 * itself) and a small icache footprint.  Otherwise, we jump off to
-	 * the more general copy routine handling arbitrary
-	 * sizes/alignment etc.
-	 */
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
-	.save ar.lc, saved_lc
-	mov saved_lc=ar.lc
-	or t0=in0,in1
-	;;
-
-	or t0=t0,in2
-	.save pr, saved_pr
-	mov saved_pr=pr
-
-	.body
-
-	cmp.eq p6,p0=in2,r0	// zero length?
-	mov retval=in0		// return dst
-(p6)	br.ret.spnt.many rp	// zero length, return immediately
-	;;
-
-	mov dst=in0		// copy because of rotation
-	shr.u cnt=in2,3		// number of 8-byte words to copy
-	mov pr.rot=1<<16
-	;;
-
-	adds cnt=-1,cnt		// br.ctop is repeat/until
-	cmp.gtu p7,p0=16,in2	// copying less than 16 bytes?
-	mov ar.ec=N
-	;;
-
-	and t0=0x7,t0
-	mov ar.lc=cnt
-	;;
-	cmp.ne p6,p0=t0,r0
-
-	mov src=in1		// copy because of rotation
-(p7)	br.cond.spnt.few .memcpy_short
-(p6)	br.cond.spnt.few .memcpy_long
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	nop.i	0
-	;;
-	nop.m	0
-	;;
-	.rotr val[N]
-	.rotp p[N]
-	.align 32
-1: { .mib
-(p[0])	ld8 val[0]=[src],8
-	nop.i 0
-	brp.loop.imp 1b, 2f
-}
-2: { .mfb
-(p[N-1])st8 [dst]=val[N-1],8
-	nop.f 0
-	br.ctop.dptk.few 1b
-}
-	;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,-1
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	/*
-	 * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
-	 * copy loop.  This performs relatively poorly on Itanium, but it doesn't
-	 * get used very often (gcc inlines small copies) and due to atomicity
-	 * issues, we want to avoid read-modify-write of entire words.
-	 */
-	.align 32
-.memcpy_short:
-	adds cnt=-1,in2		// br.ctop is repeat/until
-	mov ar.ec=MEM_LAT
-	brp.loop.imp 1f, 2f
-	;;
-	mov ar.lc=cnt
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	nop.i	0
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	;;
-	/*
-	 * It is faster to put a stop bit in the loop here because it makes
-	 * the pipeline shorter (and latency is what matters on short copies).
-	 */
-	.align 32
-1: { .mib
-(p[0])	ld1 val[0]=[src],1
-	nop.i 0
-	brp.loop.imp 1b, 2f
-} ;;
-2: { .mfb
-(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
-	nop.f 0
-	br.ctop.dptk.few 1b
-} ;;
-	mov ar.lc=saved_lc
-	mov pr=saved_pr,-1
-	mov ar.pfs=saved_pfs
-	br.ret.sptk.many rp
-
-	/*
-	 * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
-	 * an overriding concern here, but throughput is.  We first do
-	 * sub-word copying until the destination is aligned, then we check
-	 * if the source is also aligned.  If so, we do a simple load/store-loop
-	 * until there are less than 8 bytes left over and then we do the tail,
-	 * by storing the last few bytes using sub-word copying.  If the source
-	 * is not aligned, we branch off to the non-congruent loop.
-	 *
-	 *   stage:   op:
-	 *         0  ld
-	 *	   :
-	 * MEM_LAT+3  shrp
-	 * MEM_LAT+4  st
-	 *
-	 * On Itanium, the pipeline itself runs without stalls.  However,  br.ctop
-	 * seems to introduce an unavoidable bubble in the pipeline so the overall
-	 * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
-	 * of 4 byte/cycle.  Still not bad.
-	 */
-#	undef N
-#	undef Nrot
-#	define N	(MEM_LAT + 5)		/* number of stages */
-#	define Nrot	((N+1 + 2 + 7) & ~7)	/* number of rotating regs */
-
-#define LOG_LOOP_SIZE	6
-
-.memcpy_long:
-	alloc t3=ar.pfs,3,Nrot,0,Nrot	// resize register frame
-	and t0=-8,src		// t0 = src & ~7
-	and t2=7,src		// t2 = src & 7
-	;;
-	ld8 t0=[t0]		// t0 = 1st source word
-	adds src2=7,src		// src2 = (src + 7)
-	sub t4=r0,dst		// t4 = -dst
-	;;
-	and src2=-8,src2	// src2 = (src + 7) & ~7
-	shl t2=t2,3		// t2 = 8*(src & 7)
-	shl t4=t4,3		// t4 = 8*(dst & 7)
-	;;
-	ld8 t1=[src2]		// t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
-	sub t3=64,t2		// t3 = 64-8*(src & 7)
-	shr.u t0=t0,t2
-	;;
-	add src_end=src,in2
-	shl t1=t1,t3
-	mov pr=t4,0x38		// (p5,p4,p3)=(dst & 7)
-	;;
-	or t0=t0,t1
-	mov cnt=r0
-	adds src_end=-1,src_end
-	;;
-(p3)	st1 [dst]=t0,1
-(p3)	shr.u t0=t0,8
-(p3)	adds cnt=1,cnt
-	;;
-(p4)	st2 [dst]=t0,2
-(p4)	shr.u t0=t0,16
-(p4)	adds cnt=2,cnt
-	;;
-(p5)	st4 [dst]=t0,4
-(p5)	adds cnt=4,cnt
-	and src_end=-8,src_end	// src_end = last word of source buffer
-	;;
-
-	// At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
-
-1:{	add src=cnt,src			// make src point to remainder of source buffer
-	sub cnt=in2,cnt			// cnt = number of bytes left to copy
-	mov t4=ip
-  }	;;
-	and src2=-8,src			// align source pointer
-	adds t4=.memcpy_loops-1b,t4
-	mov ar.ec=N
-
-	and t0=7,src			// t0 = src & 7
-	shr.u t2=cnt,3			// t2 = number of 8-byte words left to copy
-	shl cnt=cnt,3			// move bits 0-2 to 3-5
-	;;
-
-	.rotr val[N+1], w[2]
-	.rotp p[N]
-
-	cmp.ne p6,p0=t0,r0		// is src aligned, too?
-	shl t0=t0,LOG_LOOP_SIZE		// t0 = 8*(src & 7)
-	adds t2=-1,t2			// br.ctop is repeat/until
-	;;
-	add t4=t0,t4
-	mov pr=cnt,0x38			// set (p5,p4,p3) to # of bytes last-word bytes to copy
-	mov ar.lc=t2
-	;;
-	nop.m	0
-	;;
-	nop.m	0
-	nop.i	0
-	;;
-	nop.m	0
-	;;
-(p6)	ld8 val[1]=[src2],8		// prime the pump...
-	mov b6=t4
-	br.sptk.few b6
-	;;
-
-.memcpy_tail:
-	// At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
-	// less than 8) and t0 contains the last few bytes of the src buffer:
-(p5)	st4 [dst]=t0,4
-(p5)	shr.u t0=t0,32
-	mov ar.lc=saved_lc
-	;;
-(p4)	st2 [dst]=t0,2
-(p4)	shr.u t0=t0,16
-	mov ar.pfs=saved_pfs
-	;;
-(p3)	st1 [dst]=t0
-	mov pr=saved_pr,-1
-	br.ret.sptk.many rp
-
-///////////////////////////////////////////////////////
-	.align 64
-
-#define COPY(shift,index)									\
- 1: { .mib											\
-	(p[0])		ld8 val[0]=[src2],8;							\
-	(p[MEM_LAT+3])	shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;			\
-			brp.loop.imp 1b, 2f							\
-    };												\
- 2: { .mfb											\
-	(p[MEM_LAT+4])	st8 [dst]=w[1],8;							\
-			nop.f 0;								\
-			br.ctop.dptk.few 1b;							\
-    };												\
-			;;									\
-			ld8 val[N-1]=[src_end];	/* load last word (may be same as val[N]) */	\
-			;;									\
-			shrp t0=val[N-1],val[N-index],shift;					\
-			br .memcpy_tail
-.memcpy_loops:
-	COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
-	COPY(8, 0)
-	COPY(16, 0)
-	COPY(24, 0)
-	COPY(32, 0)
-	COPY(40, 0)
-	COPY(48, 0)
-	COPY(56, 0)
-
-END(memcpy)
-EXPORT_SYMBOL(memcpy)
diff --git a/arch/ia64/lib/memcpy_mck.S b/arch/ia64/lib/memcpy_mck.S
deleted file mode 100644
index c0d4362217ae..000000000000
--- a/arch/ia64/lib/memcpy_mck.S
+++ /dev/null
@@ -1,659 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Itanium 2-optimized version of memcpy and copy_user function
- *
- * Inputs:
- * 	in0:	destination address
- *	in1:	source address
- *	in2:	number of bytes to copy
- * Output:
- *	for memcpy:    return dest
- * 	for copy_user: return 0 if success,
- *		       or number of byte NOT copied if error occurred.
- *
- * Copyright (C) 2002 Intel Corp.
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
- */
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define EK(y...) EX(y)
-
-/* McKinley specific optimization */
-
-#define retval		r8
-#define saved_pfs	r31
-#define saved_lc	r10
-#define saved_pr	r11
-#define saved_in0	r14
-#define saved_in1	r15
-#define saved_in2	r16
-
-#define src0		r2
-#define src1		r3
-#define dst0		r17
-#define dst1		r18
-#define cnt		r9
-
-/* r19-r30 are temp for each code section */
-#define PREFETCH_DIST	8
-#define src_pre_mem	r19
-#define dst_pre_mem	r20
-#define src_pre_l2	r21
-#define dst_pre_l2	r22
-#define t1		r23
-#define t2		r24
-#define t3		r25
-#define t4		r26
-#define t5		t1	// alias!
-#define t6		t2	// alias!
-#define t7		t3	// alias!
-#define n8		r27
-#define t9		t5	// alias!
-#define t10		t4	// alias!
-#define t11		t7	// alias!
-#define t12		t6	// alias!
-#define t14		t10	// alias!
-#define t13		r28
-#define t15		r29
-#define tmp		r30
-
-/* defines for long_copy block */
-#define	A	0
-#define B	(PREFETCH_DIST)
-#define C	(B + PREFETCH_DIST)
-#define D	(C + 1)
-#define N	(D + 1)
-#define Nrot	((N + 7) & ~7)
-
-/* alias */
-#define in0		r32
-#define in1		r33
-#define in2		r34
-
-GLOBAL_ENTRY(memcpy)
-	and	r28=0x7,in0
-	and	r29=0x7,in1
-	mov	f6=f0
-	mov	retval=in0
-	br.cond.sptk .common_code
-	;;
-END(memcpy)
-EXPORT_SYMBOL(memcpy)
-GLOBAL_ENTRY(__copy_user)
-	.prologue
-// check dest alignment
-	and	r28=0x7,in0
-	and	r29=0x7,in1
-	mov	f6=f1
-	mov	saved_in0=in0	// save dest pointer
-	mov	saved_in1=in1	// save src pointer
-	mov	retval=r0	// initialize return value
-	;;
-.common_code:
-	cmp.gt	p15,p0=8,in2	// check for small size
-	cmp.ne	p13,p0=0,r28	// check dest alignment
-	cmp.ne	p14,p0=0,r29	// check src alignment
-	add	src0=0,in1
-	sub	r30=8,r28	// for .align_dest
-	mov	saved_in2=in2	// save len
-	;;
-	add	dst0=0,in0
-	add	dst1=1,in0	// dest odd index
-	cmp.le	p6,p0 = 1,r30	// for .align_dest
-(p15)	br.cond.dpnt .memcpy_short
-(p13)	br.cond.dpnt .align_dest
-(p14)	br.cond.dpnt .unaligned_src
-	;;
-
-// both dest and src are aligned on 8-byte boundary
-.aligned_src:
-	.save ar.pfs, saved_pfs
-	alloc	saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
-	.save pr, saved_pr
-	mov	saved_pr=pr
-
-	shr.u	cnt=in2,7	// this much cache line
-	;;
-	cmp.lt	p6,p0=2*PREFETCH_DIST,cnt
-	cmp.lt	p7,p8=1,cnt
-	.save ar.lc, saved_lc
-	mov	saved_lc=ar.lc
-	.body
-	add	cnt=-1,cnt
-	add	src_pre_mem=0,in1	// prefetch src pointer
-	add	dst_pre_mem=0,in0	// prefetch dest pointer
-	;;
-(p7)	mov	ar.lc=cnt	// prefetch count
-(p8)	mov	ar.lc=r0
-(p6)	br.cond.dpnt .long_copy
-	;;
-
-.prefetch:
-	lfetch.fault	  [src_pre_mem], 128
-	lfetch.fault.excl [dst_pre_mem], 128
-	br.cloop.dptk.few .prefetch
-	;;
-
-.medium_copy:
-	and	tmp=31,in2	// copy length after iteration
-	shr.u	r29=in2,5	// number of 32-byte iteration
-	add	dst1=8,dst0	// 2nd dest pointer
-	;;
-	add	cnt=-1,r29	// ctop iteration adjustment
-	cmp.eq	p10,p0=r29,r0	// do we really need to loop?
-	add	src1=8,src0	// 2nd src pointer
-	cmp.le	p6,p0=8,tmp
-	;;
-	cmp.le	p7,p0=16,tmp
-	mov	ar.lc=cnt	// loop setup
-	cmp.eq	p16,p17 = r0,r0
-	mov	ar.ec=2
-(p10)	br.dpnt.few .aligned_src_tail
-	;;
-	TEXT_ALIGN(32)
-1:
-EX(.ex_handler, (p16)	ld8	r34=[src0],16)
-EK(.ex_handler, (p16)	ld8	r38=[src1],16)
-EX(.ex_handler, (p17)	st8	[dst0]=r33,16)
-EK(.ex_handler, (p17)	st8	[dst1]=r37,16)
-	;;
-EX(.ex_handler, (p16)	ld8	r32=[src0],16)
-EK(.ex_handler, (p16)	ld8	r36=[src1],16)
-EX(.ex_handler, (p16)	st8	[dst0]=r34,16)
-EK(.ex_handler, (p16)	st8	[dst1]=r38,16)
-	br.ctop.dptk.few 1b
-	;;
-
-.aligned_src_tail:
-EX(.ex_handler, (p6)	ld8	t1=[src0])
-	mov	ar.lc=saved_lc
-	mov	ar.pfs=saved_pfs
-EX(.ex_hndlr_s, (p7)	ld8	t2=[src1],8)
-	cmp.le	p8,p0=24,tmp
-	and	r21=-8,tmp
-	;;
-EX(.ex_hndlr_s, (p8)	ld8	t3=[src1])
-EX(.ex_handler, (p6)	st8	[dst0]=t1)	// store byte 1
-	and	in2=7,tmp	// remaining length
-EX(.ex_hndlr_d, (p7)	st8	[dst1]=t2,8)	// store byte 2
-	add	src0=src0,r21	// setting up src pointer
-	add	dst0=dst0,r21	// setting up dest pointer
-	;;
-EX(.ex_handler, (p8)	st8	[dst1]=t3)	// store byte 3
-	mov	pr=saved_pr,-1
-	br.dptk.many .memcpy_short
-	;;
-
-/* code taken from copy_page_mck */
-.long_copy:
-	.rotr v[2*PREFETCH_DIST]
-	.rotp p[N]
-
-	mov src_pre_mem = src0
-	mov pr.rot = 0x10000
-	mov ar.ec = 1				// special unrolled loop
-
-	mov dst_pre_mem = dst0
-
-	add src_pre_l2 = 8*8, src0
-	add dst_pre_l2 = 8*8, dst0
-	;;
-	add src0 = 8, src_pre_mem		// first t1 src
-	mov ar.lc = 2*PREFETCH_DIST - 1
-	shr.u cnt=in2,7				// number of lines
-	add src1 = 3*8, src_pre_mem		// first t3 src
-	add dst0 = 8, dst_pre_mem		// first t1 dst
-	add dst1 = 3*8, dst_pre_mem		// first t3 dst
-	;;
-	and tmp=127,in2				// remaining bytes after this block
-	add cnt = -(2*PREFETCH_DIST) - 1, cnt
-	// same as .line_copy loop, but with all predicated-off instructions removed:
-.prefetch_loop:
-EX(.ex_hndlr_lcpy_1, (p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0
-EK(.ex_hndlr_lcpy_1, (p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2
-	br.ctop.sptk .prefetch_loop
-	;;
-	cmp.eq p16, p0 = r0, r0			// reset p16 to 1
-	mov ar.lc = cnt
-	mov ar.ec = N				// # of stages in pipeline
-	;;
-.line_copy:
-EX(.ex_handler,	(p[D])	ld8 t2 = [src0], 3*8)			// M0
-EK(.ex_handler,	(p[D])	ld8 t4 = [src1], 3*8)			// M1
-EX(.ex_handler_lcpy,	(p[B])	st8 [dst_pre_mem] = v[B], 128)		// M2 prefetch dst from memory
-EK(.ex_handler_lcpy,	(p[D])	st8 [dst_pre_l2] = n8, 128)		// M3 prefetch dst from L2
-	;;
-EX(.ex_handler_lcpy,	(p[A])	ld8 v[A] = [src_pre_mem], 128)		// M0 prefetch src from memory
-EK(.ex_handler_lcpy,	(p[C])	ld8 n8 = [src_pre_l2], 128)		// M1 prefetch src from L2
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t1, 8)			// M2
-EK(.ex_handler,	(p[D])	st8 [dst1] =  t3, 8)			// M3
-	;;
-EX(.ex_handler,	(p[D])	ld8  t5 = [src0], 8)
-EK(.ex_handler,	(p[D])	ld8  t7 = [src1], 3*8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t2, 3*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] =  t4, 3*8)
-	;;
-EX(.ex_handler,	(p[D])	ld8  t6 = [src0], 3*8)
-EK(.ex_handler,	(p[D])	ld8 t10 = [src1], 8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t5, 8)
-EK(.ex_handler,	(p[D])	st8 [dst1] =  t7, 3*8)
-	;;
-EX(.ex_handler,	(p[D])	ld8  t9 = [src0], 3*8)
-EK(.ex_handler,	(p[D])	ld8 t11 = [src1], 3*8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t6, 3*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t10, 8)
-	;;
-EX(.ex_handler,	(p[D])	ld8 t12 = [src0], 8)
-EK(.ex_handler,	(p[D])	ld8 t14 = [src1], 8)
-EX(.ex_handler,	(p[D])	st8 [dst0] =  t9, 3*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t11, 3*8)
-	;;
-EX(.ex_handler,	(p[D])	ld8 t13 = [src0], 4*8)
-EK(.ex_handler,	(p[D])	ld8 t15 = [src1], 4*8)
-EX(.ex_handler,	(p[D])	st8 [dst0] = t12, 8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t14, 8)
-	;;
-EX(.ex_handler,	(p[C])	ld8  t1 = [src0], 8)
-EK(.ex_handler,	(p[C])	ld8  t3 = [src1], 8)
-EX(.ex_handler,	(p[D])	st8 [dst0] = t13, 4*8)
-EK(.ex_handler,	(p[D])	st8 [dst1] = t15, 4*8)
-	br.ctop.sptk .line_copy
-	;;
-
-	add dst0=-8,dst0
-	add src0=-8,src0
-	mov in2=tmp
-	.restore sp
-	br.sptk.many .medium_copy
-	;;
-
-#define BLOCK_SIZE	128*32
-#define blocksize	r23
-#define curlen		r24
-
-// dest is on 8-byte boundary, src is not. We need to do
-// ld8-ld8, shrp, then st8.  Max 8 byte copy per cycle.
-.unaligned_src:
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc	saved_pfs=ar.pfs,3,5,0,8
-	.save ar.lc, saved_lc
-	mov	saved_lc=ar.lc
-	.save pr, saved_pr
-	mov	saved_pr=pr
-	.body
-.4k_block:
-	mov	saved_in0=dst0	// need to save all input arguments
-	mov	saved_in2=in2
-	mov	blocksize=BLOCK_SIZE
-	;;
-	cmp.lt	p6,p7=blocksize,in2
-	mov	saved_in1=src0
-	;;
-(p6)	mov	in2=blocksize
-	;;
-	shr.u	r21=in2,7	// this much cache line
-	shr.u	r22=in2,4	// number of 16-byte iteration
-	and	curlen=15,in2	// copy length after iteration
-	and	r30=7,src0	// source alignment
-	;;
-	cmp.lt	p7,p8=1,r21
-	add	cnt=-1,r21
-	;;
-
-	add	src_pre_mem=0,src0	// prefetch src pointer
-	add	dst_pre_mem=0,dst0	// prefetch dest pointer
-	and	src0=-8,src0		// 1st src pointer
-(p7)	mov	ar.lc = cnt
-(p8)	mov	ar.lc = r0
-	;;
-	TEXT_ALIGN(32)
-1:	lfetch.fault	  [src_pre_mem], 128
-	lfetch.fault.excl [dst_pre_mem], 128
-	br.cloop.dptk.few 1b
-	;;
-
-	shladd	dst1=r22,3,dst0	// 2nd dest pointer
-	shladd	src1=r22,3,src0	// 2nd src pointer
-	cmp.eq	p8,p9=r22,r0	// do we really need to loop?
-	cmp.le	p6,p7=8,curlen;	// have at least 8 byte remaining?
-	add	cnt=-1,r22	// ctop iteration adjustment
-	;;
-EX(.ex_handler, (p9)	ld8	r33=[src0],8)	// loop primer
-EK(.ex_handler, (p9)	ld8	r37=[src1],8)
-(p8)	br.dpnt.few .noloop
-	;;
-
-// The jump address is calculated based on src alignment. The COPYU
-// macro below need to confine its size to power of two, so an entry
-// can be caulated using shl instead of an expensive multiply. The
-// size is then hard coded by the following #define to match the
-// actual size.  This make it somewhat tedious when COPYU macro gets
-// changed and this need to be adjusted to match.
-#define LOOP_SIZE 6
-1:
-	mov	r29=ip		// jmp_table thread
-	mov	ar.lc=cnt
-	;;
-	add	r29=.jump_table - 1b - (.jmp1-.jump_table), r29
-	shl	r28=r30, LOOP_SIZE	// jmp_table thread
-	mov	ar.ec=2		// loop setup
-	;;
-	add	r29=r29,r28		// jmp_table thread
-	cmp.eq	p16,p17=r0,r0
-	;;
-	mov	b6=r29			// jmp_table thread
-	;;
-	br.cond.sptk.few b6
-
-// for 8-15 byte case
-// We will skip the loop, but need to replicate the side effect
-// that the loop produces.
-.noloop:
-EX(.ex_handler, (p6)	ld8	r37=[src1],8)
-	add	src0=8,src0
-(p6)	shl	r25=r30,3
-	;;
-EX(.ex_handler, (p6)	ld8	r27=[src1])
-(p6)	shr.u	r28=r37,r25
-(p6)	sub	r26=64,r25
-	;;
-(p6)	shl	r27=r27,r26
-	;;
-(p6)	or	r21=r28,r27
-
-.unaligned_src_tail:
-/* check if we have more than blocksize to copy, if so go back */
-	cmp.gt	p8,p0=saved_in2,blocksize
-	;;
-(p8)	add	dst0=saved_in0,blocksize
-(p8)	add	src0=saved_in1,blocksize
-(p8)	sub	in2=saved_in2,blocksize
-(p8)	br.dpnt	.4k_block
-	;;
-
-/* we have up to 15 byte to copy in the tail.
- * part of work is already done in the jump table code
- * we are at the following state.
- * src side:
- * 
- *   xxxxxx xx                   <----- r21 has xxxxxxxx already
- * -------- -------- --------
- * 0        8        16
- *          ^
- *          |
- *          src1
- * 
- * dst
- * -------- -------- --------
- * ^
- * |
- * dst1
- */
-EX(.ex_handler, (p6)	st8	[dst1]=r21,8)	// more than 8 byte to copy
-(p6)	add	curlen=-8,curlen	// update length
-	mov	ar.pfs=saved_pfs
-	;;
-	mov	ar.lc=saved_lc
-	mov	pr=saved_pr,-1
-	mov	in2=curlen	// remaining length
-	mov	dst0=dst1	// dest pointer
-	add	src0=src1,r30	// forward by src alignment
-	;;
-
-// 7 byte or smaller.
-.memcpy_short:
-	cmp.le	p8,p9   = 1,in2
-	cmp.le	p10,p11 = 2,in2
-	cmp.le	p12,p13 = 3,in2
-	cmp.le	p14,p15 = 4,in2
-	add	src1=1,src0	// second src pointer
-	add	dst1=1,dst0	// second dest pointer
-	;;
-
-EX(.ex_handler_short, (p8)	ld1	t1=[src0],2)
-EK(.ex_handler_short, (p10)	ld1	t2=[src1],2)
-(p9)	br.ret.dpnt rp		// 0 byte copy
-	;;
-
-EX(.ex_handler_short, (p8)	st1	[dst0]=t1,2)
-EK(.ex_handler_short, (p10)	st1	[dst1]=t2,2)
-(p11)	br.ret.dpnt rp		// 1 byte copy
-
-EX(.ex_handler_short, (p12)	ld1	t3=[src0],2)
-EK(.ex_handler_short, (p14)	ld1	t4=[src1],2)
-(p13)	br.ret.dpnt rp		// 2 byte copy
-	;;
-
-	cmp.le	p6,p7   = 5,in2
-	cmp.le	p8,p9   = 6,in2
-	cmp.le	p10,p11 = 7,in2
-
-EX(.ex_handler_short, (p12)	st1	[dst0]=t3,2)
-EK(.ex_handler_short, (p14)	st1	[dst1]=t4,2)
-(p15)	br.ret.dpnt rp		// 3 byte copy
-	;;
-
-EX(.ex_handler_short, (p6)	ld1	t5=[src0],2)
-EK(.ex_handler_short, (p8)	ld1	t6=[src1],2)
-(p7)	br.ret.dpnt rp		// 4 byte copy
-	;;
-
-EX(.ex_handler_short, (p6)	st1	[dst0]=t5,2)
-EK(.ex_handler_short, (p8)	st1	[dst1]=t6,2)
-(p9)	br.ret.dptk rp		// 5 byte copy
-
-EX(.ex_handler_short, (p10)	ld1	t7=[src0],2)
-(p11)	br.ret.dptk rp		// 6 byte copy
-	;;
-
-EX(.ex_handler_short, (p10)	st1	[dst0]=t7,2)
-	br.ret.dptk rp		// done all cases
-
-
-/* Align dest to nearest 8-byte boundary. We know we have at
- * least 7 bytes to copy, enough to crawl to 8-byte boundary.
- * Actual number of byte to crawl depend on the dest alignment.
- * 7 byte or less is taken care at .memcpy_short
-
- * src0 - source even index
- * src1 - source  odd index
- * dst0 - dest even index
- * dst1 - dest  odd index
- * r30  - distance to 8-byte boundary
- */
-
-.align_dest:
-	add	src1=1,in1	// source odd index
-	cmp.le	p7,p0 = 2,r30	// for .align_dest
-	cmp.le	p8,p0 = 3,r30	// for .align_dest
-EX(.ex_handler_short, (p6)	ld1	t1=[src0],2)
-	cmp.le	p9,p0 = 4,r30	// for .align_dest
-	cmp.le	p10,p0 = 5,r30
-	;;
-EX(.ex_handler_short, (p7)	ld1	t2=[src1],2)
-EK(.ex_handler_short, (p8)	ld1	t3=[src0],2)
-	cmp.le	p11,p0 = 6,r30
-EX(.ex_handler_short, (p6)	st1	[dst0] = t1,2)
-	cmp.le	p12,p0 = 7,r30
-	;;
-EX(.ex_handler_short, (p9)	ld1	t4=[src1],2)
-EK(.ex_handler_short, (p10)	ld1	t5=[src0],2)
-EX(.ex_handler_short, (p7)	st1	[dst1] = t2,2)
-EK(.ex_handler_short, (p8)	st1	[dst0] = t3,2)
-	;;
-EX(.ex_handler_short, (p11)	ld1	t6=[src1],2)
-EK(.ex_handler_short, (p12)	ld1	t7=[src0],2)
-	cmp.eq	p6,p7=r28,r29
-EX(.ex_handler_short, (p9)	st1	[dst1] = t4,2)
-EK(.ex_handler_short, (p10)	st1	[dst0] = t5,2)
-	sub	in2=in2,r30
-	;;
-EX(.ex_handler_short, (p11)	st1	[dst1] = t6,2)
-EK(.ex_handler_short, (p12)	st1	[dst0] = t7)
-	add	dst0=in0,r30	// setup arguments
-	add	src0=in1,r30
-(p6)	br.cond.dptk .aligned_src
-(p7)	br.cond.dpnt .unaligned_src
-	;;
-
-/* main loop body in jump table format */
-#define COPYU(shift)									\
-1:											\
-EX(.ex_handler,  (p16)	ld8	r32=[src0],8);		/* 1 */				\
-EK(.ex_handler,  (p16)	ld8	r36=[src1],8);						\
-		 (p17)	shrp	r35=r33,r34,shift;;	/* 1 */				\
-EX(.ex_handler,  (p6)	ld8	r22=[src1]);	/* common, prime for tail section */	\
-		 nop.m	0;								\
-		 (p16)	shrp	r38=r36,r37,shift;					\
-EX(.ex_handler,  (p17)	st8	[dst0]=r35,8);		/* 1 */				\
-EK(.ex_handler,  (p17)	st8	[dst1]=r39,8);						\
-		 br.ctop.dptk.few 1b;;							\
-		 (p7)	add	src1=-8,src1;	/* back out for <8 byte case */		\
-		 shrp	r21=r22,r38,shift;	/* speculative work */			\
-		 br.sptk.few .unaligned_src_tail /* branch out of jump table */		\
-		 ;;
-	TEXT_ALIGN(32)
-.jump_table:
-	COPYU(8)	// unaligned cases
-.jmp1:
-	COPYU(16)
-	COPYU(24)
-	COPYU(32)
-	COPYU(40)
-	COPYU(48)
-	COPYU(56)
-
-#undef A
-#undef B
-#undef C
-#undef D
-
-/*
- * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
- * instruction failed in the bundle.  The exception algorithm is that we
- * first figure out the faulting address, then detect if there is any
- * progress made on the copy, if so, redo the copy from last known copied
- * location up to the faulting address (exclusive). In the copy_from_user
- * case, remaining byte in kernel buffer will be zeroed.
- *
- * Take copy_from_user as an example, in the code there are multiple loads
- * in a bundle and those multiple loads could span over two pages, the
- * faulting address is calculated as page_round_down(max(src0, src1)).
- * This is based on knowledge that if we can access one byte in a page, we
- * can access any byte in that page.
- *
- * predicate used in the exception handler:
- * p6-p7: direction
- * p10-p11: src faulting addr calculation
- * p12-p13: dst faulting addr calculation
- */
-
-#define A	r19
-#define B	r20
-#define C	r21
-#define D	r22
-#define F	r28
-
-#define saved_retval	loc0
-#define saved_rtlink	loc1
-#define saved_pfs_stack	loc2
-
-.ex_hndlr_s:
-	add	src0=8,src0
-	br.sptk .ex_handler
-	;;
-.ex_hndlr_d:
-	add	dst0=8,dst0
-	br.sptk .ex_handler
-	;;
-.ex_hndlr_lcpy_1:
-	mov	src1=src_pre_mem
-	mov	dst1=dst_pre_mem
-	cmp.gtu	p10,p11=src_pre_mem,saved_in1
-	cmp.gtu	p12,p13=dst_pre_mem,saved_in0
-	;;
-(p10)	add	src0=8,saved_in1
-(p11)	mov	src0=saved_in1
-(p12)	add	dst0=8,saved_in0
-(p13)	mov	dst0=saved_in0
-	br.sptk	.ex_handler
-.ex_handler_lcpy:
-	// in line_copy block, the preload addresses should always ahead
-	// of the other two src/dst pointers.  Furthermore, src1/dst1 should
-	// always ahead of src0/dst0.
-	mov	src1=src_pre_mem
-	mov	dst1=dst_pre_mem
-.ex_handler:
-	mov	pr=saved_pr,-1		// first restore pr, lc, and pfs
-	mov	ar.lc=saved_lc
-	mov	ar.pfs=saved_pfs
-	;;
-.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
-	cmp.ltu	p6,p7=saved_in0, saved_in1	// get the copy direction
-	cmp.ltu	p10,p11=src0,src1
-	cmp.ltu	p12,p13=dst0,dst1
-	fcmp.eq	p8,p0=f6,f0		// is it memcpy?
-	mov	tmp = dst0
-	;;
-(p11)	mov	src1 = src0		// pick the larger of the two
-(p13)	mov	dst0 = dst1		// make dst0 the smaller one
-(p13)	mov	dst1 = tmp		// and dst1 the larger one
-	;;
-(p6)	dep	F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
-(p7)	dep	F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
-	;;
-(p6)	cmp.le	p14,p0=dst0,saved_in0	// no progress has been made on store
-(p7)	cmp.le	p14,p0=src0,saved_in1	// no progress has been made on load
-	mov	retval=saved_in2
-(p8)	ld1	tmp=[src1]		// force an oops for memcpy call
-(p8)	st1	[dst1]=r0		// force an oops for memcpy call
-(p14)	br.ret.sptk.many rp
-
-/*
- * The remaining byte to copy is calculated as:
- *
- * A =	(faulting_addr - orig_src)	-> len to faulting ld address
- *	or 
- * 	(faulting_addr - orig_dst)	-> len to faulting st address
- * B =	(cur_dst - orig_dst)		-> len copied so far
- * C =	A - B				-> len need to be copied
- * D =	orig_len - A			-> len need to be left along
- */
-(p6)	sub	A = F, saved_in0
-(p7)	sub	A = F, saved_in1
-	clrrrb
-	;;
-	alloc	saved_pfs_stack=ar.pfs,3,3,3,0
-	cmp.lt	p8,p0=A,r0
-	sub	B = dst0, saved_in0	// how many byte copied so far
-	;;
-(p8)	mov	A = 0;			// A shouldn't be negative, cap it
-	;;
-	sub	C = A, B
-	sub	D = saved_in2, A
-	;;
-	cmp.gt	p8,p0=C,r0		// more than 1 byte?
-	mov	r8=0
-	mov	saved_retval = D
-	mov	saved_rtlink = b0
-
-	add	out0=saved_in0, B
-	add	out1=saved_in1, B
-	mov	out2=C
-(p8)	br.call.sptk.few b0=__copy_user	// recursive call
-	;;
-
-	add	saved_retval=saved_retval,r8	// above might return non-zero value
-	;;
-
-	mov	retval=saved_retval
-	mov	ar.pfs=saved_pfs_stack
-	mov	b0=saved_rtlink
-	br.ret.sptk.many rp
-
-/* end of McKinley specific optimization */
-END(__copy_user)
-EXPORT_SYMBOL(__copy_user)
diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S
deleted file mode 100644
index 552c5c7e4d06..000000000000
--- a/arch/ia64/lib/memset.S
+++ /dev/null
@@ -1,365 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Optimized version of the standard memset() function.
-
-   Copyright (c) 2002 Hewlett-Packard Co/CERN
-	Sverre Jarp <Sverre.Jarp@cern.ch>
-
-   Return: dest
-
-   Inputs:
-        in0:    dest
-        in1:    value
-        in2:    count
-
-   The algorithm is fairly straightforward: set byte by byte until we
-   we get to a 16B-aligned address, then loop on 128 B chunks using an
-   early store as prefetching, then loop on 32B chucks, then clear remaining
-   words, finally clear remaining bytes.
-   Since a stf.spill f0 can store 16B in one go, we use this instruction
-   to get peak speed when value = 0.  */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-#undef ret
-
-#define dest		in0
-#define value		in1
-#define	cnt		in2
-
-#define tmp		r31
-#define save_lc		r30
-#define ptr0		r29
-#define ptr1		r28
-#define ptr2		r27
-#define ptr3		r26
-#define ptr9 		r24
-#define	loopcnt		r23
-#define linecnt		r22
-#define bytecnt		r21
-
-#define fvalue		f6
-
-// This routine uses only scratch predicate registers (p6 - p15)
-#define p_scr		p6			// default register for same-cycle branches
-#define p_nz		p7
-#define p_zr		p8
-#define p_unalgn	p9
-#define p_y		p11
-#define p_n		p12
-#define p_yy		p13
-#define p_nn		p14
-
-#define MIN1		15
-#define MIN1P1HALF	8
-#define LINE_SIZE	128
-#define LSIZE_SH        7			// shift amount
-#define PREF_AHEAD	8
-
-GLOBAL_ENTRY(memset)
-{ .mmi
-	.prologue
-	alloc	tmp = ar.pfs, 3, 0, 0, 0
-	lfetch.nt1 [dest]			//
-	.save   ar.lc, save_lc
-	mov.i	save_lc = ar.lc
-	.body
-} { .mmi
-	mov	ret0 = dest			// return value
-	cmp.ne	p_nz, p_zr = value, r0		// use stf.spill if value is zero
-	cmp.eq	p_scr, p0 = cnt, r0
-;; }
-{ .mmi
-	and	ptr2 = -(MIN1+1), dest		// aligned address
-	and	tmp = MIN1, dest		// prepare to check for correct alignment
-	tbit.nz p_y, p_n = dest, 0		// Do we have an odd address? (M_B_U)
-} { .mib
-	mov	ptr1 = dest
-	mux1	value = value, @brcst		// create 8 identical bytes in word
-(p_scr)	br.ret.dpnt.many rp			// return immediately if count = 0
-;; }
-{ .mib
-	cmp.ne	p_unalgn, p0 = tmp, r0		//
-} { .mib
-	sub	bytecnt = (MIN1+1), tmp		// NB: # of bytes to move is 1 higher than loopcnt
-	cmp.gt	p_scr, p0 = 16, cnt		// is it a minimalistic task?
-(p_scr)	br.cond.dptk.many .move_bytes_unaligned	// go move just a few (M_B_U)
-;; }
-{ .mmi
-(p_unalgn) add	ptr1 = (MIN1+1), ptr2		// after alignment
-(p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		// after alignment
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	// should we do a st8 ?
-;; }
-{ .mib
-(p_y)	add	cnt = -8, cnt			//
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	// should we do a st4 ?
-} { .mib
-(p_y)	st8	[ptr2] = value,-4		//
-(p_n)	add	ptr2 = 4, ptr2			//
-;; }
-{ .mib
-(p_yy)	add	cnt = -4, cnt			//
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	// should we do a st2 ?
-} { .mib
-(p_yy)	st4	[ptr2] = value,-2		//
-(p_nn)	add	ptr2 = 2, ptr2			//
-;; }
-{ .mmi
-	mov	tmp = LINE_SIZE+1		// for compare
-(p_y)	add	cnt = -2, cnt			//
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	// should we do a st1 ?
-} { .mmi
-	setf.sig fvalue=value			// transfer value to FLP side
-(p_y)	st2	[ptr2] = value,-1		//
-(p_n)	add	ptr2 = 1, ptr2			//
-;; }
-
-{ .mmi
-(p_yy)	st1	[ptr2] = value 			//
-  	cmp.gt	p_scr, p0 = tmp, cnt		// is it a minimalistic task?
-} { .mbb
-(p_yy)	add	cnt = -1, cnt			//
-(p_scr)	br.cond.dpnt.many .fraction_of_line	// go move just a few
-;; }
-
-{ .mib
-	nop.m 0
-	shr.u	linecnt = cnt, LSIZE_SH
-(p_zr)	br.cond.dptk.many .l1b			// Jump to use stf.spill
-;; }
-
-	TEXT_ALIGN(32) // --------------------- //  L1A: store ahead into cache lines; fill later
-{ .mmi
-	and	tmp = -(LINE_SIZE), cnt		// compute end of range
-	mov	ptr9 = ptr1			// used for prefetching
-	and	cnt = (LINE_SIZE-1), cnt	// remainder
-} { .mmi
-	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
-	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
-;; }
-{ .mmi
-(p_scr)	add	loopcnt = -1, linecnt		//
-	add	ptr2 = 8, ptr1			// start of stores (beyond prefetch stores)
-	add	ptr1 = tmp, ptr1		// first address beyond total range
-;; }
-{ .mmi
-	add	tmp = -1, linecnt		// next loop count
-	mov.i	ar.lc = loopcnt			//
-;; }
-.pref_l1a:
-{ .mib
-	stf8 [ptr9] = fvalue, 128		// Do stores one cache line apart
-	nop.i	0
-	br.cloop.dptk.few .pref_l1a
-;; }
-{ .mmi
-	add	ptr0 = 16, ptr2			// Two stores in parallel
-	mov.i	ar.lc = tmp			//
-;; }
-.l1ax:
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 24
-	stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 24
-	stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 8
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 24
-	stf8 [ptr0] = fvalue, 24
- ;; }
- { .mmi
-	stf8 [ptr2] = fvalue, 8
-	stf8 [ptr0] = fvalue, 32
- 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
- ;; }
-{ .mmb
-	stf8 [ptr2] = fvalue, 24
-(p_scr)	stf8 [ptr9] = fvalue, 128
-	br.cloop.dptk.few .l1ax
-;; }
-{ .mbb
-	cmp.le  p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr) br.cond.dpnt.many  .fraction_of_line	// Branch no. 2
-	br.cond.dpnt.many  .move_bytes_from_alignment	// Branch no. 3
-;; }
-
-	TEXT_ALIGN(32)
-.l1b:	// ------------------------------------ //  L1B: store ahead into cache lines; fill later
-{ .mmi
-	and	tmp = -(LINE_SIZE), cnt		// compute end of range
-	mov	ptr9 = ptr1			// used for prefetching
-	and	cnt = (LINE_SIZE-1), cnt	// remainder
-} { .mmi
-	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
-	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
-;; }
-{ .mmi
-(p_scr)	add	loopcnt = -1, linecnt
-	add	ptr2 = 16, ptr1			// start of stores (beyond prefetch stores)
-	add	ptr1 = tmp, ptr1		// first address beyond total range
-;; }
-{ .mmi
-	add	tmp = -1, linecnt		// next loop count
-	mov.i	ar.lc = loopcnt
-;; }
-.pref_l1b:
-{ .mib
-	stf.spill [ptr9] = f0, 128		// Do stores one cache line apart
-	nop.i   0
-	br.cloop.dptk.few .pref_l1b
-;; }
-{ .mmi
-	add	ptr0 = 16, ptr2			// Two stores in parallel
-	mov.i	ar.lc = tmp
-;; }
-.l1bx:
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 64
- 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
- ;; }
-{ .mmb
-	stf.spill [ptr2] = f0, 32
-(p_scr)	stf.spill [ptr9] = f0, 128
-	br.cloop.dptk.few .l1bx
-;; }
-{ .mib
-	cmp.gt  p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr)	br.cond.dpnt.many  .move_bytes_from_alignment	//
-;; }
-
-.fraction_of_line:
-{ .mib
-	add	ptr2 = 16, ptr1
-	shr.u	loopcnt = cnt, 5   		// loopcnt = cnt / 32
-;; }
-{ .mib
-	cmp.eq	p_scr, p0 = loopcnt, r0
-	add	loopcnt = -1, loopcnt
-(p_scr)	br.cond.dpnt.many .store_words
-;; }
-{ .mib
-	and	cnt = 0x1f, cnt			// compute the remaining cnt
-	mov.i   ar.lc = loopcnt
-;; }
-	TEXT_ALIGN(32)
-.l2:	// ------------------------------------ //  L2A:  store 32B in 2 cycles
-{ .mmb
-	stf8	[ptr1] = fvalue, 8
-	stf8	[ptr2] = fvalue, 8
-;; } { .mmb
-	stf8	[ptr1] = fvalue, 24
-	stf8	[ptr2] = fvalue, 24
-	br.cloop.dptk.many .l2
-;; }
-.store_words:
-{ .mib
-	cmp.gt	p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	// Branch
-;; }
-
-{ .mmi
-	stf8	[ptr1] = fvalue, 8		// store
-	cmp.le	p_y, p_n = 16, cnt
-	add	cnt = -8, cnt			// subtract
-;; }
-{ .mmi
-(p_y)	stf8	[ptr1] = fvalue, 8		// store
-(p_y)	cmp.le.unc p_yy, p_nn = 16, cnt
-(p_y)	add	cnt = -8, cnt			// subtract
-;; }
-{ .mmi						// store
-(p_yy)	stf8	[ptr1] = fvalue, 8
-(p_yy)	add	cnt = -8, cnt			// subtract
-;; }
-
-.move_bytes_from_alignment:
-{ .mib
-	cmp.eq	p_scr, p0 = cnt, r0
-	tbit.nz.unc p_y, p0 = cnt, 2		// should we terminate with a st4 ?
-(p_scr)	br.cond.dpnt.few .restore_and_exit
-;; }
-{ .mib
-(p_y)	st4	[ptr1] = value,4
-	tbit.nz.unc p_yy, p0 = cnt, 1		// should we terminate with a st2 ?
-;; }
-{ .mib
-(p_yy)	st2	[ptr1] = value,2
-	tbit.nz.unc p_y, p0 = cnt, 0		// should we terminate with a st1 ?
-;; }
-
-{ .mib
-(p_y)	st1	[ptr1] = value
-;; }
-.restore_and_exit:
-{ .mib
-	nop.m	0
-	mov.i	ar.lc = save_lc
-	br.ret.sptk.many rp
-;; }
-
-.move_bytes_unaligned:
-{ .mmi
-       .pred.rel "mutex",p_y, p_n
-       .pred.rel "mutex",p_yy, p_nn
-(p_n)	cmp.le  p_yy, p_nn = 4, cnt
-(p_y)	cmp.le  p_yy, p_nn = 5, cnt
-(p_n)	add	ptr2 = 2, ptr1
-} { .mmi
-(p_y)	add	ptr2 = 3, ptr1
-(p_y)	st1	[ptr1] = value, 1		// fill 1 (odd-aligned) byte [15, 14 (or less) left]
-(p_y)	add	cnt = -1, cnt
-;; }
-{ .mmi
-(p_yy)	cmp.le.unc p_y, p0 = 8, cnt
-	add	ptr3 = ptr1, cnt		// prepare last store
-	mov.i	ar.lc = save_lc
-} { .mmi
-(p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
-(p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [11, 10 (o less) left]
-(p_yy)	add	cnt = -4, cnt
-;; }
-{ .mmi
-(p_y)	cmp.le.unc p_yy, p0 = 8, cnt
-	add	ptr3 = -1, ptr3			// last store
-	tbit.nz p_scr, p0 = cnt, 1		// will there be a st2 at the end ?
-} { .mmi
-(p_y)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
-(p_y)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [7, 6 (or less) left]
-(p_y)	add	cnt = -4, cnt
-;; }
-{ .mmi
-(p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
-(p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [3, 2 (or less) left]
-	tbit.nz p_y, p0 = cnt, 0		// will there be a st1 at the end ?
-} { .mmi
-(p_yy)	add	cnt = -4, cnt
-;; }
-{ .mmb
-(p_scr)	st2	[ptr1] = value			// fill 2 (aligned) bytes
-(p_y)	st1	[ptr3] = value			// fill last byte (using ptr3)
-	br.ret.sptk.many rp
-}
-END(memset)
-EXPORT_SYMBOL(memset)
diff --git a/arch/ia64/lib/strlen.S b/arch/ia64/lib/strlen.S
deleted file mode 100644
index 1f4a46c15127..000000000000
--- a/arch/ia64/lib/strlen.S
+++ /dev/null
@@ -1,195 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *
- * Optimized version of the standard strlen() function
- *
- *
- * Inputs:
- *	in0	address of string
- *
- * Outputs:
- *	ret0	the number of characters in the string (0 if empty string)
- *	does not count the \0
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- *	Stephane Eranian <eranian@hpl.hp.com>
- *
- * 09/24/99 S.Eranian add speculation recovery code
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-//
-//
-// This is an enhanced version of the basic strlen. it includes a combination
-// of compute zero index (czx), parallel comparisons, speculative loads and
-// loop unroll using rotating registers.
-//
-// General Ideas about the algorithm:
-//	  The goal is to look at the string in chunks of 8 bytes.
-//	  so we need to do a few extra checks at the beginning because the
-//	  string may not be 8-byte aligned. In this case we load the 8byte
-//	  quantity which includes the start of the string and mask the unused
-//	  bytes with 0xff to avoid confusing czx.
-//	  We use speculative loads and software pipelining to hide memory
-//	  latency and do read ahead safely. This way we defer any exception.
-//
-//	  Because we don't want the kernel to be relying on particular
-//	  settings of the DCR register, we provide recovery code in case
-//	  speculation fails. The recovery code is going to "redo" the work using
-//	  only normal loads. If we still get a fault then we generate a
-//	  kernel panic. Otherwise we return the strlen as usual.
-//
-//	  The fact that speculation may fail can be caused, for instance, by
-//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
-//	  a NaT bit will be set if the translation is not present. The normal
-//	  load, on the other hand, will cause the translation to be inserted
-//	  if the mapping exists.
-//
-//	  It should be noted that we execute recovery code only when we need
-//	  to use the data that has been speculatively loaded: we don't execute
-//	  recovery code on pure read ahead data.
-//
-// Remarks:
-//	- the cmp r0,r0 is used as a fast way to initialize a predicate
-//	  register to 1. This is required to make sure that we get the parallel
-//	  compare correct.
-//
-//	- we don't use the epilogue counter to exit the loop but we need to set
-//	  it to zero beforehand.
-//
-//	- after the loop we must test for Nat values because neither the
-//	  czx nor cmp instruction raise a NaT consumption fault. We must be
-//	  careful not to look too far for a Nat for which we don't care.
-//	  For instance we don't need to look at a NaT in val2 if the zero byte
-//	  was in val1.
-//
-//	- Clearly performance tuning is required.
-//
-//
-//
-#define saved_pfs	r11
-#define	tmp		r10
-#define base		r16
-#define orig		r17
-#define saved_pr	r18
-#define src		r19
-#define mask		r20
-#define val		r21
-#define val1		r22
-#define val2		r23
-
-GLOBAL_ENTRY(strlen)
-	.prologue
-	.save ar.pfs, saved_pfs
-	alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
-
-	.rotr v[2], w[2]	// declares our 4 aliases
-
-	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
-	mov orig=in0		// keep trackof initial byte address
-	dep src=0,in0,0,3	// src=8byte-aligned in0 address
-	.save pr, saved_pr
-	mov saved_pr=pr		// preserve predicates (rotation)
-	;;
-
-	.body
-
-	ld8 v[1]=[src],8	// must not speculate: can fail here
-	shl tmp=tmp,3		// multiply by 8bits/byte
-	mov mask=-1		// our mask
-	;;
-	ld8.s w[1]=[src],8	// speculatively load next
-	cmp.eq p6,p0=r0,r0	// sets p6 to true for cmp.and
-	sub tmp=64,tmp		// how many bits to shift our mask on the right
-	;;
-	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
-	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
-	;;
-	add base=-16,src	// keep track of aligned base
-	or v[1]=v[1],mask	// now we have a safe initial byte pattern
-	;;
-1:
-	ld8.s v[0]=[src],8	// speculatively load next
-	czx1.r val1=v[1]	// search 0 byte from right
-	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
-	;;
-	ld8.s w[0]=[src],8	// speculatively load next to next
-	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
-	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
-(p6)	br.wtop.dptk 1b		// loop until p6 == 0
-	;;
-	//
-	// We must return try the recovery code iff
-	// val1_is_nat || (val1==8 && val2_is_nat)
-	//
-	// XXX Fixme
-	//	- there must be a better way of doing the test
-	//
-	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
-	tnat.nz p6,p7=val1	// test NaT on val1
-(p6)	br.cond.spnt .recover	// jump to recovery if val1 is NaT
-	;;
-	//
-	// if we come here p7 is true, i.e., initialized for // cmp
-	//
-	cmp.eq.and  p7,p0=8,val1// val1==8?
-	tnat.nz.and p7,p0=val2	// test NaT if val2
-(p7)	br.cond.spnt .recover	// jump to recovery if val2 is NaT
-	;;
-(p8)	mov val1=val2		// the other test got us out of the loop
-(p8)	adds src=-16,src	// correct position when 3 ahead
-(p9)	adds src=-24,src	// correct position when 4 ahead
-	;;
-	sub ret0=src,orig	// distance from base
-	sub tmp=8,val1		// which byte in word
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	sub ret0=ret0,tmp	// adjust
-	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
-	br.ret.sptk.many rp	// end of normal execution
-
-	//
-	// Outlined recovery code when speculation failed
-	//
-	// This time we don't use speculation and rely on the normal exception
-	// mechanism. that's why the loop is not as good as the previous one
-	// because read ahead is not possible
-	//
-	// IMPORTANT:
-	// Please note that in the case of strlen() as opposed to strlen_user()
-	// we don't use the exception mechanism, as this function is not
-	// supposed to fail. If that happens it means we have a bug and the
-	// code will cause of kernel fault.
-	//
-	// XXX Fixme
-	//	- today we restart from the beginning of the string instead
-	//	  of trying to continue where we left off.
-	//
-.recover:
-	ld8 val=[base],8	// will fail if unrecoverable fault
-	;;
-	or val=val,mask		// remask first bytes
-	cmp.eq p0,p6=r0,r0	// nullify first ld8 in loop
-	;;
-	//
-	// ar.ec is still zero here
-	//
-2:
-(p6)	ld8 val=[base],8	// will fail if unrecoverable fault
-	;;
-	czx1.r val1=val		// search 0 byte from right
-	;;
-	cmp.eq p6,p0=8,val1	// val1==8 ?
-(p6)	br.wtop.dptk 2b		// loop until p6 == 0
-	;;			// (avoid WAW on p63)
-	sub ret0=base,orig	// distance from base
-	sub tmp=8,val1
-	mov pr=saved_pr,0xffffffffffff0000
-	;;
-	sub ret0=ret0,tmp	// length=now - back -1
-	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
-	br.ret.sptk.many rp	// end of successful recovery code
-END(strlen)
-EXPORT_SYMBOL(strlen)
diff --git a/arch/ia64/lib/strncpy_from_user.S b/arch/ia64/lib/strncpy_from_user.S
deleted file mode 100644
index a287169bd953..000000000000
--- a/arch/ia64/lib/strncpy_from_user.S
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Just like strncpy() except that if a fault occurs during copying,
- * -EFAULT is returned.
- *
- * Inputs:
- *	in0:	address of destination buffer
- *	in1:	address of string to be copied
- *	in2:	length of buffer in bytes
- * Outputs:
- *	r8:	-EFAULT in case of fault or number of bytes copied if no fault
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
- *			 by Andreas Schwab <schwab@suse.de>).
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strncpy_from_user)
-	alloc r2=ar.pfs,3,0,0,0
-	mov r8=0
-	mov r9=in1
-	;;
-	add r10=in1,in2
-	cmp.eq p6,p0=r0,in2
-(p6)	br.ret.spnt.many rp
-
-	// XXX braindead copy loop---this needs to be optimized
-.Loop1:
-	EX(.Lexit, ld1 r8=[in1],1)
-	;;
-	EX(.Lexit, st1 [in0]=r8,1)
-	cmp.ne p6,p7=r8,r0
-	;;
-(p6)	cmp.ne.unc p8,p0=in1,r10
-(p8)	br.cond.dpnt.few .Loop1
-	;;
-(p6)	mov r8=in2		// buffer filled up---return buffer length
-(p7)	sub r8=in1,r9,1		// return string length (excluding NUL character)
-[.Lexit:]
-	br.ret.sptk.many rp
-END(__strncpy_from_user)
-EXPORT_SYMBOL(__strncpy_from_user)
diff --git a/arch/ia64/lib/strnlen_user.S b/arch/ia64/lib/strnlen_user.S
deleted file mode 100644
index a7eb56e840a9..000000000000
--- a/arch/ia64/lib/strnlen_user.S
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Returns 0 if exception before NUL or reaching the supplied limit (N),
- * a value greater than N if the string is longer than the limit, else
- * strlen.
- *
- * Inputs:
- *	in0:	address of buffer
- *	in1:	string length limit N
- * Outputs:
- *	r8:	0 in case of fault, strlen(buffer)+1 otherwise
- *
- * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strnlen_user)
-	.prologue
-	alloc r2=ar.pfs,2,0,0,0
-	.save ar.lc, r16
-	mov r16=ar.lc			// preserve ar.lc
-
-	.body
-
-	add r3=-1,in1
-	;;
-	mov ar.lc=r3
-	mov r9=0
-	;;
-	// XXX braindead strlen loop---this needs to be optimized
-.Loop1:
-	EXCLR(.Lexit, ld1 r8=[in0],1)
-	add r9=1,r9
-	;;
-	cmp.eq p6,p0=r8,r0
-(p6)	br.cond.dpnt .Lexit
-	br.cloop.dptk.few .Loop1
-
-	add r9=1,in1			// NUL not found---return N+1
-	;;
-.Lexit:
-	mov r8=r9
-	mov ar.lc=r16			// restore ar.lc
-	br.ret.sptk.many rp
-END(__strnlen_user)
-EXPORT_SYMBOL(__strnlen_user)
diff --git a/arch/ia64/lib/xor.S b/arch/ia64/lib/xor.S
deleted file mode 100644
index 6e2a69662c06..000000000000
--- a/arch/ia64/lib/xor.S
+++ /dev/null
@@ -1,181 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * arch/ia64/lib/xor.S
- *
- * Optimized RAID-5 checksumming functions for IA-64.
- */
-
-#include <linux/export.h>
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(xor_ia64_2)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 3, 0, 13, 16
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	;;
-	.rotr s1[6+1], s2[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-(p[6+1])st8.nta [r8] = d[1], 8
-	nop.f 0
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_2)
-EXPORT_SYMBOL(xor_ia64_2)
-
-GLOBAL_ENTRY(xor_ia64_3)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 4, 0, 20, 24
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov r18 = in3
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	;;
-	.rotr s1[6+1], s2[6+1], s3[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-	;;
-(p[0])	ld8.nta s3[0] = [r18], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6])	xor d[0] = d[0], s3[6]
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_3)
-EXPORT_SYMBOL(xor_ia64_3)
-
-GLOBAL_ENTRY(xor_ia64_4)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 5, 0, 27, 32
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov r18 = in3
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	mov r19 = in4
-	;;
-	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-(p[0])	ld8.nta s3[0] = [r18], 8
-(p[0])	ld8.nta s4[0] = [r19], 8
-(p[6])	xor r20 = s3[6], s4[6]
-	;;
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6])	xor d[0] = d[0], r20
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_4)
-EXPORT_SYMBOL(xor_ia64_4)
-
-GLOBAL_ENTRY(xor_ia64_5)
-	.prologue
-	.fframe 0
-	.save ar.pfs, r31
-	alloc r31 = ar.pfs, 6, 0, 34, 40
-	.save ar.lc, r30
-	mov r30 = ar.lc
-	.save pr, r29
-	mov r29 = pr
-	;;
-	.body
-	mov r8 = in1
-	mov ar.ec = 6 + 2
-	shr in0 = in0, 3
-	;;
-	adds in0 = -1, in0
-	mov r16 = in1
-	mov r17 = in2
-	;;
-	mov r18 = in3
-	mov ar.lc = in0
-	mov pr.rot = 1 << 16
-	mov r19 = in4
-	mov r20 = in5
-	;;
-	.rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
-	.rotp p[6+2]
-0:
-(p[0])	ld8.nta s1[0] = [r16], 8
-(p[0])	ld8.nta s2[0] = [r17], 8
-(p[6])	xor d[0] = s1[6], s2[6]
-(p[0])	ld8.nta s3[0] = [r18], 8
-(p[0])	ld8.nta s4[0] = [r19], 8
-(p[6])	xor r21 = s3[6], s4[6]
-	;;
-(p[0])	ld8.nta s5[0] = [r20], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6])	xor d[0] = d[0], r21
-	;;
-(p[6])	  xor d[0] = d[0], s5[6]
-	nop.f 0
-	br.ctop.dptk.few 0b
-	;;
-	mov ar.lc = r30
-	mov pr = r29, -1
-	br.ret.sptk.few rp
-END(xor_ia64_5)
-EXPORT_SYMBOL(xor_ia64_5)
diff --git a/arch/ia64/mm/Makefile b/arch/ia64/mm/Makefile
deleted file mode 100644
index c03f63c62ac4..000000000000
--- a/arch/ia64/mm/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the ia64-specific parts of the memory manager.
-#
-
-obj-y := init.o fault.o tlb.o extable.o ioremap.o
-
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_NUMA)	   += numa.o
-obj-$(CONFIG_SPARSEMEM)	   += discontig.o
-obj-$(CONFIG_FLATMEM)	   += contig.o
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
deleted file mode 100644
index 1e9eaa107eb7..000000000000
--- a/arch/ia64/mm/contig.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2000, Rohit Seth <rohit.seth@intel.com>
- * Copyright (C) 1999 VA Linux Systems
- * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 2003 Silicon Graphics, Inc. All rights reserved.
- *
- * Routines used by ia64 machines with contiguous (or virtually contiguous)
- * memory.
- */
-#include <linux/efi.h>
-#include <linux/memblock.h>
-#include <linux/mm.h>
-#include <linux/nmi.h>
-#include <linux/swap.h>
-#include <linux/sizes.h>
-
-#include <asm/efi.h>
-#include <asm/meminit.h>
-#include <asm/sections.h>
-#include <asm/mca.h>
-
-/* physical address where the bootmem map is located */
-unsigned long bootmap_start;
-
-#ifdef CONFIG_SMP
-static void *cpu_data;
-/**
- * per_cpu_init - setup per-cpu variables
- *
- * Allocate and setup per-cpu data areas.
- */
-void *per_cpu_init(void)
-{
-	static bool first_time = true;
-	void *cpu0_data = __cpu0_per_cpu;
-	unsigned int cpu;
-
-	if (!first_time)
-		goto skip;
-	first_time = false;
-
-	/*
-	 * get_free_pages() cannot be used before cpu_init() done.
-	 * BSP allocates PERCPU_PAGE_SIZE bytes for all possible CPUs
-	 * to avoid that AP calls get_zeroed_page().
-	 */
-	for_each_possible_cpu(cpu) {
-		void *src = cpu == 0 ? cpu0_data : __phys_per_cpu_start;
-
-		memcpy(cpu_data, src, __per_cpu_end - __per_cpu_start);
-		__per_cpu_offset[cpu] = (char *)cpu_data - __per_cpu_start;
-		per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
-
-		/*
-		 * percpu area for cpu0 is moved from the __init area
-		 * which is setup by head.S and used till this point.
-		 * Update ar.k3.  This move is ensures that percpu
-		 * area for cpu0 is on the correct node and its
-		 * virtual address isn't insanely far from other
-		 * percpu areas which is important for congruent
-		 * percpu allocator.
-		 */
-		if (cpu == 0)
-			ia64_set_kr(IA64_KR_PER_CPU_DATA, __pa(cpu_data) -
-				    (unsigned long)__per_cpu_start);
-
-		cpu_data += PERCPU_PAGE_SIZE;
-	}
-skip:
-	return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
-}
-
-static inline __init void
-alloc_per_cpu_data(void)
-{
-	size_t size = PERCPU_PAGE_SIZE * num_possible_cpus();
-
-	cpu_data = memblock_alloc_from(size, PERCPU_PAGE_SIZE,
-				       __pa(MAX_DMA_ADDRESS));
-	if (!cpu_data)
-		panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n",
-		      __func__, size, PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
-}
-
-/**
- * setup_per_cpu_areas - setup percpu areas
- *
- * Arch code has already allocated and initialized percpu areas.  All
- * this function has to do is to teach the determined layout to the
- * dynamic percpu allocator, which happens to be more complex than
- * creating whole new ones using helpers.
- */
-void __init
-setup_per_cpu_areas(void)
-{
-	struct pcpu_alloc_info *ai;
-	struct pcpu_group_info *gi;
-	unsigned int cpu;
-	ssize_t static_size, reserved_size, dyn_size;
-
-	ai = pcpu_alloc_alloc_info(1, num_possible_cpus());
-	if (!ai)
-		panic("failed to allocate pcpu_alloc_info");
-	gi = &ai->groups[0];
-
-	/* units are assigned consecutively to possible cpus */
-	for_each_possible_cpu(cpu)
-		gi->cpu_map[gi->nr_units++] = cpu;
-
-	/* set parameters */
-	static_size = __per_cpu_end - __per_cpu_start;
-	reserved_size = PERCPU_MODULE_RESERVE;
-	dyn_size = PERCPU_PAGE_SIZE - static_size - reserved_size;
-	if (dyn_size < 0)
-		panic("percpu area overflow static=%zd reserved=%zd\n",
-		      static_size, reserved_size);
-
-	ai->static_size		= static_size;
-	ai->reserved_size	= reserved_size;
-	ai->dyn_size		= dyn_size;
-	ai->unit_size		= PERCPU_PAGE_SIZE;
-	ai->atom_size		= PAGE_SIZE;
-	ai->alloc_size		= PERCPU_PAGE_SIZE;
-
-	pcpu_setup_first_chunk(ai, __per_cpu_start + __per_cpu_offset[0]);
-	pcpu_free_alloc_info(ai);
-}
-#else
-#define alloc_per_cpu_data() do { } while (0)
-#endif /* CONFIG_SMP */
-
-/**
- * find_memory - setup memory map
- *
- * Walk the EFI memory map and find usable memory for the system, taking
- * into account reserved areas.
- */
-void __init
-find_memory (void)
-{
-	reserve_memory();
-
-	/* first find highest page frame number */
-	min_low_pfn = ~0UL;
-	max_low_pfn = 0;
-	efi_memmap_walk(find_max_min_low_pfn, NULL);
-	max_pfn = max_low_pfn;
-
-	memblock_add_node(0, PFN_PHYS(max_low_pfn), 0, MEMBLOCK_NONE);
-
-	find_initrd();
-
-	alloc_per_cpu_data();
-}
-
-static int __init find_largest_hole(u64 start, u64 end, void *arg)
-{
-	u64 *max_gap = arg;
-
-	static u64 last_end = PAGE_OFFSET;
-
-	/* NOTE: this algorithm assumes efi memmap table is ordered */
-
-	if (*max_gap < (start - last_end))
-		*max_gap = start - last_end;
-	last_end = end;
-	return 0;
-}
-
-static void __init verify_gap_absence(void)
-{
-	unsigned long max_gap;
-
-	/* Forbid FLATMEM if hole is > than 1G */
-	efi_memmap_walk(find_largest_hole, (u64 *)&max_gap);
-	if (max_gap >= SZ_1G)
-		panic("Cannot use FLATMEM with %ldMB hole\n"
-		      "Please switch over to SPARSEMEM\n",
-		      (max_gap >> 20));
-}
-
-/*
- * Set up the page tables.
- */
-
-void __init
-paging_init (void)
-{
-	unsigned long max_dma;
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-	max_zone_pfns[ZONE_DMA32] = max_dma;
-	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-
-	verify_gap_absence();
-
-	free_area_init(max_zone_pfns);
-	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
-}
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
deleted file mode 100644
index 73d0db36edb6..000000000000
--- a/arch/ia64/mm/discontig.c
+++ /dev/null
@@ -1,635 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000, 2003 Silicon Graphics, Inc.  All rights reserved.
- * Copyright (c) 2001 Intel Corp.
- * Copyright (c) 2001 Tony Luck <tony.luck@intel.com>
- * Copyright (c) 2002 NEC Corp.
- * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
- * Copyright (c) 2004 Silicon Graphics, Inc
- *	Russ Anderson <rja@sgi.com>
- *	Jesse Barnes <jbarnes@sgi.com>
- *	Jack Steiner <steiner@sgi.com>
- */
-
-/*
- * Platform initialization for Discontig Memory
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/nmi.h>
-#include <linux/swap.h>
-#include <linux/memblock.h>
-#include <linux/acpi.h>
-#include <linux/efi.h>
-#include <linux/nodemask.h>
-#include <linux/slab.h>
-#include <asm/efi.h>
-#include <asm/tlb.h>
-#include <asm/meminit.h>
-#include <asm/numa.h>
-#include <asm/sections.h>
-
-/*
- * Track per-node information needed to setup the boot memory allocator, the
- * per-node areas, and the real VM.
- */
-struct early_node_data {
-	struct ia64_node_data *node_data;
-	unsigned long pernode_addr;
-	unsigned long pernode_size;
-	unsigned long min_pfn;
-	unsigned long max_pfn;
-};
-
-static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
-static nodemask_t memory_less_mask __initdata;
-
-pg_data_t *pgdat_list[MAX_NUMNODES];
-
-/*
- * To prevent cache aliasing effects, align per-node structures so that they
- * start at addresses that are strided by node number.
- */
-#define MAX_NODE_ALIGN_OFFSET	(32 * 1024 * 1024)
-#define NODEDATA_ALIGN(addr, node)						\
-	((((addr) + 1024*1024-1) & ~(1024*1024-1)) + 				\
-	     (((node)*PERCPU_PAGE_SIZE) & (MAX_NODE_ALIGN_OFFSET - 1)))
-
-/**
- * build_node_maps - callback to setup mem_data structs for each node
- * @start: physical start of range
- * @len: length of range
- * @node: node where this range resides
- *
- * Detect extents of each piece of memory that we wish to
- * treat as a virtually contiguous block (i.e. each node). Each such block
- * must start on an %IA64_GRANULE_SIZE boundary, so we round the address down
- * if necessary.  Any non-existent pages will simply be part of the virtual
- * memmap.
- */
-static int __init build_node_maps(unsigned long start, unsigned long len,
-				  int node)
-{
-	unsigned long spfn, epfn, end = start + len;
-
-	epfn = GRANULEROUNDUP(end) >> PAGE_SHIFT;
-	spfn = GRANULEROUNDDOWN(start) >> PAGE_SHIFT;
-
-	if (!mem_data[node].min_pfn) {
-		mem_data[node].min_pfn = spfn;
-		mem_data[node].max_pfn = epfn;
-	} else {
-		mem_data[node].min_pfn = min(spfn, mem_data[node].min_pfn);
-		mem_data[node].max_pfn = max(epfn, mem_data[node].max_pfn);
-	}
-
-	return 0;
-}
-
-/**
- * early_nr_cpus_node - return number of cpus on a given node
- * @node: node to check
- *
- * Count the number of cpus on @node.  We can't use nr_cpus_node() yet because
- * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
- * called yet.  Note that node 0 will also count all non-existent cpus.
- */
-static int early_nr_cpus_node(int node)
-{
-	int cpu, n = 0;
-
-	for_each_possible_early_cpu(cpu)
-		if (node == node_cpuid[cpu].nid)
-			n++;
-
-	return n;
-}
-
-/**
- * compute_pernodesize - compute size of pernode data
- * @node: the node id.
- */
-static unsigned long compute_pernodesize(int node)
-{
-	unsigned long pernodesize = 0, cpus;
-
-	cpus = early_nr_cpus_node(node);
-	pernodesize += PERCPU_PAGE_SIZE * cpus;
-	pernodesize += node * L1_CACHE_BYTES;
-	pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
-	pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
-	pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
-	pernodesize = PAGE_ALIGN(pernodesize);
-	return pernodesize;
-}
-
-/**
- * per_cpu_node_setup - setup per-cpu areas on each node
- * @cpu_data: per-cpu area on this node
- * @node: node to setup
- *
- * Copy the static per-cpu data into the region we just set aside and then
- * setup __per_cpu_offset for each CPU on this node.  Return a pointer to
- * the end of the area.
- */
-static void *per_cpu_node_setup(void *cpu_data, int node)
-{
-#ifdef CONFIG_SMP
-	int cpu;
-
-	for_each_possible_early_cpu(cpu) {
-		void *src = cpu == 0 ? __cpu0_per_cpu : __phys_per_cpu_start;
-
-		if (node != node_cpuid[cpu].nid)
-			continue;
-
-		memcpy(__va(cpu_data), src, __per_cpu_end - __per_cpu_start);
-		__per_cpu_offset[cpu] = (char *)__va(cpu_data) -
-			__per_cpu_start;
-
-		/*
-		 * percpu area for cpu0 is moved from the __init area
-		 * which is setup by head.S and used till this point.
-		 * Update ar.k3.  This move is ensures that percpu
-		 * area for cpu0 is on the correct node and its
-		 * virtual address isn't insanely far from other
-		 * percpu areas which is important for congruent
-		 * percpu allocator.
-		 */
-		if (cpu == 0)
-			ia64_set_kr(IA64_KR_PER_CPU_DATA,
-				    (unsigned long)cpu_data -
-				    (unsigned long)__per_cpu_start);
-
-		cpu_data += PERCPU_PAGE_SIZE;
-	}
-#endif
-	return cpu_data;
-}
-
-#ifdef CONFIG_SMP
-/**
- * setup_per_cpu_areas - setup percpu areas
- *
- * Arch code has already allocated and initialized percpu areas.  All
- * this function has to do is to teach the determined layout to the
- * dynamic percpu allocator, which happens to be more complex than
- * creating whole new ones using helpers.
- */
-void __init setup_per_cpu_areas(void)
-{
-	struct pcpu_alloc_info *ai;
-	struct pcpu_group_info *gi;
-	unsigned int *cpu_map;
-	void *base;
-	unsigned long base_offset;
-	unsigned int cpu;
-	ssize_t static_size, reserved_size, dyn_size;
-	int node, prev_node, unit, nr_units;
-
-	ai = pcpu_alloc_alloc_info(MAX_NUMNODES, nr_cpu_ids);
-	if (!ai)
-		panic("failed to allocate pcpu_alloc_info");
-	cpu_map = ai->groups[0].cpu_map;
-
-	/* determine base */
-	base = (void *)ULONG_MAX;
-	for_each_possible_cpu(cpu)
-		base = min(base,
-			   (void *)(__per_cpu_offset[cpu] + __per_cpu_start));
-	base_offset = (void *)__per_cpu_start - base;
-
-	/* build cpu_map, units are grouped by node */
-	unit = 0;
-	for_each_node(node)
-		for_each_possible_cpu(cpu)
-			if (node == node_cpuid[cpu].nid)
-				cpu_map[unit++] = cpu;
-	nr_units = unit;
-
-	/* set basic parameters */
-	static_size = __per_cpu_end - __per_cpu_start;
-	reserved_size = PERCPU_MODULE_RESERVE;
-	dyn_size = PERCPU_PAGE_SIZE - static_size - reserved_size;
-	if (dyn_size < 0)
-		panic("percpu area overflow static=%zd reserved=%zd\n",
-		      static_size, reserved_size);
-
-	ai->static_size		= static_size;
-	ai->reserved_size	= reserved_size;
-	ai->dyn_size		= dyn_size;
-	ai->unit_size		= PERCPU_PAGE_SIZE;
-	ai->atom_size		= PAGE_SIZE;
-	ai->alloc_size		= PERCPU_PAGE_SIZE;
-
-	/*
-	 * CPUs are put into groups according to node.  Walk cpu_map
-	 * and create new groups at node boundaries.
-	 */
-	prev_node = NUMA_NO_NODE;
-	ai->nr_groups = 0;
-	for (unit = 0; unit < nr_units; unit++) {
-		cpu = cpu_map[unit];
-		node = node_cpuid[cpu].nid;
-
-		if (node == prev_node) {
-			gi->nr_units++;
-			continue;
-		}
-		prev_node = node;
-
-		gi = &ai->groups[ai->nr_groups++];
-		gi->nr_units		= 1;
-		gi->base_offset		= __per_cpu_offset[cpu] + base_offset;
-		gi->cpu_map		= &cpu_map[unit];
-	}
-
-	pcpu_setup_first_chunk(ai, base);
-	pcpu_free_alloc_info(ai);
-}
-#endif
-
-/**
- * fill_pernode - initialize pernode data.
- * @node: the node id.
- * @pernode: physical address of pernode data
- * @pernodesize: size of the pernode data
- */
-static void __init fill_pernode(int node, unsigned long pernode,
-	unsigned long pernodesize)
-{
-	void *cpu_data;
-	int cpus = early_nr_cpus_node(node);
-
-	mem_data[node].pernode_addr = pernode;
-	mem_data[node].pernode_size = pernodesize;
-	memset(__va(pernode), 0, pernodesize);
-
-	cpu_data = (void *)pernode;
-	pernode += PERCPU_PAGE_SIZE * cpus;
-	pernode += node * L1_CACHE_BYTES;
-
-	pgdat_list[node] = __va(pernode);
-	pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
-
-	mem_data[node].node_data = __va(pernode);
-	pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
-	pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
-
-	cpu_data = per_cpu_node_setup(cpu_data, node);
-
-	return;
-}
-
-/**
- * find_pernode_space - allocate memory for memory map and per-node structures
- * @start: physical start of range
- * @len: length of range
- * @node: node where this range resides
- *
- * This routine reserves space for the per-cpu data struct, the list of
- * pg_data_ts and the per-node data struct.  Each node will have something like
- * the following in the first chunk of addr. space large enough to hold it.
- *
- *    ________________________
- *   |                        |
- *   |~~~~~~~~~~~~~~~~~~~~~~~~| <-- NODEDATA_ALIGN(start, node) for the first
- *   |    PERCPU_PAGE_SIZE *  |     start and length big enough
- *   |    cpus_on_this_node   | Node 0 will also have entries for all non-existent cpus.
- *   |------------------------|
- *   |   local pg_data_t *    |
- *   |------------------------|
- *   |  local ia64_node_data  |
- *   |------------------------|
- *   |          ???           |
- *   |________________________|
- *
- * Once this space has been set aside, the bootmem maps are initialized.  We
- * could probably move the allocation of the per-cpu and ia64_node_data space
- * outside of this function and use alloc_bootmem_node(), but doing it here
- * is straightforward and we get the alignments we want so...
- */
-static int __init find_pernode_space(unsigned long start, unsigned long len,
-				     int node)
-{
-	unsigned long spfn, epfn;
-	unsigned long pernodesize = 0, pernode;
-
-	spfn = start >> PAGE_SHIFT;
-	epfn = (start + len) >> PAGE_SHIFT;
-
-	/*
-	 * Make sure this memory falls within this node's usable memory
-	 * since we may have thrown some away in build_maps().
-	 */
-	if (spfn < mem_data[node].min_pfn || epfn > mem_data[node].max_pfn)
-		return 0;
-
-	/* Don't setup this node's local space twice... */
-	if (mem_data[node].pernode_addr)
-		return 0;
-
-	/*
-	 * Calculate total size needed, incl. what's necessary
-	 * for good alignment and alias prevention.
-	 */
-	pernodesize = compute_pernodesize(node);
-	pernode = NODEDATA_ALIGN(start, node);
-
-	/* Is this range big enough for what we want to store here? */
-	if (start + len > (pernode + pernodesize))
-		fill_pernode(node, pernode, pernodesize);
-
-	return 0;
-}
-
-/**
- * reserve_pernode_space - reserve memory for per-node space
- *
- * Reserve the space used by the bootmem maps & per-node space in the boot
- * allocator so that when we actually create the real mem maps we don't
- * use their memory.
- */
-static void __init reserve_pernode_space(void)
-{
-	unsigned long base, size;
-	int node;
-
-	for_each_online_node(node) {
-		if (node_isset(node, memory_less_mask))
-			continue;
-
-		/* Now the per-node space */
-		size = mem_data[node].pernode_size;
-		base = __pa(mem_data[node].pernode_addr);
-		memblock_reserve(base, size);
-	}
-}
-
-static void scatter_node_data(void)
-{
-	pg_data_t **dst;
-	int node;
-
-	/*
-	 * for_each_online_node() can't be used at here.
-	 * node_online_map is not set for hot-added nodes at this time,
-	 * because we are halfway through initialization of the new node's
-	 * structures.  If for_each_online_node() is used, a new node's
-	 * pg_data_ptrs will be not initialized. Instead of using it,
-	 * pgdat_list[] is checked.
-	 */
-	for_each_node(node) {
-		if (pgdat_list[node]) {
-			dst = LOCAL_DATA_ADDR(pgdat_list[node])->pg_data_ptrs;
-			memcpy(dst, pgdat_list, sizeof(pgdat_list));
-		}
-	}
-}
-
-/**
- * initialize_pernode_data - fixup per-cpu & per-node pointers
- *
- * Each node's per-node area has a copy of the global pg_data_t list, so
- * we copy that to each node here, as well as setting the per-cpu pointer
- * to the local node data structure.
- */
-static void __init initialize_pernode_data(void)
-{
-	int cpu, node;
-
-	scatter_node_data();
-
-#ifdef CONFIG_SMP
-	/* Set the node_data pointer for each per-cpu struct */
-	for_each_possible_early_cpu(cpu) {
-		node = node_cpuid[cpu].nid;
-		per_cpu(ia64_cpu_info, cpu).node_data =
-			mem_data[node].node_data;
-	}
-#else
-	{
-		struct cpuinfo_ia64 *cpu0_cpu_info;
-		cpu = 0;
-		node = node_cpuid[cpu].nid;
-		cpu0_cpu_info = (struct cpuinfo_ia64 *)(__phys_per_cpu_start +
-			((char *)&ia64_cpu_info - __per_cpu_start));
-		cpu0_cpu_info->node_data = mem_data[node].node_data;
-	}
-#endif /* CONFIG_SMP */
-}
-
-/**
- * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit
- * 	node but fall back to any other node when __alloc_bootmem_node fails
- *	for best.
- * @nid: node id
- * @pernodesize: size of this node's pernode data
- */
-static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize)
-{
-	void *ptr = NULL;
-	u8 best = 0xff;
-	int bestnode = NUMA_NO_NODE, node, anynode = 0;
-
-	for_each_online_node(node) {
-		if (node_isset(node, memory_less_mask))
-			continue;
-		else if (node_distance(nid, node) < best) {
-			best = node_distance(nid, node);
-			bestnode = node;
-		}
-		anynode = node;
-	}
-
-	if (bestnode == NUMA_NO_NODE)
-		bestnode = anynode;
-
-	ptr = memblock_alloc_try_nid(pernodesize, PERCPU_PAGE_SIZE,
-				     __pa(MAX_DMA_ADDRESS),
-				     MEMBLOCK_ALLOC_ACCESSIBLE,
-				     bestnode);
-	if (!ptr)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%lx\n",
-		      __func__, pernodesize, PERCPU_PAGE_SIZE, bestnode,
-		      __pa(MAX_DMA_ADDRESS));
-
-	return ptr;
-}
-
-/**
- * memory_less_nodes - allocate and initialize CPU only nodes pernode
- *	information.
- */
-static void __init memory_less_nodes(void)
-{
-	unsigned long pernodesize;
-	void *pernode;
-	int node;
-
-	for_each_node_mask(node, memory_less_mask) {
-		pernodesize = compute_pernodesize(node);
-		pernode = memory_less_node_alloc(node, pernodesize);
-		fill_pernode(node, __pa(pernode), pernodesize);
-	}
-
-	return;
-}
-
-/**
- * find_memory - walk the EFI memory map and setup the bootmem allocator
- *
- * Called early in boot to setup the bootmem allocator, and to
- * allocate the per-cpu and per-node structures.
- */
-void __init find_memory(void)
-{
-	int node;
-
-	reserve_memory();
-	efi_memmap_walk(filter_memory, register_active_ranges);
-
-	if (num_online_nodes() == 0) {
-		printk(KERN_ERR "node info missing!\n");
-		node_set_online(0);
-	}
-
-	nodes_or(memory_less_mask, memory_less_mask, node_online_map);
-	min_low_pfn = -1;
-	max_low_pfn = 0;
-
-	/* These actually end up getting called by call_pernode_memory() */
-	efi_memmap_walk(filter_rsvd_memory, build_node_maps);
-	efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
-	efi_memmap_walk(find_max_min_low_pfn, NULL);
-
-	for_each_online_node(node)
-		if (mem_data[node].min_pfn)
-			node_clear(node, memory_less_mask);
-
-	reserve_pernode_space();
-	memory_less_nodes();
-	initialize_pernode_data();
-
-	max_pfn = max_low_pfn;
-
-	find_initrd();
-}
-
-#ifdef CONFIG_SMP
-/**
- * per_cpu_init - setup per-cpu variables
- *
- * find_pernode_space() does most of this already, we just need to set
- * local_per_cpu_offset
- */
-void *per_cpu_init(void)
-{
-	int cpu;
-	static int first_time = 1;
-
-	if (first_time) {
-		first_time = 0;
-		for_each_possible_early_cpu(cpu)
-			per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
-	}
-
-	return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
-}
-#endif /* CONFIG_SMP */
-
-/**
- * call_pernode_memory - use SRAT to call callback functions with node info
- * @start: physical start of range
- * @len: length of range
- * @arg: function to call for each range
- *
- * efi_memmap_walk() knows nothing about layout of memory across nodes. Find
- * out to which node a block of memory belongs.  Ignore memory that we cannot
- * identify, and split blocks that run across multiple nodes.
- *
- * Take this opportunity to round the start address up and the end address
- * down to page boundaries.
- */
-void call_pernode_memory(unsigned long start, unsigned long len, void *arg)
-{
-	unsigned long rs, re, end = start + len;
-	void (*func)(unsigned long, unsigned long, int);
-	int i;
-
-	start = PAGE_ALIGN(start);
-	end &= PAGE_MASK;
-	if (start >= end)
-		return;
-
-	func = arg;
-
-	if (!num_node_memblks) {
-		/* No SRAT table, so assume one node (node 0) */
-		if (start < end)
-			(*func)(start, end - start, 0);
-		return;
-	}
-
-	for (i = 0; i < num_node_memblks; i++) {
-		rs = max(start, node_memblk[i].start_paddr);
-		re = min(end, node_memblk[i].start_paddr +
-			 node_memblk[i].size);
-
-		if (rs < re)
-			(*func)(rs, re - rs, node_memblk[i].nid);
-
-		if (re == end)
-			break;
-	}
-}
-
-/**
- * paging_init - setup page tables
- *
- * paging_init() sets up the page tables for each node of the system and frees
- * the bootmem allocator memory for general use.
- */
-void __init paging_init(void)
-{
-	unsigned long max_dma;
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-
-	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
-	sparse_init();
-
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-	max_zone_pfns[ZONE_DMA32] = max_dma;
-	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-	free_area_init(max_zone_pfns);
-
-	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
-}
-
-pg_data_t * __init arch_alloc_nodedata(int nid)
-{
-	unsigned long size = compute_pernodesize(nid);
-
-	return memblock_alloc(size, SMP_CACHE_BYTES);
-}
-
-void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
-{
-	pgdat_list[update_node] = update_pgdat;
-	scatter_node_data();
-}
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
-		struct vmem_altmap *altmap)
-{
-	return vmemmap_populate_basepages(start, end, node, NULL);
-}
-
-void vmemmap_free(unsigned long start, unsigned long end,
-		struct vmem_altmap *altmap)
-{
-}
-#endif
diff --git a/arch/ia64/mm/extable.c b/arch/ia64/mm/extable.c
deleted file mode 100644
index da477c11770b..000000000000
--- a/arch/ia64/mm/extable.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Kernel exception handling table support.  Derived from arch/alpha/mm/extable.c.
- *
- * Copyright (C) 1998, 1999, 2001-2002, 2004 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-
-#include <asm/ptrace.h>
-#include <asm/extable.h>
-#include <asm/errno.h>
-#include <asm/processor.h>
-
-void
-ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e)
-{
-	long fix = (u64) &e->fixup + e->fixup;
-
-	regs->r8 = -EFAULT;
-	if (fix & 4)
-		regs->r9 = 0;
-	regs->cr_iip = fix & ~0xf;
-	ia64_psr(regs)->ri = fix & 0x3;		/* set continuation slot number */
-}
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
deleted file mode 100644
index 5458b52b4009..000000000000
--- a/arch/ia64/mm/fault.c
+++ /dev/null
@@ -1,251 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * MMU fault handling support.
- *
- * Copyright (C) 1998-2002 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/sched/signal.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/extable.h>
-#include <linux/interrupt.h>
-#include <linux/kprobes.h>
-#include <linux/kdebug.h>
-#include <linux/prefetch.h>
-#include <linux/uaccess.h>
-#include <linux/perf_event.h>
-
-#include <asm/processor.h>
-#include <asm/exception.h>
-
-extern int die(char *, struct pt_regs *, long);
-
-/*
- * Return TRUE if ADDRESS points at a page in the kernel's mapped segment
- * (inside region 5, on ia64) and that page is present.
- */
-static int
-mapped_kernel_page_is_present (unsigned long address)
-{
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *ptep, pte;
-
-	pgd = pgd_offset_k(address);
-	if (pgd_none(*pgd) || pgd_bad(*pgd))
-		return 0;
-
-	p4d = p4d_offset(pgd, address);
-	if (p4d_none(*p4d) || p4d_bad(*p4d))
-		return 0;
-
-	pud = pud_offset(p4d, address);
-	if (pud_none(*pud) || pud_bad(*pud))
-		return 0;
-
-	pmd = pmd_offset(pud, address);
-	if (pmd_none(*pmd) || pmd_bad(*pmd))
-		return 0;
-
-	ptep = pte_offset_kernel(pmd, address);
-	if (!ptep)
-		return 0;
-
-	pte = *ptep;
-	return pte_present(pte);
-}
-
-#	define VM_READ_BIT	0
-#	define VM_WRITE_BIT	1
-#	define VM_EXEC_BIT	2
-
-void __kprobes
-ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *regs)
-{
-	int signal = SIGSEGV, code = SEGV_MAPERR;
-	struct vm_area_struct *vma, *prev_vma;
-	struct mm_struct *mm = current->mm;
-	unsigned long mask;
-	vm_fault_t fault;
-	unsigned int flags = FAULT_FLAG_DEFAULT;
-
-	mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
-		| (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT));
-
-	/* mmap_lock is performance critical.... */
-	prefetchw(&mm->mmap_lock);
-
-	/*
-	 * If we're in an interrupt or have no user context, we must not take the fault..
-	 */
-	if (faulthandler_disabled() || !mm)
-		goto no_context;
-
-	/*
-	 * This is to handle the kprobes on user space access instructions
-	 */
-	if (kprobe_page_fault(regs, TRAP_BRKPT))
-		return;
-
-	if (user_mode(regs))
-		flags |= FAULT_FLAG_USER;
-	if (mask & VM_WRITE)
-		flags |= FAULT_FLAG_WRITE;
-
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
-retry:
-	mmap_read_lock(mm);
-
-	vma = find_vma_prev(mm, address, &prev_vma);
-	if (!vma && !prev_vma )
-		goto bad_area;
-
-        /*
-         * find_vma_prev() returns vma such that address < vma->vm_end or NULL
-         *
-         * May find no vma, but could be that the last vm area is the
-         * register backing store that needs to expand upwards, in
-         * this case vma will be null, but prev_vma will ne non-null
-         */
-        if (( !vma && prev_vma ) || (address < vma->vm_start) ) {
-		vma = expand_stack(mm, address);
-		if (!vma)
-			goto bad_area_nosemaphore;
-	}
-
-	code = SEGV_ACCERR;
-
-	/* OK, we've got a good vm_area for this memory area.  Check the access permissions: */
-
-#	if (((1 << VM_READ_BIT) != VM_READ || (1 << VM_WRITE_BIT) != VM_WRITE) \
-	    || (1 << VM_EXEC_BIT) != VM_EXEC)
-#		error File is out of sync with <linux/mm.h>.  Please update.
-#	endif
-
-	if (((isr >> IA64_ISR_R_BIT) & 1UL) && (!(vma->vm_flags & (VM_READ | VM_WRITE))))
-		goto bad_area;
-
-	if ((vma->vm_flags & mask) != mask)
-		goto bad_area;
-
-	/*
-	 * If for any reason at all we couldn't handle the fault, make
-	 * sure we exit gracefully rather than endlessly redo the
-	 * fault.
-	 */
-	fault = handle_mm_fault(vma, address, flags, regs);
-
-	if (fault_signal_pending(fault, regs)) {
-		if (!user_mode(regs))
-			goto no_context;
-		return;
-	}
-
-	/* The fault is fully completed (including releasing mmap lock) */
-	if (fault & VM_FAULT_COMPLETED)
-		return;
-
-	if (unlikely(fault & VM_FAULT_ERROR)) {
-		/*
-		 * We ran out of memory, or some other thing happened
-		 * to us that made us unable to handle the page fault
-		 * gracefully.
-		 */
-		if (fault & VM_FAULT_OOM) {
-			goto out_of_memory;
-		} else if (fault & VM_FAULT_SIGSEGV) {
-			goto bad_area;
-		} else if (fault & VM_FAULT_SIGBUS) {
-			signal = SIGBUS;
-			goto bad_area;
-		}
-		BUG();
-	}
-
-	if (fault & VM_FAULT_RETRY) {
-		flags |= FAULT_FLAG_TRIED;
-
-		/* No need to mmap_read_unlock(mm) as we would
-		 * have already released it in __lock_page_or_retry
-		 * in mm/filemap.c.
-		 */
-
-		goto retry;
-	}
-
-	mmap_read_unlock(mm);
-	return;
-
-  bad_area:
-	mmap_read_unlock(mm);
-  bad_area_nosemaphore:
-	if ((isr & IA64_ISR_SP)
-	    || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
-	{
-		/*
-		 * This fault was due to a speculative load or lfetch.fault, set the "ed"
-		 * bit in the psr to ensure forward progress.  (Target register will get a
-		 * NaT for ld.s, lfetch will be canceled.)
-		 */
-		ia64_psr(regs)->ed = 1;
-		return;
-	}
-	if (user_mode(regs)) {
-		force_sig_fault(signal, code, (void __user *) address,
-				0, __ISR_VALID, isr);
-		return;
-	}
-
-  no_context:
-	if ((isr & IA64_ISR_SP)
-	    || ((isr & IA64_ISR_NA) && (isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH))
-	{
-		/*
-		 * This fault was due to a speculative load or lfetch.fault, set the "ed"
-		 * bit in the psr to ensure forward progress.  (Target register will get a
-		 * NaT for ld.s, lfetch will be canceled.)
-		 */
-		ia64_psr(regs)->ed = 1;
-		return;
-	}
-
-	/*
-	 * Since we have no vma's for region 5, we might get here even if the address is
-	 * valid, due to the VHPT walker inserting a non present translation that becomes
-	 * stale. If that happens, the non present fault handler already purged the stale
-	 * translation, which fixed the problem. So, we check to see if the translation is
-	 * valid, and return if it is.
-	 */
-	if (REGION_NUMBER(address) == 5 && mapped_kernel_page_is_present(address))
-		return;
-
-	if (ia64_done_with_exception(regs))
-		return;
-
-	/*
-	 * Oops. The kernel tried to access some bad page. We'll have to terminate things
-	 * with extreme prejudice.
-	 */
-	bust_spinlocks(1);
-
-	if (address < PAGE_SIZE)
-		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference (address %016lx)\n", address);
-	else
-		printk(KERN_ALERT "Unable to handle kernel paging request at "
-		       "virtual address %016lx\n", address);
-	if (die("Oops", regs, isr))
-		regs = NULL;
-	bust_spinlocks(0);
-	if (regs)
-		make_task_dead(SIGKILL);
-	return;
-
-  out_of_memory:
-	mmap_read_unlock(mm);
-	if (!user_mode(regs))
-		goto no_context;
-	pagefault_out_of_memory();
-}
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
deleted file mode 100644
index adc49f2d22e8..000000000000
--- a/arch/ia64/mm/hugetlbpage.c
+++ /dev/null
@@ -1,186 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * IA-64 Huge TLB Page Support for Kernel.
- *
- * Copyright (C) 2002-2004 Rohit Seth <rohit.seth@intel.com>
- * Copyright (C) 2003-2004 Ken Chen <kenneth.w.chen@intel.com>
- *
- * Sep, 2003: add numa support
- * Feb, 2004: dynamic hugetlb page size via boot parameter
- */
-
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/pagemap.h>
-#include <linux/module.h>
-#include <linux/sysctl.h>
-#include <linux/log2.h>
-#include <asm/mman.h>
-#include <asm/tlb.h>
-#include <asm/tlbflush.h>
-
-unsigned int hpage_shift = HPAGE_SHIFT_DEFAULT;
-EXPORT_SYMBOL(hpage_shift);
-
-pte_t *
-huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
-	       unsigned long addr, unsigned long sz)
-{
-	unsigned long taddr = htlbpage_to_page(addr);
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte = NULL;
-
-	pgd = pgd_offset(mm, taddr);
-	p4d = p4d_offset(pgd, taddr);
-	pud = pud_alloc(mm, p4d, taddr);
-	if (pud) {
-		pmd = pmd_alloc(mm, pud, taddr);
-		if (pmd)
-			pte = pte_alloc_huge(mm, pmd, taddr);
-	}
-	return pte;
-}
-
-pte_t *
-huge_pte_offset (struct mm_struct *mm, unsigned long addr, unsigned long sz)
-{
-	unsigned long taddr = htlbpage_to_page(addr);
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte = NULL;
-
-	pgd = pgd_offset(mm, taddr);
-	if (pgd_present(*pgd)) {
-		p4d = p4d_offset(pgd, taddr);
-		if (p4d_present(*p4d)) {
-			pud = pud_offset(p4d, taddr);
-			if (pud_present(*pud)) {
-				pmd = pmd_offset(pud, taddr);
-				if (pmd_present(*pmd))
-					pte = pte_offset_huge(pmd, taddr);
-			}
-		}
-	}
-
-	return pte;
-}
-
-#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
-
-/*
- * Don't actually need to do any preparation, but need to make sure
- * the address is in the right region.
- */
-int prepare_hugepage_range(struct file *file,
-			unsigned long addr, unsigned long len)
-{
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
-		return -EINVAL;
-	if (REGION_NUMBER(addr) != RGN_HPAGE)
-		return -EINVAL;
-
-	return 0;
-}
-
-int pmd_huge(pmd_t pmd)
-{
-	return 0;
-}
-
-int pud_huge(pud_t pud)
-{
-	return 0;
-}
-
-void hugetlb_free_pgd_range(struct mmu_gather *tlb,
-			unsigned long addr, unsigned long end,
-			unsigned long floor, unsigned long ceiling)
-{
-	/*
-	 * This is called to free hugetlb page tables.
-	 *
-	 * The offset of these addresses from the base of the hugetlb
-	 * region must be scaled down by HPAGE_SIZE/PAGE_SIZE so that
-	 * the standard free_pgd_range will free the right page tables.
-	 *
-	 * If floor and ceiling are also in the hugetlb region, they
-	 * must likewise be scaled down; but if outside, left unchanged.
-	 */
-
-	addr = htlbpage_to_page(addr);
-	end  = htlbpage_to_page(end);
-	if (REGION_NUMBER(floor) == RGN_HPAGE)
-		floor = htlbpage_to_page(floor);
-	if (REGION_NUMBER(ceiling) == RGN_HPAGE)
-		ceiling = htlbpage_to_page(ceiling);
-
-	free_pgd_range(tlb, addr, end, floor, ceiling);
-}
-
-unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
-{
-	struct vm_unmapped_area_info info;
-
-	if (len > RGN_MAP_LIMIT)
-		return -ENOMEM;
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-
-	/* Handle MAP_FIXED */
-	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(file, addr, len))
-			return -EINVAL;
-		return addr;
-	}
-
-	/* This code assumes that RGN_HPAGE != 0. */
-	if ((REGION_NUMBER(addr) != RGN_HPAGE) || (addr & (HPAGE_SIZE - 1)))
-		addr = HPAGE_REGION_BASE;
-
-	info.flags = 0;
-	info.length = len;
-	info.low_limit = addr;
-	info.high_limit = HPAGE_REGION_BASE + RGN_MAP_LIMIT;
-	info.align_mask = PAGE_MASK & (HPAGE_SIZE - 1);
-	info.align_offset = 0;
-	return vm_unmapped_area(&info);
-}
-
-static int __init hugetlb_setup_sz(char *str)
-{
-	u64 tr_pages;
-	unsigned long long size;
-
-	if (ia64_pal_vm_page_size(&tr_pages, NULL) != 0)
-		/*
-		 * shouldn't happen, but just in case.
-		 */
-		tr_pages = 0x15557000UL;
-
-	size = memparse(str, &str);
-	if (*str || !is_power_of_2(size) || !(tr_pages & size) ||
-		size <= PAGE_SIZE ||
-		size > (1UL << PAGE_SHIFT << MAX_ORDER)) {
-		printk(KERN_WARNING "Invalid huge page size specified\n");
-		return 1;
-	}
-
-	hpage_shift = __ffs(size);
-	/*
-	 * boot cpu already executed ia64_mmu_init, and has HPAGE_SHIFT_DEFAULT
-	 * override here with new page shift.
-	 */
-	ia64_set_rr(HPAGE_REGION_BASE, hpage_shift << 2);
-	return 0;
-}
-early_param("hugepagesz", hugetlb_setup_sz);
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
deleted file mode 100644
index 05b0f2f0c073..000000000000
--- a/arch/ia64/mm/init.c
+++ /dev/null
@@ -1,532 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Initialize MMU support.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-
-#include <linux/dma-map-ops.h>
-#include <linux/dmar.h>
-#include <linux/efi.h>
-#include <linux/elf.h>
-#include <linux/memblock.h>
-#include <linux/mm.h>
-#include <linux/sched/signal.h>
-#include <linux/mmzone.h>
-#include <linux/module.h>
-#include <linux/personality.h>
-#include <linux/reboot.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/proc_fs.h>
-#include <linux/bitops.h>
-#include <linux/kexec.h>
-#include <linux/swiotlb.h>
-
-#include <asm/dma.h>
-#include <asm/efi.h>
-#include <asm/io.h>
-#include <asm/numa.h>
-#include <asm/patch.h>
-#include <asm/pgalloc.h>
-#include <asm/sal.h>
-#include <asm/sections.h>
-#include <asm/tlb.h>
-#include <linux/uaccess.h>
-#include <asm/unistd.h>
-#include <asm/mca.h>
-
-extern void ia64_tlb_init (void);
-
-unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
-
-struct page *zero_page_memmap_ptr;	/* map entry for zero page */
-EXPORT_SYMBOL(zero_page_memmap_ptr);
-
-void
-__ia64_sync_icache_dcache (pte_t pte)
-{
-	unsigned long addr;
-	struct folio *folio;
-
-	folio = page_folio(pte_page(pte));
-	addr = (unsigned long)folio_address(folio);
-
-	if (test_bit(PG_arch_1, &folio->flags))
-		return;				/* i-cache is already coherent with d-cache */
-
-	flush_icache_range(addr, addr + folio_size(folio));
-	set_bit(PG_arch_1, &folio->flags);	/* mark page as clean */
-}
-
-/*
- * Since DMA is i-cache coherent, any (complete) folios that were written via
- * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
- * flush them when they get mapped into an executable vm-area.
- */
-void arch_dma_mark_clean(phys_addr_t paddr, size_t size)
-{
-	unsigned long pfn = PHYS_PFN(paddr);
-	struct folio *folio = page_folio(pfn_to_page(pfn));
-	ssize_t left = size;
-	size_t offset = offset_in_folio(folio, paddr);
-
-	if (offset) {
-		left -= folio_size(folio) - offset;
-		if (left <= 0)
-			return;
-		folio = folio_next(folio);
-	}
-
-	while (left >= (ssize_t)folio_size(folio)) {
-		left -= folio_size(folio);
-		set_bit(PG_arch_1, &pfn_to_page(pfn)->flags);
-		if (!left)
-			break;
-		folio = folio_next(folio);
-	}
-}
-
-inline void
-ia64_set_rbs_bot (void)
-{
-	unsigned long stack_size = rlimit_max(RLIMIT_STACK) & -16;
-
-	if (stack_size > MAX_USER_STACK_SIZE)
-		stack_size = MAX_USER_STACK_SIZE;
-	current->thread.rbs_bot = PAGE_ALIGN(current->mm->start_stack - stack_size);
-}
-
-/*
- * This performs some platform-dependent address space initialization.
- * On IA-64, we want to setup the VM area for the register backing
- * store (which grows upwards) and install the gateway page which is
- * used for signal trampolines, etc.
- */
-void
-ia64_init_addr_space (void)
-{
-	struct vm_area_struct *vma;
-
-	ia64_set_rbs_bot();
-
-	/*
-	 * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore
-	 * the problem.  When the process attempts to write to the register backing store
-	 * for the first time, it will get a SEGFAULT in this case.
-	 */
-	vma = vm_area_alloc(current->mm);
-	if (vma) {
-		vma_set_anonymous(vma);
-		vma->vm_start = current->thread.rbs_bot & PAGE_MASK;
-		vma->vm_end = vma->vm_start + PAGE_SIZE;
-		vm_flags_init(vma, VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT);
-		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
-		mmap_write_lock(current->mm);
-		if (insert_vm_struct(current->mm, vma)) {
-			mmap_write_unlock(current->mm);
-			vm_area_free(vma);
-			return;
-		}
-		mmap_write_unlock(current->mm);
-	}
-
-	/* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
-	if (!(current->personality & MMAP_PAGE_ZERO)) {
-		vma = vm_area_alloc(current->mm);
-		if (vma) {
-			vma_set_anonymous(vma);
-			vma->vm_end = PAGE_SIZE;
-			vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
-			vm_flags_init(vma, VM_READ | VM_MAYREAD | VM_IO |
-				      VM_DONTEXPAND | VM_DONTDUMP);
-			mmap_write_lock(current->mm);
-			if (insert_vm_struct(current->mm, vma)) {
-				mmap_write_unlock(current->mm);
-				vm_area_free(vma);
-				return;
-			}
-			mmap_write_unlock(current->mm);
-		}
-	}
-}
-
-void
-free_initmem (void)
-{
-	free_reserved_area(ia64_imva(__init_begin), ia64_imva(__init_end),
-			   -1, "unused kernel");
-}
-
-void __init
-free_initrd_mem (unsigned long start, unsigned long end)
-{
-	/*
-	 * EFI uses 4KB pages while the kernel can use 4KB or bigger.
-	 * Thus EFI and the kernel may have different page sizes. It is
-	 * therefore possible to have the initrd share the same page as
-	 * the end of the kernel (given current setup).
-	 *
-	 * To avoid freeing/using the wrong page (kernel sized) we:
-	 *	- align up the beginning of initrd
-	 *	- align down the end of initrd
-	 *
-	 *  |             |
-	 *  |=============| a000
-	 *  |             |
-	 *  |             |
-	 *  |             | 9000
-	 *  |/////////////|
-	 *  |/////////////|
-	 *  |=============| 8000
-	 *  |///INITRD////|
-	 *  |/////////////|
-	 *  |/////////////| 7000
-	 *  |             |
-	 *  |KKKKKKKKKKKKK|
-	 *  |=============| 6000
-	 *  |KKKKKKKKKKKKK|
-	 *  |KKKKKKKKKKKKK|
-	 *  K=kernel using 8KB pages
-	 *
-	 * In this example, we must free page 8000 ONLY. So we must align up
-	 * initrd_start and keep initrd_end as is.
-	 */
-	start = PAGE_ALIGN(start);
-	end = end & PAGE_MASK;
-
-	if (start < end)
-		printk(KERN_INFO "Freeing initrd memory: %ldkB freed\n", (end - start) >> 10);
-
-	for (; start < end; start += PAGE_SIZE) {
-		if (!virt_addr_valid(start))
-			continue;
-		free_reserved_page(virt_to_page(start));
-	}
-}
-
-/*
- * This installs a clean page in the kernel's page table.
- */
-static struct page * __init
-put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
-{
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	pgd = pgd_offset_k(address);		/* note: this is NOT pgd_offset()! */
-
-	{
-		p4d = p4d_alloc(&init_mm, pgd, address);
-		if (!p4d)
-			goto out;
-		pud = pud_alloc(&init_mm, p4d, address);
-		if (!pud)
-			goto out;
-		pmd = pmd_alloc(&init_mm, pud, address);
-		if (!pmd)
-			goto out;
-		pte = pte_alloc_kernel(pmd, address);
-		if (!pte)
-			goto out;
-		if (!pte_none(*pte))
-			goto out;
-		set_pte(pte, mk_pte(page, pgprot));
-	}
-  out:
-	/* no need for flush_tlb */
-	return page;
-}
-
-static void __init
-setup_gate (void)
-{
-	struct page *page;
-
-	/*
-	 * Map the gate page twice: once read-only to export the ELF
-	 * headers etc. and once execute-only page to enable
-	 * privilege-promotion via "epc":
-	 */
-	page = virt_to_page(ia64_imva(__start_gate_section));
-	put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
-#ifdef HAVE_BUGGY_SEGREL
-	page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE));
-	put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
-#else
-	put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
-	/* Fill in the holes (if any) with read-only zero pages: */
-	{
-		unsigned long addr;
-
-		for (addr = GATE_ADDR + PAGE_SIZE;
-		     addr < GATE_ADDR + PERCPU_PAGE_SIZE;
-		     addr += PAGE_SIZE)
-		{
-			put_kernel_page(ZERO_PAGE(0), addr,
-					PAGE_READONLY);
-			put_kernel_page(ZERO_PAGE(0), addr + PERCPU_PAGE_SIZE,
-					PAGE_READONLY);
-		}
-	}
-#endif
-	ia64_patch_gate();
-}
-
-static struct vm_area_struct gate_vma;
-
-static int __init gate_vma_init(void)
-{
-	vma_init(&gate_vma, NULL);
-	gate_vma.vm_start = FIXADDR_USER_START;
-	gate_vma.vm_end = FIXADDR_USER_END;
-	vm_flags_init(&gate_vma, VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC);
-	gate_vma.vm_page_prot = __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX);
-
-	return 0;
-}
-__initcall(gate_vma_init);
-
-struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
-	return &gate_vma;
-}
-
-int in_gate_area_no_mm(unsigned long addr)
-{
-	if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
-		return 1;
-	return 0;
-}
-
-int in_gate_area(struct mm_struct *mm, unsigned long addr)
-{
-	return in_gate_area_no_mm(addr);
-}
-
-void ia64_mmu_init(void *my_cpu_data)
-{
-	unsigned long pta, impl_va_bits;
-	extern void tlb_init(void);
-
-#ifdef CONFIG_DISABLE_VHPT
-#	define VHPT_ENABLE_BIT	0
-#else
-#	define VHPT_ENABLE_BIT	1
-#endif
-
-	/*
-	 * Check if the virtually mapped linear page table (VMLPT) overlaps with a mapped
-	 * address space.  The IA-64 architecture guarantees that at least 50 bits of
-	 * virtual address space are implemented but if we pick a large enough page size
-	 * (e.g., 64KB), the mapped address space is big enough that it will overlap with
-	 * VMLPT.  I assume that once we run on machines big enough to warrant 64KB pages,
-	 * IMPL_VA_MSB will be significantly bigger, so this is unlikely to become a
-	 * problem in practice.  Alternatively, we could truncate the top of the mapped
-	 * address space to not permit mappings that would overlap with the VMLPT.
-	 * --davidm 00/12/06
-	 */
-#	define pte_bits			3
-#	define mapped_space_bits	(3*(PAGE_SHIFT - pte_bits) + PAGE_SHIFT)
-	/*
-	 * The virtual page table has to cover the entire implemented address space within
-	 * a region even though not all of this space may be mappable.  The reason for
-	 * this is that the Access bit and Dirty bit fault handlers perform
-	 * non-speculative accesses to the virtual page table, so the address range of the
-	 * virtual page table itself needs to be covered by virtual page table.
-	 */
-#	define vmlpt_bits		(impl_va_bits - PAGE_SHIFT + pte_bits)
-#	define POW2(n)			(1ULL << (n))
-
-	impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask | (7UL << 61)));
-
-	if (impl_va_bits < 51 || impl_va_bits > 61)
-		panic("CPU has bogus IMPL_VA_MSB value of %lu!\n", impl_va_bits - 1);
-	/*
-	 * mapped_space_bits - PAGE_SHIFT is the total number of ptes we need,
-	 * which must fit into "vmlpt_bits - pte_bits" slots. Second half of
-	 * the test makes sure that our mapped space doesn't overlap the
-	 * unimplemented hole in the middle of the region.
-	 */
-	if ((mapped_space_bits - PAGE_SHIFT > vmlpt_bits - pte_bits) ||
-	    (mapped_space_bits > impl_va_bits - 1))
-		panic("Cannot build a big enough virtual-linear page table"
-		      " to cover mapped address space.\n"
-		      " Try using a smaller page size.\n");
-
-
-	/* place the VMLPT at the end of each page-table mapped region: */
-	pta = POW2(61) - POW2(vmlpt_bits);
-
-	/*
-	 * Set the (virtually mapped linear) page table address.  Bit
-	 * 8 selects between the short and long format, bits 2-7 the
-	 * size of the table, and bit 0 whether the VHPT walker is
-	 * enabled.
-	 */
-	ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | VHPT_ENABLE_BIT);
-
-	ia64_tlb_init();
-
-#ifdef	CONFIG_HUGETLB_PAGE
-	ia64_set_rr(HPAGE_REGION_BASE, HPAGE_SHIFT << 2);
-	ia64_srlz_d();
-#endif
-}
-
-int __init register_active_ranges(u64 start, u64 len, int nid)
-{
-	u64 end = start + len;
-
-#ifdef CONFIG_KEXEC
-	if (start > crashk_res.start && start < crashk_res.end)
-		start = crashk_res.end;
-	if (end > crashk_res.start && end < crashk_res.end)
-		end = crashk_res.start;
-#endif
-
-	if (start < end)
-		memblock_add_node(__pa(start), end - start, nid, MEMBLOCK_NONE);
-	return 0;
-}
-
-int
-find_max_min_low_pfn (u64 start, u64 end, void *arg)
-{
-	unsigned long pfn_start, pfn_end;
-#ifdef CONFIG_FLATMEM
-	pfn_start = (PAGE_ALIGN(__pa(start))) >> PAGE_SHIFT;
-	pfn_end = (PAGE_ALIGN(__pa(end - 1))) >> PAGE_SHIFT;
-#else
-	pfn_start = GRANULEROUNDDOWN(__pa(start)) >> PAGE_SHIFT;
-	pfn_end = GRANULEROUNDUP(__pa(end - 1)) >> PAGE_SHIFT;
-#endif
-	min_low_pfn = min(min_low_pfn, pfn_start);
-	max_low_pfn = max(max_low_pfn, pfn_end);
-	return 0;
-}
-
-/*
- * Boot command-line option "nolwsys" can be used to disable the use of any light-weight
- * system call handler.  When this option is in effect, all fsyscalls will end up bubbling
- * down into the kernel and calling the normal (heavy-weight) syscall handler.  This is
- * useful for performance testing, but conceivably could also come in handy for debugging
- * purposes.
- */
-
-static int nolwsys __initdata;
-
-static int __init
-nolwsys_setup (char *s)
-{
-	nolwsys = 1;
-	return 1;
-}
-
-__setup("nolwsys", nolwsys_setup);
-
-void __init
-mem_init (void)
-{
-	int i;
-
-	BUG_ON(PTRS_PER_PGD * sizeof(pgd_t) != PAGE_SIZE);
-	BUG_ON(PTRS_PER_PMD * sizeof(pmd_t) != PAGE_SIZE);
-	BUG_ON(PTRS_PER_PTE * sizeof(pte_t) != PAGE_SIZE);
-
-	/*
-	 * This needs to be called _after_ the command line has been parsed but
-	 * _before_ any drivers that may need the PCI DMA interface are
-	 * initialized or bootmem has been freed.
-	 */
-	do {
-#ifdef CONFIG_INTEL_IOMMU
-		detect_intel_iommu();
-		if (iommu_detected)
-			break;
-#endif
-		swiotlb_init(true, SWIOTLB_VERBOSE);
-	} while (0);
-
-#ifdef CONFIG_FLATMEM
-	BUG_ON(!mem_map);
-#endif
-
-	set_max_mapnr(max_low_pfn);
-	high_memory = __va(max_low_pfn * PAGE_SIZE);
-	memblock_free_all();
-
-	/*
-	 * For fsyscall entrypoints with no light-weight handler, use the ordinary
-	 * (heavy-weight) handler, but mark it by setting bit 0, so the fsyscall entry
-	 * code can tell them apart.
-	 */
-	for (i = 0; i < NR_syscalls; ++i) {
-		extern unsigned long fsyscall_table[NR_syscalls];
-		extern unsigned long sys_call_table[NR_syscalls];
-
-		if (!fsyscall_table[i] || nolwsys)
-			fsyscall_table[i] = sys_call_table[i] | 1;
-	}
-	setup_gate();
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size,
-		    struct mhp_params *params)
-{
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
-	int ret;
-
-	if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
-		return -EINVAL;
-
-	ret = __add_pages(nid, start_pfn, nr_pages, params);
-	if (ret)
-		printk("%s: Problem encountered in __add_pages() as ret=%d\n",
-		       __func__,  ret);
-
-	return ret;
-}
-
-void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
-{
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
-
-	__remove_pages(start_pfn, nr_pages, altmap);
-}
-#endif
-
-static const pgprot_t protection_map[16] = {
-	[VM_NONE]					= PAGE_NONE,
-	[VM_READ]					= PAGE_READONLY,
-	[VM_WRITE]					= PAGE_READONLY,
-	[VM_WRITE | VM_READ]				= PAGE_READONLY,
-	[VM_EXEC]					= __pgprot(__ACCESS_BITS | _PAGE_PL_3 |
-								   _PAGE_AR_X_RX),
-	[VM_EXEC | VM_READ]				= __pgprot(__ACCESS_BITS | _PAGE_PL_3 |
-								   _PAGE_AR_RX),
-	[VM_EXEC | VM_WRITE]				= PAGE_COPY_EXEC,
-	[VM_EXEC | VM_WRITE | VM_READ]			= PAGE_COPY_EXEC,
-	[VM_SHARED]					= PAGE_NONE,
-	[VM_SHARED | VM_READ]				= PAGE_READONLY,
-	[VM_SHARED | VM_WRITE]				= PAGE_SHARED,
-	[VM_SHARED | VM_WRITE | VM_READ]		= PAGE_SHARED,
-	[VM_SHARED | VM_EXEC]				= __pgprot(__ACCESS_BITS | _PAGE_PL_3 |
-								   _PAGE_AR_X_RX),
-	[VM_SHARED | VM_EXEC | VM_READ]			= __pgprot(__ACCESS_BITS | _PAGE_PL_3 |
-								   _PAGE_AR_RX),
-	[VM_SHARED | VM_EXEC | VM_WRITE]		= __pgprot(__ACCESS_BITS | _PAGE_PL_3 |
-								   _PAGE_AR_RWX),
-	[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]	= __pgprot(__ACCESS_BITS | _PAGE_PL_3 |
-								   _PAGE_AR_RWX)
-};
-DECLARE_VM_GET_PAGE_PROT
diff --git a/arch/ia64/mm/ioremap.c b/arch/ia64/mm/ioremap.c
deleted file mode 100644
index 711b6abc822e..000000000000
--- a/arch/ia64/mm/ioremap.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * (c) Copyright 2006, 2007 Hewlett-Packard Development Company, L.P.
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- */
-
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <linux/efi.h>
-#include <linux/io.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <asm/io.h>
-#include <asm/meminit.h>
-
-static inline void __iomem *
-__ioremap_uc(unsigned long phys_addr)
-{
-	return (void __iomem *) (__IA64_UNCACHED_OFFSET | phys_addr);
-}
-
-void __iomem *
-early_ioremap (unsigned long phys_addr, unsigned long size)
-{
-	u64 attr;
-	attr = kern_mem_attribute(phys_addr, size);
-	if (attr & EFI_MEMORY_WB)
-		return (void __iomem *) phys_to_virt(phys_addr);
-	return __ioremap_uc(phys_addr);
-}
-
-void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
-			   unsigned long flags)
-{
-	u64 attr;
-	unsigned long gran_base, gran_size;
-	unsigned long page_base;
-
-	/*
-	 * For things in kern_memmap, we must use the same attribute
-	 * as the rest of the kernel.  For more details, see
-	 * Documentation/arch/ia64/aliasing.rst.
-	 */
-	attr = kern_mem_attribute(phys_addr, size);
-	if (attr & EFI_MEMORY_WB)
-		return (void __iomem *) phys_to_virt(phys_addr);
-	else if (attr & EFI_MEMORY_UC)
-		return __ioremap_uc(phys_addr);
-
-	/*
-	 * Some chipsets don't support UC access to memory.  If
-	 * WB is supported for the whole granule, we prefer that.
-	 */
-	gran_base = GRANULEROUNDDOWN(phys_addr);
-	gran_size = GRANULEROUNDUP(phys_addr + size) - gran_base;
-	if (efi_mem_attribute(gran_base, gran_size) & EFI_MEMORY_WB)
-		return (void __iomem *) phys_to_virt(phys_addr);
-
-	/*
-	 * WB is not supported for the whole granule, so we can't use
-	 * the region 7 identity mapping.  If we can safely cover the
-	 * area with kernel page table mappings, we can use those
-	 * instead.
-	 */
-	page_base = phys_addr & PAGE_MASK;
-	size = PAGE_ALIGN(phys_addr + size) - page_base;
-	if (efi_mem_attribute(page_base, size) & EFI_MEMORY_WB)
-		return generic_ioremap_prot(phys_addr, size, __pgprot(flags));
-
-	return __ioremap_uc(phys_addr);
-}
-EXPORT_SYMBOL(ioremap_prot);
-
-void __iomem *
-ioremap_uc(unsigned long phys_addr, unsigned long size)
-{
-	if (kern_mem_attribute(phys_addr, size) & EFI_MEMORY_WB)
-		return NULL;
-
-	return __ioremap_uc(phys_addr);
-}
-EXPORT_SYMBOL(ioremap_uc);
-
-void
-early_iounmap (volatile void __iomem *addr, unsigned long size)
-{
-}
-
-void iounmap(volatile void __iomem *addr)
-{
-	if (REGION_NUMBER(addr) == RGN_GATE)
-		vunmap((void *) ((unsigned long) addr & PAGE_MASK));
-}
-EXPORT_SYMBOL(iounmap);
diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
deleted file mode 100644
index 4c7b1f50e3b7..000000000000
--- a/arch/ia64/mm/numa.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * This file contains NUMA specific variables and functions which are used on
- * NUMA machines with contiguous memory.
- * 
- *                         2002/08/07 Erich Focht <efocht@ess.nec.de>
- */
-
-#include <linux/cpu.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/node.h>
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/module.h>
-#include <asm/mmzone.h>
-#include <asm/numa.h>
-
-
-/*
- * The following structures are usually initialized by ACPI or
- * similar mechanisms and describe the NUMA characteristics of the machine.
- */
-int num_node_memblks;
-struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
-struct node_cpuid_s node_cpuid[NR_CPUS] =
-	{ [0 ... NR_CPUS-1] = { .phys_id = 0, .nid = NUMA_NO_NODE } };
-
-/*
- * This is a matrix with "distances" between nodes, they should be
- * proportional to the memory access latency ratios.
- */
-u8 numa_slit[MAX_NUMNODES * MAX_NUMNODES];
-
-int __node_distance(int from, int to)
-{
-	return slit_distance(from, to);
-}
-EXPORT_SYMBOL(__node_distance);
-
-/* Identify which cnode a physical address resides on */
-int
-paddr_to_nid(unsigned long paddr)
-{
-	int	i;
-
-	for (i = 0; i < num_node_memblks; i++)
-		if (paddr >= node_memblk[i].start_paddr &&
-		    paddr < node_memblk[i].start_paddr + node_memblk[i].size)
-			break;
-
-	return (i < num_node_memblks) ? node_memblk[i].nid : (num_node_memblks ? -1 : 0);
-}
-EXPORT_SYMBOL(paddr_to_nid);
-
-#if defined(CONFIG_SPARSEMEM) && defined(CONFIG_NUMA)
-void numa_clear_node(int cpu)
-{
-	unmap_cpu_from_node(cpu, NUMA_NO_NODE);
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-/*
- *  SRAT information is stored in node_memblk[], then we can use SRAT
- *  information at memory-hot-add if necessary.
- */
-
-int memory_add_physaddr_to_nid(u64 addr)
-{
-	int nid = paddr_to_nid(addr);
-	if (nid < 0)
-		return 0;
-	return nid;
-}
-EXPORT_SYMBOL(memory_add_physaddr_to_nid);
-#endif
-#endif
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
deleted file mode 100644
index ca060e7a2a46..000000000000
--- a/arch/ia64/mm/tlb.c
+++ /dev/null
@@ -1,591 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * TLB support routines.
- *
- * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * 08/02/00 A. Mallick <asit.k.mallick@intel.com>
- *		Modified RID allocation for SMP
- *          Goutham Rao <goutham.rao@intel.com>
- *              IPI based ptc implementation and A-step IPI implementation.
- * Rohit Seth <rohit.seth@intel.com>
- * Ken Chen <kenneth.w.chen@intel.com>
- * Christophe de Dinechin <ddd@hp.com>: Avoid ptc.e on memory allocation
- * Copyright (C) 2007 Intel Corp
- *	Fenghua Yu <fenghua.yu@intel.com>
- *	Add multiple ptc.g/ptc.ga instruction support in global tlb purge.
- */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
-#include <linux/memblock.h>
-#include <linux/slab.h>
-
-#include <asm/delay.h>
-#include <asm/mmu_context.h>
-#include <asm/pal.h>
-#include <asm/tlbflush.h>
-#include <asm/dma.h>
-#include <asm/processor.h>
-#include <asm/sal.h>
-#include <asm/tlb.h>
-
-static struct {
-	u64 mask;		/* mask of supported purge page-sizes */
-	unsigned long max_bits;	/* log2 of largest supported purge page-size */
-} purge;
-
-struct ia64_ctx ia64_ctx = {
-	.lock =	__SPIN_LOCK_UNLOCKED(ia64_ctx.lock),
-	.next =	1,
-	.max_ctx = ~0U
-};
-
-DEFINE_PER_CPU(u8, ia64_need_tlb_flush);
-DEFINE_PER_CPU(u8, ia64_tr_num);  /*Number of TR slots in current processor*/
-DEFINE_PER_CPU(u8, ia64_tr_used); /*Max Slot number used by kernel*/
-
-struct ia64_tr_entry *ia64_idtrs[NR_CPUS];
-
-/*
- * Initializes the ia64_ctx.bitmap array based on max_ctx+1.
- * Called after cpu_init() has setup ia64_ctx.max_ctx based on
- * maximum RID that is supported by boot CPU.
- */
-void __init
-mmu_context_init (void)
-{
-	ia64_ctx.bitmap = memblock_alloc((ia64_ctx.max_ctx + 1) >> 3,
-					 SMP_CACHE_BYTES);
-	if (!ia64_ctx.bitmap)
-		panic("%s: Failed to allocate %u bytes\n", __func__,
-		      (ia64_ctx.max_ctx + 1) >> 3);
-	ia64_ctx.flushmap = memblock_alloc((ia64_ctx.max_ctx + 1) >> 3,
-					   SMP_CACHE_BYTES);
-	if (!ia64_ctx.flushmap)
-		panic("%s: Failed to allocate %u bytes\n", __func__,
-		      (ia64_ctx.max_ctx + 1) >> 3);
-}
-
-/*
- * Acquire the ia64_ctx.lock before calling this function!
- */
-void
-wrap_mmu_context (struct mm_struct *mm)
-{
-	int i, cpu;
-	unsigned long flush_bit;
-
-	for (i=0; i <= ia64_ctx.max_ctx / BITS_PER_LONG; i++) {
-		flush_bit = xchg(&ia64_ctx.flushmap[i], 0);
-		ia64_ctx.bitmap[i] ^= flush_bit;
-	}
- 
-	/* use offset at 300 to skip daemons */
-	ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
-				ia64_ctx.max_ctx, 300);
-	ia64_ctx.limit = find_next_bit(ia64_ctx.bitmap,
-				ia64_ctx.max_ctx, ia64_ctx.next);
-
-	/*
-	 * can't call flush_tlb_all() here because of race condition
-	 * with O(1) scheduler [EF]
-	 */
-	cpu = get_cpu(); /* prevent preemption/migration */
-	for_each_online_cpu(i)
-		if (i != cpu)
-			per_cpu(ia64_need_tlb_flush, i) = 1;
-	put_cpu();
-	local_flush_tlb_all();
-}
-
-/*
- * Implement "spinaphores" ... like counting semaphores, but they
- * spin instead of sleeping.  If there are ever any other users for
- * this primitive it can be moved up to a spinaphore.h header.
- */
-struct spinaphore {
-	unsigned long	ticket;
-	unsigned long	serve;
-};
-
-static inline void spinaphore_init(struct spinaphore *ss, int val)
-{
-	ss->ticket = 0;
-	ss->serve = val;
-}
-
-static inline void down_spin(struct spinaphore *ss)
-{
-	unsigned long t = ia64_fetchadd(1, &ss->ticket, acq), serve;
-
-	if (time_before(t, ss->serve))
-		return;
-
-	ia64_invala();
-
-	for (;;) {
-		asm volatile ("ld8.c.nc %0=[%1]" : "=r"(serve) : "r"(&ss->serve) : "memory");
-		if (time_before(t, serve))
-			return;
-		cpu_relax();
-	}
-}
-
-static inline void up_spin(struct spinaphore *ss)
-{
-	ia64_fetchadd(1, &ss->serve, rel);
-}
-
-static struct spinaphore ptcg_sem;
-static u16 nptcg = 1;
-static int need_ptcg_sem = 1;
-static int toolatetochangeptcgsem = 0;
-
-/*
- * Kernel parameter "nptcg=" overrides max number of concurrent global TLB
- * purges which is reported from either PAL or SAL PALO.
- *
- * We don't have sanity checking for nptcg value. It's the user's responsibility
- * for valid nptcg value on the platform. Otherwise, kernel may hang in some
- * cases.
- */
-static int __init
-set_nptcg(char *str)
-{
-	int value = 0;
-
-	get_option(&str, &value);
-	setup_ptcg_sem(value, NPTCG_FROM_KERNEL_PARAMETER);
-
-	return 1;
-}
-
-__setup("nptcg=", set_nptcg);
-
-/*
- * Maximum number of simultaneous ptc.g purges in the system can
- * be defined by PAL_VM_SUMMARY (in which case we should take
- * the smallest value for any cpu in the system) or by the PAL
- * override table (in which case we should ignore the value from
- * PAL_VM_SUMMARY).
- *
- * Kernel parameter "nptcg=" overrides maximum number of simultaneous ptc.g
- * purges defined in either PAL_VM_SUMMARY or PAL override table. In this case,
- * we should ignore the value from either PAL_VM_SUMMARY or PAL override table.
- *
- * Complicating the logic here is the fact that num_possible_cpus()
- * isn't fully setup until we start bringing cpus online.
- */
-void
-setup_ptcg_sem(int max_purges, int nptcg_from)
-{
-	static int kp_override;
-	static int palo_override;
-	static int firstcpu = 1;
-
-	if (toolatetochangeptcgsem) {
-		if (nptcg_from == NPTCG_FROM_PAL && max_purges == 0)
-			BUG_ON(1 < nptcg);
-		else
-			BUG_ON(max_purges < nptcg);
-		return;
-	}
-
-	if (nptcg_from == NPTCG_FROM_KERNEL_PARAMETER) {
-		kp_override = 1;
-		nptcg = max_purges;
-		goto resetsema;
-	}
-	if (kp_override) {
-		need_ptcg_sem = num_possible_cpus() > nptcg;
-		return;
-	}
-
-	if (nptcg_from == NPTCG_FROM_PALO) {
-		palo_override = 1;
-
-		/* In PALO max_purges == 0 really means it! */
-		if (max_purges == 0)
-			panic("Whoa! Platform does not support global TLB purges.\n");
-		nptcg = max_purges;
-		if (nptcg == PALO_MAX_TLB_PURGES) {
-			need_ptcg_sem = 0;
-			return;
-		}
-		goto resetsema;
-	}
-	if (palo_override) {
-		if (nptcg != PALO_MAX_TLB_PURGES)
-			need_ptcg_sem = (num_possible_cpus() > nptcg);
-		return;
-	}
-
-	/* In PAL_VM_SUMMARY max_purges == 0 actually means 1 */
-	if (max_purges == 0) max_purges = 1;
-
-	if (firstcpu) {
-		nptcg = max_purges;
-		firstcpu = 0;
-	}
-	if (max_purges < nptcg)
-		nptcg = max_purges;
-	if (nptcg == PAL_MAX_PURGES) {
-		need_ptcg_sem = 0;
-		return;
-	} else
-		need_ptcg_sem = (num_possible_cpus() > nptcg);
-
-resetsema:
-	spinaphore_init(&ptcg_sem, max_purges);
-}
-
-#ifdef CONFIG_SMP
-static void
-ia64_global_tlb_purge (struct mm_struct *mm, unsigned long start,
-		       unsigned long end, unsigned long nbits)
-{
-	struct mm_struct *active_mm = current->active_mm;
-
-	toolatetochangeptcgsem = 1;
-
-	if (mm != active_mm) {
-		/* Restore region IDs for mm */
-		if (mm && active_mm) {
-			activate_context(mm);
-		} else {
-			flush_tlb_all();
-			return;
-		}
-	}
-
-	if (need_ptcg_sem)
-		down_spin(&ptcg_sem);
-
-	do {
-		/*
-		 * Flush ALAT entries also.
-		 */
-		ia64_ptcga(start, (nbits << 2));
-		ia64_srlz_i();
-		start += (1UL << nbits);
-	} while (start < end);
-
-	if (need_ptcg_sem)
-		up_spin(&ptcg_sem);
-
-        if (mm != active_mm) {
-                activate_context(active_mm);
-        }
-}
-#endif /* CONFIG_SMP */
-
-void
-local_flush_tlb_all (void)
-{
-	unsigned long i, j, flags, count0, count1, stride0, stride1, addr;
-
-	addr    = local_cpu_data->ptce_base;
-	count0  = local_cpu_data->ptce_count[0];
-	count1  = local_cpu_data->ptce_count[1];
-	stride0 = local_cpu_data->ptce_stride[0];
-	stride1 = local_cpu_data->ptce_stride[1];
-
-	local_irq_save(flags);
-	for (i = 0; i < count0; ++i) {
-		for (j = 0; j < count1; ++j) {
-			ia64_ptce(addr);
-			addr += stride1;
-		}
-		addr += stride0;
-	}
-	local_irq_restore(flags);
-	ia64_srlz_i();			/* srlz.i implies srlz.d */
-}
-
-static void
-__flush_tlb_range (struct vm_area_struct *vma, unsigned long start,
-		 unsigned long end)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long size = end - start;
-	unsigned long nbits;
-
-#ifndef CONFIG_SMP
-	if (mm != current->active_mm) {
-		mm->context = 0;
-		return;
-	}
-#endif
-
-	nbits = ia64_fls(size + 0xfff);
-	while (unlikely (((1UL << nbits) & purge.mask) == 0) &&
-			(nbits < purge.max_bits))
-		++nbits;
-	if (nbits > purge.max_bits)
-		nbits = purge.max_bits;
-	start &= ~((1UL << nbits) - 1);
-
-	preempt_disable();
-#ifdef CONFIG_SMP
-	if (mm != current->active_mm || cpumask_weight(mm_cpumask(mm)) != 1) {
-		ia64_global_tlb_purge(mm, start, end, nbits);
-		preempt_enable();
-		return;
-	}
-#endif
-	do {
-		ia64_ptcl(start, (nbits<<2));
-		start += (1UL << nbits);
-	} while (start < end);
-	preempt_enable();
-	ia64_srlz_i();			/* srlz.i implies srlz.d */
-}
-
-void flush_tlb_range(struct vm_area_struct *vma,
-		unsigned long start, unsigned long end)
-{
-	if (unlikely(end - start >= 1024*1024*1024*1024UL
-			|| REGION_NUMBER(start) != REGION_NUMBER(end - 1))) {
-		/*
-		 * If we flush more than a tera-byte or across regions, we're
-		 * probably better off just flushing the entire TLB(s).  This
-		 * should be very rare and is not worth optimizing for.
-		 */
-		flush_tlb_all();
-	} else {
-		/* flush the address range from the tlb */
-		__flush_tlb_range(vma, start, end);
-		/* flush the virt. page-table area mapping the addr range */
-		__flush_tlb_range(vma, ia64_thash(start), ia64_thash(end));
-	}
-}
-EXPORT_SYMBOL(flush_tlb_range);
-
-void ia64_tlb_init(void)
-{
-	ia64_ptce_info_t ptce_info;
-	u64 tr_pgbits;
-	long status;
-	pal_vm_info_1_u_t vm_info_1;
-	pal_vm_info_2_u_t vm_info_2;
-	int cpu = smp_processor_id();
-
-	if ((status = ia64_pal_vm_page_size(&tr_pgbits, &purge.mask)) != 0) {
-		printk(KERN_ERR "PAL_VM_PAGE_SIZE failed with status=%ld; "
-		       "defaulting to architected purge page-sizes.\n", status);
-		purge.mask = 0x115557000UL;
-	}
-	purge.max_bits = ia64_fls(purge.mask);
-
-	ia64_get_ptce(&ptce_info);
-	local_cpu_data->ptce_base = ptce_info.base;
-	local_cpu_data->ptce_count[0] = ptce_info.count[0];
-	local_cpu_data->ptce_count[1] = ptce_info.count[1];
-	local_cpu_data->ptce_stride[0] = ptce_info.stride[0];
-	local_cpu_data->ptce_stride[1] = ptce_info.stride[1];
-
-	local_flush_tlb_all();	/* nuke left overs from bootstrapping... */
-	status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2);
-
-	if (status) {
-		printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status);
-		per_cpu(ia64_tr_num, cpu) = 8;
-		return;
-	}
-	per_cpu(ia64_tr_num, cpu) = vm_info_1.pal_vm_info_1_s.max_itr_entry+1;
-	if (per_cpu(ia64_tr_num, cpu) >
-				(vm_info_1.pal_vm_info_1_s.max_dtr_entry+1))
-		per_cpu(ia64_tr_num, cpu) =
-				vm_info_1.pal_vm_info_1_s.max_dtr_entry+1;
-	if (per_cpu(ia64_tr_num, cpu) > IA64_TR_ALLOC_MAX) {
-		static int justonce = 1;
-		per_cpu(ia64_tr_num, cpu) = IA64_TR_ALLOC_MAX;
-		if (justonce) {
-			justonce = 0;
-			printk(KERN_DEBUG "TR register number exceeds "
-			       "IA64_TR_ALLOC_MAX!\n");
-		}
-	}
-}
-
-/*
- * is_tr_overlap
- *
- * Check overlap with inserted TRs.
- */
-static int is_tr_overlap(struct ia64_tr_entry *p, u64 va, u64 log_size)
-{
-	u64 tr_log_size;
-	u64 tr_end;
-	u64 va_rr = ia64_get_rr(va);
-	u64 va_rid = RR_TO_RID(va_rr);
-	u64 va_end = va + (1<<log_size) - 1;
-
-	if (va_rid != RR_TO_RID(p->rr))
-		return 0;
-	tr_log_size = (p->itir & 0xff) >> 2;
-	tr_end = p->ifa + (1<<tr_log_size) - 1;
-
-	if (va > tr_end || p->ifa > va_end)
-		return 0;
-	return 1;
-
-}
-
-/*
- * ia64_insert_tr in virtual mode. Allocate a TR slot
- *
- * target_mask : 0x1 : itr, 0x2 : dtr, 0x3 : idtr
- *
- * va 	: virtual address.
- * pte 	: pte entries inserted.
- * log_size: range to be covered.
- *
- * Return value:  <0 :  error No.
- *
- *		  >=0 : slot number allocated for TR.
- * Must be called with preemption disabled.
- */
-int ia64_itr_entry(u64 target_mask, u64 va, u64 pte, u64 log_size)
-{
-	int i, r;
-	unsigned long psr;
-	struct ia64_tr_entry *p;
-	int cpu = smp_processor_id();
-
-	if (!ia64_idtrs[cpu]) {
-		ia64_idtrs[cpu] = kmalloc_array(2 * IA64_TR_ALLOC_MAX,
-						sizeof(struct ia64_tr_entry),
-						GFP_KERNEL);
-		if (!ia64_idtrs[cpu])
-			return -ENOMEM;
-	}
-	r = -EINVAL;
-	/*Check overlap with existing TR entries*/
-	if (target_mask & 0x1) {
-		p = ia64_idtrs[cpu];
-		for (i = IA64_TR_ALLOC_BASE; i <= per_cpu(ia64_tr_used, cpu);
-								i++, p++) {
-			if (p->pte & 0x1)
-				if (is_tr_overlap(p, va, log_size)) {
-					printk(KERN_DEBUG "Overlapped Entry"
-						"Inserted for TR Register!!\n");
-					goto out;
-			}
-		}
-	}
-	if (target_mask & 0x2) {
-		p = ia64_idtrs[cpu] + IA64_TR_ALLOC_MAX;
-		for (i = IA64_TR_ALLOC_BASE; i <= per_cpu(ia64_tr_used, cpu);
-								i++, p++) {
-			if (p->pte & 0x1)
-				if (is_tr_overlap(p, va, log_size)) {
-					printk(KERN_DEBUG "Overlapped Entry"
-						"Inserted for TR Register!!\n");
-					goto out;
-				}
-		}
-	}
-
-	for (i = IA64_TR_ALLOC_BASE; i < per_cpu(ia64_tr_num, cpu); i++) {
-		switch (target_mask & 0x3) {
-		case 1:
-			if (!((ia64_idtrs[cpu] + i)->pte & 0x1))
-				goto found;
-			continue;
-		case 2:
-			if (!((ia64_idtrs[cpu] + IA64_TR_ALLOC_MAX + i)->pte & 0x1))
-				goto found;
-			continue;
-		case 3:
-			if (!((ia64_idtrs[cpu] + i)->pte & 0x1) &&
-			    !((ia64_idtrs[cpu] + IA64_TR_ALLOC_MAX + i)->pte & 0x1))
-				goto found;
-			continue;
-		default:
-			r = -EINVAL;
-			goto out;
-		}
-	}
-found:
-	if (i >= per_cpu(ia64_tr_num, cpu))
-		return -EBUSY;
-
-	/*Record tr info for mca handler use!*/
-	if (i > per_cpu(ia64_tr_used, cpu))
-		per_cpu(ia64_tr_used, cpu) = i;
-
-	psr = ia64_clear_ic();
-	if (target_mask & 0x1) {
-		ia64_itr(0x1, i, va, pte, log_size);
-		ia64_srlz_i();
-		p = ia64_idtrs[cpu] + i;
-		p->ifa = va;
-		p->pte = pte;
-		p->itir = log_size << 2;
-		p->rr = ia64_get_rr(va);
-	}
-	if (target_mask & 0x2) {
-		ia64_itr(0x2, i, va, pte, log_size);
-		ia64_srlz_i();
-		p = ia64_idtrs[cpu] + IA64_TR_ALLOC_MAX + i;
-		p->ifa = va;
-		p->pte = pte;
-		p->itir = log_size << 2;
-		p->rr = ia64_get_rr(va);
-	}
-	ia64_set_psr(psr);
-	r = i;
-out:
-	return r;
-}
-EXPORT_SYMBOL_GPL(ia64_itr_entry);
-
-/*
- * ia64_purge_tr
- *
- * target_mask: 0x1: purge itr, 0x2 : purge dtr, 0x3 purge idtr.
- * slot: slot number to be freed.
- *
- * Must be called with preemption disabled.
- */
-void ia64_ptr_entry(u64 target_mask, int slot)
-{
-	int cpu = smp_processor_id();
-	int i;
-	struct ia64_tr_entry *p;
-
-	if (slot < IA64_TR_ALLOC_BASE || slot >= per_cpu(ia64_tr_num, cpu))
-		return;
-
-	if (target_mask & 0x1) {
-		p = ia64_idtrs[cpu] + slot;
-		if ((p->pte&0x1) && is_tr_overlap(p, p->ifa, p->itir>>2)) {
-			p->pte = 0;
-			ia64_ptr(0x1, p->ifa, p->itir>>2);
-			ia64_srlz_i();
-		}
-	}
-
-	if (target_mask & 0x2) {
-		p = ia64_idtrs[cpu] + IA64_TR_ALLOC_MAX + slot;
-		if ((p->pte & 0x1) && is_tr_overlap(p, p->ifa, p->itir>>2)) {
-			p->pte = 0;
-			ia64_ptr(0x2, p->ifa, p->itir>>2);
-			ia64_srlz_i();
-		}
-	}
-
-	for (i = per_cpu(ia64_tr_used, cpu); i >= IA64_TR_ALLOC_BASE; i--) {
-		if (((ia64_idtrs[cpu] + i)->pte & 0x1) ||
-		    ((ia64_idtrs[cpu] + IA64_TR_ALLOC_MAX + i)->pte & 0x1))
-			break;
-	}
-	per_cpu(ia64_tr_used, cpu) = i;
-}
-EXPORT_SYMBOL_GPL(ia64_ptr_entry);
diff --git a/arch/ia64/pci/Makefile b/arch/ia64/pci/Makefile
deleted file mode 100644
index 81ea50eeb527..000000000000
--- a/arch/ia64/pci/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Makefile for the ia64-specific parts of the pci bus
-#
-obj-y		:= pci.o fixup.o
diff --git a/arch/ia64/pci/fixup.c b/arch/ia64/pci/fixup.c
deleted file mode 100644
index 2bcdd7d3a1ad..000000000000
--- a/arch/ia64/pci/fixup.c
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Exceptions for specific devices. Usually work-arounds for fatal design flaws.
- * Derived from fixup.c of i386 tree.
- */
-
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/vgaarb.h>
-#include <linux/screen_info.h>
-#include <asm/uv/uv.h>
-
-/*
- * Fixup to mark boot BIOS video selected by BIOS before it changes
- *
- * From information provided by "Jon Smirl" <jonsmirl@gmail.com>
- *
- * The standard boot ROM sequence for an x86 machine uses the BIOS
- * to select an initial video card for boot display. This boot video
- * card will have its BIOS copied to 0xC0000 in system RAM.
- * IORESOURCE_ROM_SHADOW is used to associate the boot video
- * card with this copy. On laptops this copy has to be used since
- * the main ROM may be compressed or combined with another image.
- * See pci_map_rom() for use of this flag. Before marking the device
- * with IORESOURCE_ROM_SHADOW check if a vga_default_device is already set
- * by either arch code or vga-arbitration; if so only apply the fixup to this
- * already-determined primary video card.
- */
-
-static void pci_fixup_video(struct pci_dev *pdev)
-{
-	struct pci_dev *bridge;
-	struct pci_bus *bus;
-	u16 config;
-	struct resource *res;
-
-	if (is_uv_system())
-		return;
-	/* Maybe, this machine supports legacy memory map. */
-
-	/* Is VGA routed to us? */
-	bus = pdev->bus;
-	while (bus) {
-		bridge = bus->self;
-
-		/*
-		 * From information provided by
-		 * "David Miller" <davem@davemloft.net>
-		 * The bridge control register is valid for PCI header
-		 * type BRIDGE, or CARDBUS. Host to PCI controllers use
-		 * PCI header type NORMAL.
-		 */
-		if (bridge && (pci_is_bridge(bridge))) {
-			pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
-						&config);
-			if (!(config & PCI_BRIDGE_CTL_VGA))
-				return;
-		}
-		bus = bus->parent;
-	}
-	if (!vga_default_device() || pdev == vga_default_device()) {
-		pci_read_config_word(pdev, PCI_COMMAND, &config);
-		if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
-			res = &pdev->resource[PCI_ROM_RESOURCE];
-
-			pci_disable_rom(pdev);
-			if (res->parent)
-				release_resource(res);
-
-			res->start = 0xC0000;
-			res->end = res->start + 0x20000 - 1;
-			res->flags = IORESOURCE_MEM | IORESOURCE_ROM_SHADOW |
-				     IORESOURCE_PCI_FIXED;
-			dev_info(&pdev->dev, "Video device with shadowed ROM at %pR\n",
-				 res);
-		}
-	}
-}
-DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID,
-			       PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
deleted file mode 100644
index 0a0328e61bef..000000000000
--- a/arch/ia64/pci/pci.c
+++ /dev/null
@@ -1,576 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * pci.c - Low-Level PCI Access in IA-64
- *
- * Derived from bios32.c of i386 tree.
- *
- * (c) Copyright 2002, 2005 Hewlett-Packard Development Company, L.P.
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- * Copyright (C) 2004 Silicon Graphics, Inc.
- *
- * Note: Above list of copyright holders is incomplete...
- */
-
-#include <linux/acpi.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/pci-acpi.h>
-#include <linux/init.h>
-#include <linux/ioport.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/memblock.h>
-#include <linux/export.h>
-
-#include <asm/page.h>
-#include <asm/io.h>
-#include <asm/sal.h>
-#include <asm/smp.h>
-#include <asm/irq.h>
-#include <asm/hw_irq.h>
-
-/*
- * Low-level SAL-based PCI configuration access functions. Note that SAL
- * calls are already serialized (via sal_lock), so we don't need another
- * synchronization mechanism here.
- */
-
-#define PCI_SAL_ADDRESS(seg, bus, devfn, reg)		\
-	(((u64) seg << 24) | (bus << 16) | (devfn << 8) | (reg))
-
-/* SAL 3.2 adds support for extended config space. */
-
-#define PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg)	\
-	(((u64) seg << 28) | (bus << 20) | (devfn << 12) | (reg))
-
-int raw_pci_read(unsigned int seg, unsigned int bus, unsigned int devfn,
-	      int reg, int len, u32 *value)
-{
-	u64 addr, data = 0;
-	int mode, result;
-
-	if (!value || (seg > 65535) || (bus > 255) || (devfn > 255) || (reg > 4095))
-		return -EINVAL;
-
-	if ((seg | reg) <= 255) {
-		addr = PCI_SAL_ADDRESS(seg, bus, devfn, reg);
-		mode = 0;
-	} else if (sal_revision >= SAL_VERSION_CODE(3,2)) {
-		addr = PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg);
-		mode = 1;
-	} else {
-		return -EINVAL;
-	}
-
-	result = ia64_sal_pci_config_read(addr, mode, len, &data);
-	if (result != 0)
-		return -EINVAL;
-
-	*value = (u32) data;
-	return 0;
-}
-
-int raw_pci_write(unsigned int seg, unsigned int bus, unsigned int devfn,
-	       int reg, int len, u32 value)
-{
-	u64 addr;
-	int mode, result;
-
-	if ((seg > 65535) || (bus > 255) || (devfn > 255) || (reg > 4095))
-		return -EINVAL;
-
-	if ((seg | reg) <= 255) {
-		addr = PCI_SAL_ADDRESS(seg, bus, devfn, reg);
-		mode = 0;
-	} else if (sal_revision >= SAL_VERSION_CODE(3,2)) {
-		addr = PCI_SAL_EXT_ADDRESS(seg, bus, devfn, reg);
-		mode = 1;
-	} else {
-		return -EINVAL;
-	}
-	result = ia64_sal_pci_config_write(addr, mode, len, value);
-	if (result != 0)
-		return -EINVAL;
-	return 0;
-}
-
-static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
-							int size, u32 *value)
-{
-	return raw_pci_read(pci_domain_nr(bus), bus->number,
-				 devfn, where, size, value);
-}
-
-static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
-							int size, u32 value)
-{
-	return raw_pci_write(pci_domain_nr(bus), bus->number,
-				  devfn, where, size, value);
-}
-
-struct pci_ops pci_root_ops = {
-	.read = pci_read,
-	.write = pci_write,
-};
-
-struct pci_root_info {
-	struct acpi_pci_root_info common;
-	struct pci_controller controller;
-	struct list_head io_resources;
-};
-
-static unsigned int new_space(u64 phys_base, int sparse)
-{
-	u64 mmio_base;
-	int i;
-
-	if (phys_base == 0)
-		return 0;	/* legacy I/O port space */
-
-	mmio_base = (u64) ioremap(phys_base, 0);
-	for (i = 0; i < num_io_spaces; i++)
-		if (io_space[i].mmio_base == mmio_base &&
-		    io_space[i].sparse == sparse)
-			return i;
-
-	if (num_io_spaces == MAX_IO_SPACES) {
-		pr_err("PCI: Too many IO port spaces "
-			"(MAX_IO_SPACES=%lu)\n", MAX_IO_SPACES);
-		return ~0;
-	}
-
-	i = num_io_spaces++;
-	io_space[i].mmio_base = mmio_base;
-	io_space[i].sparse = sparse;
-
-	return i;
-}
-
-static int add_io_space(struct device *dev, struct pci_root_info *info,
-			struct resource_entry *entry)
-{
-	struct resource_entry *iospace;
-	struct resource *resource, *res = entry->res;
-	char *name;
-	unsigned long base, min, max, base_port;
-	unsigned int sparse = 0, space_nr, len;
-
-	len = strlen(info->common.name) + 32;
-	iospace = resource_list_create_entry(NULL, len);
-	if (!iospace) {
-		dev_err(dev, "PCI: No memory for %s I/O port space\n",
-			info->common.name);
-		return -ENOMEM;
-	}
-
-	if (res->flags & IORESOURCE_IO_SPARSE)
-		sparse = 1;
-	space_nr = new_space(entry->offset, sparse);
-	if (space_nr == ~0)
-		goto free_resource;
-
-	name = (char *)(iospace + 1);
-	min = res->start - entry->offset;
-	max = res->end - entry->offset;
-	base = __pa(io_space[space_nr].mmio_base);
-	base_port = IO_SPACE_BASE(space_nr);
-	snprintf(name, len, "%s I/O Ports %08lx-%08lx", info->common.name,
-		 base_port + min, base_port + max);
-
-	/*
-	 * The SDM guarantees the legacy 0-64K space is sparse, but if the
-	 * mapping is done by the processor (not the bridge), ACPI may not
-	 * mark it as sparse.
-	 */
-	if (space_nr == 0)
-		sparse = 1;
-
-	resource = iospace->res;
-	resource->name  = name;
-	resource->flags = IORESOURCE_MEM;
-	resource->start = base + (sparse ? IO_SPACE_SPARSE_ENCODING(min) : min);
-	resource->end   = base + (sparse ? IO_SPACE_SPARSE_ENCODING(max) : max);
-	if (insert_resource(&iomem_resource, resource)) {
-		dev_err(dev,
-			"can't allocate host bridge io space resource  %pR\n",
-			resource);
-		goto free_resource;
-	}
-
-	entry->offset = base_port;
-	res->start = min + base_port;
-	res->end = max + base_port;
-	resource_list_add_tail(iospace, &info->io_resources);
-
-	return 0;
-
-free_resource:
-	resource_list_free_entry(iospace);
-	return -ENOSPC;
-}
-
-/*
- * An IO port or MMIO resource assigned to a PCI host bridge may be
- * consumed by the host bridge itself or available to its child
- * bus/devices. The ACPI specification defines a bit (Producer/Consumer)
- * to tell whether the resource is consumed by the host bridge itself,
- * but firmware hasn't used that bit consistently, so we can't rely on it.
- *
- * On x86 and IA64 platforms, all IO port and MMIO resources are assumed
- * to be available to child bus/devices except one special case:
- *     IO port [0xCF8-0xCFF] is consumed by the host bridge itself
- *     to access PCI configuration space.
- *
- * So explicitly filter out PCI CFG IO ports[0xCF8-0xCFF].
- */
-static bool resource_is_pcicfg_ioport(struct resource *res)
-{
-	return (res->flags & IORESOURCE_IO) &&
-		res->start == 0xCF8 && res->end == 0xCFF;
-}
-
-static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
-{
-	struct device *dev = &ci->bridge->dev;
-	struct pci_root_info *info;
-	struct resource *res;
-	struct resource_entry *entry, *tmp;
-	int status;
-
-	status = acpi_pci_probe_root_resources(ci);
-	if (status > 0) {
-		info = container_of(ci, struct pci_root_info, common);
-		resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
-			res = entry->res;
-			if (res->flags & IORESOURCE_MEM) {
-				/*
-				 * HP's firmware has a hack to work around a
-				 * Windows bug. Ignore these tiny memory ranges.
-				 */
-				if (resource_size(res) <= 16) {
-					resource_list_del(entry);
-					insert_resource(&iomem_resource,
-							entry->res);
-					resource_list_add_tail(entry,
-							&info->io_resources);
-				}
-			} else if (res->flags & IORESOURCE_IO) {
-				if (resource_is_pcicfg_ioport(entry->res))
-					resource_list_destroy_entry(entry);
-				else if (add_io_space(dev, info, entry))
-					resource_list_destroy_entry(entry);
-			}
-		}
-	}
-
-	return status;
-}
-
-static void pci_acpi_root_release_info(struct acpi_pci_root_info *ci)
-{
-	struct pci_root_info *info;
-	struct resource_entry *entry, *tmp;
-
-	info = container_of(ci, struct pci_root_info, common);
-	resource_list_for_each_entry_safe(entry, tmp, &info->io_resources) {
-		release_resource(entry->res);
-		resource_list_destroy_entry(entry);
-	}
-	kfree(info);
-}
-
-static struct acpi_pci_root_ops pci_acpi_root_ops = {
-	.pci_ops = &pci_root_ops,
-	.release_info = pci_acpi_root_release_info,
-	.prepare_resources = pci_acpi_root_prepare_resources,
-};
-
-struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
-{
-	struct acpi_device *device = root->device;
-	struct pci_root_info *info;
-
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info) {
-		dev_err(&device->dev,
-			"pci_bus %04x:%02x: ignored (out of memory)\n",
-			root->segment, (int)root->secondary.start);
-		return NULL;
-	}
-
-	info->controller.segment = root->segment;
-	info->controller.companion = device;
-	info->controller.node = acpi_get_node(device->handle);
-	INIT_LIST_HEAD(&info->io_resources);
-	return acpi_pci_root_create(root, &pci_acpi_root_ops,
-				    &info->common, &info->controller);
-}
-
-int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
-{
-	/*
-	 * We pass NULL as parent to pci_create_root_bus(), so if it is not NULL
-	 * here, pci_create_root_bus() has been called by someone else and
-	 * sysdata is likely to be different from what we expect.  Let it go in
-	 * that case.
-	 */
-	if (!bridge->dev.parent) {
-		struct pci_controller *controller = bridge->bus->sysdata;
-		ACPI_COMPANION_SET(&bridge->dev, controller->companion);
-	}
-	return 0;
-}
-
-void pcibios_fixup_device_resources(struct pci_dev *dev)
-{
-	int idx;
-
-	if (!dev->bus)
-		return;
-
-	for (idx = 0; idx < PCI_BRIDGE_RESOURCES; idx++) {
-		struct resource *r = &dev->resource[idx];
-
-		if (!r->flags || r->parent || !r->start)
-			continue;
-
-		pci_claim_resource(dev, idx);
-	}
-}
-EXPORT_SYMBOL_GPL(pcibios_fixup_device_resources);
-
-static void pcibios_fixup_bridge_resources(struct pci_dev *dev)
-{
-	int idx;
-
-	if (!dev->bus)
-		return;
-
-	for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) {
-		struct resource *r = &dev->resource[idx];
-
-		if (!r->flags || r->parent || !r->start)
-			continue;
-
-		pci_claim_bridge_resource(dev, idx);
-	}
-}
-
-/*
- *  Called after each bus is probed, but before its children are examined.
- */
-void pcibios_fixup_bus(struct pci_bus *b)
-{
-	struct pci_dev *dev;
-
-	if (b->self) {
-		pci_read_bridge_bases(b);
-		pcibios_fixup_bridge_resources(b->self);
-	}
-	list_for_each_entry(dev, &b->devices, bus_list)
-		pcibios_fixup_device_resources(dev);
-}
-
-void pcibios_add_bus(struct pci_bus *bus)
-{
-	acpi_pci_add_bus(bus);
-}
-
-void pcibios_remove_bus(struct pci_bus *bus)
-{
-	acpi_pci_remove_bus(bus);
-}
-
-void pcibios_set_master (struct pci_dev *dev)
-{
-	/* No special bus mastering setup handling */
-}
-
-int
-pcibios_enable_device (struct pci_dev *dev, int mask)
-{
-	int ret;
-
-	ret = pci_enable_resources(dev, mask);
-	if (ret < 0)
-		return ret;
-
-	if (!pci_dev_msi_enabled(dev))
-		return acpi_pci_irq_enable(dev);
-	return 0;
-}
-
-void
-pcibios_disable_device (struct pci_dev *dev)
-{
-	BUG_ON(atomic_read(&dev->enable_cnt));
-	if (!pci_dev_msi_enabled(dev))
-		acpi_pci_irq_disable(dev);
-}
-
-/**
- * pci_get_legacy_mem - generic legacy mem routine
- * @bus: bus to get legacy memory base address for
- *
- * Find the base of legacy memory for @bus.  This is typically the first
- * megabyte of bus address space for @bus or is simply 0 on platforms whose
- * chipsets support legacy I/O and memory routing.  Returns the base address
- * or an error pointer if an error occurred.
- *
- * This is the ia64 generic version of this routine.  Other platforms
- * are free to override it with a machine vector.
- */
-char *pci_get_legacy_mem(struct pci_bus *bus)
-{
-	return (char *)__IA64_UNCACHED_OFFSET;
-}
-
-/**
- * pci_mmap_legacy_page_range - map legacy memory space to userland
- * @bus: bus whose legacy space we're mapping
- * @vma: vma passed in by mmap
- *
- * Map legacy memory space for this device back to userspace using a machine
- * vector to get the base address.
- */
-int
-pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma,
-			   enum pci_mmap_state mmap_state)
-{
-	unsigned long size = vma->vm_end - vma->vm_start;
-	pgprot_t prot;
-	char *addr;
-
-	/* We only support mmap'ing of legacy memory space */
-	if (mmap_state != pci_mmap_mem)
-		return -ENOSYS;
-
-	/*
-	 * Avoid attribute aliasing.  See Documentation/arch/ia64/aliasing.rst
-	 * for more details.
-	 */
-	if (!valid_mmap_phys_addr_range(vma->vm_pgoff, size))
-		return -EINVAL;
-	prot = phys_mem_access_prot(NULL, vma->vm_pgoff, size,
-				    vma->vm_page_prot);
-
-	addr = pci_get_legacy_mem(bus);
-	if (IS_ERR(addr))
-		return PTR_ERR(addr);
-
-	vma->vm_pgoff += (unsigned long)addr >> PAGE_SHIFT;
-	vma->vm_page_prot = prot;
-
-	if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
-			    size, vma->vm_page_prot))
-		return -EAGAIN;
-
-	return 0;
-}
-
-/**
- * pci_legacy_read - read from legacy I/O space
- * @bus: bus to read
- * @port: legacy port value
- * @val: caller allocated storage for returned value
- * @size: number of bytes to read
- *
- * Simply reads @size bytes from @port and puts the result in @val.
- *
- * Again, this (and the write routine) are generic versions that can be
- * overridden by the platform.  This is necessary on platforms that don't
- * support legacy I/O routing or that hard fail on legacy I/O timeouts.
- */
-int pci_legacy_read(struct pci_bus *bus, u16 port, u32 *val, u8 size)
-{
-	int ret = size;
-
-	switch (size) {
-	case 1:
-		*val = inb(port);
-		break;
-	case 2:
-		*val = inw(port);
-		break;
-	case 4:
-		*val = inl(port);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-
-	return ret;
-}
-
-/**
- * pci_legacy_write - perform a legacy I/O write
- * @bus: bus pointer
- * @port: port to write
- * @val: value to write
- * @size: number of bytes to write from @val
- *
- * Simply writes @size bytes of @val to @port.
- */
-int pci_legacy_write(struct pci_bus *bus, u16 port, u32 val, u8 size)
-{
-	int ret = size;
-
-	switch (size) {
-	case 1:
-		outb(val, port);
-		break;
-	case 2:
-		outw(val, port);
-		break;
-	case 4:
-		outl(val, port);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-
-	return ret;
-}
-
-/**
- * set_pci_cacheline_size - determine cacheline size for PCI devices
- *
- * We want to use the line-size of the outer-most cache.  We assume
- * that this line-size is the same for all CPUs.
- *
- * Code mostly taken from arch/ia64/kernel/palinfo.c:cache_info().
- */
-static void __init set_pci_dfl_cacheline_size(void)
-{
-	unsigned long levels, unique_caches;
-	long status;
-	pal_cache_config_info_t cci;
-
-	status = ia64_pal_cache_summary(&levels, &unique_caches);
-	if (status != 0) {
-		pr_err("%s: ia64_pal_cache_summary() failed "
-			"(status=%ld)\n", __func__, status);
-		return;
-	}
-
-	status = ia64_pal_cache_config_info(levels - 1,
-				/* cache_type (data_or_unified)= */ 2, &cci);
-	if (status != 0) {
-		pr_err("%s: ia64_pal_cache_config_info() failed "
-			"(status=%ld)\n", __func__, status);
-		return;
-	}
-	pci_dfl_cache_line_size = (1 << cci.pcci_line_size) / 4;
-}
-
-static int __init pcibios_init(void)
-{
-	set_pci_dfl_cacheline_size();
-	return 0;
-}
-
-subsys_initcall(pcibios_init);
diff --git a/arch/ia64/scripts/check-gas b/arch/ia64/scripts/check-gas
deleted file mode 100755
index 787cf9b6b04a..000000000000
--- a/arch/ia64/scripts/check-gas
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-dir=$(dirname $0)
-CC=$1
-OBJDUMP=$2
-tmp=${TMPDIR:-/tmp}
-out=$tmp/out$$.o
-$CC -c $dir/check-gas-asm.S -o $out
-res=$($OBJDUMP -r --section .data $out | fgrep 00004 | tr -s ' ' |cut -f3 -d' ')
-rm -f $out
-if [ $res != ".text" ]; then
-	echo buggy
-else
-	echo good
-fi
-exit 0
diff --git a/arch/ia64/scripts/check-gas-asm.S b/arch/ia64/scripts/check-gas-asm.S
deleted file mode 100644
index 010e1d227e5d..000000000000
--- a/arch/ia64/scripts/check-gas-asm.S
+++ /dev/null
@@ -1,2 +0,0 @@
-[1:]	nop 0
-	.xdata4 ".data", 0, 1b-.
diff --git a/arch/ia64/scripts/check-model.c b/arch/ia64/scripts/check-model.c
deleted file mode 100644
index e1d4e86e3d63..000000000000
--- a/arch/ia64/scripts/check-model.c
+++ /dev/null
@@ -1 +0,0 @@
-int __attribute__ ((__model__ (__small__))) x;
diff --git a/arch/ia64/scripts/check-segrel.S b/arch/ia64/scripts/check-segrel.S
deleted file mode 100644
index 65d6378adaaa..000000000000
--- a/arch/ia64/scripts/check-segrel.S
+++ /dev/null
@@ -1,5 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-	.rodata
-	data4 @segrel(start)
-	.data
-start:
diff --git a/arch/ia64/scripts/check-segrel.lds b/arch/ia64/scripts/check-segrel.lds
deleted file mode 100644
index c385d246e458..000000000000
--- a/arch/ia64/scripts/check-segrel.lds
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-SECTIONS {
-	. = SIZEOF_HEADERS;
-	.rodata : { *(.rodata) } :ro
-	.note : { *(.note*) }
-	. = 0xa0000;
-	.data : { *(.data) } :dat
-	/DISCARD/ : { *(*) }
-}
-PHDRS {
-  ro PT_LOAD FILEHDR PHDRS;
-  dat PT_LOAD;
-}
diff --git a/arch/ia64/scripts/check-serialize.S b/arch/ia64/scripts/check-serialize.S
deleted file mode 100644
index 0400c106806c..000000000000
--- a/arch/ia64/scripts/check-serialize.S
+++ /dev/null
@@ -1,2 +0,0 @@
-	.serialize.data
-	.serialize.instruction
diff --git a/arch/ia64/scripts/check-text-align.S b/arch/ia64/scripts/check-text-align.S
deleted file mode 100644
index 107fa1c88c2e..000000000000
--- a/arch/ia64/scripts/check-text-align.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-	.proc foo
-	.prologue
-foo:	.save rp, r2
-	nop 0
-	.align 64
-	.endp foo
diff --git a/arch/ia64/scripts/toolchain-flags b/arch/ia64/scripts/toolchain-flags
deleted file mode 100755
index 12dff5c981cf..000000000000
--- a/arch/ia64/scripts/toolchain-flags
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-#
-# Check whether linker can handle cross-segment @segrel():
-#
-CPPFLAGS=""
-CC=$1
-OBJDUMP=$2
-READELF=$3
-dir=$(dirname $0)
-tmp=${TMPDIR:-/tmp}
-out=$tmp/out$$
-
-# Check whether cross-segment segment-relative relocs work fine.  We need
-# that for building the gate DSO:
-
-$CC -nostdlib -static -Wl,-T$dir/check-segrel.lds $dir/check-segrel.S -o $out
-res=$($OBJDUMP --full --section .rodata $out | fgrep 000 | cut -f3 -d' ')
-rm -f $out
-if [ $res != 00000a00 ]; then
-    CPPFLAGS="$CPPFLAGS -DHAVE_BUGGY_SEGREL"
-    cat >&2 <<EOF
-warning: your linker cannot handle cross-segment segment-relative relocations.
-         please upgrade to a newer version (it is safe to use this linker, but
-         the kernel will be bigger than strictly necessary).
-EOF
-fi
-
-# Check whether .align inside a function works as expected.
-
-$CC -c $dir/check-text-align.S -o $out
-$READELF -u $out | fgrep -q 'prologue(rlen=12)'
-res=$?
-rm -f $out
-if [ $res -eq 0 ]; then
-    CPPFLAGS="$CPPFLAGS -DHAVE_WORKING_TEXT_ALIGN"
-fi
-
-if ! $CC -c $dir/check-model.c -o $out 2>&1 | grep  __model__ | grep -q attrib
-then
-    CPPFLAGS="$CPPFLAGS -DHAVE_MODEL_SMALL_ATTRIBUTE"
-fi
-rm -f $out
-
-# Check whether assembler supports .serialize.{data,instruction} directive.
-
-$CC -c $dir/check-serialize.S -o $out 2>/dev/null
-res=$?
-rm -f $out
-if [ $res -eq 0 ]; then
-    CPPFLAGS="$CPPFLAGS -DHAVE_SERIALIZE_DIRECTIVE"
-fi
-
-echo $CPPFLAGS
diff --git a/arch/ia64/scripts/unwcheck.py b/arch/ia64/scripts/unwcheck.py
deleted file mode 100644
index 9581742f0db2..000000000000
--- a/arch/ia64/scripts/unwcheck.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0
-#
-# Usage: unwcheck.py FILE
-#
-# This script checks the unwind info of each function in file FILE
-# and verifies that the sum of the region-lengths matches the total
-# length of the function.
-#
-# Based on a shell/awk script originally written by Harish Patil,
-# which was converted to Perl by Matthew Chapman, which was converted
-# to Python by David Mosberger.
-#
-import os
-import re
-import sys
-
-if len(sys.argv) != 2:
-    print("Usage: %s FILE" % sys.argv[0])
-    sys.exit(2)
-
-readelf = os.getenv("READELF", "readelf")
-
-start_pattern = re.compile("<([^>]*)>: \[0x([0-9a-f]+)-0x([0-9a-f]+)\]")
-rlen_pattern  = re.compile(".*rlen=([0-9]+)")
-
-def check_func (func, slots, rlen_sum):
-    if slots != rlen_sum:
-        global num_errors
-        num_errors += 1
-        if not func: func = "[%#x-%#x]" % (start, end)
-        print("ERROR: %s: %lu slots, total region length = %lu" % (func, slots, rlen_sum))
-    return
-
-num_funcs = 0
-num_errors = 0
-func = False
-slots = 0
-rlen_sum = 0
-for line in os.popen("%s -u %s" % (readelf, sys.argv[1])):
-    m = start_pattern.match(line)
-    if m:
-        check_func(func, slots, rlen_sum)
-
-        func  = m.group(1)
-        start = int(m.group(2), 16)
-        end   = int(m.group(3), 16)
-        slots = 3 * (end - start) / 16
-        rlen_sum = 0
-        num_funcs += 1
-    else:
-        m = rlen_pattern.match(line)
-        if m:
-            rlen_sum += int(m.group(1))
-check_func(func, slots, rlen_sum)
-
-if num_errors == 0:
-    print("No errors detected in %u functions." % num_funcs)
-else:
-    if num_errors > 1:
-        err="errors"
-    else:
-        err="error"
-    print("%u %s detected in %u functions." % (num_errors, err, num_funcs))
-    sys.exit(1)
diff --git a/arch/ia64/uv/Makefile b/arch/ia64/uv/Makefile
deleted file mode 100644
index aa9f91947c49..000000000000
--- a/arch/ia64/uv/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-# arch/ia64/uv/Makefile
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 2008 Silicon Graphics, Inc.  All Rights Reserved.
-#
-# Makefile for the sn uv subplatform
-#
-
-obj-y += kernel/
diff --git a/arch/ia64/uv/kernel/Makefile b/arch/ia64/uv/kernel/Makefile
deleted file mode 100644
index 297196578d19..000000000000
--- a/arch/ia64/uv/kernel/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-# arch/ia64/uv/kernel/Makefile
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 2008 Silicon Graphics, Inc.  All Rights Reserved.
-#
-
-ccflags-y := -Iarch/ia64/sn/include
-
-obj-y				+= setup.o
diff --git a/arch/ia64/uv/kernel/setup.c b/arch/ia64/uv/kernel/setup.c
deleted file mode 100644
index bb025486d791..000000000000
--- a/arch/ia64/uv/kernel/setup.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * SGI UV Core Functions
- *
- * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
- */
-
-#include <linux/acpi.h>
-#include <linux/efi.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <asm/uv/uv.h>
-#include <asm/uv/uv_mmrs.h>
-#include <asm/uv/uv_hub.h>
-
-bool ia64_is_uv;
-EXPORT_SYMBOL_GPL(ia64_is_uv);
-
-DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
-EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
-
-struct redir_addr {
-	unsigned long redirect;
-	unsigned long alias;
-};
-
-#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
-
-static __initdata struct redir_addr redir_addrs[] = {
-	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG},
-	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG},
-	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG},
-};
-
-static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
-{
-	union uvh_si_alias0_overlay_config_u alias;
-	union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
-		alias.v = uv_read_local_mmr(redir_addrs[i].alias);
-		if (alias.s.base == 0) {
-			*size = (1UL << alias.s.m_alias);
-			redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
-			*base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
-			return;
-		}
-	}
-	BUG();
-}
-
-void __init uv_probe_system_type(void)
-{
-	struct acpi_table_rsdp *rsdp;
-	struct acpi_table_xsdt *xsdt;
-
-	if (efi.acpi20 == EFI_INVALID_TABLE_ADDR) {
-		pr_err("ACPI 2.0 RSDP not found.\n");
-		return;
-	}
-
-	rsdp = (struct acpi_table_rsdp *)__va(efi.acpi20);
-	if (strncmp(rsdp->signature, ACPI_SIG_RSDP, sizeof(ACPI_SIG_RSDP) - 1)) {
-		pr_err("ACPI 2.0 RSDP signature incorrect.\n");
-		return;
-	}
-
-	xsdt = (struct acpi_table_xsdt *)__va(rsdp->xsdt_physical_address);
-	if (strncmp(xsdt->header.signature, ACPI_SIG_XSDT,
-			sizeof(ACPI_SIG_XSDT) - 1)) {
-		pr_err("ACPI 2.0 XSDT signature incorrect.\n");
-		return;
-	}
-
-	if (!strcmp(xsdt->header.oem_id, "SGI") &&
-	    !strcmp(xsdt->header.oem_table_id + 4, "UV"))
-		ia64_is_uv = true;
-}
-
-void __init uv_setup(char **cmdline_p)
-{
-	union uvh_si_addr_map_config_u m_n_config;
-	union uvh_node_id_u node_id;
-	unsigned long gnode_upper;
-	int nid, cpu, m_val, n_val;
-	unsigned long mmr_base, lowmem_redir_base, lowmem_redir_size;
-
-	get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
-	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
-	m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
-	mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
-			~UV_MMR_ENABLE;
-
-	m_val = m_n_config.s.m_skt;
-	n_val = m_n_config.s.n_skt;
-	printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
-
-	gnode_upper = (((unsigned long)node_id.s.node_id) &
-		       ~((1 << n_val) - 1)) << m_val;
-
-	for_each_present_cpu(cpu) {
-		nid = cpu_to_node(cpu);
-		uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
-		uv_cpu_hub_info(cpu)->lowmem_remap_top =
-			lowmem_redir_base + lowmem_redir_size;
-		uv_cpu_hub_info(cpu)->m_val = m_val;
-		uv_cpu_hub_info(cpu)->n_val = n_val;
-		uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) -1;
-		uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
-		uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
-		uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
-		uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
-		printk(KERN_DEBUG "UV cpu %d, nid %d\n", cpu, nid);
-	}
-}
-
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index cee82b473dc5..554e487cbfab 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -257,7 +257,7 @@ config ACPI_CPU_FREQ_PSS
 config ACPI_PROCESSOR_CSTATE
 	def_bool y
 	depends on ACPI_PROCESSOR
-	depends on IA64 || X86
+	depends on X86
 
 config ACPI_PROCESSOR_IDLE
 	bool
@@ -281,9 +281,9 @@ config ACPI_CPPC_LIB
 
 config ACPI_PROCESSOR
 	tristate "Processor"
-	depends on X86 || IA64 || ARM64 || LOONGARCH
+	depends on X86 || ARM64 || LOONGARCH
 	select ACPI_PROCESSOR_IDLE
-	select ACPI_CPU_FREQ_PSS if X86 || IA64 || LOONGARCH
+	select ACPI_CPU_FREQ_PSS if X86 || LOONGARCH
 	select THERMAL
 	default y
 	help
diff --git a/drivers/acpi/numa/Kconfig b/drivers/acpi/numa/Kconfig
index 39b1f34c21df..849c2bd820b9 100644
--- a/drivers/acpi/numa/Kconfig
+++ b/drivers/acpi/numa/Kconfig
@@ -2,8 +2,8 @@
 config ACPI_NUMA
 	bool "NUMA support"
 	depends on NUMA
-	depends on (X86 || IA64 || ARM64 || LOONGARCH)
-	default y if IA64 || ARM64
+	depends on (X86 || ARM64 || LOONGARCH)
+	default y if ARM64
 
 config ACPI_HMAT
 	bool "ACPI Heterogeneous Memory Attribute Table Support"
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index f725813d0cce..06289a93d3d2 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -276,7 +276,7 @@ acpi_map_lookup_virt(void __iomem *virt, acpi_size size)
 	return NULL;
 }
 
-#if defined(CONFIG_IA64) || defined(CONFIG_ARM64) || defined(CONFIG_RISCV)
+#if defined(CONFIG_ARM64) || defined(CONFIG_RISCV)
 /* ioremap will take care of cache attributes */
 #define should_use_kmap(pfn)   0
 #else
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 625af75833fc..7c8dd0abcfdf 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -348,7 +348,7 @@ config DEVPORT
 	  device is similar to /dev/mem, but for I/O ports.
 
 config HPET
-	bool "HPET - High Precision Event Timer" if (X86 || IA64)
+	bool "HPET - High Precision Event Timer" if X86
 	default n
 	depends on ACPI
 	help
@@ -377,7 +377,7 @@ config HPET_MMAP_DEFAULT
 
 config HANGCHECK_TIMER
 	tristate "Hangcheck timer"
-	depends on X86 || IA64 || PPC64 || S390
+	depends on X86 || PPC64 || S390
 	help
 	  The hangcheck-timer module detects when the system has gone
 	  out to lunch past a certain margin.  It can reboot the system
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index c5f532e412f1..e9b360cdc99a 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -8,7 +8,6 @@ obj-$(CONFIG_TTY_PRINTK)	+= ttyprintk.o
 obj-y				+= misc.o
 obj-$(CONFIG_ATARI_DSP56K)	+= dsp56k.o
 obj-$(CONFIG_VIRTIO_CONSOLE)	+= virtio_console.o
-obj-$(CONFIG_MSPEC)		+= mspec.o
 obj-$(CONFIG_UV_MMTIMER)	+= uv_mmtimer.o
 obj-$(CONFIG_IBM_BSR)		+= bsr.o
 
diff --git a/drivers/char/agp/Kconfig b/drivers/char/agp/Kconfig
index 4f501e4842ab..c47eb7bf06d4 100644
--- a/drivers/char/agp/Kconfig
+++ b/drivers/char/agp/Kconfig
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 menuconfig AGP
 	tristate "/dev/agpgart (AGP Support)"
-	depends on ALPHA || IA64 || PARISC || PPC || X86
+	depends on ALPHA || PARISC || PPC || X86
 	depends on PCI
 	help
 	  AGP (Accelerated Graphics Port) is a bus system mainly used to
@@ -109,20 +109,6 @@ config AGP_VIA
 	  This option gives you AGP support for the GLX component of
 	  X on VIA MVP3/Apollo Pro chipsets.
 
-config AGP_I460
-	tristate "Intel 460GX chipset support"
-	depends on AGP && IA64
-	help
-	  This option gives you AGP GART support for the Intel 460GX chipset
-	  for IA64 processors.
-
-config AGP_HP_ZX1
-	tristate "HP ZX1 chipset AGP support"
-	depends on AGP && IA64
-	help
-	  This option gives you AGP GART support for the HP ZX1 chipset
-	  for IA64 processors.
-
 config AGP_PARISC
 	tristate "HP Quicksilver AGP support"
 	depends on AGP && PARISC && 64BIT && IOMMU_SBA
diff --git a/drivers/char/agp/Makefile b/drivers/char/agp/Makefile
index 90ed8c789e48..25834557e486 100644
--- a/drivers/char/agp/Makefile
+++ b/drivers/char/agp/Makefile
@@ -14,9 +14,7 @@ obj-$(CONFIG_AGP_AMD)		+= amd-k7-agp.o
 obj-$(CONFIG_AGP_AMD64)		+= amd64-agp.o
 obj-$(CONFIG_AGP_ALPHA_CORE)	+= alpha-agp.o
 obj-$(CONFIG_AGP_EFFICEON)	+= efficeon-agp.o
-obj-$(CONFIG_AGP_HP_ZX1)	+= hp-agp.o
 obj-$(CONFIG_AGP_PARISC)	+= parisc-agp.o
-obj-$(CONFIG_AGP_I460)		+= i460-agp.o
 obj-$(CONFIG_AGP_INTEL)		+= intel-agp.o
 obj-$(CONFIG_INTEL_GTT)		+= intel-gtt.o
 obj-$(CONFIG_AGP_NVIDIA)	+= nvidia-agp.o
diff --git a/drivers/char/agp/hp-agp.c b/drivers/char/agp/hp-agp.c
deleted file mode 100644
index 84d9adbb62f6..000000000000
--- a/drivers/char/agp/hp-agp.c
+++ /dev/null
@@ -1,550 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * HP zx1 AGPGART routines.
- *
- * (c) Copyright 2002, 2003 Hewlett-Packard Development Company, L.P.
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- */
-
-#include <linux/acpi.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/agp_backend.h>
-#include <linux/log2.h>
-#include <linux/slab.h>
-
-#include <asm/acpi-ext.h>
-
-#include "agp.h"
-
-#define HP_ZX1_IOC_OFFSET	0x1000  /* ACPI reports SBA, we want IOC */
-
-/* HP ZX1 IOC registers */
-#define HP_ZX1_IBASE		0x300
-#define HP_ZX1_IMASK		0x308
-#define HP_ZX1_PCOM		0x310
-#define HP_ZX1_TCNFG		0x318
-#define HP_ZX1_PDIR_BASE	0x320
-
-#define HP_ZX1_IOVA_BASE	GB(1UL)
-#define HP_ZX1_IOVA_SIZE	GB(1UL)
-#define HP_ZX1_GART_SIZE	(HP_ZX1_IOVA_SIZE / 2)
-#define HP_ZX1_SBA_IOMMU_COOKIE	0x0000badbadc0ffeeUL
-
-#define HP_ZX1_PDIR_VALID_BIT	0x8000000000000000UL
-#define HP_ZX1_IOVA_TO_PDIR(va)	((va - hp_private.iova_base) >> hp_private.io_tlb_shift)
-
-#define AGP8X_MODE_BIT		3
-#define AGP8X_MODE		(1 << AGP8X_MODE_BIT)
-
-/* AGP bridge need not be PCI device, but DRM thinks it is. */
-static struct pci_dev fake_bridge_dev;
-
-static int hp_zx1_gart_found;
-
-static struct aper_size_info_fixed hp_zx1_sizes[] =
-{
-	{0, 0, 0},		/* filled in by hp_zx1_fetch_size() */
-};
-
-static struct gatt_mask hp_zx1_masks[] =
-{
-	{.mask = HP_ZX1_PDIR_VALID_BIT, .type = 0}
-};
-
-static struct _hp_private {
-	volatile u8 __iomem *ioc_regs;
-	volatile u8 __iomem *lba_regs;
-	int lba_cap_offset;
-	u64 *io_pdir;		// PDIR for entire IOVA
-	u64 *gatt;		// PDIR just for GART (subset of above)
-	u64 gatt_entries;
-	u64 iova_base;
-	u64 gart_base;
-	u64 gart_size;
-	u64 io_pdir_size;
-	int io_pdir_owner;	// do we own it, or share it with sba_iommu?
-	int io_page_size;
-	int io_tlb_shift;
-	int io_tlb_ps;		// IOC ps config
-	int io_pages_per_kpage;
-} hp_private;
-
-static int __init hp_zx1_ioc_shared(void)
-{
-	struct _hp_private *hp = &hp_private;
-
-	printk(KERN_INFO PFX "HP ZX1 IOC: IOPDIR shared with sba_iommu\n");
-
-	/*
-	 * IOC already configured by sba_iommu module; just use
-	 * its setup.  We assume:
-	 *	- IOVA space is 1Gb in size
-	 *	- first 512Mb is IOMMU, second 512Mb is GART
-	 */
-	hp->io_tlb_ps = readq(hp->ioc_regs+HP_ZX1_TCNFG);
-	switch (hp->io_tlb_ps) {
-		case 0: hp->io_tlb_shift = 12; break;
-		case 1: hp->io_tlb_shift = 13; break;
-		case 2: hp->io_tlb_shift = 14; break;
-		case 3: hp->io_tlb_shift = 16; break;
-		default:
-			printk(KERN_ERR PFX "Invalid IOTLB page size "
-			       "configuration 0x%x\n", hp->io_tlb_ps);
-			hp->gatt = NULL;
-			hp->gatt_entries = 0;
-			return -ENODEV;
-	}
-	hp->io_page_size = 1 << hp->io_tlb_shift;
-	hp->io_pages_per_kpage = PAGE_SIZE / hp->io_page_size;
-
-	hp->iova_base = readq(hp->ioc_regs+HP_ZX1_IBASE) & ~0x1;
-	hp->gart_base = hp->iova_base + HP_ZX1_IOVA_SIZE - HP_ZX1_GART_SIZE;
-
-	hp->gart_size = HP_ZX1_GART_SIZE;
-	hp->gatt_entries = hp->gart_size / hp->io_page_size;
-
-	hp->io_pdir = phys_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE));
-	hp->gatt = &hp->io_pdir[HP_ZX1_IOVA_TO_PDIR(hp->gart_base)];
-
-	if (hp->gatt[0] != HP_ZX1_SBA_IOMMU_COOKIE) {
-		/* Normal case when no AGP device in system */
-		hp->gatt = NULL;
-		hp->gatt_entries = 0;
-		printk(KERN_ERR PFX "No reserved IO PDIR entry found; "
-		       "GART disabled\n");
-		return -ENODEV;
-	}
-
-	return 0;
-}
-
-static int __init
-hp_zx1_ioc_owner (void)
-{
-	struct _hp_private *hp = &hp_private;
-
-	printk(KERN_INFO PFX "HP ZX1 IOC: IOPDIR dedicated to GART\n");
-
-	/*
-	 * Select an IOV page size no larger than system page size.
-	 */
-	if (PAGE_SIZE >= KB(64)) {
-		hp->io_tlb_shift = 16;
-		hp->io_tlb_ps = 3;
-	} else if (PAGE_SIZE >= KB(16)) {
-		hp->io_tlb_shift = 14;
-		hp->io_tlb_ps = 2;
-	} else if (PAGE_SIZE >= KB(8)) {
-		hp->io_tlb_shift = 13;
-		hp->io_tlb_ps = 1;
-	} else {
-		hp->io_tlb_shift = 12;
-		hp->io_tlb_ps = 0;
-	}
-	hp->io_page_size = 1 << hp->io_tlb_shift;
-	hp->io_pages_per_kpage = PAGE_SIZE / hp->io_page_size;
-
-	hp->iova_base = HP_ZX1_IOVA_BASE;
-	hp->gart_size = HP_ZX1_GART_SIZE;
-	hp->gart_base = hp->iova_base + HP_ZX1_IOVA_SIZE - hp->gart_size;
-
-	hp->gatt_entries = hp->gart_size / hp->io_page_size;
-	hp->io_pdir_size = (HP_ZX1_IOVA_SIZE / hp->io_page_size) * sizeof(u64);
-
-	return 0;
-}
-
-static int __init
-hp_zx1_ioc_init (u64 hpa)
-{
-	struct _hp_private *hp = &hp_private;
-
-	hp->ioc_regs = ioremap(hpa, 1024);
-	if (!hp->ioc_regs)
-		return -ENOMEM;
-
-	/*
-	 * If the IOTLB is currently disabled, we can take it over.
-	 * Otherwise, we have to share with sba_iommu.
-	 */
-	hp->io_pdir_owner = (readq(hp->ioc_regs+HP_ZX1_IBASE) & 0x1) == 0;
-
-	if (hp->io_pdir_owner)
-		return hp_zx1_ioc_owner();
-
-	return hp_zx1_ioc_shared();
-}
-
-static int
-hp_zx1_lba_find_capability (volatile u8 __iomem *hpa, int cap)
-{
-	u16 status;
-	u8 pos, id;
-	int ttl = 48;
-
-	status = readw(hpa+PCI_STATUS);
-	if (!(status & PCI_STATUS_CAP_LIST))
-		return 0;
-	pos = readb(hpa+PCI_CAPABILITY_LIST);
-	while (ttl-- && pos >= 0x40) {
-		pos &= ~3;
-		id = readb(hpa+pos+PCI_CAP_LIST_ID);
-		if (id == 0xff)
-			break;
-		if (id == cap)
-			return pos;
-		pos = readb(hpa+pos+PCI_CAP_LIST_NEXT);
-	}
-	return 0;
-}
-
-static int __init
-hp_zx1_lba_init (u64 hpa)
-{
-	struct _hp_private *hp = &hp_private;
-	int cap;
-
-	hp->lba_regs = ioremap(hpa, 256);
-	if (!hp->lba_regs)
-		return -ENOMEM;
-
-	hp->lba_cap_offset = hp_zx1_lba_find_capability(hp->lba_regs, PCI_CAP_ID_AGP);
-
-	cap = readl(hp->lba_regs+hp->lba_cap_offset) & 0xff;
-	if (cap != PCI_CAP_ID_AGP) {
-		printk(KERN_ERR PFX "Invalid capability ID 0x%02x at 0x%x\n",
-		       cap, hp->lba_cap_offset);
-		iounmap(hp->lba_regs);
-		return -ENODEV;
-	}
-
-	return 0;
-}
-
-static int
-hp_zx1_fetch_size(void)
-{
-	int size;
-
-	size = hp_private.gart_size / MB(1);
-	hp_zx1_sizes[0].size = size;
-	agp_bridge->current_size = (void *) &hp_zx1_sizes[0];
-	return size;
-}
-
-static int
-hp_zx1_configure (void)
-{
-	struct _hp_private *hp = &hp_private;
-
-	agp_bridge->gart_bus_addr = hp->gart_base;
-	agp_bridge->capndx = hp->lba_cap_offset;
-	agp_bridge->mode = readl(hp->lba_regs+hp->lba_cap_offset+PCI_AGP_STATUS);
-
-	if (hp->io_pdir_owner) {
-		writel(virt_to_phys(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE);
-		readl(hp->ioc_regs+HP_ZX1_PDIR_BASE);
-		writel(hp->io_tlb_ps, hp->ioc_regs+HP_ZX1_TCNFG);
-		readl(hp->ioc_regs+HP_ZX1_TCNFG);
-		writel((unsigned int)(~(HP_ZX1_IOVA_SIZE-1)), hp->ioc_regs+HP_ZX1_IMASK);
-		readl(hp->ioc_regs+HP_ZX1_IMASK);
-		writel(hp->iova_base|1, hp->ioc_regs+HP_ZX1_IBASE);
-		readl(hp->ioc_regs+HP_ZX1_IBASE);
-		writel(hp->iova_base|ilog2(HP_ZX1_IOVA_SIZE), hp->ioc_regs+HP_ZX1_PCOM);
-		readl(hp->ioc_regs+HP_ZX1_PCOM);
-	}
-
-	return 0;
-}
-
-static void
-hp_zx1_cleanup (void)
-{
-	struct _hp_private *hp = &hp_private;
-
-	if (hp->ioc_regs) {
-		if (hp->io_pdir_owner) {
-			writeq(0, hp->ioc_regs+HP_ZX1_IBASE);
-			readq(hp->ioc_regs+HP_ZX1_IBASE);
-		}
-		iounmap(hp->ioc_regs);
-	}
-	if (hp->lba_regs)
-		iounmap(hp->lba_regs);
-}
-
-static void
-hp_zx1_tlbflush (struct agp_memory *mem)
-{
-	struct _hp_private *hp = &hp_private;
-
-	writeq(hp->gart_base | ilog2(hp->gart_size), hp->ioc_regs+HP_ZX1_PCOM);
-	readq(hp->ioc_regs+HP_ZX1_PCOM);
-}
-
-static int
-hp_zx1_create_gatt_table (struct agp_bridge_data *bridge)
-{
-	struct _hp_private *hp = &hp_private;
-	int i;
-
-	if (hp->io_pdir_owner) {
-		hp->io_pdir = (u64 *) __get_free_pages(GFP_KERNEL,
-						get_order(hp->io_pdir_size));
-		if (!hp->io_pdir) {
-			printk(KERN_ERR PFX "Couldn't allocate contiguous "
-				"memory for I/O PDIR\n");
-			hp->gatt = NULL;
-			hp->gatt_entries = 0;
-			return -ENOMEM;
-		}
-		memset(hp->io_pdir, 0, hp->io_pdir_size);
-
-		hp->gatt = &hp->io_pdir[HP_ZX1_IOVA_TO_PDIR(hp->gart_base)];
-	}
-
-	for (i = 0; i < hp->gatt_entries; i++) {
-		hp->gatt[i] = (unsigned long) agp_bridge->scratch_page;
-	}
-
-	return 0;
-}
-
-static int
-hp_zx1_free_gatt_table (struct agp_bridge_data *bridge)
-{
-	struct _hp_private *hp = &hp_private;
-
-	if (hp->io_pdir_owner)
-		free_pages((unsigned long) hp->io_pdir,
-			    get_order(hp->io_pdir_size));
-	else
-		hp->gatt[0] = HP_ZX1_SBA_IOMMU_COOKIE;
-	return 0;
-}
-
-static int
-hp_zx1_insert_memory (struct agp_memory *mem, off_t pg_start, int type)
-{
-	struct _hp_private *hp = &hp_private;
-	int i, k;
-	off_t j, io_pg_start;
-	int io_pg_count;
-
-	if (type != mem->type ||
-		agp_bridge->driver->agp_type_to_mask_type(agp_bridge, type)) {
-		return -EINVAL;
-	}
-
-	io_pg_start = hp->io_pages_per_kpage * pg_start;
-	io_pg_count = hp->io_pages_per_kpage * mem->page_count;
-	if ((io_pg_start + io_pg_count) > hp->gatt_entries) {
-		return -EINVAL;
-	}
-
-	j = io_pg_start;
-	while (j < (io_pg_start + io_pg_count)) {
-		if (hp->gatt[j]) {
-			return -EBUSY;
-		}
-		j++;
-	}
-
-	if (!mem->is_flushed) {
-		global_cache_flush();
-		mem->is_flushed = true;
-	}
-
-	for (i = 0, j = io_pg_start; i < mem->page_count; i++) {
-		unsigned long paddr;
-
-		paddr = page_to_phys(mem->pages[i]);
-		for (k = 0;
-		     k < hp->io_pages_per_kpage;
-		     k++, j++, paddr += hp->io_page_size) {
-			hp->gatt[j] = HP_ZX1_PDIR_VALID_BIT | paddr;
-		}
-	}
-
-	agp_bridge->driver->tlb_flush(mem);
-	return 0;
-}
-
-static int
-hp_zx1_remove_memory (struct agp_memory *mem, off_t pg_start, int type)
-{
-	struct _hp_private *hp = &hp_private;
-	int i, io_pg_start, io_pg_count;
-
-	if (type != mem->type ||
-		agp_bridge->driver->agp_type_to_mask_type(agp_bridge, type)) {
-		return -EINVAL;
-	}
-
-	io_pg_start = hp->io_pages_per_kpage * pg_start;
-	io_pg_count = hp->io_pages_per_kpage * mem->page_count;
-	for (i = io_pg_start; i < io_pg_count + io_pg_start; i++) {
-		hp->gatt[i] = agp_bridge->scratch_page;
-	}
-
-	agp_bridge->driver->tlb_flush(mem);
-	return 0;
-}
-
-static unsigned long
-hp_zx1_mask_memory (struct agp_bridge_data *bridge, dma_addr_t addr, int type)
-{
-	return HP_ZX1_PDIR_VALID_BIT | addr;
-}
-
-static void
-hp_zx1_enable (struct agp_bridge_data *bridge, u32 mode)
-{
-	struct _hp_private *hp = &hp_private;
-	u32 command;
-
-	command = readl(hp->lba_regs+hp->lba_cap_offset+PCI_AGP_STATUS);
-	command = agp_collect_device_status(bridge, mode, command);
-	command |= 0x00000100;
-
-	writel(command, hp->lba_regs+hp->lba_cap_offset+PCI_AGP_COMMAND);
-
-	agp_device_command(command, (mode & AGP8X_MODE) != 0);
-}
-
-const struct agp_bridge_driver hp_zx1_driver = {
-	.owner			= THIS_MODULE,
-	.size_type		= FIXED_APER_SIZE,
-	.configure		= hp_zx1_configure,
-	.fetch_size		= hp_zx1_fetch_size,
-	.cleanup		= hp_zx1_cleanup,
-	.tlb_flush		= hp_zx1_tlbflush,
-	.mask_memory		= hp_zx1_mask_memory,
-	.masks			= hp_zx1_masks,
-	.agp_enable		= hp_zx1_enable,
-	.cache_flush		= global_cache_flush,
-	.create_gatt_table	= hp_zx1_create_gatt_table,
-	.free_gatt_table	= hp_zx1_free_gatt_table,
-	.insert_memory		= hp_zx1_insert_memory,
-	.remove_memory		= hp_zx1_remove_memory,
-	.alloc_by_type		= agp_generic_alloc_by_type,
-	.free_by_type		= agp_generic_free_by_type,
-	.agp_alloc_page		= agp_generic_alloc_page,
-	.agp_alloc_pages	= agp_generic_alloc_pages,
-	.agp_destroy_page	= agp_generic_destroy_page,
-	.agp_destroy_pages	= agp_generic_destroy_pages,
-	.agp_type_to_mask_type  = agp_generic_type_to_mask_type,
-	.cant_use_aperture	= true,
-};
-
-static int __init
-hp_zx1_setup (u64 ioc_hpa, u64 lba_hpa)
-{
-	struct agp_bridge_data *bridge;
-	int error = 0;
-
-	error = hp_zx1_ioc_init(ioc_hpa);
-	if (error)
-		goto fail;
-
-	error = hp_zx1_lba_init(lba_hpa);
-	if (error)
-		goto fail;
-
-	bridge = agp_alloc_bridge();
-	if (!bridge) {
-		error = -ENOMEM;
-		goto fail;
-	}
-	bridge->driver = &hp_zx1_driver;
-
-	fake_bridge_dev.vendor = PCI_VENDOR_ID_HP;
-	fake_bridge_dev.device = PCI_DEVICE_ID_HP_PCIX_LBA;
-	bridge->dev = &fake_bridge_dev;
-
-	error = agp_add_bridge(bridge);
-  fail:
-	if (error)
-		hp_zx1_cleanup();
-	return error;
-}
-
-static acpi_status __init
-zx1_gart_probe (acpi_handle obj, u32 depth, void *context, void **ret)
-{
-	acpi_handle handle, parent;
-	acpi_status status;
-	struct acpi_device_info *info;
-	u64 lba_hpa, sba_hpa, length;
-	int match;
-
-	status = hp_acpi_csr_space(obj, &lba_hpa, &length);
-	if (ACPI_FAILURE(status))
-		return AE_OK; /* keep looking for another bridge */
-
-	/* Look for an enclosing IOC scope and find its CSR space */
-	handle = obj;
-	do {
-		status = acpi_get_object_info(handle, &info);
-		if (ACPI_SUCCESS(status) && (info->valid & ACPI_VALID_HID)) {
-			/* TBD check _CID also */
-			match = (strcmp(info->hardware_id.string, "HWP0001") == 0);
-			kfree(info);
-			if (match) {
-				status = hp_acpi_csr_space(handle, &sba_hpa, &length);
-				if (ACPI_SUCCESS(status))
-					break;
-				else {
-					printk(KERN_ERR PFX "Detected HP ZX1 "
-					       "AGP LBA but no IOC.\n");
-					return AE_OK;
-				}
-			}
-		}
-
-		status = acpi_get_parent(handle, &parent);
-		handle = parent;
-	} while (ACPI_SUCCESS(status));
-
-	if (ACPI_FAILURE(status))
-		return AE_OK;	/* found no enclosing IOC */
-
-	if (hp_zx1_setup(sba_hpa + HP_ZX1_IOC_OFFSET, lba_hpa))
-		return AE_OK;
-
-	printk(KERN_INFO PFX "Detected HP ZX1 %s AGP chipset "
-		"(ioc=%llx, lba=%llx)\n", (char *)context,
-		sba_hpa + HP_ZX1_IOC_OFFSET, lba_hpa);
-
-	hp_zx1_gart_found = 1;
-	return AE_CTRL_TERMINATE; /* we only support one bridge; quit looking */
-}
-
-static int __init
-agp_hp_init (void)
-{
-	if (agp_off)
-		return -EINVAL;
-
-	acpi_get_devices("HWP0003", zx1_gart_probe, "HWP0003", NULL);
-	if (hp_zx1_gart_found)
-		return 0;
-
-	acpi_get_devices("HWP0007", zx1_gart_probe, "HWP0007", NULL);
-	if (hp_zx1_gart_found)
-		return 0;
-
-	return -ENODEV;
-}
-
-static void __exit
-agp_hp_cleanup (void)
-{
-}
-
-module_init(agp_hp_init);
-module_exit(agp_hp_cleanup);
-
-MODULE_LICENSE("GPL and additional rights");
diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c
deleted file mode 100644
index 15b240ea4848..000000000000
--- a/drivers/char/agp/i460-agp.c
+++ /dev/null
@@ -1,659 +0,0 @@
-/*
- * For documentation on the i460 AGP interface, see Chapter 7 (AGP Subsystem) of
- * the "Intel 460GTX Chipset Software Developer's Manual":
- * http://www.intel.com/design/archives/itanium/downloads/248704.htm 
- */
-/*
- * 460GX support by Chris Ahna <christopher.j.ahna@intel.com>
- * Clean up & simplification by David Mosberger-Tang <davidm@hpl.hp.com>
- */
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/agp_backend.h>
-#include <linux/log2.h>
-
-#include "agp.h"
-
-#define INTEL_I460_BAPBASE		0x98
-#define INTEL_I460_GXBCTL		0xa0
-#define INTEL_I460_AGPSIZ		0xa2
-#define INTEL_I460_ATTBASE		0xfe200000
-#define INTEL_I460_GATT_VALID		(1UL << 24)
-#define INTEL_I460_GATT_COHERENT	(1UL << 25)
-
-/*
- * The i460 can operate with large (4MB) pages, but there is no sane way to support this
- * within the current kernel/DRM environment, so we disable the relevant code for now.
- * See also comments in ia64_alloc_page()...
- */
-#define I460_LARGE_IO_PAGES		0
-
-#if I460_LARGE_IO_PAGES
-# define I460_IO_PAGE_SHIFT		i460.io_page_shift
-#else
-# define I460_IO_PAGE_SHIFT		12
-#endif
-
-#define I460_IOPAGES_PER_KPAGE		(PAGE_SIZE >> I460_IO_PAGE_SHIFT)
-#define I460_KPAGES_PER_IOPAGE		(1 << (I460_IO_PAGE_SHIFT - PAGE_SHIFT))
-#define I460_SRAM_IO_DISABLE		(1 << 4)
-#define I460_BAPBASE_ENABLE		(1 << 3)
-#define I460_AGPSIZ_MASK		0x7
-#define I460_4M_PS			(1 << 1)
-
-/* Control bits for Out-Of-GART coherency and Burst Write Combining */
-#define I460_GXBCTL_OOG		(1UL << 0)
-#define I460_GXBCTL_BWC		(1UL << 2)
-
-/*
- * gatt_table entries are 32-bits wide on the i460; the generic code ought to declare the
- * gatt_table and gatt_table_real pointers a "void *"...
- */
-#define RD_GATT(index)		readl((u32 *) i460.gatt + (index))
-#define WR_GATT(index, val)	writel((val), (u32 *) i460.gatt + (index))
-/*
- * The 460 spec says we have to read the last location written to make sure that all
- * writes have taken effect
- */
-#define WR_FLUSH_GATT(index)	RD_GATT(index)
-
-static unsigned long i460_mask_memory (struct agp_bridge_data *bridge,
-				       dma_addr_t addr, int type);
-
-static struct {
-	void *gatt;				/* ioremap'd GATT area */
-
-	/* i460 supports multiple GART page sizes, so GART pageshift is dynamic: */
-	u8 io_page_shift;
-
-	/* BIOS configures chipset to one of 2 possible apbase values: */
-	u8 dynamic_apbase;
-
-	/* structure for tracking partial use of 4MB GART pages: */
-	struct lp_desc {
-		unsigned long *alloced_map;	/* bitmap of kernel-pages in use */
-		int refcount;			/* number of kernel pages using the large page */
-		u64 paddr;			/* physical address of large page */
-		struct page *page; 		/* page pointer */
-	} *lp_desc;
-} i460;
-
-static const struct aper_size_info_8 i460_sizes[3] =
-{
-	/*
-	 * The 32GB aperture is only available with a 4M GART page size.  Due to the
-	 * dynamic GART page size, we can't figure out page_order or num_entries until
-	 * runtime.
-	 */
-	{32768, 0, 0, 4},
-	{1024, 0, 0, 2},
-	{256, 0, 0, 1}
-};
-
-static struct gatt_mask i460_masks[] =
-{
-	{
-	  .mask = INTEL_I460_GATT_VALID | INTEL_I460_GATT_COHERENT,
-	  .type = 0
-	}
-};
-
-static int i460_fetch_size (void)
-{
-	int i;
-	u8 temp;
-	struct aper_size_info_8 *values;
-
-	/* Determine the GART page size */
-	pci_read_config_byte(agp_bridge->dev, INTEL_I460_GXBCTL, &temp);
-	i460.io_page_shift = (temp & I460_4M_PS) ? 22 : 12;
-	pr_debug("i460_fetch_size: io_page_shift=%d\n", i460.io_page_shift);
-
-	if (i460.io_page_shift != I460_IO_PAGE_SHIFT) {
-		printk(KERN_ERR PFX
-			"I/O (GART) page-size %luKB doesn't match expected "
-				"size %luKB\n",
-			1UL << (i460.io_page_shift - 10),
-			1UL << (I460_IO_PAGE_SHIFT));
-		return 0;
-	}
-
-	values = A_SIZE_8(agp_bridge->driver->aperture_sizes);
-
-	pci_read_config_byte(agp_bridge->dev, INTEL_I460_AGPSIZ, &temp);
-
-	/* Exit now if the IO drivers for the GART SRAMS are turned off */
-	if (temp & I460_SRAM_IO_DISABLE) {
-		printk(KERN_ERR PFX "GART SRAMS disabled on 460GX chipset\n");
-		printk(KERN_ERR PFX "AGPGART operation not possible\n");
-		return 0;
-	}
-
-	/* Make sure we don't try to create an 2 ^ 23 entry GATT */
-	if ((i460.io_page_shift == 0) && ((temp & I460_AGPSIZ_MASK) == 4)) {
-		printk(KERN_ERR PFX "We can't have a 32GB aperture with 4KB GART pages\n");
-		return 0;
-	}
-
-	/* Determine the proper APBASE register */
-	if (temp & I460_BAPBASE_ENABLE)
-		i460.dynamic_apbase = INTEL_I460_BAPBASE;
-	else
-		i460.dynamic_apbase = AGP_APBASE;
-
-	for (i = 0; i < agp_bridge->driver->num_aperture_sizes; i++) {
-		/*
-		 * Dynamically calculate the proper num_entries and page_order values for
-		 * the define aperture sizes. Take care not to shift off the end of
-		 * values[i].size.
-		 */
-		values[i].num_entries = (values[i].size << 8) >> (I460_IO_PAGE_SHIFT - 12);
-		values[i].page_order = ilog2((sizeof(u32)*values[i].num_entries) >> PAGE_SHIFT);
-	}
-
-	for (i = 0; i < agp_bridge->driver->num_aperture_sizes; i++) {
-		/* Neglect control bits when matching up size_value */
-		if ((temp & I460_AGPSIZ_MASK) == values[i].size_value) {
-			agp_bridge->previous_size = agp_bridge->current_size = (void *) (values + i);
-			agp_bridge->aperture_size_idx = i;
-			return values[i].size;
-		}
-	}
-
-	return 0;
-}
-
-/* There isn't anything to do here since 460 has no GART TLB. */
-static void i460_tlb_flush (struct agp_memory *mem)
-{
-	return;
-}
-
-/*
- * This utility function is needed to prevent corruption of the control bits
- * which are stored along with the aperture size in 460's AGPSIZ register
- */
-static void i460_write_agpsiz (u8 size_value)
-{
-	u8 temp;
-
-	pci_read_config_byte(agp_bridge->dev, INTEL_I460_AGPSIZ, &temp);
-	pci_write_config_byte(agp_bridge->dev, INTEL_I460_AGPSIZ,
-			      ((temp & ~I460_AGPSIZ_MASK) | size_value));
-}
-
-static void i460_cleanup (void)
-{
-	struct aper_size_info_8 *previous_size;
-
-	previous_size = A_SIZE_8(agp_bridge->previous_size);
-	i460_write_agpsiz(previous_size->size_value);
-
-	if (I460_IO_PAGE_SHIFT > PAGE_SHIFT)
-		kfree(i460.lp_desc);
-}
-
-static int i460_configure (void)
-{
-	union {
-		u32 small[2];
-		u64 large;
-	} temp;
-	size_t size;
-	u8 scratch;
-	struct aper_size_info_8 *current_size;
-
-	temp.large = 0;
-
-	current_size = A_SIZE_8(agp_bridge->current_size);
-	i460_write_agpsiz(current_size->size_value);
-
-	/*
-	 * Do the necessary rigmarole to read all eight bytes of APBASE.
-	 * This has to be done since the AGP aperture can be above 4GB on
-	 * 460 based systems.
-	 */
-	pci_read_config_dword(agp_bridge->dev, i460.dynamic_apbase, &(temp.small[0]));
-	pci_read_config_dword(agp_bridge->dev, i460.dynamic_apbase + 4, &(temp.small[1]));
-
-	/* Clear BAR control bits */
-	agp_bridge->gart_bus_addr = temp.large & ~((1UL << 3) - 1);
-
-	pci_read_config_byte(agp_bridge->dev, INTEL_I460_GXBCTL, &scratch);
-	pci_write_config_byte(agp_bridge->dev, INTEL_I460_GXBCTL,
-			      (scratch & 0x02) | I460_GXBCTL_OOG | I460_GXBCTL_BWC);
-
-	/*
-	 * Initialize partial allocation trackers if a GART page is bigger than a kernel
-	 * page.
-	 */
-	if (I460_IO_PAGE_SHIFT > PAGE_SHIFT) {
-		size = current_size->num_entries * sizeof(i460.lp_desc[0]);
-		i460.lp_desc = kzalloc(size, GFP_KERNEL);
-		if (!i460.lp_desc)
-			return -ENOMEM;
-	}
-	return 0;
-}
-
-static int i460_create_gatt_table (struct agp_bridge_data *bridge)
-{
-	int page_order, num_entries, i;
-	void *temp;
-
-	/*
-	 * Load up the fixed address of the GART SRAMS which hold our GATT table.
-	 */
-	temp = agp_bridge->current_size;
-	page_order = A_SIZE_8(temp)->page_order;
-	num_entries = A_SIZE_8(temp)->num_entries;
-
-	i460.gatt = ioremap(INTEL_I460_ATTBASE, PAGE_SIZE << page_order);
-	if (!i460.gatt) {
-		printk(KERN_ERR PFX "ioremap failed\n");
-		return -ENOMEM;
-	}
-
-	/* These are no good, the should be removed from the agp_bridge strucure... */
-	agp_bridge->gatt_table_real = NULL;
-	agp_bridge->gatt_table = NULL;
-	agp_bridge->gatt_bus_addr = 0;
-
-	for (i = 0; i < num_entries; ++i)
-		WR_GATT(i, 0);
-	WR_FLUSH_GATT(i - 1);
-	return 0;
-}
-
-static int i460_free_gatt_table (struct agp_bridge_data *bridge)
-{
-	int num_entries, i;
-	void *temp;
-
-	temp = agp_bridge->current_size;
-
-	num_entries = A_SIZE_8(temp)->num_entries;
-
-	for (i = 0; i < num_entries; ++i)
-		WR_GATT(i, 0);
-	WR_FLUSH_GATT(num_entries - 1);
-
-	iounmap(i460.gatt);
-	return 0;
-}
-
-/*
- * The following functions are called when the I/O (GART) page size is smaller than
- * PAGE_SIZE.
- */
-
-static int i460_insert_memory_small_io_page (struct agp_memory *mem,
-				off_t pg_start, int type)
-{
-	unsigned long paddr, io_pg_start, io_page_size;
-	int i, j, k, num_entries;
-	void *temp;
-
-	pr_debug("i460_insert_memory_small_io_page(mem=%p, pg_start=%ld, type=%d, paddr0=0x%lx)\n",
-		 mem, pg_start, type, page_to_phys(mem->pages[0]));
-
-	if (type >= AGP_USER_TYPES || mem->type >= AGP_USER_TYPES)
-		return -EINVAL;
-
-	io_pg_start = I460_IOPAGES_PER_KPAGE * pg_start;
-
-	temp = agp_bridge->current_size;
-	num_entries = A_SIZE_8(temp)->num_entries;
-
-	if ((io_pg_start + I460_IOPAGES_PER_KPAGE * mem->page_count) > num_entries) {
-		printk(KERN_ERR PFX "Looks like we're out of AGP memory\n");
-		return -EINVAL;
-	}
-
-	j = io_pg_start;
-	while (j < (io_pg_start + I460_IOPAGES_PER_KPAGE * mem->page_count)) {
-		if (!PGE_EMPTY(agp_bridge, RD_GATT(j))) {
-			pr_debug("i460_insert_memory_small_io_page: GATT[%d]=0x%x is busy\n",
-				 j, RD_GATT(j));
-			return -EBUSY;
-		}
-		j++;
-	}
-
-	io_page_size = 1UL << I460_IO_PAGE_SHIFT;
-	for (i = 0, j = io_pg_start; i < mem->page_count; i++) {
-		paddr = page_to_phys(mem->pages[i]);
-		for (k = 0; k < I460_IOPAGES_PER_KPAGE; k++, j++, paddr += io_page_size)
-			WR_GATT(j, i460_mask_memory(agp_bridge, paddr, mem->type));
-	}
-	WR_FLUSH_GATT(j - 1);
-	return 0;
-}
-
-static int i460_remove_memory_small_io_page(struct agp_memory *mem,
-				off_t pg_start, int type)
-{
-	int i;
-
-	pr_debug("i460_remove_memory_small_io_page(mem=%p, pg_start=%ld, type=%d)\n",
-		 mem, pg_start, type);
-
-	pg_start = I460_IOPAGES_PER_KPAGE * pg_start;
-
-	for (i = pg_start; i < (pg_start + I460_IOPAGES_PER_KPAGE * mem->page_count); i++)
-		WR_GATT(i, 0);
-	WR_FLUSH_GATT(i - 1);
-	return 0;
-}
-
-#if I460_LARGE_IO_PAGES
-
-/*
- * These functions are called when the I/O (GART) page size exceeds PAGE_SIZE.
- *
- * This situation is interesting since AGP memory allocations that are smaller than a
- * single GART page are possible.  The i460.lp_desc array tracks partial allocation of the
- * large GART pages to work around this issue.
- *
- * i460.lp_desc[pg_num].refcount tracks the number of kernel pages in use within GART page
- * pg_num.  i460.lp_desc[pg_num].paddr is the physical address of the large page and
- * i460.lp_desc[pg_num].alloced_map is a bitmap of kernel pages that are in use (allocated).
- */
-
-static int i460_alloc_large_page (struct lp_desc *lp)
-{
-	unsigned long order = I460_IO_PAGE_SHIFT - PAGE_SHIFT;
-	size_t map_size;
-
-	lp->page = alloc_pages(GFP_KERNEL, order);
-	if (!lp->page) {
-		printk(KERN_ERR PFX "Couldn't alloc 4M GART page...\n");
-		return -ENOMEM;
-	}
-
-	map_size = ((I460_KPAGES_PER_IOPAGE + BITS_PER_LONG - 1) & -BITS_PER_LONG)/8;
-	lp->alloced_map = kzalloc(map_size, GFP_KERNEL);
-	if (!lp->alloced_map) {
-		__free_pages(lp->page, order);
-		printk(KERN_ERR PFX "Out of memory, we're in trouble...\n");
-		return -ENOMEM;
-	}
-
-	lp->paddr = page_to_phys(lp->page);
-	lp->refcount = 0;
-	atomic_add(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp);
-	return 0;
-}
-
-static void i460_free_large_page (struct lp_desc *lp)
-{
-	kfree(lp->alloced_map);
-	lp->alloced_map = NULL;
-
-	__free_pages(lp->page, I460_IO_PAGE_SHIFT - PAGE_SHIFT);
-	atomic_sub(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp);
-}
-
-static int i460_insert_memory_large_io_page (struct agp_memory *mem,
-				off_t pg_start, int type)
-{
-	int i, start_offset, end_offset, idx, pg, num_entries;
-	struct lp_desc *start, *end, *lp;
-	void *temp;
-
-	if (type >= AGP_USER_TYPES || mem->type >= AGP_USER_TYPES)
-		return -EINVAL;
-
-	temp = agp_bridge->current_size;
-	num_entries = A_SIZE_8(temp)->num_entries;
-
-	/* Figure out what pg_start means in terms of our large GART pages */
-	start = &i460.lp_desc[pg_start / I460_KPAGES_PER_IOPAGE];
-	end = &i460.lp_desc[(pg_start + mem->page_count - 1) / I460_KPAGES_PER_IOPAGE];
-	start_offset = pg_start % I460_KPAGES_PER_IOPAGE;
-	end_offset = (pg_start + mem->page_count - 1) % I460_KPAGES_PER_IOPAGE;
-
-	if (end > i460.lp_desc + num_entries) {
-		printk(KERN_ERR PFX "Looks like we're out of AGP memory\n");
-		return -EINVAL;
-	}
-
-	/* Check if the requested region of the aperture is free */
-	for (lp = start; lp <= end; ++lp) {
-		if (!lp->alloced_map)
-			continue;	/* OK, the entire large page is available... */
-
-		for (idx = ((lp == start) ? start_offset : 0);
-		     idx < ((lp == end) ? (end_offset + 1) : I460_KPAGES_PER_IOPAGE);
-		     idx++)
-		{
-			if (test_bit(idx, lp->alloced_map))
-				return -EBUSY;
-		}
-	}
-
-	for (lp = start, i = 0; lp <= end; ++lp) {
-		if (!lp->alloced_map) {
-			/* Allocate new GART pages... */
-			if (i460_alloc_large_page(lp) < 0)
-				return -ENOMEM;
-			pg = lp - i460.lp_desc;
-			WR_GATT(pg, i460_mask_memory(agp_bridge,
-						     lp->paddr, 0));
-			WR_FLUSH_GATT(pg);
-		}
-
-		for (idx = ((lp == start) ? start_offset : 0);
-		     idx < ((lp == end) ? (end_offset + 1) : I460_KPAGES_PER_IOPAGE);
-		     idx++, i++)
-		{
-			mem->pages[i] = lp->page;
-			__set_bit(idx, lp->alloced_map);
-			++lp->refcount;
-		}
-	}
-	return 0;
-}
-
-static int i460_remove_memory_large_io_page (struct agp_memory *mem,
-				off_t pg_start, int type)
-{
-	int i, pg, start_offset, end_offset, idx, num_entries;
-	struct lp_desc *start, *end, *lp;
-	void *temp;
-
-	temp = agp_bridge->current_size;
-	num_entries = A_SIZE_8(temp)->num_entries;
-
-	/* Figure out what pg_start means in terms of our large GART pages */
-	start = &i460.lp_desc[pg_start / I460_KPAGES_PER_IOPAGE];
-	end = &i460.lp_desc[(pg_start + mem->page_count - 1) / I460_KPAGES_PER_IOPAGE];
-	start_offset = pg_start % I460_KPAGES_PER_IOPAGE;
-	end_offset = (pg_start + mem->page_count - 1) % I460_KPAGES_PER_IOPAGE;
-
-	for (i = 0, lp = start; lp <= end; ++lp) {
-		for (idx = ((lp == start) ? start_offset : 0);
-		     idx < ((lp == end) ? (end_offset + 1) : I460_KPAGES_PER_IOPAGE);
-		     idx++, i++)
-		{
-			mem->pages[i] = NULL;
-			__clear_bit(idx, lp->alloced_map);
-			--lp->refcount;
-		}
-
-		/* Free GART pages if they are unused */
-		if (lp->refcount == 0) {
-			pg = lp - i460.lp_desc;
-			WR_GATT(pg, 0);
-			WR_FLUSH_GATT(pg);
-			i460_free_large_page(lp);
-		}
-	}
-	return 0;
-}
-
-/* Wrapper routines to call the approriate {small_io_page,large_io_page} function */
-
-static int i460_insert_memory (struct agp_memory *mem,
-				off_t pg_start, int type)
-{
-	if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT)
-		return i460_insert_memory_small_io_page(mem, pg_start, type);
-	else
-		return i460_insert_memory_large_io_page(mem, pg_start, type);
-}
-
-static int i460_remove_memory (struct agp_memory *mem,
-				off_t pg_start, int type)
-{
-	if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT)
-		return i460_remove_memory_small_io_page(mem, pg_start, type);
-	else
-		return i460_remove_memory_large_io_page(mem, pg_start, type);
-}
-
-/*
- * If the I/O (GART) page size is bigger than the kernel page size, we don't want to
- * allocate memory until we know where it is to be bound in the aperture (a
- * multi-kernel-page alloc might fit inside of an already allocated GART page).
- *
- * Let's just hope nobody counts on the allocated AGP memory being there before bind time
- * (I don't think current drivers do)...
- */
-static struct page *i460_alloc_page (struct agp_bridge_data *bridge)
-{
-	void *page;
-
-	if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) {
-		page = agp_generic_alloc_page(agp_bridge);
-	} else
-		/* Returning NULL would cause problems */
-		/* AK: really dubious code. */
-		page = (void *)~0UL;
-	return page;
-}
-
-static void i460_destroy_page (struct page *page, int flags)
-{
-	if (I460_IO_PAGE_SHIFT <= PAGE_SHIFT) {
-		agp_generic_destroy_page(page, flags);
-	}
-}
-
-#endif /* I460_LARGE_IO_PAGES */
-
-static unsigned long i460_mask_memory (struct agp_bridge_data *bridge,
-				       dma_addr_t addr, int type)
-{
-	/* Make sure the returned address is a valid GATT entry */
-	return bridge->driver->masks[0].mask
-		| (((addr & ~((1 << I460_IO_PAGE_SHIFT) - 1)) & 0xfffff000) >> 12);
-}
-
-const struct agp_bridge_driver intel_i460_driver = {
-	.owner			= THIS_MODULE,
-	.aperture_sizes		= i460_sizes,
-	.size_type		= U8_APER_SIZE,
-	.num_aperture_sizes	= 3,
-	.configure		= i460_configure,
-	.fetch_size		= i460_fetch_size,
-	.cleanup		= i460_cleanup,
-	.tlb_flush		= i460_tlb_flush,
-	.mask_memory		= i460_mask_memory,
-	.masks			= i460_masks,
-	.agp_enable		= agp_generic_enable,
-	.cache_flush		= global_cache_flush,
-	.create_gatt_table	= i460_create_gatt_table,
-	.free_gatt_table	= i460_free_gatt_table,
-#if I460_LARGE_IO_PAGES
-	.insert_memory		= i460_insert_memory,
-	.remove_memory		= i460_remove_memory,
-	.agp_alloc_page		= i460_alloc_page,
-	.agp_destroy_page	= i460_destroy_page,
-#else
-	.insert_memory		= i460_insert_memory_small_io_page,
-	.remove_memory		= i460_remove_memory_small_io_page,
-	.agp_alloc_page		= agp_generic_alloc_page,
-	.agp_alloc_pages	= agp_generic_alloc_pages,
-	.agp_destroy_page	= agp_generic_destroy_page,
-	.agp_destroy_pages	= agp_generic_destroy_pages,
-#endif
-	.alloc_by_type		= agp_generic_alloc_by_type,
-	.free_by_type		= agp_generic_free_by_type,
-	.agp_type_to_mask_type  = agp_generic_type_to_mask_type,
-	.cant_use_aperture	= true,
-};
-
-static int agp_intel_i460_probe(struct pci_dev *pdev,
-				const struct pci_device_id *ent)
-{
-	struct agp_bridge_data *bridge;
-	u8 cap_ptr;
-
-	cap_ptr = pci_find_capability(pdev, PCI_CAP_ID_AGP);
-	if (!cap_ptr)
-		return -ENODEV;
-
-	bridge = agp_alloc_bridge();
-	if (!bridge)
-		return -ENOMEM;
-
-	bridge->driver = &intel_i460_driver;
-	bridge->dev = pdev;
-	bridge->capndx = cap_ptr;
-
-	printk(KERN_INFO PFX "Detected Intel 460GX chipset\n");
-
-	pci_set_drvdata(pdev, bridge);
-	return agp_add_bridge(bridge);
-}
-
-static void agp_intel_i460_remove(struct pci_dev *pdev)
-{
-	struct agp_bridge_data *bridge = pci_get_drvdata(pdev);
-
-	agp_remove_bridge(bridge);
-	agp_put_bridge(bridge);
-}
-
-static struct pci_device_id agp_intel_i460_pci_table[] = {
-	{
-	.class		= (PCI_CLASS_BRIDGE_HOST << 8),
-	.class_mask	= ~0,
-	.vendor		= PCI_VENDOR_ID_INTEL,
-	.device		= PCI_DEVICE_ID_INTEL_84460GX,
-	.subvendor	= PCI_ANY_ID,
-	.subdevice	= PCI_ANY_ID,
-	},
-	{ }
-};
-
-MODULE_DEVICE_TABLE(pci, agp_intel_i460_pci_table);
-
-static struct pci_driver agp_intel_i460_pci_driver = {
-	.name		= "agpgart-intel-i460",
-	.id_table	= agp_intel_i460_pci_table,
-	.probe		= agp_intel_i460_probe,
-	.remove		= agp_intel_i460_remove,
-};
-
-static int __init agp_intel_i460_init(void)
-{
-	if (agp_off)
-		return -EINVAL;
-	return pci_register_driver(&agp_intel_i460_pci_driver);
-}
-
-static void __exit agp_intel_i460_cleanup(void)
-{
-	pci_unregister_driver(&agp_intel_i460_pci_driver);
-}
-
-module_init(agp_intel_i460_init);
-module_exit(agp_intel_i460_cleanup);
-
-MODULE_AUTHOR("Chris Ahna <Christopher.J.Ahna@intel.com>");
-MODULE_LICENSE("GPL and additional rights");
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index ee71376f174b..3b2159416e62 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -64,25 +64,6 @@
 static DEFINE_MUTEX(hpet_mutex); /* replaces BKL */
 static u32 hpet_nhpet, hpet_max_freq = HPET_USER_FREQ;
 
-/* This clocksource driver currently only works on ia64 */
-#ifdef CONFIG_IA64
-static void __iomem *hpet_mctr;
-
-static u64 read_hpet(struct clocksource *cs)
-{
-	return (u64)read_counter((void __iomem *)hpet_mctr);
-}
-
-static struct clocksource clocksource_hpet = {
-	.name		= "hpet",
-	.rating		= 250,
-	.read		= read_hpet,
-	.mask		= CLOCKSOURCE_MASK(64),
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-static struct clocksource *hpet_clocksource;
-#endif
-
 /* A lock for concurrent access by app and isr hpet activity. */
 static DEFINE_SPINLOCK(hpet_lock);
 
@@ -907,17 +888,6 @@ int hpet_alloc(struct hpet_data *hdp)
 
 	hpetp->hp_delta = hpet_calibrate(hpetp);
 
-/* This clocksource driver currently only works on ia64 */
-#ifdef CONFIG_IA64
-	if (!hpet_clocksource) {
-		hpet_mctr = (void __iomem *)&hpetp->hp_hpet->hpet_mc;
-		clocksource_hpet.archdata.fsys_mmio = hpet_mctr;
-		clocksource_register_hz(&clocksource_hpet, hpetp->hp_tick_freq);
-		hpetp->hp_clocksource = &clocksource_hpet;
-		hpet_clocksource = &clocksource_hpet;
-	}
-#endif
-
 	return 0;
 }
 
diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
index 8de74dcfa18c..442c40efb200 100644
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -37,7 +37,7 @@ config HW_RANDOM_TIMERIOMEM
 
 config HW_RANDOM_INTEL
 	tristate "Intel HW Random Number Generator support"
-	depends on (X86 || IA64 || COMPILE_TEST) && PCI
+	depends on (X86 || COMPILE_TEST) && PCI
 	default HW_RANDOM
 	help
 	  This driver provides kernel-side support for the Random Number
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 1052b0f2d4cf..8d27aa6b5b50 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -31,10 +31,6 @@
 #include <linux/uaccess.h>
 #include <linux/security.h>
 
-#ifdef CONFIG_IA64
-# include <linux/efi.h>
-#endif
-
 #define DEVMEM_MINOR	1
 #define DEVPORT_MINOR	4
 
@@ -277,13 +273,6 @@ int __weak phys_mem_access_prot_allowed(struct file *file,
 #ifdef pgprot_noncached
 static int uncached_access(struct file *file, phys_addr_t addr)
 {
-#if defined(CONFIG_IA64)
-	/*
-	 * On ia64, we ignore O_DSYNC because we cannot tolerate memory
-	 * attribute aliases.
-	 */
-	return !(efi_mem_attributes(addr) & EFI_MEMORY_WB);
-#else
 	/*
 	 * Accessing memory above the top the kernel knows about or through a
 	 * file pointer
@@ -292,7 +281,6 @@ static int uncached_access(struct file *file, phys_addr_t addr)
 	if (file->f_flags & O_DSYNC)
 		return 1;
 	return addr >= __pa(high_memory);
-#endif
 }
 #endif
 
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
deleted file mode 100644
index b35f651837c8..000000000000
--- a/drivers/char/mspec.c
+++ /dev/null
@@ -1,295 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2001-2006 Silicon Graphics, Inc.  All rights
- * reserved.
- */
-
-/*
- * SN Platform Special Memory (mspec) Support
- *
- * This driver exports the SN special memory (mspec) facility to user
- * processes.
- * There are two types of memory made available thru this driver:
- * uncached and cached.
- *
- * Uncached are used for memory write combining feature of the ia64
- * cpu.
- *
- * Cached are used for areas of memory that are used as cached addresses
- * on our partition and used as uncached addresses from other partitions.
- * Due to a design constraint of the SN2 Shub, you can not have processors
- * on the same FSB perform both a cached and uncached reference to the
- * same cache line.  These special memory cached regions prevent the
- * kernel from ever dropping in a TLB entry and therefore prevent the
- * processor from ever speculating a cache line from this page.
- */
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/miscdevice.h>
-#include <linux/spinlock.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/vmalloc.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/numa.h>
-#include <linux/refcount.h>
-#include <asm/page.h>
-#include <linux/atomic.h>
-#include <asm/tlbflush.h>
-#include <asm/uncached.h>
-
-
-#define CACHED_ID	"Cached,"
-#define UNCACHED_ID	"Uncached"
-#define REVISION	"4.0"
-#define MSPEC_BASENAME	"mspec"
-
-/*
- * Page types allocated by the device.
- */
-enum mspec_page_type {
-	MSPEC_CACHED = 2,
-	MSPEC_UNCACHED
-};
-
-/*
- * One of these structures is allocated when an mspec region is mmaped. The
- * structure is pointed to by the vma->vm_private_data field in the vma struct.
- * This structure is used to record the addresses of the mspec pages.
- * This structure is shared by all vma's that are split off from the
- * original vma when split_vma()'s are done.
- *
- * The refcnt is incremented atomically because mm->mmap_lock does not
- * protect in fork case where multiple tasks share the vma_data.
- */
-struct vma_data {
-	refcount_t refcnt;	/* Number of vmas sharing the data. */
-	spinlock_t lock;	/* Serialize access to this structure. */
-	int count;		/* Number of pages allocated. */
-	enum mspec_page_type type; /* Type of pages allocated. */
-	unsigned long vm_start;	/* Original (unsplit) base. */
-	unsigned long vm_end;	/* Original (unsplit) end. */
-	unsigned long maddr[];	/* Array of MSPEC addresses. */
-};
-
-/*
- * mspec_open
- *
- * Called when a device mapping is created by a means other than mmap
- * (via fork, munmap, etc.).  Increments the reference count on the
- * underlying mspec data so it is not freed prematurely.
- */
-static void
-mspec_open(struct vm_area_struct *vma)
-{
-	struct vma_data *vdata;
-
-	vdata = vma->vm_private_data;
-	refcount_inc(&vdata->refcnt);
-}
-
-/*
- * mspec_close
- *
- * Called when unmapping a device mapping. Frees all mspec pages
- * belonging to all the vma's sharing this vma_data structure.
- */
-static void
-mspec_close(struct vm_area_struct *vma)
-{
-	struct vma_data *vdata;
-	int index, last_index;
-	unsigned long my_page;
-
-	vdata = vma->vm_private_data;
-
-	if (!refcount_dec_and_test(&vdata->refcnt))
-		return;
-
-	last_index = (vdata->vm_end - vdata->vm_start) >> PAGE_SHIFT;
-	for (index = 0; index < last_index; index++) {
-		if (vdata->maddr[index] == 0)
-			continue;
-		/*
-		 * Clear the page before sticking it back
-		 * into the pool.
-		 */
-		my_page = vdata->maddr[index];
-		vdata->maddr[index] = 0;
-		memset((char *)my_page, 0, PAGE_SIZE);
-		uncached_free_page(my_page, 1);
-	}
-
-	kvfree(vdata);
-}
-
-/*
- * mspec_fault
- *
- * Creates a mspec page and maps it to user space.
- */
-static vm_fault_t
-mspec_fault(struct vm_fault *vmf)
-{
-	unsigned long paddr, maddr;
-	unsigned long pfn;
-	pgoff_t index = vmf->pgoff;
-	struct vma_data *vdata = vmf->vma->vm_private_data;
-
-	maddr = (volatile unsigned long) vdata->maddr[index];
-	if (maddr == 0) {
-		maddr = uncached_alloc_page(numa_node_id(), 1);
-		if (maddr == 0)
-			return VM_FAULT_OOM;
-
-		spin_lock(&vdata->lock);
-		if (vdata->maddr[index] == 0) {
-			vdata->count++;
-			vdata->maddr[index] = maddr;
-		} else {
-			uncached_free_page(maddr, 1);
-			maddr = vdata->maddr[index];
-		}
-		spin_unlock(&vdata->lock);
-	}
-
-	paddr = maddr & ~__IA64_UNCACHED_OFFSET;
-	pfn = paddr >> PAGE_SHIFT;
-
-	return vmf_insert_pfn(vmf->vma, vmf->address, pfn);
-}
-
-static const struct vm_operations_struct mspec_vm_ops = {
-	.open = mspec_open,
-	.close = mspec_close,
-	.fault = mspec_fault,
-};
-
-/*
- * mspec_mmap
- *
- * Called when mmapping the device.  Initializes the vma with a fault handler
- * and private data structure necessary to allocate, track, and free the
- * underlying pages.
- */
-static int
-mspec_mmap(struct file *file, struct vm_area_struct *vma,
-					enum mspec_page_type type)
-{
-	struct vma_data *vdata;
-	int pages, vdata_size;
-
-	if (vma->vm_pgoff != 0)
-		return -EINVAL;
-
-	if ((vma->vm_flags & VM_SHARED) == 0)
-		return -EINVAL;
-
-	if ((vma->vm_flags & VM_WRITE) == 0)
-		return -EPERM;
-
-	pages = vma_pages(vma);
-	vdata_size = sizeof(struct vma_data) + pages * sizeof(long);
-	vdata = kvzalloc(vdata_size, GFP_KERNEL);
-	if (!vdata)
-		return -ENOMEM;
-
-	vdata->vm_start = vma->vm_start;
-	vdata->vm_end = vma->vm_end;
-	vdata->type = type;
-	spin_lock_init(&vdata->lock);
-	refcount_set(&vdata->refcnt, 1);
-	vma->vm_private_data = vdata;
-
-	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
-	if (vdata->type == MSPEC_UNCACHED)
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-	vma->vm_ops = &mspec_vm_ops;
-
-	return 0;
-}
-
-static int
-cached_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	return mspec_mmap(file, vma, MSPEC_CACHED);
-}
-
-static int
-uncached_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	return mspec_mmap(file, vma, MSPEC_UNCACHED);
-}
-
-static const struct file_operations cached_fops = {
-	.owner = THIS_MODULE,
-	.mmap = cached_mmap,
-	.llseek = noop_llseek,
-};
-
-static struct miscdevice cached_miscdev = {
-	.minor = MISC_DYNAMIC_MINOR,
-	.name = "mspec_cached",
-	.fops = &cached_fops
-};
-
-static const struct file_operations uncached_fops = {
-	.owner = THIS_MODULE,
-	.mmap = uncached_mmap,
-	.llseek = noop_llseek,
-};
-
-static struct miscdevice uncached_miscdev = {
-	.minor = MISC_DYNAMIC_MINOR,
-	.name = "mspec_uncached",
-	.fops = &uncached_fops
-};
-
-/*
- * mspec_init
- *
- * Called at boot time to initialize the mspec facility.
- */
-static int __init
-mspec_init(void)
-{
-	int ret;
-
-	ret = misc_register(&cached_miscdev);
-	if (ret) {
-		printk(KERN_ERR "%s: failed to register device %i\n",
-		       CACHED_ID, ret);
-		return ret;
-	}
-	ret = misc_register(&uncached_miscdev);
-	if (ret) {
-		printk(KERN_ERR "%s: failed to register device %i\n",
-		       UNCACHED_ID, ret);
-		misc_deregister(&cached_miscdev);
-		return ret;
-	}
-
-	printk(KERN_INFO "%s %s initialized devices: %s %s\n",
-	       MSPEC_BASENAME, REVISION, CACHED_ID, UNCACHED_ID);
-
-	return 0;
-}
-
-static void __exit
-mspec_exit(void)
-{
-	misc_deregister(&uncached_miscdev);
-	misc_deregister(&cached_miscdev);
-}
-
-module_init(mspec_init);
-module_exit(mspec_exit);
-
-MODULE_AUTHOR("Silicon Graphics, Inc. <linux-altix@sgi.com>");
-MODULE_DESCRIPTION("Driver for SGI SN special memory operations");
-MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index f429b9b37b76..35efb53d5492 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -239,17 +239,6 @@ if PPC32 || PPC64
 source "drivers/cpufreq/Kconfig.powerpc"
 endif
 
-if IA64
-config IA64_ACPI_CPUFREQ
-	tristate "ACPI Processor P-States driver"
-	depends on ACPI_PROCESSOR
-	help
-	This driver adds a CPUFreq driver which utilizes the ACPI
-	Processor Performance States.
-
-	If in doubt, say N.
-endif
-
 if MIPS
 config BMIPS_CPUFREQ
 	tristate "BMIPS CPUfreq Driver"
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index ef8510774913..8d141c71b016 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -102,7 +102,6 @@ obj-$(CONFIG_POWERNV_CPUFREQ)		+= powernv-cpufreq.o
 ##################################################################################
 # Other platform drivers
 obj-$(CONFIG_BMIPS_CPUFREQ)		+= bmips-cpufreq.o
-obj-$(CONFIG_IA64_ACPI_CPUFREQ)		+= ia64-acpi-cpufreq.o
 obj-$(CONFIG_LOONGSON2_CPUFREQ)		+= loongson2_cpufreq.o
 obj-$(CONFIG_SH_CPU_FREQ)		+= sh-cpufreq.o
 obj-$(CONFIG_SPARC_US2E_CPUFREQ)	+= sparc-us2e-cpufreq.o
diff --git a/drivers/cpufreq/ia64-acpi-cpufreq.c b/drivers/cpufreq/ia64-acpi-cpufreq.c
deleted file mode 100644
index c6bdc455517f..000000000000
--- a/drivers/cpufreq/ia64-acpi-cpufreq.c
+++ /dev/null
@@ -1,353 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * This file provides the ACPI based P-state support. This
- * module works with generic cpufreq infrastructure. Most of
- * the code is based on i386 version
- * (arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c)
- *
- * Copyright (C) 2005 Intel Corp
- *      Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/proc_fs.h>
-#include <asm/io.h>
-#include <linux/uaccess.h>
-#include <asm/pal.h>
-
-#include <linux/acpi.h>
-#include <acpi/processor.h>
-
-MODULE_AUTHOR("Venkatesh Pallipadi");
-MODULE_DESCRIPTION("ACPI Processor P-States Driver");
-MODULE_LICENSE("GPL");
-
-struct cpufreq_acpi_io {
-	struct acpi_processor_performance	acpi_data;
-	unsigned int				resume;
-};
-
-struct cpufreq_acpi_req {
-	unsigned int		cpu;
-	unsigned int		state;
-};
-
-static struct cpufreq_acpi_io	*acpi_io_data[NR_CPUS];
-
-static struct cpufreq_driver acpi_cpufreq_driver;
-
-
-static int
-processor_set_pstate (
-	u32	value)
-{
-	s64 retval;
-
-	pr_debug("processor_set_pstate\n");
-
-	retval = ia64_pal_set_pstate((u64)value);
-
-	if (retval) {
-		pr_debug("Failed to set freq to 0x%x, with error 0x%llx\n",
-		        value, retval);
-		return -ENODEV;
-	}
-	return (int)retval;
-}
-
-
-static int
-processor_get_pstate (
-	u32	*value)
-{
-	u64	pstate_index = 0;
-	s64 	retval;
-
-	pr_debug("processor_get_pstate\n");
-
-	retval = ia64_pal_get_pstate(&pstate_index,
-	                             PAL_GET_PSTATE_TYPE_INSTANT);
-	*value = (u32) pstate_index;
-
-	if (retval)
-		pr_debug("Failed to get current freq with "
-			"error 0x%llx, idx 0x%x\n", retval, *value);
-
-	return (int)retval;
-}
-
-
-/* To be used only after data->acpi_data is initialized */
-static unsigned
-extract_clock (
-	struct cpufreq_acpi_io *data,
-	unsigned value)
-{
-	unsigned long i;
-
-	pr_debug("extract_clock\n");
-
-	for (i = 0; i < data->acpi_data.state_count; i++) {
-		if (value == data->acpi_data.states[i].status)
-			return data->acpi_data.states[i].core_frequency;
-	}
-	return data->acpi_data.states[i-1].core_frequency;
-}
-
-
-static long
-processor_get_freq (
-	void *arg)
-{
-	struct cpufreq_acpi_req *req = arg;
-	unsigned int		cpu = req->cpu;
-	struct cpufreq_acpi_io	*data = acpi_io_data[cpu];
-	u32			value;
-	int			ret;
-
-	pr_debug("processor_get_freq\n");
-	if (smp_processor_id() != cpu)
-		return -EAGAIN;
-
-	/* processor_get_pstate gets the instantaneous frequency */
-	ret = processor_get_pstate(&value);
-	if (ret) {
-		pr_warn("get performance failed with error %d\n", ret);
-		return ret;
-	}
-	return 1000 * extract_clock(data, value);
-}
-
-
-static long
-processor_set_freq (
-	void *arg)
-{
-	struct cpufreq_acpi_req *req = arg;
-	unsigned int		cpu = req->cpu;
-	struct cpufreq_acpi_io	*data = acpi_io_data[cpu];
-	int			ret, state = req->state;
-	u32			value;
-
-	pr_debug("processor_set_freq\n");
-	if (smp_processor_id() != cpu)
-		return -EAGAIN;
-
-	if (state == data->acpi_data.state) {
-		if (unlikely(data->resume)) {
-			pr_debug("Called after resume, resetting to P%d\n", state);
-			data->resume = 0;
-		} else {
-			pr_debug("Already at target state (P%d)\n", state);
-			return 0;
-		}
-	}
-
-	pr_debug("Transitioning from P%d to P%d\n",
-		data->acpi_data.state, state);
-
-	/*
-	 * First we write the target state's 'control' value to the
-	 * control_register.
-	 */
-	value = (u32) data->acpi_data.states[state].control;
-
-	pr_debug("Transitioning to state: 0x%08x\n", value);
-
-	ret = processor_set_pstate(value);
-	if (ret) {
-		pr_warn("Transition failed with error %d\n", ret);
-		return -ENODEV;
-	}
-
-	data->acpi_data.state = state;
-	return 0;
-}
-
-
-static unsigned int
-acpi_cpufreq_get (
-	unsigned int		cpu)
-{
-	struct cpufreq_acpi_req req;
-	long ret;
-
-	req.cpu = cpu;
-	ret = work_on_cpu(cpu, processor_get_freq, &req);
-
-	return ret > 0 ? (unsigned int) ret : 0;
-}
-
-
-static int
-acpi_cpufreq_target (
-	struct cpufreq_policy   *policy,
-	unsigned int index)
-{
-	struct cpufreq_acpi_req req;
-
-	req.cpu = policy->cpu;
-	req.state = index;
-
-	return work_on_cpu(req.cpu, processor_set_freq, &req);
-}
-
-static int
-acpi_cpufreq_cpu_init (
-	struct cpufreq_policy   *policy)
-{
-	unsigned int		i;
-	unsigned int		cpu = policy->cpu;
-	struct cpufreq_acpi_io	*data;
-	unsigned int		result = 0;
-	struct cpufreq_frequency_table *freq_table;
-
-	pr_debug("acpi_cpufreq_cpu_init\n");
-
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		return (-ENOMEM);
-
-	acpi_io_data[cpu] = data;
-
-	result = acpi_processor_register_performance(&data->acpi_data, cpu);
-
-	if (result)
-		goto err_free;
-
-	/* capability check */
-	if (data->acpi_data.state_count <= 1) {
-		pr_debug("No P-States\n");
-		result = -ENODEV;
-		goto err_unreg;
-	}
-
-	if ((data->acpi_data.control_register.space_id !=
-					ACPI_ADR_SPACE_FIXED_HARDWARE) ||
-	    (data->acpi_data.status_register.space_id !=
-					ACPI_ADR_SPACE_FIXED_HARDWARE)) {
-		pr_debug("Unsupported address space [%d, %d]\n",
-			(u32) (data->acpi_data.control_register.space_id),
-			(u32) (data->acpi_data.status_register.space_id));
-		result = -ENODEV;
-		goto err_unreg;
-	}
-
-	/* alloc freq_table */
-	freq_table = kcalloc(data->acpi_data.state_count + 1,
-	                           sizeof(*freq_table),
-	                           GFP_KERNEL);
-	if (!freq_table) {
-		result = -ENOMEM;
-		goto err_unreg;
-	}
-
-	/* detect transition latency */
-	policy->cpuinfo.transition_latency = 0;
-	for (i=0; i<data->acpi_data.state_count; i++) {
-		if ((data->acpi_data.states[i].transition_latency * 1000) >
-		    policy->cpuinfo.transition_latency) {
-			policy->cpuinfo.transition_latency =
-			    data->acpi_data.states[i].transition_latency * 1000;
-		}
-	}
-
-	/* table init */
-	for (i = 0; i <= data->acpi_data.state_count; i++)
-	{
-		if (i < data->acpi_data.state_count) {
-			freq_table[i].frequency =
-			      data->acpi_data.states[i].core_frequency * 1000;
-		} else {
-			freq_table[i].frequency = CPUFREQ_TABLE_END;
-		}
-	}
-
-	policy->freq_table = freq_table;
-
-	/* notify BIOS that we exist */
-	acpi_processor_notify_smm(THIS_MODULE);
-
-	pr_info("CPU%u - ACPI performance management activated\n", cpu);
-
-	for (i = 0; i < data->acpi_data.state_count; i++)
-		pr_debug("     %cP%d: %d MHz, %d mW, %d uS, %d uS, 0x%x 0x%x\n",
-			(i == data->acpi_data.state?'*':' '), i,
-			(u32) data->acpi_data.states[i].core_frequency,
-			(u32) data->acpi_data.states[i].power,
-			(u32) data->acpi_data.states[i].transition_latency,
-			(u32) data->acpi_data.states[i].bus_master_latency,
-			(u32) data->acpi_data.states[i].status,
-			(u32) data->acpi_data.states[i].control);
-
-	/* the first call to ->target() should result in us actually
-	 * writing something to the appropriate registers. */
-	data->resume = 1;
-
-	return (result);
-
- err_unreg:
-	acpi_processor_unregister_performance(cpu);
- err_free:
-	kfree(data);
-	acpi_io_data[cpu] = NULL;
-
-	return (result);
-}
-
-
-static int
-acpi_cpufreq_cpu_exit (
-	struct cpufreq_policy   *policy)
-{
-	struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
-
-	pr_debug("acpi_cpufreq_cpu_exit\n");
-
-	if (data) {
-		acpi_io_data[policy->cpu] = NULL;
-		acpi_processor_unregister_performance(policy->cpu);
-		kfree(policy->freq_table);
-		kfree(data);
-	}
-
-	return (0);
-}
-
-
-static struct cpufreq_driver acpi_cpufreq_driver = {
-	.verify 	= cpufreq_generic_frequency_table_verify,
-	.target_index	= acpi_cpufreq_target,
-	.get 		= acpi_cpufreq_get,
-	.init		= acpi_cpufreq_cpu_init,
-	.exit		= acpi_cpufreq_cpu_exit,
-	.name		= "acpi-cpufreq",
-	.attr		= cpufreq_generic_attr,
-};
-
-
-static int __init
-acpi_cpufreq_init (void)
-{
-	pr_debug("acpi_cpufreq_init\n");
-
- 	return cpufreq_register_driver(&acpi_cpufreq_driver);
-}
-
-
-static void __exit
-acpi_cpufreq_exit (void)
-{
-	pr_debug("acpi_cpufreq_exit\n");
-
-	cpufreq_unregister_driver(&acpi_cpufreq_driver);
-}
-
-late_initcall(acpi_cpufreq_init);
-module_exit(acpi_cpufreq_exit);
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index b59e3041fd62..a79579fea6f0 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -77,30 +77,6 @@ config FIRMWARE_MEMMAP
 
       See also Documentation/ABI/testing/sysfs-firmware-memmap.
 
-config EFI_PCDP
-	bool "Console device selection via EFI PCDP or HCDP table"
-	depends on ACPI && EFI && IA64
-	default y if IA64
-	help
-	  If your firmware supplies the PCDP table, and you want to
-	  automatically use the primary console device it describes
-	  as the Linux console, say Y here.
-
-	  If your firmware supplies the HCDP table, and you want to
-	  use the first serial port it describes as the Linux console,
-	  say Y here.  If your EFI ConOut path contains only a UART
-	  device, it will become the console automatically.  Otherwise,
-	  you must specify the "console=hcdp" kernel boot argument.
-
-	  Neither the PCDP nor the HCDP affects naming of serial devices,
-	  so a serial console may be /dev/ttyS0, /dev/ttyS1, etc, depending
-	  on how the driver discovers devices.
-
-	  You must also enable the appropriate drivers (serial, VGA, etc.)
-
-	  See DIG64_HCDPv20_042804.pdf available from
-	  <http://www.dig64.org/specifications/> 
-
 config DMIID
     bool "Export DMI identification via sysfs to userspace"
     depends on DMI
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index 28fcddcd688f..1d1eb671d805 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -8,7 +8,6 @@ obj-$(CONFIG_ARM_SDE_INTERFACE)	+= arm_sdei.o
 obj-$(CONFIG_DMI)		+= dmi_scan.o
 obj-$(CONFIG_DMI_SYSFS)		+= dmi-sysfs.o
 obj-$(CONFIG_EDD)		+= edd.o
-obj-$(CONFIG_EFI_PCDP)		+= pcdp.o
 obj-$(CONFIG_DMIID)		+= dmi-id.o
 obj-$(CONFIG_INTEL_STRATIX10_SERVICE) += stratix10-svc.o
 obj-$(CONFIG_INTEL_STRATIX10_RSU)     += stratix10-rsu.o
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 231f1c70d1db..cb374b2da9b7 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -4,7 +4,7 @@ menu "EFI (Extensible Firmware Interface) Support"
 
 config EFI_ESRT
 	bool
-	depends on EFI && !IA64
+	depends on EFI
 	default y
 
 config EFI_VARS_PSTORE
@@ -123,7 +123,7 @@ config EFI_BOOTLOADER_CONTROL
 
 config EFI_CAPSULE_LOADER
 	tristate "EFI capsule loader"
-	depends on EFI && !IA64
+	depends on EFI
 	help
 	  This option exposes a loader interface "/dev/efi_capsule_loader" for
 	  users to load EFI capsules. This driver requires working runtime
@@ -224,7 +224,7 @@ config EFI_DISABLE_PCI_DMA
 
 config EFI_EARLYCON
 	def_bool y
-	depends on SERIAL_EARLYCON && !ARM && !IA64
+	depends on SERIAL_EARLYCON && !ARM
 	select FONT_SUPPORT
 	select ARCH_USE_MEMREMAP_PROT
 
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 1599f1176842..47ca652f02f6 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -147,7 +147,7 @@ static ssize_t systab_show(struct kobject *kobj,
 	if (efi.smbios != EFI_INVALID_TABLE_ADDR)
 		str += sprintf(str, "SMBIOS=0x%lx\n", efi.smbios);
 
-	if (IS_ENABLED(CONFIG_IA64) || IS_ENABLED(CONFIG_X86))
+	if (IS_ENABLED(CONFIG_X86))
 		str = efi_systab_show_arch(str);
 
 	return str - buf;
@@ -777,7 +777,6 @@ int __init efi_systab_check_header(const efi_table_hdr_t *systab_hdr)
 	return 0;
 }
 
-#ifndef CONFIG_IA64
 static const efi_char16_t *__init map_fw_vendor(unsigned long fw_vendor,
 						size_t size)
 {
@@ -793,10 +792,6 @@ static void __init unmap_fw_vendor(const void *fw_vendor, size_t size)
 {
 	early_memunmap((void *)fw_vendor, size);
 }
-#else
-#define map_fw_vendor(p, s)	__va(p)
-#define unmap_fw_vendor(v, s)
-#endif
 
 void __init efi_systab_report_header(const efi_table_hdr_t *systab_hdr,
 				     unsigned long fw_vendor)
@@ -899,11 +894,6 @@ char * __init efi_md_typeattr_format(char *buf, size_t size,
 	return buf;
 }
 
-/*
- * IA64 has a funky EFI memory map that doesn't work the same way as
- * other architectures.
- */
-#ifndef CONFIG_IA64
 /*
  * efi_mem_attributes - lookup memmap attributes for physical address
  * @phys_addr: the physical address to lookup
@@ -951,7 +941,6 @@ int efi_mem_type(unsigned long phys_addr)
 	}
 	return -EINVAL;
 }
-#endif
 
 int efi_status_to_err(efi_status_t status)
 {
diff --git a/drivers/firmware/pcdp.c b/drivers/firmware/pcdp.c
deleted file mode 100644
index 715a45442d1c..000000000000
--- a/drivers/firmware/pcdp.c
+++ /dev/null
@@ -1,135 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Parse the EFI PCDP table to locate the console device.
- *
- * (c) Copyright 2002, 2003, 2004 Hewlett-Packard Development Company, L.P.
- *	Khalid Aziz <khalid.aziz@hp.com>
- *	Alex Williamson <alex.williamson@hp.com>
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- */
-
-#include <linux/acpi.h>
-#include <linux/console.h>
-#include <linux/efi.h>
-#include <linux/serial.h>
-#include <linux/serial_core.h>
-#include <asm/vga.h>
-#include "pcdp.h"
-
-static int __init
-setup_serial_console(struct pcdp_uart *uart)
-{
-#ifdef CONFIG_SERIAL_8250_CONSOLE
-	int mmio;
-	static char options[64], *p = options;
-	char parity;
-
-	mmio = (uart->addr.space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY);
-	p += sprintf(p, "uart8250,%s,0x%llx",
-		mmio ? "mmio" : "io", uart->addr.address);
-	if (uart->baud) {
-		p += sprintf(p, ",%llu", uart->baud);
-		if (uart->bits) {
-			switch (uart->parity) {
-			    case 0x2: parity = 'e'; break;
-			    case 0x3: parity = 'o'; break;
-			    default:  parity = 'n';
-			}
-			p += sprintf(p, "%c%d", parity, uart->bits);
-		}
-	}
-
-	add_preferred_console("uart", 8250, &options[9]);
-	return setup_earlycon(options);
-#else
-	return -ENODEV;
-#endif
-}
-
-static int __init
-setup_vga_console(struct pcdp_device *dev)
-{
-#if defined(CONFIG_VT) && defined(CONFIG_VGA_CONSOLE)
-	u8 *if_ptr;
-
-	if_ptr = ((u8 *)dev + sizeof(struct pcdp_device));
-	if (if_ptr[0] == PCDP_IF_PCI) {
-		struct pcdp_if_pci if_pci;
-
-		/* struct copy since ifptr might not be correctly aligned */
-
-		memcpy(&if_pci, if_ptr, sizeof(if_pci));
-
-		if (if_pci.trans & PCDP_PCI_TRANS_IOPORT)
-			vga_console_iobase = if_pci.ioport_tra;
-
-		if (if_pci.trans & PCDP_PCI_TRANS_MMIO)
-			vga_console_membase = if_pci.mmio_tra;
-	}
-
-	if (efi_mem_type(vga_console_membase + 0xA0000) == EFI_CONVENTIONAL_MEMORY) {
-		printk(KERN_ERR "PCDP: VGA selected, but frame buffer is not MMIO!\n");
-		return -ENODEV;
-	}
-
-	conswitchp = &vga_con;
-	printk(KERN_INFO "PCDP: VGA console\n");
-	return 0;
-#else
-	return -ENODEV;
-#endif
-}
-
-extern unsigned long hcdp_phys;
-
-int __init
-efi_setup_pcdp_console(char *cmdline)
-{
-	struct pcdp *pcdp;
-	struct pcdp_uart *uart;
-	struct pcdp_device *dev, *end;
-	int i, serial = 0;
-	int rc = -ENODEV;
-
-	if (hcdp_phys == EFI_INVALID_TABLE_ADDR)
-		return -ENODEV;
-
-	pcdp = early_memremap(hcdp_phys, 4096);
-	printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, hcdp_phys);
-
-	if (strstr(cmdline, "console=hcdp")) {
-		if (pcdp->rev < 3)
-			serial = 1;
-	} else if (strstr(cmdline, "console=")) {
-		printk(KERN_INFO "Explicit \"console=\"; ignoring PCDP\n");
-		goto out;
-	}
-
-	if (pcdp->rev < 3 && efi_uart_console_only())
-		serial = 1;
-
-	for (i = 0, uart = pcdp->uart; i < pcdp->num_uarts; i++, uart++) {
-		if (uart->flags & PCDP_UART_PRIMARY_CONSOLE || serial) {
-			if (uart->type == PCDP_CONSOLE_UART) {
-				rc = setup_serial_console(uart);
-				goto out;
-			}
-		}
-	}
-
-	end = (struct pcdp_device *) ((u8 *) pcdp + pcdp->length);
-	for (dev = (struct pcdp_device *) (pcdp->uart + pcdp->num_uarts);
-	     dev < end;
-	     dev = (struct pcdp_device *) ((u8 *) dev + dev->length)) {
-		if (dev->flags & PCDP_PRIMARY_CONSOLE) {
-			if (dev->type == PCDP_CONSOLE_VGA) {
-				rc = setup_vga_console(dev);
-				goto out;
-			}
-		}
-	}
-
-out:
-	early_memunmap(pcdp, 4096);
-	return rc;
-}
diff --git a/drivers/firmware/pcdp.h b/drivers/firmware/pcdp.h
deleted file mode 100644
index e02540571c52..000000000000
--- a/drivers/firmware/pcdp.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Definitions for PCDP-defined console devices
- *
- * For DIG64_HCDPv10a_01.pdf and DIG64_PCDPv20.pdf (v1.0a and v2.0 resp.),
- * please see <http://www.dig64.org/specifications/>
- *
- * (c) Copyright 2002, 2004 Hewlett-Packard Development Company, L.P.
- *	Khalid Aziz <khalid.aziz@hp.com>
- *	Bjorn Helgaas <bjorn.helgaas@hp.com>
- */
-
-#define PCDP_CONSOLE			0
-#define PCDP_DEBUG			1
-#define PCDP_CONSOLE_OUTPUT		2
-#define PCDP_CONSOLE_INPUT		3
-
-#define PCDP_UART			(0 << 3)
-#define PCDP_VGA			(1 << 3)
-#define PCDP_USB			(2 << 3)
-
-/* pcdp_uart.type and pcdp_device.type */
-#define PCDP_CONSOLE_UART		(PCDP_UART | PCDP_CONSOLE)
-#define PCDP_DEBUG_UART			(PCDP_UART | PCDP_DEBUG)
-#define PCDP_CONSOLE_VGA		(PCDP_VGA  | PCDP_CONSOLE_OUTPUT)
-#define PCDP_CONSOLE_USB		(PCDP_USB  | PCDP_CONSOLE_INPUT)
-
-/* pcdp_uart.flags */
-#define PCDP_UART_EDGE_SENSITIVE	(1 << 0)
-#define PCDP_UART_ACTIVE_LOW		(1 << 1)
-#define PCDP_UART_PRIMARY_CONSOLE	(1 << 2)
-#define PCDP_UART_IRQ			(1 << 6) /* in pci_func for rev < 3 */
-#define PCDP_UART_PCI			(1 << 7) /* in pci_func for rev < 3 */
-
-struct pcdp_uart {
-	u8				type;
-	u8				bits;
-	u8				parity;
-	u8				stop_bits;
-	u8				pci_seg;
-	u8				pci_bus;
-	u8				pci_dev;
-	u8				pci_func;
-	u64				baud;
-	struct acpi_generic_address	addr;
-	u16				pci_dev_id;
-	u16				pci_vendor_id;
-	u32				gsi;
-	u32				clock_rate;
-	u8				pci_prog_intfc;
-	u8				flags;
-	u16				conout_index;
-	u32				reserved;
-} __attribute__((packed));
-
-#define PCDP_IF_PCI	1
-
-/* pcdp_if_pci.trans */
-#define PCDP_PCI_TRANS_IOPORT	0x02
-#define PCDP_PCI_TRANS_MMIO	0x01
-
-struct pcdp_if_pci {
-	u8			interconnect;
-	u8			reserved;
-	u16			length;
-	u8			segment;
-	u8			bus;
-	u8			dev;
-	u8			fun;
-	u16			dev_id;
-	u16			vendor_id;
-	u32			acpi_interrupt;
-	u64			mmio_tra;
-	u64			ioport_tra;
-	u8			flags;
-	u8			trans;
-} __attribute__((packed));
-
-struct pcdp_vga {
-	u8			count;		/* address space descriptors */
-} __attribute__((packed));
-
-/* pcdp_device.flags */
-#define PCDP_PRIMARY_CONSOLE	1
-
-struct pcdp_device {
-	u8			type;
-	u8			flags;
-	u16			length;
-	u16			efi_index;
-	/* next data is pcdp_if_pci or pcdp_if_acpi (not yet supported) */
-	/* next data is device specific type (currently only pcdp_vga) */
-} __attribute__((packed));
-
-struct pcdp {
-	u8			signature[4];
-	u32			length;
-	u8			rev;		/* PCDP v2.0 is rev 3 */
-	u8			chksum;
-	u8			oemid[6];
-	u8			oem_tabid[8];
-	u32			oem_rev;
-	u8			creator_id[4];
-	u32			creator_rev;
-	u32			num_uarts;
-	struct pcdp_uart	uart[];	/* actual size is num_uarts */
-	/* remainder of table is pcdp_device structures */
-} __attribute__((packed));
diff --git a/drivers/gpu/drm/drm_ioc32.c b/drivers/gpu/drm/drm_ioc32.c
index 49a743f62b4a..025dc558c94e 100644
--- a/drivers/gpu/drm/drm_ioc32.c
+++ b/drivers/gpu/drm/drm_ioc32.c
@@ -945,11 +945,11 @@ static struct {
 	DRM_IOCTL32_DEF(DRM_IOCTL_SG_ALLOC, compat_drm_sg_alloc),
 	DRM_IOCTL32_DEF(DRM_IOCTL_SG_FREE, compat_drm_sg_free),
 #endif
-#if defined(CONFIG_X86) || defined(CONFIG_IA64)
+#if defined(CONFIG_X86)
 	DRM_IOCTL32_DEF(DRM_IOCTL_UPDATE_DRAW, compat_drm_update_draw),
 #endif
 	DRM_IOCTL32_DEF(DRM_IOCTL_WAIT_VBLANK, compat_drm_wait_vblank),
-#if defined(CONFIG_X86) || defined(CONFIG_IA64)
+#if defined(CONFIG_X86)
 	DRM_IOCTL32_DEF(DRM_IOCTL_MODE_ADDFB2, compat_drm_mode_addfb2),
 #endif
 };
diff --git a/drivers/input/serio/i8042.h b/drivers/input/serio/i8042.h
index adb5173372d3..5f61672d55b7 100644
--- a/drivers/input/serio/i8042.h
+++ b/drivers/input/serio/i8042.h
@@ -19,7 +19,7 @@
 #include "i8042-snirm.h"
 #elif defined(CONFIG_SPARC)
 #include "i8042-sparcio.h"
-#elif defined(CONFIG_X86) || defined(CONFIG_IA64) || defined(CONFIG_LOONGARCH)
+#elif defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
 #include "i8042-acpipnpio.h"
 #else
 #include "i8042-io.h"
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 2b12b583ef4b..7f04491ca5f0 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -91,7 +91,7 @@ config IOMMU_DEBUGFS
 choice
 	prompt "IOMMU default domain type"
 	depends on IOMMU_API
-	default IOMMU_DEFAULT_DMA_LAZY if X86 || IA64
+	default IOMMU_DEFAULT_DMA_LAZY if X86
 	default IOMMU_DEFAULT_DMA_STRICT
 	help
 	  Choose the type of IOMMU domain used to manage DMA API usage by
@@ -146,7 +146,7 @@ config OF_IOMMU
 
 # IOMMU-agnostic DMA-mapping layer
 config IOMMU_DMA
-	def_bool ARM64 || IA64 || X86
+	def_bool ARM64 || X86
 	select DMA_OPS
 	select IOMMU_API
 	select IOMMU_IOVA
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 2e56bd79f589..119d2c57a48e 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -11,7 +11,7 @@ config DMAR_DEBUG
 
 config INTEL_IOMMU
 	bool "Support for Intel IOMMU using DMA Remapping Devices"
-	depends on PCI_MSI && ACPI && (X86 || IA64)
+	depends on PCI_MSI && ACPI && X86
 	select DMA_OPS
 	select IOMMU_API
 	select IOMMU_IOVA
diff --git a/drivers/media/cec/platform/Kconfig b/drivers/media/cec/platform/Kconfig
index b672d3142eb7..ede81fe331b0 100644
--- a/drivers/media/cec/platform/Kconfig
+++ b/drivers/media/cec/platform/Kconfig
@@ -99,7 +99,7 @@ config CEC_TEGRA
 
 config CEC_SECO
 	tristate "SECO Boards HDMI CEC driver"
-	depends on (X86 || IA64) || COMPILE_TEST
+	depends on X86 || COMPILE_TEST
 	depends on PCI && DMI
 	select CEC_CORE
 	select CEC_NOTIFIER
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index cadd4a820c03..f37c4b8380ae 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -166,7 +166,7 @@ config ENCLOSURE_SERVICES
 config SGI_XP
 	tristate "Support communication between SGI SSIs"
 	depends on NET
-	depends on (IA64_SGI_UV || X86_UV) && SMP
+	depends on X86_UV && SMP
 	depends on X86_64 || BROKEN
 	select SGI_GRU if X86_64 && SMP
 	help
diff --git a/drivers/misc/sgi-gru/gru.h b/drivers/misc/sgi-gru/gru.h
index 3ad76cd18b4b..6ae045037219 100644
--- a/drivers/misc/sgi-gru/gru.h
+++ b/drivers/misc/sgi-gru/gru.h
@@ -30,9 +30,7 @@
 /*
  * Size used to map GRU GSeg
  */
-#if defined(CONFIG_IA64)
-#define GRU_GSEG_PAGESIZE	(256 * 1024UL)
-#elif defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_64)
 #define GRU_GSEG_PAGESIZE	(256 * 1024UL)		/* ZZZ 2MB ??? */
 #else
 #error "Unsupported architecture"
diff --git a/drivers/misc/sgi-gru/gru_instructions.h b/drivers/misc/sgi-gru/gru_instructions.h
index 04d5170ac149..da5eb9edf9ec 100644
--- a/drivers/misc/sgi-gru/gru_instructions.h
+++ b/drivers/misc/sgi-gru/gru_instructions.h
@@ -29,17 +29,7 @@ extern void gru_wait_abort_proc(void *cb);
  * Architecture dependent functions
  */
 
-#if defined(CONFIG_IA64)
-#include <linux/compiler.h>
-#include <asm/intrinsics.h>
-#define __flush_cache(p)		ia64_fc((unsigned long)p)
-/* Use volatile on IA64 to ensure ordering via st4.rel */
-#define gru_ordered_store_ulong(p, v)					\
-		do {							\
-			barrier();					\
-			*((volatile unsigned long *)(p)) = v; /* force st.rel */	\
-		} while (0)
-#elif defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_64)
 #include <asm/cacheflush.h>
 #define __flush_cache(p)		clflush(p)
 #define gru_ordered_store_ulong(p, v)					\
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index a3d659c11cc4..e755690c9805 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -337,72 +337,6 @@ static unsigned long gru_chiplet_cpu_to_mmr(int chiplet, int cpu, int *corep)
 	return mmr;
 }
 
-#ifdef CONFIG_IA64
-
-static int gru_irq_count[GRU_CHIPLETS_PER_BLADE];
-
-static void gru_noop(struct irq_data *d)
-{
-}
-
-static struct irq_chip gru_chip[GRU_CHIPLETS_PER_BLADE] = {
-	[0 ... GRU_CHIPLETS_PER_BLADE - 1] {
-		.irq_mask	= gru_noop,
-		.irq_unmask	= gru_noop,
-		.irq_ack	= gru_noop
-	}
-};
-
-static int gru_chiplet_setup_tlb_irq(int chiplet, char *irq_name,
-			irq_handler_t irq_handler, int cpu, int blade)
-{
-	unsigned long mmr;
-	int irq = IRQ_GRU + chiplet;
-	int ret, core;
-
-	mmr = gru_chiplet_cpu_to_mmr(chiplet, cpu, &core);
-	if (mmr == 0)
-		return 0;
-
-	if (gru_irq_count[chiplet] == 0) {
-		gru_chip[chiplet].name = irq_name;
-		ret = irq_set_chip(irq, &gru_chip[chiplet]);
-		if (ret) {
-			printk(KERN_ERR "%s: set_irq_chip failed, errno=%d\n",
-			       GRU_DRIVER_ID_STR, -ret);
-			return ret;
-		}
-
-		ret = request_irq(irq, irq_handler, 0, irq_name, NULL);
-		if (ret) {
-			printk(KERN_ERR "%s: request_irq failed, errno=%d\n",
-			       GRU_DRIVER_ID_STR, -ret);
-			return ret;
-		}
-	}
-	gru_irq_count[chiplet]++;
-
-	return 0;
-}
-
-static void gru_chiplet_teardown_tlb_irq(int chiplet, int cpu, int blade)
-{
-	unsigned long mmr;
-	int core, irq = IRQ_GRU + chiplet;
-
-	if (gru_irq_count[chiplet] == 0)
-		return;
-
-	mmr = gru_chiplet_cpu_to_mmr(chiplet, cpu, &core);
-	if (mmr == 0)
-		return;
-
-	if (--gru_irq_count[chiplet] == 0)
-		free_irq(irq, NULL);
-}
-
-#elif defined CONFIG_X86_64
-
 static int gru_chiplet_setup_tlb_irq(int chiplet, char *irq_name,
 			irq_handler_t irq_handler, int cpu, int blade)
 {
@@ -447,8 +381,6 @@ static void gru_chiplet_teardown_tlb_irq(int chiplet, int cpu, int blade)
 	}
 }
 
-#endif
-
 static void gru_teardown_tlb_irqs(void)
 {
 	int blade;
@@ -514,12 +446,8 @@ static int __init gru_init(void)
 	if (!gru_supported())
 		return 0;
 
-#if defined CONFIG_IA64
-	gru_start_paddr = 0xd000000000UL; /* ZZZZZZZZZZZZZZZZZZZ fixme */
-#else
 	gru_start_paddr = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG) &
 				0x7fffffffffffUL;
-#endif
 	gru_start_vaddr = __va(gru_start_paddr);
 	gru_end_paddr = gru_start_paddr + GRU_MAX_BLADES * GRU_SIZE;
 	printk(KERN_INFO "GRU space: 0x%lx - 0x%lx\n",
diff --git a/drivers/misc/sgi-gru/gruhandles.c b/drivers/misc/sgi-gru/gruhandles.c
index 1d75d5e540bc..695316a83b01 100644
--- a/drivers/misc/sgi-gru/gruhandles.c
+++ b/drivers/misc/sgi-gru/gruhandles.c
@@ -11,16 +11,10 @@
 #include "grutables.h"
 
 /* 10 sec */
-#ifdef CONFIG_IA64
-#include <asm/processor.h>
-#define GRU_OPERATION_TIMEOUT	(((cycles_t) local_cpu_data->itc_freq)*10)
-#define CLKS2NSEC(c)		((c) *1000000000 / local_cpu_data->itc_freq)
-#else
 #include <linux/sync_core.h>
 #include <asm/tsc.h>
 #define GRU_OPERATION_TIMEOUT	((cycles_t) tsc_khz*10*1000)
 #define CLKS2NSEC(c)		((c) * 1000000 / tsc_khz)
-#endif
 
 /* Extract the status field from a kernel handle */
 #define GET_MSEG_HANDLE_STATUS(h)	(((*(unsigned long *)(h)) >> 16) & 3)
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
index 4eb4b9455139..0f5b09e290c8 100644
--- a/drivers/misc/sgi-gru/grumain.c
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -41,16 +41,12 @@ struct device *grudev = &gru_device;
  */
 int gru_cpu_fault_map_id(void)
 {
-#ifdef CONFIG_IA64
-	return uv_blade_processor_id() % GRU_NUM_TFM;
-#else
 	int cpu = smp_processor_id();
 	int id, core;
 
 	core = uv_cpu_core_number(cpu);
 	id = core + UV_MAX_INT_CORES * uv_cpu_socket_number(cpu);
 	return id;
-#endif
 }
 
 /*--------- ASID Management -------------------------------------------
diff --git a/drivers/misc/sgi-xp/xp.h b/drivers/misc/sgi-xp/xp.h
index f1336f43d3bd..3185711beb07 100644
--- a/drivers/misc/sgi-xp/xp.h
+++ b/drivers/misc/sgi-xp/xp.h
@@ -16,7 +16,7 @@
 
 #include <linux/mutex.h>
 
-#if defined CONFIG_X86_UV || defined CONFIG_IA64_SGI_UV
+#if defined CONFIG_X86_UV
 #include <asm/uv/uv.h>
 #endif
 
diff --git a/drivers/misc/sgi-xp/xp_uv.c b/drivers/misc/sgi-xp/xp_uv.c
index 19fc7076af27..3faa7eadf679 100644
--- a/drivers/misc/sgi-xp/xp_uv.c
+++ b/drivers/misc/sgi-xp/xp_uv.c
@@ -18,8 +18,6 @@
 #include <asm/uv/uv_hub.h>
 #if defined CONFIG_X86_64
 #include <asm/uv/bios.h>
-#elif defined CONFIG_IA64_SGI_UV
-#include <asm/sn/sn_sal.h>
 #endif
 #include "../sgi-gru/grukservices.h"
 #include "xp.h"
@@ -99,17 +97,6 @@ xp_expand_memprotect_uv(unsigned long phys_addr, unsigned long size)
 			"UV_MEMPROT_ALLOW_RW) failed, ret=%d\n", ret);
 		return xpBiosError;
 	}
-
-#elif defined CONFIG_IA64_SGI_UV
-	u64 nasid_array;
-
-	ret = sn_change_memprotect(phys_addr, size, SN_MEMPROT_ACCESS_CLASS_1,
-				   &nasid_array);
-	if (ret != 0) {
-		dev_err(xp, "sn_change_memprotect(,, "
-			"SN_MEMPROT_ACCESS_CLASS_1,) failed ret=%d\n", ret);
-		return xpSalError;
-	}
 #else
 	#error not a supported configuration
 #endif
@@ -129,17 +116,6 @@ xp_restrict_memprotect_uv(unsigned long phys_addr, unsigned long size)
 			"UV_MEMPROT_RESTRICT_ACCESS) failed, ret=%d\n", ret);
 		return xpBiosError;
 	}
-
-#elif defined CONFIG_IA64_SGI_UV
-	u64 nasid_array;
-
-	ret = sn_change_memprotect(phys_addr, size, SN_MEMPROT_ACCESS_CLASS_0,
-				   &nasid_array);
-	if (ret != 0) {
-		dev_err(xp, "sn_change_memprotect(,, "
-			"SN_MEMPROT_ACCESS_CLASS_0,) failed ret=%d\n", ret);
-		return xpSalError;
-	}
 #else
 	#error not a supported configuration
 #endif
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
index 6da509d692bb..cc71395782b6 100644
--- a/drivers/misc/sgi-xp/xpc_main.c
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -1155,36 +1155,6 @@ xpc_die_deactivate(void)
 static int
 xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args)
 {
-#ifdef CONFIG_IA64		/* !!! temporary kludge */
-	switch (event) {
-	case DIE_MACHINE_RESTART:
-	case DIE_MACHINE_HALT:
-		xpc_die_deactivate();
-		break;
-
-	case DIE_KDEBUG_ENTER:
-		/* Should lack of heartbeat be ignored by other partitions? */
-		if (!xpc_kdebug_ignore)
-			break;
-
-		fallthrough;
-	case DIE_MCA_MONARCH_ENTER:
-	case DIE_INIT_MONARCH_ENTER:
-		xpc_arch_ops.offline_heartbeat();
-		break;
-
-	case DIE_KDEBUG_LEAVE:
-		/* Is lack of heartbeat being ignored by other partitions? */
-		if (!xpc_kdebug_ignore)
-			break;
-
-		fallthrough;
-	case DIE_MCA_MONARCH_LEAVE:
-	case DIE_INIT_MONARCH_LEAVE:
-		xpc_arch_ops.online_heartbeat();
-		break;
-	}
-#else
 	struct die_args *die_args = _die_args;
 
 	switch (event) {
@@ -1206,7 +1176,6 @@ xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args)
 	default:
 		xpc_die_deactivate();
 	}
-#endif
 
 	return NOTIFY_DONE;
 }
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
index fff522d347e3..2f03a7080d96 100644
--- a/drivers/misc/sgi-xp/xpc_uv.c
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -24,34 +24,12 @@
 #include <linux/slab.h>
 #include <linux/numa.h>
 #include <asm/uv/uv_hub.h>
-#if defined CONFIG_X86_64
 #include <asm/uv/bios.h>
 #include <asm/uv/uv_irq.h>
-#elif defined CONFIG_IA64_SGI_UV
-#include <asm/sn/intr.h>
-#include <asm/sn/sn_sal.h>
-#endif
 #include "../sgi-gru/gru.h"
 #include "../sgi-gru/grukservices.h"
 #include "xpc.h"
 
-#if defined CONFIG_IA64_SGI_UV
-struct uv_IO_APIC_route_entry {
-	__u64	vector		:  8,
-		delivery_mode	:  3,
-		dest_mode	:  1,
-		delivery_status	:  1,
-		polarity	:  1,
-		__reserved_1	:  1,
-		trigger		:  1,
-		mask		:  1,
-		__reserved_2	: 15,
-		dest		: 32;
-};
-
-#define sn_partition_id 0
-#endif
-
 static struct xpc_heartbeat_uv *xpc_heartbeat_uv;
 
 #define XPC_ACTIVATE_MSG_SIZE_UV	(1 * GRU_CACHE_LINE_BYTES)
@@ -113,7 +91,6 @@ xpc_get_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq, int cpu, char *irq_name)
 {
 	int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
 
-#if defined CONFIG_X86_64
 	mq->irq = uv_setup_irq(irq_name, cpu, mq->mmr_blade, mq->mmr_offset,
 			UV_AFFINITY_CPU);
 	if (mq->irq < 0)
@@ -121,40 +98,13 @@ xpc_get_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq, int cpu, char *irq_name)
 
 	mq->mmr_value = uv_read_global_mmr64(mmr_pnode, mq->mmr_offset);
 
-#elif defined CONFIG_IA64_SGI_UV
-	if (strcmp(irq_name, XPC_ACTIVATE_IRQ_NAME) == 0)
-		mq->irq = SGI_XPC_ACTIVATE;
-	else if (strcmp(irq_name, XPC_NOTIFY_IRQ_NAME) == 0)
-		mq->irq = SGI_XPC_NOTIFY;
-	else
-		return -EINVAL;
-
-	mq->mmr_value = (unsigned long)cpu_physical_id(cpu) << 32 | mq->irq;
-	uv_write_global_mmr64(mmr_pnode, mq->mmr_offset, mq->mmr_value);
-#else
-	#error not a supported configuration
-#endif
-
 	return 0;
 }
 
 static void
 xpc_release_gru_mq_irq_uv(struct xpc_gru_mq_uv *mq)
 {
-#if defined CONFIG_X86_64
 	uv_teardown_irq(mq->irq);
-
-#elif defined CONFIG_IA64_SGI_UV
-	int mmr_pnode;
-	unsigned long mmr_value;
-
-	mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
-	mmr_value = 1UL << 16;
-
-	uv_write_global_mmr64(mmr_pnode, mq->mmr_offset, mmr_value);
-#else
-	#error not a supported configuration
-#endif
 }
 
 static int
@@ -162,17 +112,6 @@ xpc_gru_mq_watchlist_alloc_uv(struct xpc_gru_mq_uv *mq)
 {
 	int ret;
 
-#if defined CONFIG_IA64_SGI_UV
-	int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
-
-	ret = sn_mq_watchlist_alloc(mmr_pnode, (void *)uv_gpa(mq->address),
-				    mq->order, &mq->mmr_offset);
-	if (ret < 0) {
-		dev_err(xpc_part, "sn_mq_watchlist_alloc() failed, ret=%d\n",
-			ret);
-		return -EBUSY;
-	}
-#elif defined CONFIG_X86_64
 	ret = uv_bios_mq_watchlist_alloc(uv_gpa(mq->address),
 					 mq->order, &mq->mmr_offset);
 	if (ret < 0) {
@@ -180,9 +119,6 @@ xpc_gru_mq_watchlist_alloc_uv(struct xpc_gru_mq_uv *mq)
 			"ret=%d\n", ret);
 		return ret;
 	}
-#else
-	#error not a supported configuration
-#endif
 
 	mq->watchlist_num = ret;
 	return 0;
@@ -194,15 +130,8 @@ xpc_gru_mq_watchlist_free_uv(struct xpc_gru_mq_uv *mq)
 	int ret;
 	int mmr_pnode = uv_blade_to_pnode(mq->mmr_blade);
 
-#if defined CONFIG_X86_64
 	ret = uv_bios_mq_watchlist_free(mmr_pnode, mq->watchlist_num);
 	BUG_ON(ret != BIOS_STATUS_SUCCESS);
-#elif defined CONFIG_IA64_SGI_UV
-	ret = sn_mq_watchlist_free(mmr_pnode, mq->watchlist_num);
-	BUG_ON(ret != SALRET_OK);
-#else
-	#error not a supported configuration
-#endif
 }
 
 static struct xpc_gru_mq_uv *
@@ -786,7 +715,6 @@ xpc_get_partition_rsvd_page_pa_uv(void *buf, u64 *cookie, unsigned long *rp_pa,
 	s64 status;
 	enum xp_retval ret;
 
-#if defined CONFIG_X86_64
 	status = uv_bios_reserved_page_pa((u64)buf, cookie, (u64 *)rp_pa,
 					  (u64 *)len);
 	if (status == BIOS_STATUS_SUCCESS)
@@ -796,19 +724,6 @@ xpc_get_partition_rsvd_page_pa_uv(void *buf, u64 *cookie, unsigned long *rp_pa,
 	else
 		ret = xpBiosError;
 
-#elif defined CONFIG_IA64_SGI_UV
-	status = sn_partition_reserved_page_pa((u64)buf, cookie, rp_pa, len);
-	if (status == SALRET_OK)
-		ret = xpSuccess;
-	else if (status == SALRET_MORE_PASSES)
-		ret = xpNeedMoreInfo;
-	else
-		ret = xpSalError;
-
-#else
-	#error not a supported configuration
-#endif
-
 	return ret;
 }
 
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 14b311196b8f..c7c1ff38ea33 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -17005,7 +17005,7 @@ static u32 tg3_calc_dma_bndry(struct tg3 *tp, u32 val)
 	    !tg3_flag(tp, PCI_EXPRESS))
 		goto out;
 
-#if defined(CONFIG_PPC64) || defined(CONFIG_IA64) || defined(CONFIG_PARISC)
+#if defined(CONFIG_PPC64) || defined(CONFIG_PARISC)
 	goal = BOUNDARY_MULTI_CACHELINE;
 #else
 #if defined(CONFIG_SPARC64) || defined(CONFIG_ALPHA)
diff --git a/drivers/net/ethernet/brocade/bna/bnad.h b/drivers/net/ethernet/brocade/bna/bnad.h
index 627a93ce38ab..10b1e534030e 100644
--- a/drivers/net/ethernet/brocade/bna/bnad.h
+++ b/drivers/net/ethernet/brocade/bna/bnad.h
@@ -19,7 +19,6 @@
 #include <linux/firmware.h>
 #include <linux/if_vlan.h>
 
-/* Fix for IA64 */
 #include <asm/checksum.h>
 #include <net/ip6_checksum.h>
 
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 1d1e183d3a8b..ed24d6af7487 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -233,9 +233,7 @@ static int nx_set_dma_mask(struct netxen_adapter *adapter)
 	cmask = DMA_BIT_MASK(32);
 
 	if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
-#ifndef CONFIG_IA64
 		mask = DMA_BIT_MASK(35);
-#endif
 	} else {
 		mask = DMA_BIT_MASK(39);
 		cmask = mask;
diff --git a/drivers/pci/vgaarb.c b/drivers/pci/vgaarb.c
index 5e6b1eb54c64..a771b2259f21 100644
--- a/drivers/pci/vgaarb.c
+++ b/drivers/pci/vgaarb.c
@@ -556,7 +556,7 @@ EXPORT_SYMBOL(vga_put);
 
 static bool vga_is_firmware_default(struct pci_dev *pdev)
 {
-#if defined(CONFIG_X86) || defined(CONFIG_IA64)
+#if defined(CONFIG_X86)
 	u64 base = screen_info.lfb_base;
 	u64 size = screen_info.lfb_size;
 	struct resource *r;
diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
index ee17cf5c44c6..c12978311a09 100644
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig
@@ -222,7 +222,7 @@ config SERIAL_8250_EXTENDED
 
 config SERIAL_8250_MANY_PORTS
 	bool "Support more than 4 legacy serial ports"
-	depends on SERIAL_8250_EXTENDED && !IA64
+	depends on SERIAL_8250_EXTENDED
 	help
 	  Say Y here if you have dumb serial boards other than the four
 	  standard COM 1/2/3/4 ports. This may happen if you have an AST
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 358f216c6cd6..1fe6107b539b 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -1273,7 +1273,7 @@ static void kbd_bh(struct tasklet_struct *unused)
 	}
 }
 
-#if defined(CONFIG_X86) || defined(CONFIG_IA64) || defined(CONFIG_ALPHA) ||\
+#if defined(CONFIG_X86) || defined(CONFIG_ALPHA) ||\
     defined(CONFIG_MIPS) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) ||\
     defined(CONFIG_PARISC) || defined(CONFIG_SUPERH) ||\
     (defined(CONFIG_ARM) && defined(CONFIG_KEYBOARD_ATKBD) && !defined(CONFIG_ARCH_RPC))
diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index eac0ba39581e..747053f529a5 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -463,7 +463,7 @@ config FB_VESA
 
 config FB_EFI
 	bool "EFI-based Framebuffer Support"
-	depends on (FB = y) && !IA64 && EFI
+	depends on (FB = y) && EFI
 	select APERTURE_HELPERS
 	select DRM_PANEL_ORIENTATION_QUIRKS
 	select FB_IOMEM_HELPERS
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 751458959411..8cb6fa45d599 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -1287,7 +1287,7 @@ config INTEL_MID_WATCHDOG
 
 config ITCO_WDT
 	tristate "Intel TCO Timer/Watchdog"
-	depends on (X86 || IA64) && PCI
+	depends on X86 && PCI
 	select WATCHDOG_CORE
 	depends on I2C || I2C=n
 	depends on MFD_INTEL_PMC_BXT || !MFD_INTEL_PMC_BXT
diff --git a/fs/Kconfig b/fs/Kconfig
index aa7e03cc1941..421f68fed1ba 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -255,7 +255,7 @@ config ARCH_SUPPORTS_HUGETLBFS
 
 config HUGETLBFS
 	bool "HugeTLB file system support"
-	depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
+	depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
 	depends on (SYSFS || SYSCTL)
 	select MEMFD_CREATE
 	help
diff --git a/fs/afs/main.c b/fs/afs/main.c
index eae288c8d40a..6425c81d07de 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -41,8 +41,6 @@ const char afs_init_sysname[] = "arm_linux26";
 const char afs_init_sysname[] = "aarch64_linux26";
 #elif defined(CONFIG_X86_32)
 const char afs_init_sysname[] = "i386_linux26";
-#elif defined(CONFIG_IA64)
-const char afs_init_sysname[] = "ia64_linux26";
 #elif defined(CONFIG_PPC64)
 const char afs_init_sysname[] = "ppc64_linux26";
 #elif defined(CONFIG_PPC32)
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index c14852362fce..052d0e888c27 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -22,7 +22,7 @@
 /*
  * On intel, even if sizes match, alignment and/or padding may differ.
  */
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_64)
 #define BROKEN_X86_ALIGNMENT
 #define __compat_packed __attribute__((packed))
 #else
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a73246c3c35e..9bcf5641a7cf 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -259,7 +259,7 @@ void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);
 /* the following numa functions are architecture-dependent */
 void acpi_numa_slit_init (struct acpi_table_slit *slit);
 
-#if defined(CONFIG_X86) || defined(CONFIG_IA64) || defined(CONFIG_LOONGARCH)
+#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
 void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa);
 #else
 static inline void
@@ -1114,15 +1114,8 @@ static inline int acpi_get_lps0_constraint(struct device *dev)
 	return ACPI_STATE_UNKNOWN;
 }
 #endif /* CONFIG_SUSPEND && CONFIG_X86 */
-#ifndef CONFIG_IA64
 void arch_reserve_mem_area(acpi_physical_address addr, size_t size);
 #else
-static inline void arch_reserve_mem_area(acpi_physical_address addr,
-					  size_t size)
-{
-}
-#endif /* CONFIG_X86 */
-#else
 #define acpi_os_set_prepare_sleep(func, pm1a_ctrl, pm1b_ctrl) do { } while (0)
 #endif
 
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 80b21d1c6eaf..9cc5bf32f6f2 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -358,13 +358,10 @@ void efi_native_runtime_setup(void);
  * where the UEFI SPEC breaks the line.
  */
 #define NULL_GUID				EFI_GUID(0x00000000, 0x0000, 0x0000,  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00)
-#define MPS_TABLE_GUID				EFI_GUID(0xeb9d2d2f, 0x2d88, 0x11d3,  0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
 #define ACPI_TABLE_GUID				EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3,  0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
 #define ACPI_20_TABLE_GUID			EFI_GUID(0x8868e871, 0xe4f1, 0x11d3,  0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81)
 #define SMBIOS_TABLE_GUID			EFI_GUID(0xeb9d2d31, 0x2d88, 0x11d3,  0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
 #define SMBIOS3_TABLE_GUID			EFI_GUID(0xf2fd1544, 0x9794, 0x4a2c,  0x99, 0x2e, 0xe5, 0xbb, 0xcf, 0x20, 0xe3, 0x94)
-#define SAL_SYSTEM_TABLE_GUID			EFI_GUID(0xeb9d2d32, 0x2d88, 0x11d3,  0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
-#define HCDP_TABLE_GUID				EFI_GUID(0xf951938d, 0x620b, 0x42ef,  0x82, 0x79, 0xa8, 0x4b, 0x79, 0x61, 0x78, 0x98)
 #define UGA_IO_PROTOCOL_GUID			EFI_GUID(0x61a4d49e, 0x6f68, 0x4f1b,  0xb9, 0x22, 0xa8, 0x6e, 0xed, 0x0b, 0x07, 0xa2)
 #define EFI_GLOBAL_VARIABLE_GUID		EFI_GUID(0x8be4df61, 0x93ca, 0x11d2,  0xaa, 0x0d, 0x00, 0xe0, 0x98, 0x03, 0x2b, 0x8c)
 #define UV_SYSTEM_TABLE_GUID			EFI_GUID(0x3b13a7d4, 0x633e, 0x11dd,  0x93, 0xec, 0xda, 0x25, 0x56, 0xd8, 0x95, 0x93)
@@ -851,10 +848,6 @@ static inline int efi_range_is_wc(unsigned long start, unsigned long len)
 	return 1;
 }
 
-#ifdef CONFIG_EFI_PCDP
-extern int __init efi_setup_pcdp_console(char *);
-#endif
-
 /*
  * We play games with efi_enabled so that the compiler will, if
  * possible, remove EFI-related code altogether.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..17d530e12f70 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -362,8 +362,6 @@ extern unsigned int kobjsize(const void *objp);
 # define VM_SAO		VM_ARCH_1	/* Strong Access Ordering (powerpc) */
 #elif defined(CONFIG_PARISC)
 # define VM_GROWSUP	VM_ARCH_1
-#elif defined(CONFIG_IA64)
-# define VM_GROWSUP	VM_ARCH_1
 #elif defined(CONFIG_SPARC64)
 # define VM_SPARC_ADI	VM_ARCH_1	/* Uses ADI tag for access control */
 # define VM_ARCH_CLEAR	VM_SPARC_ADI
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 962cd41a2cb5..99e4f1f718c7 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -276,7 +276,7 @@ struct kparam_array
    read-only sections (which is part of respective UNIX ABI on these
    platforms). So 'const' makes no sense and even causes compile failures
    with some compilers. */
-#if defined(CONFIG_ALPHA) || defined(CONFIG_IA64) || defined(CONFIG_PPC64)
+#if defined(CONFIG_ALPHA) || defined(CONFIG_PPC64)
 #define __moduleparam_const
 #else
 #define __moduleparam_const const
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 1478b9dd05fa..d801409b33cf 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -144,7 +144,7 @@ IF_HAVE_PG_ARCH_X(arch_3)
 #define __VM_ARCH_SPECIFIC_1 {VM_PAT,     "pat"           }
 #elif defined(CONFIG_PPC)
 #define __VM_ARCH_SPECIFIC_1 {VM_SAO,     "sao"           }
-#elif defined(CONFIG_PARISC) || defined(CONFIG_IA64)
+#elif defined(CONFIG_PARISC)
 #define __VM_ARCH_SPECIFIC_1 {VM_GROWSUP,	"growsup"	}
 #elif !defined(CONFIG_MMU)
 #define __VM_ARCH_SPECIFIC_1 {VM_MAPPED_COPY,"mappedcopy"	}
diff --git a/init/Kconfig b/init/Kconfig
index 6d35728b94b2..9ffb103fc927 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1723,7 +1723,7 @@ config KALLSYMS_ABSOLUTE_PERCPU
 config KALLSYMS_BASE_RELATIVE
 	bool
 	depends on KALLSYMS
-	default !IA64
+	default y
 	help
 	  Instead of emitting them as absolute values in the native word size,
 	  emit the symbol references in the kallsyms table as 32-bit entries,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6de7c6bb74ee..234361530007 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1725,9 +1725,6 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target)
 	if (!cpu_possible(cpu)) {
 		pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
 		       cpu);
-#if defined(CONFIG_IA64)
-		pr_err("please check additional_cpus= boot parameter\n");
-#endif
 		return -EINVAL;
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b6d20dfb9a8..faa921ecbe67 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3144,7 +3144,7 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
 		if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
 			return false;
 
-#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
+#if !defined(CONFIG_STACK_GROWSUP)
 		kargs->stack += kargs->stack_size;
 #endif
 	}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2299a5cfbfb9..caab7cd26790 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10289,9 +10289,9 @@ void normalize_rt_tasks(void)
 
 #endif /* CONFIG_MAGIC_SYSRQ */
 
-#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+#if defined(CONFIG_KGDB_KDB)
 /*
- * These functions are only useful for the IA64 MCA handling, or kdb.
+ * These functions are only useful for kdb.
  *
  * They can only be called when the whole system has been
  * stopped - every CPU needs to be quiescent, and no scheduling
@@ -10313,30 +10313,7 @@ struct task_struct *curr_task(int cpu)
 	return cpu_curr(cpu);
 }
 
-#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
-
-#ifdef CONFIG_IA64
-/**
- * ia64_set_curr_task - set the current task for a given CPU.
- * @cpu: the processor in question.
- * @p: the task pointer to set.
- *
- * Description: This function must only be used when non-maskable interrupts
- * are serviced on a separate stack. It allows the architecture to switch the
- * notion of the current task on a CPU in a non-blocking manner. This function
- * must be called with all CPU's synchronized, and interrupts disabled, the
- * and caller must save the original value of the current task (see
- * curr_task() above) and restore that value before reenabling interrupts and
- * re-starting the system.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-void ia64_set_curr_task(int cpu, struct task_struct *p)
-{
-	cpu_curr(cpu) = p;
-}
-
-#endif
+#endif /* defined(CONFIG_KGDB_KDB) */
 
 #ifdef CONFIG_CGROUP_SCHED
 /* task_group_lock serializes the addition/removal of task groups */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 354a2d294f52..dca7a8f735cd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1939,15 +1939,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_IA64
-	{
-		.procname	= "unaligned-dump-stack",
-		.data		= &unaligned_dump_stack,
-		.maxlen		= sizeof (int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#endif
 #ifdef CONFIG_RT_MUTEXES
 	{
 		.procname	= "max_lock_depth",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index fa307f93fa2e..2caf73c9f623 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -759,7 +759,7 @@ config SHRINKER_DEBUG
 
 config DEBUG_STACK_USAGE
 	bool "Stack utilization instrumentation"
-	depends on DEBUG_KERNEL && !IA64
+	depends on DEBUG_KERNEL
 	help
 	  Enables the display of the minimum amount of free stack which each
 	  task has ever had available in the sysrq-T and sysrq-P debug output.
diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c
index 353268b9f129..842894158944 100644
--- a/lib/decompress_unxz.c
+++ b/lib/decompress_unxz.c
@@ -133,9 +133,6 @@
 #ifdef CONFIG_ARM
 #	define XZ_DEC_ARM
 #endif
-#ifdef CONFIG_IA64
-#	define XZ_DEC_IA64
-#endif
 #ifdef CONFIG_SPARC
 #	define XZ_DEC_SPARC
 #endif
diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig
index adce22ac18d6..aef086a6bf2f 100644
--- a/lib/xz/Kconfig
+++ b/lib/xz/Kconfig
@@ -19,11 +19,6 @@ config XZ_DEC_POWERPC
 	default y
 	select XZ_DEC_BCJ
 
-config XZ_DEC_IA64
-	bool "IA-64 BCJ filter decoder" if EXPERT
-	default y
-	select XZ_DEC_BCJ
-
 config XZ_DEC_ARM
 	bool "ARM BCJ filter decoder" if EXPERT
 	default y
diff --git a/mm/mmap.c b/mm/mmap.c
index b56a7f0c9f85..4b0eaf6427c4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1924,9 +1924,9 @@ static int acct_stack_growth(struct vm_area_struct *vma,
 	return 0;
 }
 
-#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
+#if defined(CONFIG_STACK_GROWSUP)
 /*
- * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
+ * PA-RISC uses this for its stack.
  * vma is the last one with address > vma->vm_end.  Have to extend vma.
  */
 static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
@@ -2023,7 +2023,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 	validate_mm(mm);
 	return error;
 }
-#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
+#endif /* CONFIG_STACK_GROWSUP */
 
 /*
  * vma is the first one with address < vma->vm_start.  Have to extend vma.
diff --git a/tools/arch/ia64/include/asm/barrier.h b/tools/arch/ia64/include/asm/barrier.h
deleted file mode 100644
index 6fffe5682713..000000000000
--- a/tools/arch/ia64/include/asm/barrier.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copied from the kernel sources to tools/:
- *
- * Memory barrier definitions.  This is based on information published
- * in the Processor Abstraction Layer and the System Abstraction Layer
- * manual.
- *
- * Copyright (C) 1998-2003 Hewlett-Packard Co
- *	David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
- */
-#ifndef _TOOLS_LINUX_ASM_IA64_BARRIER_H
-#define _TOOLS_LINUX_ASM_IA64_BARRIER_H
-
-#include <linux/compiler.h>
-
-/*
- * Macros to force memory ordering.  In these descriptions, "previous"
- * and "subsequent" refer to program order; "visible" means that all
- * architecturally visible effects of a memory access have occurred
- * (at a minimum, this means the memory has been read or written).
- *
- *   wmb():	Guarantees that all preceding stores to memory-
- *		like regions are visible before any subsequent
- *		stores and that all following stores will be
- *		visible only after all previous stores.
- *   rmb():	Like wmb(), but for reads.
- *   mb():	wmb()/rmb() combo, i.e., all previous memory
- *		accesses are visible before all subsequent
- *		accesses and vice versa.  This is also known as
- *		a "fence."
- *
- * Note: "mb()" and its variants cannot be used as a fence to order
- * accesses to memory mapped I/O registers.  For that, mf.a needs to
- * be used.  However, we don't want to always use mf.a because (a)
- * it's (presumably) much slower than mf and (b) mf.a is supported for
- * sequential memory pages only.
- */
-
-#define mb()		ia64_mf()
-#define rmb()		mb()
-#define wmb()		mb()
-
-#define smp_store_release(p, v)			\
-do {						\
-	barrier();				\
-	WRITE_ONCE(*p, v);			\
-} while (0)
-
-#define smp_load_acquire(p)			\
-({						\
-	typeof(*p) ___p1 = READ_ONCE(*p);	\
-	barrier();				\
-	___p1;					\
-})
-
-#endif /* _TOOLS_LINUX_ASM_IA64_BARRIER_H */
diff --git a/tools/arch/ia64/include/uapi/asm/bitsperlong.h b/tools/arch/ia64/include/uapi/asm/bitsperlong.h
deleted file mode 100644
index 1146d55563db..000000000000
--- a/tools/arch/ia64/include/uapi/asm/bitsperlong.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __ASM_IA64_BITSPERLONG_H
-#define __ASM_IA64_BITSPERLONG_H
-
-#define __BITS_PER_LONG 64
-
-#include <asm-generic/bitsperlong.h>
-
-#endif /* __ASM_IA64_BITSPERLONG_H */
diff --git a/tools/arch/ia64/include/uapi/asm/mman.h b/tools/arch/ia64/include/uapi/asm/mman.h
deleted file mode 100644
index 2a19bb1db4ab..000000000000
--- a/tools/arch/ia64/include/uapi/asm/mman.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef TOOLS_ARCH_IA64_UAPI_ASM_MMAN_FIX_H
-#define TOOLS_ARCH_IA64_UAPI_ASM_MMAN_FIX_H
-#include <uapi/asm-generic/mman.h>
-/* MAP_32BIT is undefined on ia64, fix it for perf */
-#define MAP_32BIT	0
-#endif
diff --git a/usr/include/Makefile b/usr/include/Makefile
index 07796df0a295..338c81f1fcf3 100644
--- a/usr/include/Makefile
+++ b/usr/include/Makefile
@@ -59,12 +59,6 @@ ifeq ($(SRCARCH),arc)
 no-header-test += linux/bpf_perf_event.h
 endif
 
-ifeq ($(SRCARCH),ia64)
-no-header-test += asm/setup.h
-no-header-test += asm/sigcontext.h
-no-header-test += linux/if_bonding.h
-endif
-
 ifeq ($(SRCARCH),powerpc)
 no-header-test += linux/bpf_perf_event.h
 endif
-- 
cgit v1.2.3


From f5e836884d8e55b416dfad55c29481ec1b65c1f0 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 13 Jan 2023 17:57:47 +0100
Subject: kernel: Drop IA64 support from sig_fault handlers

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 include/linux/sched/signal.h       | 17 ++++-------------
 include/uapi/asm-generic/siginfo.h |  5 -----
 kernel/signal.c                    | 25 +++++--------------------
 3 files changed, 9 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0014d3adaf84..155332977239 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -303,20 +303,11 @@ static inline void kernel_signal_stop(void)
 
 	schedule();
 }
-#ifdef __ia64__
-# define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3
-#else
-# define ___ARCH_SI_IA64(_a1, _a2, _a3)
-#endif
 
-int force_sig_fault_to_task(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
-	, struct task_struct *t);
-int force_sig_fault(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr));
-int send_sig_fault(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
-	, struct task_struct *t);
+int force_sig_fault_to_task(int sig, int code, void __user *addr,
+			    struct task_struct *t);
+int force_sig_fault(int sig, int code, void __user *addr);
+int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t);
 
 int force_sig_mceerr(int code, void __user *, short);
 int send_sig_mceerr(int code, void __user *, short, struct task_struct *);
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index 0f52d0ac47c5..b7bc545ec3b2 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -68,11 +68,6 @@ union __sifields {
 	/* SIGILL, SIGFPE, SIGSEGV, SIGBUS, SIGTRAP, SIGEMT */
 	struct {
 		void __user *_addr; /* faulting insn/memory ref. */
-#ifdef __ia64__
-		int _imm;		/* immediate value for "break" */
-		unsigned int _flags;	/* see ia64 si_flags */
-		unsigned long _isr;	/* isr */
-#endif
 
 #define __ADDR_BND_PKEY_PAD  (__alignof__(void *) < sizeof(short) ? \
 			      sizeof(short) : __alignof__(void *))
diff --git a/kernel/signal.c b/kernel/signal.c
index 09019017d669..26d9f66e5364 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1718,9 +1718,8 @@ void force_sigsegv(int sig)
 		force_sig(SIGSEGV);
 }
 
-int force_sig_fault_to_task(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
-	, struct task_struct *t)
+int force_sig_fault_to_task(int sig, int code, void __user *addr,
+			    struct task_struct *t)
 {
 	struct kernel_siginfo info;
 
@@ -1729,24 +1728,15 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr
 	info.si_errno = 0;
 	info.si_code  = code;
 	info.si_addr  = addr;
-#ifdef __ia64__
-	info.si_imm = imm;
-	info.si_flags = flags;
-	info.si_isr = isr;
-#endif
 	return force_sig_info_to_task(&info, t, HANDLER_CURRENT);
 }
 
-int force_sig_fault(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr))
+int force_sig_fault(int sig, int code, void __user *addr)
 {
-	return force_sig_fault_to_task(sig, code, addr
-				       ___ARCH_SI_IA64(imm, flags, isr), current);
+	return force_sig_fault_to_task(sig, code, addr, current);
 }
 
-int send_sig_fault(int sig, int code, void __user *addr
-	___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
-	, struct task_struct *t)
+int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t)
 {
 	struct kernel_siginfo info;
 
@@ -1755,11 +1745,6 @@ int send_sig_fault(int sig, int code, void __user *addr
 	info.si_errno = 0;
 	info.si_code  = code;
 	info.si_addr  = addr;
-#ifdef __ia64__
-	info.si_imm = imm;
-	info.si_flags = flags;
-	info.si_isr = isr;
-#endif
 	return send_sig_info(info.si_signo, &info, t);
 }
 
-- 
cgit v1.2.3


From 3e15dcf77b23b8e9b9b7f3c0d4def8fe9c12c534 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 8 Sep 2023 16:28:59 +0300
Subject: fs: rename __mnt_{want,drop}_write*() helpers

Before exporting these helpers to modules, make their names more
meaningful.

The names mnt_{get,put)_write_access*() were chosen, because they rhyme
with the inode {get,put)_write_access() helpers, which have a very close
meaning for the inode object.

Suggested-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20230817-anfechtbar-ruhelosigkeit-8c6cca8443fc@brauner/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Message-Id: <20230908132900.2983519-2-amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c            |  8 ++++----
 fs/internal.h         | 12 ++++++------
 fs/namespace.c        | 34 +++++++++++++++++-----------------
 fs/open.c             |  2 +-
 include/linux/mount.h |  4 ++--
 kernel/acct.c         |  4 ++--
 6 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/fs/inode.c b/fs/inode.c
index 35fd688168c5..7febdc9fd1a9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2006,7 +2006,7 @@ void touch_atime(const struct path *path)
 	if (!sb_start_write_trylock(inode->i_sb))
 		return;
 
-	if (__mnt_want_write(mnt) != 0)
+	if (mnt_get_write_access(mnt) != 0)
 		goto skip_update;
 	/*
 	 * File systems can error out when updating inodes if they need to
@@ -2018,7 +2018,7 @@ void touch_atime(const struct path *path)
 	 * of the fs read only, e.g. subvolumes in Btrfs.
 	 */
 	inode_update_time(inode, S_ATIME);
-	__mnt_drop_write(mnt);
+	mnt_put_write_access(mnt);
 skip_update:
 	sb_end_write(inode->i_sb);
 }
@@ -2173,9 +2173,9 @@ static int __file_update_time(struct file *file, int sync_mode)
 	struct inode *inode = file_inode(file);
 
 	/* try to update time settings */
-	if (!__mnt_want_write_file(file)) {
+	if (!mnt_get_write_access_file(file)) {
 		ret = inode_update_time(inode, sync_mode);
-		__mnt_drop_write_file(file);
+		mnt_put_write_access_file(file);
 	}
 
 	return ret;
diff --git a/fs/internal.h b/fs/internal.h
index d64ae03998cc..8260c738980c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -73,8 +73,8 @@ extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
 
-extern int __mnt_want_write_file(struct file *);
-extern void __mnt_drop_write_file(struct file *);
+int mnt_get_write_access_file(struct file *file);
+void mnt_put_write_access_file(struct file *file);
 
 extern void dissolve_on_fput(struct vfsmount *);
 extern bool may_mount(void);
@@ -101,7 +101,7 @@ static inline void put_file_access(struct file *file)
 		i_readcount_dec(file->f_inode);
 	} else if (file->f_mode & FMODE_WRITER) {
 		put_write_access(file->f_inode);
-		__mnt_drop_write(file->f_path.mnt);
+		mnt_put_write_access(file->f_path.mnt);
 	}
 }
 
@@ -130,9 +130,9 @@ static inline void sb_start_ro_state_change(struct super_block *sb)
 	 * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY
 	 * cleared, it will see s_readonly_remount set.
 	 * For RW->RO transition, the barrier pairs with the barrier in
-	 * __mnt_want_write() before the mnt_is_readonly() check. The barrier
-	 * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already
-	 * cleared, it will see s_readonly_remount set.
+	 * mnt_get_write_access() before the mnt_is_readonly() check.
+	 * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD
+	 * already cleared, it will see s_readonly_remount set.
 	 */
 	smp_wmb();
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index e157efc54023..3fe7c0484e6a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -330,16 +330,16 @@ static int mnt_is_readonly(struct vfsmount *mnt)
  * can determine when writes are able to occur to a filesystem.
  */
 /**
- * __mnt_want_write - get write access to a mount without freeze protection
+ * mnt_get_write_access - get write access to a mount without freeze protection
  * @m: the mount on which to take a write
  *
  * This tells the low-level filesystem that a write is about to be performed to
  * it, and makes sure that writes are allowed (mnt it read-write) before
  * returning success. This operation does not protect against filesystem being
- * frozen. When the write operation is finished, __mnt_drop_write() must be
+ * frozen. When the write operation is finished, mnt_put_write_access() must be
  * called. This is effectively a refcount.
  */
-int __mnt_want_write(struct vfsmount *m)
+int mnt_get_write_access(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
 	int ret = 0;
@@ -401,7 +401,7 @@ int mnt_want_write(struct vfsmount *m)
 	int ret;
 
 	sb_start_write(m->mnt_sb);
-	ret = __mnt_want_write(m);
+	ret = mnt_get_write_access(m);
 	if (ret)
 		sb_end_write(m->mnt_sb);
 	return ret;
@@ -409,15 +409,15 @@ int mnt_want_write(struct vfsmount *m)
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
 /**
- * __mnt_want_write_file - get write access to a file's mount
+ * mnt_get_write_access_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
- * This is like __mnt_want_write, but if the file is already open for writing it
+ * This is like mnt_get_write_access, but if @file is already open for write it
  * skips incrementing mnt_writers (since the open file already has a reference)
  * and instead only does the check for emergency r/o remounts.  This must be
- * paired with __mnt_drop_write_file.
+ * paired with mnt_put_write_access_file.
  */
-int __mnt_want_write_file(struct file *file)
+int mnt_get_write_access_file(struct file *file)
 {
 	if (file->f_mode & FMODE_WRITER) {
 		/*
@@ -428,7 +428,7 @@ int __mnt_want_write_file(struct file *file)
 			return -EROFS;
 		return 0;
 	}
-	return __mnt_want_write(file->f_path.mnt);
+	return mnt_get_write_access(file->f_path.mnt);
 }
 
 /**
@@ -445,7 +445,7 @@ int mnt_want_write_file(struct file *file)
 	int ret;
 
 	sb_start_write(file_inode(file)->i_sb);
-	ret = __mnt_want_write_file(file);
+	ret = mnt_get_write_access_file(file);
 	if (ret)
 		sb_end_write(file_inode(file)->i_sb);
 	return ret;
@@ -453,14 +453,14 @@ int mnt_want_write_file(struct file *file)
 EXPORT_SYMBOL_GPL(mnt_want_write_file);
 
 /**
- * __mnt_drop_write - give up write access to a mount
+ * mnt_put_write_access - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
  * Tells the low-level filesystem that we are done
  * performing writes to it.  Must be matched with
- * __mnt_want_write() call above.
+ * mnt_get_write_access() call above.
  */
-void __mnt_drop_write(struct vfsmount *mnt)
+void mnt_put_write_access(struct vfsmount *mnt)
 {
 	preempt_disable();
 	mnt_dec_writers(real_mount(mnt));
@@ -477,20 +477,20 @@ void __mnt_drop_write(struct vfsmount *mnt)
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-	__mnt_drop_write(mnt);
+	mnt_put_write_access(mnt);
 	sb_end_write(mnt->mnt_sb);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
-void __mnt_drop_write_file(struct file *file)
+void mnt_put_write_access_file(struct file *file)
 {
 	if (!(file->f_mode & FMODE_WRITER))
-		__mnt_drop_write(file->f_path.mnt);
+		mnt_put_write_access(file->f_path.mnt);
 }
 
 void mnt_drop_write_file(struct file *file)
 {
-	__mnt_drop_write_file(file);
+	mnt_put_write_access_file(file);
 	sb_end_write(file_inode(file)->i_sb);
 }
 EXPORT_SYMBOL(mnt_drop_write_file);
diff --git a/fs/open.c b/fs/open.c
index 98f6601fbac6..a65ce47810cf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -895,7 +895,7 @@ static int do_dentry_open(struct file *f,
 		error = get_write_access(inode);
 		if (unlikely(error))
 			goto cleanup_file;
-		error = __mnt_want_write(f->f_path.mnt);
+		error = mnt_get_write_access(f->f_path.mnt);
 		if (unlikely(error)) {
 			put_write_access(inode);
 			goto cleanup_file;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 4f40b40306d0..ac3dd2876197 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -92,8 +92,8 @@ extern bool __mnt_is_readonly(struct vfsmount *mnt);
 extern bool mnt_may_suid(struct vfsmount *mnt);
 
 extern struct vfsmount *clone_private_mount(const struct path *path);
-extern int __mnt_want_write(struct vfsmount *);
-extern void __mnt_drop_write(struct vfsmount *);
+int mnt_get_write_access(struct vfsmount *mnt);
+void mnt_put_write_access(struct vfsmount *mnt);
 
 extern struct vfsmount *fc_mount(struct fs_context *fc);
 extern struct vfsmount *vfs_create_mount(struct fs_context *fc);
diff --git a/kernel/acct.c b/kernel/acct.c
index 1a9f929fe629..986c8214dabf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -246,7 +246,7 @@ static int acct_on(struct filename *pathname)
 		filp_close(file, NULL);
 		return PTR_ERR(internal);
 	}
-	err = __mnt_want_write(internal);
+	err = mnt_get_write_access(internal);
 	if (err) {
 		mntput(internal);
 		kfree(acct);
@@ -271,7 +271,7 @@ static int acct_on(struct filename *pathname)
 	old = xchg(&ns->bacct, &acct->pin);
 	mutex_unlock(&acct->lock);
 	pin_kill(old);
-	__mnt_drop_write(mnt);
+	mnt_put_write_access(mnt);
 	mntput(mnt);
 	return 0;
 }
-- 
cgit v1.2.3


From d52b59315bf5e86e83c00bfae47cedd388dad6a8 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 8 Sep 2023 21:39:20 +0800
Subject: bpf: Adjust size_index according to the value of KMALLOC_MIN_SIZE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The following warning was reported when running "./test_progs -a
link_api -a linked_list" on a RISC-V QEMU VM:

  ------------[ cut here ]------------
  WARNING: CPU: 3 PID: 261 at kernel/bpf/memalloc.c:342 bpf_mem_refill
  Modules linked in: bpf_testmod(OE)
  CPU: 3 PID: 261 Comm: test_progs- ... 6.5.0-rc5-01743-gdcb152bb8328 #2
  Hardware name: riscv-virtio,qemu (DT)
  epc : bpf_mem_refill+0x1fc/0x206
   ra : irq_work_single+0x68/0x70
  epc : ffffffff801b1bc4 ra : ffffffff8015fe84 sp : ff2000000001be20
   gp : ffffffff82d26138 tp : ff6000008477a800 t0 : 0000000000046600
   t1 : ffffffff812b6ddc t2 : 0000000000000000 s0 : ff2000000001be70
   s1 : ff5ffffffffe8998 a0 : ff5ffffffffe8998 a1 : ff600003fef4b000
   a2 : 000000000000003f a3 : ffffffff80008250 a4 : 0000000000000060
   a5 : 0000000000000080 a6 : 0000000000000000 a7 : 0000000000735049
   s2 : ff5ffffffffe8998 s3 : 0000000000000022 s4 : 0000000000001000
   s5 : 0000000000000007 s6 : ff5ffffffffe8570 s7 : ffffffff82d6bd30
   s8 : 000000000000003f s9 : ffffffff82d2c5e8 s10: 000000000000ffff
   s11: ffffffff82d2c5d8 t3 : ffffffff81ea8f28 t4 : 0000000000000000
   t5 : ff6000008fd28278 t6 : 0000000000040000
  [<ffffffff801b1bc4>] bpf_mem_refill+0x1fc/0x206
  [<ffffffff8015fe84>] irq_work_single+0x68/0x70
  [<ffffffff8015feb4>] irq_work_run_list+0x28/0x36
  [<ffffffff8015fefa>] irq_work_run+0x38/0x66
  [<ffffffff8000828a>] handle_IPI+0x3a/0xb4
  [<ffffffff800a5c3a>] handle_percpu_devid_irq+0xa4/0x1f8
  [<ffffffff8009fafa>] generic_handle_domain_irq+0x28/0x36
  [<ffffffff800ae570>] ipi_mux_process+0xac/0xfa
  [<ffffffff8000a8ea>] sbi_ipi_handle+0x2e/0x88
  [<ffffffff8009fafa>] generic_handle_domain_irq+0x28/0x36
  [<ffffffff807ee70e>] riscv_intc_irq+0x36/0x4e
  [<ffffffff812b5d3a>] handle_riscv_irq+0x54/0x86
  [<ffffffff812b6904>] do_irq+0x66/0x98
  ---[ end trace 0000000000000000 ]---

The warning is due to WARN_ON_ONCE(tgt->unit_size != c->unit_size) in
free_bulk(). The direct reason is that a object is allocated and
freed by bpf_mem_caches with different unit_size.

The root cause is that KMALLOC_MIN_SIZE is 64 and there is no 96-bytes
slab cache in the specific VM. When linked_list test allocates a
72-bytes object through bpf_obj_new(), bpf_global_ma will allocate it
from a bpf_mem_cache with 96-bytes unit_size, but this bpf_mem_cache is
backed by 128-bytes slab cache. When the object is freed, bpf_mem_free()
uses ksize() to choose the corresponding bpf_mem_cache. Because the
object is allocated from 128-bytes slab cache, ksize() returns 128,
bpf_mem_free() chooses a 128-bytes bpf_mem_cache to free the object and
triggers the warning.

A similar warning will also be reported when using CONFIG_SLAB instead
of CONFIG_SLUB in a x86-64 kernel. Because CONFIG_SLUB defines
KMALLOC_MIN_SIZE as 8 but CONFIG_SLAB defines KMALLOC_MIN_SIZE as 32.

An alternative fix is to use kmalloc_size_round() in bpf_mem_alloc() to
choose a bpf_mem_cache which has the same unit_size with the backing
slab cache, but it may introduce performance degradation, so fix the
warning by adjusting the indexes in size_index according to the value of
KMALLOC_MIN_SIZE just like setup_kmalloc_cache_index_table() does.

Fixes: 822fb26bdb55 ("bpf: Add a hint to allocated objects.")
Reported-by: Björn Töpel <bjorn@kernel.org>
Closes: https://lore.kernel.org/bpf/87jztjmmy4.fsf@all.your.base.are.belong.to.us
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230908133923.2675053-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 9c49ae53deaf..98d9e96fba3c 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -916,3 +916,41 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
 
 	return !ret ? NULL : ret + LLIST_NODE_SZ;
 }
+
+/* Most of the logic is taken from setup_kmalloc_cache_index_table() */
+static __init int bpf_mem_cache_adjust_size(void)
+{
+	unsigned int size, index;
+
+	/* Normally KMALLOC_MIN_SIZE is 8-bytes, but it can be
+	 * up-to 256-bytes.
+	 */
+	size = KMALLOC_MIN_SIZE;
+	if (size <= 192)
+		index = size_index[(size - 1) / 8];
+	else
+		index = fls(size - 1) - 1;
+	for (size = 8; size < KMALLOC_MIN_SIZE && size <= 192; size += 8)
+		size_index[(size - 1) / 8] = index;
+
+	/* The minimal alignment is 64-bytes, so disable 96-bytes cache and
+	 * use 128-bytes cache instead.
+	 */
+	if (KMALLOC_MIN_SIZE >= 64) {
+		index = size_index[(128 - 1) / 8];
+		for (size = 64 + 8; size <= 96; size += 8)
+			size_index[(size - 1) / 8] = index;
+	}
+
+	/* The minimal alignment is 128-bytes, so disable 192-bytes cache and
+	 * use 256-bytes cache instead.
+	 */
+	if (KMALLOC_MIN_SIZE >= 128) {
+		index = fls(256 - 1) - 1;
+		for (size = 128 + 8; size <= 192; size += 8)
+			size_index[(size - 1) / 8] = index;
+	}
+
+	return 0;
+}
+subsys_initcall(bpf_mem_cache_adjust_size);
-- 
cgit v1.2.3


From b1d53958b69312e43c118d4093d8f93d3f6f80af Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 8 Sep 2023 21:39:21 +0800
Subject: bpf: Don't prefill for unused bpf_mem_cache

When the unit_size of a bpf_mem_cache is unmatched with the object_size
of the underlying slab cache, the bpf_mem_cache will not be used, and
the allocation will be redirected to a bpf_mem_cache with a bigger
unit_size instead, so there is no need to prefill for these
unused bpf_mem_caches.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230908133923.2675053-3-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 98d9e96fba3c..90c1ed8210a2 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -459,8 +459,7 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
  * Typical case will be between 11K and 116K closer to 11K.
  * bpf progs can and should share bpf_mem_cache when possible.
  */
-
-static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+static void init_refill_work(struct bpf_mem_cache *c)
 {
 	init_irq_work(&c->refill_work, bpf_mem_refill);
 	if (c->unit_size <= 256) {
@@ -476,7 +475,10 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
 		c->high_watermark = max(96 * 256 / c->unit_size, 3);
 	}
 	c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
+}
 
+static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+{
 	/* To avoid consuming memory assume that 1st run of bpf
 	 * prog won't be doing more than 4 map_update_elem from
 	 * irq disabled region
@@ -521,6 +523,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			c->objcg = objcg;
 			c->percpu_size = percpu_size;
 			c->tgt = c;
+			init_refill_work(c);
 			prefill_mem_cache(c, cpu);
 		}
 		ma->cache = pc;
@@ -544,6 +547,15 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			c->unit_size = sizes[i];
 			c->objcg = objcg;
 			c->tgt = c;
+
+			init_refill_work(c);
+			/* Another bpf_mem_cache will be used when allocating
+			 * c->unit_size in bpf_mem_alloc(), so doesn't prefill
+			 * for the bpf_mem_cache because these free objects will
+			 * never be used.
+			 */
+			if (i != bpf_mem_cache_idx(c->unit_size))
+				continue;
 			prefill_mem_cache(c, cpu);
 		}
 	}
-- 
cgit v1.2.3


From c930472552022bd09aab3cd946ba3f243070d5c7 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 8 Sep 2023 21:39:22 +0800
Subject: bpf: Ensure unit_size is matched with slab cache object size

Add extra check in bpf_mem_alloc_init() to ensure the unit_size of
bpf_mem_cache is matched with the object_size of underlying slab cache.
If these two sizes are unmatched, print a warning once and return
-EINVAL in bpf_mem_alloc_init(), so the mismatch can be found early and
the potential issue can be prevented.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20230908133923.2675053-4-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 90c1ed8210a2..1c22b90e754a 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -486,6 +486,24 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
 	alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
 }
 
+static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx)
+{
+	struct llist_node *first;
+	unsigned int obj_size;
+
+	first = c->free_llist.first;
+	if (!first)
+		return 0;
+
+	obj_size = ksize(first);
+	if (obj_size != c->unit_size) {
+		WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n",
+			  idx, obj_size, c->unit_size);
+		return -EINVAL;
+	}
+	return 0;
+}
+
 /* When size != 0 bpf_mem_cache for each cpu.
  * This is typical bpf hash map use case when all elements have equal size.
  *
@@ -496,10 +514,10 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 {
 	static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+	int cpu, i, err, unit_size, percpu_size = 0;
 	struct bpf_mem_caches *cc, __percpu *pcc;
 	struct bpf_mem_cache *c, __percpu *pc;
 	struct obj_cgroup *objcg = NULL;
-	int cpu, i, unit_size, percpu_size = 0;
 
 	if (size) {
 		pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
@@ -537,6 +555,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 	pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
 	if (!pcc)
 		return -ENOMEM;
+	err = 0;
 #ifdef CONFIG_MEMCG_KMEM
 	objcg = get_obj_cgroup_from_current();
 #endif
@@ -557,10 +576,20 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			if (i != bpf_mem_cache_idx(c->unit_size))
 				continue;
 			prefill_mem_cache(c, cpu);
+			err = check_obj_size(c, i);
+			if (err)
+				goto out;
 		}
 	}
+
+out:
 	ma->caches = pcc;
-	return 0;
+	/* refill_work is either zeroed or initialized, so it is safe to
+	 * call irq_work_sync().
+	 */
+	if (err)
+		bpf_mem_alloc_destroy(ma);
+	return err;
 }
 
 static void drain_mem_cache(struct bpf_mem_cache *c)
-- 
cgit v1.2.3


From b934b7ff5ea755473d9737d79ab303722ceba22e Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Wed, 12 Jul 2023 23:15:56 +0800
Subject: rcu: Delete a redundant check in rcu_check_gp_kthread_starvation()

The rcu_check_gp_kthread_starvation() function uses task_cpu() to sample
the last CPU that the grace-period kthread ran on, and task_cpu() samples
the thread_info structure's ->cpu field.  But this field will always
contain a number corresponding to a CPU that was online some time in
the past, thus never a negative number.  This invariant is checked by
a WARN_ON_ONCE() in set_task_cpu().

This means that if the grace-period kthread exists, that is, if the "gpk"
local variable is non-NULL, the "cpu" local variable will be non-negative.
This in turn means that the existing check for non-negative "cpu" is
redundant with the enclosing check for non-NULL "gpk".

This commit threefore removes the redundant check of "cpu".

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree_stall.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 6f06dc12904a..b5ce0580074e 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -537,13 +537,11 @@ static void rcu_check_gp_kthread_starvation(void)
 			pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
 			pr_err("RCU grace-period kthread stack dump:\n");
 			sched_show_task(gpk);
-			if (cpu >= 0) {
-				if (cpu_is_offline(cpu)) {
-					pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
-				} else  {
-					pr_err("Stack dump where RCU GP kthread last ran:\n");
-					dump_cpu_task(cpu);
-				}
+			if (cpu_is_offline(cpu)) {
+				pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
+			} else  {
+				pr_err("Stack dump where RCU GP kthread last ran:\n");
+				dump_cpu_task(cpu);
 			}
 			wake_up_process(gpk);
 		}
-- 
cgit v1.2.3


From f3efe02fd56e2498ad8da4c4f444a34aa1456a46 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Wed, 12 Jul 2023 23:15:57 +0800
Subject: rcu: Don't redump the stalled CPU where RCU GP kthread last ran

The stacks of all stalled CPUs will be dumped in rcu_dump_cpu_stacks().
If the CPU on where RCU GP kthread last ran is stalled, its stack does
not need to be dumped again. We can search the corresponding backtrace
based on the printed CPU ID.

For example:
[   87.328275] rcu: rcu_sched kthread starved for ... ->cpu=3  <--------|
... ...                                                                 |
[   89.385007] NMI backtrace for cpu 3                         <--------|
[   89.385179] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.10.0+ #22 <--|
[   89.385188] Hardware name: linux,dummy-virt (DT)
[   89.385196] pstate: 60000005 (nZCv daif -PAN -UAO -TCO BTYPE=--)
[   89.385204] pc : arch_cpu_idle+0x40/0xc0
[   89.385211] lr : arch_cpu_idle+0x2c/0xc0
... ...
[   89.385566] Call trace:
[   89.385574]  arch_cpu_idle+0x40/0xc0
[   89.385581]  default_idle_call+0x100/0x450
[   89.385589]  cpuidle_idle_call+0x2f8/0x460
[   89.385596]  do_idle+0x1dc/0x3d0
[   89.385604]  cpu_startup_entry+0x5c/0xb0
[   89.385613]  secondary_start_kernel+0x35c/0x520

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree_stall.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b5ce0580074e..fc04a8d7ce96 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -534,12 +534,14 @@ static void rcu_check_gp_kthread_starvation(void)
 		       data_race(READ_ONCE(rcu_state.gp_state)),
 		       gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu);
 		if (gpk) {
+			struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
 			pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
 			pr_err("RCU grace-period kthread stack dump:\n");
 			sched_show_task(gpk);
 			if (cpu_is_offline(cpu)) {
 				pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
-			} else  {
+			} else if (!(data_race(READ_ONCE(rdp->mynode->qsmask)) & rdp->grpmask)) {
 				pr_err("Stack dump where RCU GP kthread last ran:\n");
 				dump_cpu_task(cpu);
 			}
-- 
cgit v1.2.3


From 243d5ab34446854ceca55146fc0d837655657f8e Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Mon, 24 Jul 2023 10:26:51 +0800
Subject: rcu: Eliminate check_cpu_stall() duplicate code

The code and comments of self-detected and other-detected RCU CPU stall
warnings are identical except the output function.  This commit therefore
refactors so as to consolidate the duplicate code.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree_stall.h | 42 +++++++++++++++---------------------------
 1 file changed, 15 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index fc04a8d7ce96..2443d1d4a6dc 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -711,7 +711,7 @@ static void print_cpu_stall(unsigned long gps)
 
 static void check_cpu_stall(struct rcu_data *rdp)
 {
-	bool didstall = false;
+	bool self_detected;
 	unsigned long gs1;
 	unsigned long gs2;
 	unsigned long gps;
@@ -758,10 +758,10 @@ static void check_cpu_stall(struct rcu_data *rdp)
 		return; /* No stall or GP completed since entering function. */
 	rnp = rdp->mynode;
 	jn = jiffies + ULONG_MAX / 2;
+	self_detected = READ_ONCE(rnp->qsmask) & rdp->grpmask;
 	if (rcu_gp_in_progress() &&
-	    (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
+	    (self_detected || ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) &&
 	    cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-
 		/*
 		 * If a virtual machine is stopped by the host it can look to
 		 * the watchdog like an RCU stall. Check to see if the host
@@ -770,33 +770,21 @@ static void check_cpu_stall(struct rcu_data *rdp)
 		if (kvm_check_and_clear_guest_paused())
 			return;
 
-		/* We haven't checked in, so go dump stack. */
-		print_cpu_stall(gps);
-		if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
-			rcu_ftrace_dump(DUMP_ALL);
-		didstall = true;
-
-	} else if (rcu_gp_in_progress() &&
-		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
-		   cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-
-		/*
-		 * If a virtual machine is stopped by the host it can look to
-		 * the watchdog like an RCU stall. Check to see if the host
-		 * stopped the vm.
-		 */
-		if (kvm_check_and_clear_guest_paused())
-			return;
+		if (self_detected) {
+			/* We haven't checked in, so go dump stack. */
+			print_cpu_stall(gps);
+		} else {
+			/* They had a few time units to dump stack, so complain. */
+			print_other_cpu_stall(gs2, gps);
+		}
 
-		/* They had a few time units to dump stack, so complain. */
-		print_other_cpu_stall(gs2, gps);
 		if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
 			rcu_ftrace_dump(DUMP_ALL);
-		didstall = true;
-	}
-	if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) {
-		jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
-		WRITE_ONCE(rcu_state.jiffies_stall, jn);
+
+		if (READ_ONCE(rcu_state.jiffies_stall) == jn) {
+			jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+			WRITE_ONCE(rcu_state.jiffies_stall, jn);
+		}
 	}
 }
 
-- 
cgit v1.2.3


From 5b404fdabacf4bee92d8c66013402a85f18a6a10 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 15 Aug 2023 15:46:32 -0700
Subject: rcu: Add RCU CPU stall notifier

It is sometimes helpful to have a way for the subsystem causing
the stall to dump its state when an RCU CPU stall occurs.  This
commit therefore bases rcu_stall_chain_notifier_register() and
rcu_stall_chain_notifier_unregister() on atomic notifiers in order to
provide this functionality.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/rcu_notifier.h | 32 ++++++++++++++++++++++++
 kernel/rcu/rcu.h             |  6 +++++
 kernel/rcu/tree_exp.h        |  6 ++++-
 kernel/rcu/tree_stall.h      | 59 +++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 101 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/rcu_notifier.h

(limited to 'kernel')

diff --git a/include/linux/rcu_notifier.h b/include/linux/rcu_notifier.h
new file mode 100644
index 000000000000..ebf371364581
--- /dev/null
+++ b/include/linux/rcu_notifier.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Read-Copy Update notifiers, initially RCU CPU stall notifier.
+ * Separate from rcupdate.h to avoid #include loops.
+ *
+ * Copyright (C) 2023 Paul E. McKenney.
+ */
+
+#ifndef __LINUX_RCU_NOTIFIER_H
+#define __LINUX_RCU_NOTIFIER_H
+
+// Actions for RCU CPU stall notifier calls.
+#define RCU_STALL_NOTIFY_NORM	1
+#define RCU_STALL_NOTIFY_EXP	2
+
+#ifdef CONFIG_RCU_STALL_COMMON
+
+#include <linux/notifier.h>
+#include <linux/types.h>
+
+int rcu_stall_chain_notifier_register(struct notifier_block *n);
+int rcu_stall_chain_notifier_unregister(struct notifier_block *n);
+
+#else // #ifdef CONFIG_RCU_STALL_COMMON
+
+// No RCU CPU stall warnings in Tiny RCU.
+static inline int rcu_stall_chain_notifier_register(struct notifier_block *n) { return -EEXIST; }
+static inline int rcu_stall_chain_notifier_unregister(struct notifier_block *n) { return -ENOENT; }
+
+#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON
+
+#endif /* __LINUX_RCU_NOTIFIER_H */
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 98e13be411af..ef3bab977407 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -654,4 +654,10 @@ static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; }
 bool rcu_cpu_beenfullyonline(int cpu);
 #endif
 
+#ifdef CONFIG_RCU_STALL_COMMON
+int rcu_stall_notifier_call_chain(unsigned long val, void *v);
+#else // #ifdef CONFIG_RCU_STALL_COMMON
+static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; }
+#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON
+
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 8239b39d945b..6d7cea5d591f 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -621,10 +621,14 @@ static void synchronize_rcu_expedited_wait(void)
 	}
 
 	for (;;) {
+		unsigned long j;
+
 		if (synchronize_rcu_expedited_wait_once(jiffies_stall))
 			return;
 		if (rcu_stall_is_suppressed())
 			continue;
+		j = jiffies;
+		rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start));
 		trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
 		pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
 		       rcu_state.name);
@@ -647,7 +651,7 @@ static void synchronize_rcu_expedited_wait(void)
 			}
 		}
 		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
-			jiffies - jiffies_start, rcu_state.expedited_sequence,
+			j - jiffies_start, rcu_state.expedited_sequence,
 			data_race(rnp_root->expmask),
 			".T"[!!data_race(rnp_root->exp_tasks)]);
 		if (ndetected) {
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 2443d1d4a6dc..49544f932279 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -8,6 +8,7 @@
  */
 
 #include <linux/kvm_para.h>
+#include <linux/rcu_notifier.h>
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -770,6 +771,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
 		if (kvm_check_and_clear_guest_paused())
 			return;
 
+		rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps);
 		if (self_detected) {
 			/* We haven't checked in, so go dump stack. */
 			print_cpu_stall(gps);
@@ -790,7 +792,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
 
 //////////////////////////////////////////////////////////////////////////////
 //
-// RCU forward-progress mechanisms, including of callback invocation.
+// RCU forward-progress mechanisms, including for callback invocation.
 
 
 /*
@@ -1042,3 +1044,58 @@ static int __init rcu_sysrq_init(void)
 	return 0;
 }
 early_initcall(rcu_sysrq_init);
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RCU CPU stall-warning notifiers
+
+static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list);
+
+/**
+ * rcu_stall_chain_notifier_register - Add an RCU CPU stall notifier
+ * @n: Entry to add.
+ *
+ * Adds an RCU CPU stall notifier to an atomic notifier chain.
+ * The @action passed to a notifier will be @RCU_STALL_NOTIFY_NORM or
+ * friends.  The @data will be the duration of the stalled grace period,
+ * in jiffies, coerced to a void* pointer.
+ *
+ * Returns 0 on success, %-EEXIST on error.
+ */
+int rcu_stall_chain_notifier_register(struct notifier_block *n)
+{
+	return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n);
+}
+EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register);
+
+/**
+ * rcu_stall_chain_notifier_unregister - Remove an RCU CPU stall notifier
+ * @n: Entry to add.
+ *
+ * Removes an RCU CPU stall notifier from an atomic notifier chain.
+ *
+ * Returns zero on success, %-ENOENT on failure.
+ */
+int rcu_stall_chain_notifier_unregister(struct notifier_block *n)
+{
+	return atomic_notifier_chain_unregister(&rcu_cpu_stall_notifier_list, n);
+}
+EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_unregister);
+
+/*
+ * rcu_stall_notifier_call_chain - Call functions in an RCU CPU stall notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ *
+ * Calls each function in the RCU CPU stall notifier chain in turn, which
+ * is an atomic call chain.  See atomic_notifier_call_chain() for more
+ * information.
+ *
+ * This is for use within RCU, hence the omission of the extra asterisk
+ * to indicate a non-kerneldoc format header comment.
+ */
+int rcu_stall_notifier_call_chain(unsigned long val, void *v)
+{
+	return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v);
+}
-- 
cgit v1.2.3


From 7c1b3e0c988f2902695ef6175ab8ad00c0e8b65f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 15 Aug 2023 15:53:32 -0700
Subject: rcutorture: Add test of RCU CPU stall notifiers

This commit registers an RCU CPU stall notifier when testing RCU CPU
stalls.  The notifier logs a message similar to the following:

	rcu_torture_stall_nf: v=1, duration=21001.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/rcutorture.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index ade42d6a9d9b..1830cacac01d 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -21,6 +21,7 @@
 #include <linux/spinlock.h>
 #include <linux/smp.h>
 #include <linux/rcupdate_wait.h>
+#include <linux/rcu_notifier.h>
 #include <linux/interrupt.h>
 #include <linux/sched/signal.h>
 #include <uapi/linux/sched/types.h>
@@ -2428,6 +2429,16 @@ static int rcutorture_booster_init(unsigned int cpu)
 	return 0;
 }
 
+static int rcu_torture_stall_nf(struct notifier_block *nb, unsigned long v, void *ptr)
+{
+	pr_info("%s: v=%lu, duration=%lu.\n", __func__, v, (unsigned long)ptr);
+	return NOTIFY_OK;
+}
+
+static struct notifier_block rcu_torture_stall_block = {
+	.notifier_call = rcu_torture_stall_nf,
+};
+
 /*
  * CPU-stall kthread.  It waits as specified by stall_cpu_holdoff, then
  * induces a CPU stall for the time specified by stall_cpu.
@@ -2435,9 +2446,14 @@ static int rcutorture_booster_init(unsigned int cpu)
 static int rcu_torture_stall(void *args)
 {
 	int idx;
+	int ret;
 	unsigned long stop_at;
 
 	VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
+	ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
+	if (ret)
+		pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
+			__func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
 	if (stall_cpu_holdoff > 0) {
 		VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
 		schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
@@ -2481,6 +2497,11 @@ static int rcu_torture_stall(void *args)
 		cur_ops->readunlock(idx);
 	}
 	pr_alert("%s end.\n", __func__);
+	if (!ret) {
+		ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block);
+		if (ret)
+			pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret);
+	}
 	torture_shutdown_absorb("rcu_torture_stall");
 	while (!kthread_should_stop())
 		schedule_timeout_interruptible(10 * HZ);
-- 
cgit v1.2.3


From b96e7a5fa0ba9cda32888e04f8f4bac42d49a7f8 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Tue, 5 Sep 2023 00:02:11 +0000
Subject: rcu/tree: Defer setting of jiffies during stall reset

There are instances where rcu_cpu_stall_reset() is called when jiffies
did not get a chance to update for a long time. Before jiffies is
updated, the CPU stall detector can go off triggering false-positives
where a just-started grace period appears to be ages old. In the past,
we disabled stall detection in rcu_cpu_stall_reset() however this got
changed [1]. This is resulting in false-positives in KGDB usecase [2].

Fix this by deferring the update of jiffies to the third run of the FQS
loop. This is more robust, as, even if rcu_cpu_stall_reset() is called
just before jiffies is read, we would end up pushing out the jiffies
read by 3 more FQS loops. Meanwhile the CPU stall detection will be
delayed and we will not get any false positives.

[1] https://lore.kernel.org/all/20210521155624.174524-2-senozhatsky@chromium.org/
[2] https://lore.kernel.org/all/20230814020045.51950-2-chenhuacai@loongson.cn/

Tested with rcutorture.cpu_stall option as well to verify stall behavior
with/without patch.

Tested-by: Huacai Chen <chenhuacai@loongson.cn>
Reported-by: Binbin Zhou <zhoubinbin@loongson.cn>
Closes: https://lore.kernel.org/all/20230814020045.51950-2-chenhuacai@loongson.cn/
Suggested-by: Paul  McKenney <paulmck@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Fixes: a80be428fbc1 ("rcu: Do not disable GP stall detection in rcu_cpu_stall_reset()")
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree.c       | 12 ++++++++++++
 kernel/rcu/tree.h       |  4 ++++
 kernel/rcu/tree_stall.h | 20 ++++++++++++++++++--
 3 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cb1caefa8bd0..d85779f67aea 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1556,10 +1556,22 @@ static bool rcu_gp_fqs_check_wake(int *gfp)
  */
 static void rcu_gp_fqs(bool first_time)
 {
+	int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall);
 	struct rcu_node *rnp = rcu_get_root();
 
 	WRITE_ONCE(rcu_state.gp_activity, jiffies);
 	WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
+
+	WARN_ON_ONCE(nr_fqs > 3);
+	/* Only countdown nr_fqs for stall purposes if jiffies moves. */
+	if (nr_fqs) {
+		if (nr_fqs == 1) {
+			WRITE_ONCE(rcu_state.jiffies_stall,
+				   jiffies + rcu_jiffies_till_stall_check());
+		}
+		WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs);
+	}
+
 	if (first_time) {
 		/* Collect dyntick-idle snapshots. */
 		force_qs_rnp(dyntick_save_progress_counter);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 192536916f9a..e9821a8422db 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -386,6 +386,10 @@ struct rcu_state {
 						/*  in jiffies. */
 	unsigned long jiffies_stall;		/* Time at which to check */
 						/*  for CPU stalls. */
+	int nr_fqs_jiffies_stall;		/* Number of fqs loops after
+						 * which read jiffies and set
+						 * jiffies_stall. Stall
+						 * warnings disabled if !0. */
 	unsigned long jiffies_resched;		/* Time at which to resched */
 						/*  a reluctant CPU. */
 	unsigned long n_force_qs_gpstart;	/* Snapshot of n_force_qs at */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 49544f932279..ac8e86babe44 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -150,12 +150,17 @@ static void panic_on_rcu_stall(void)
 /**
  * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period
  *
+ * To perform the reset request from the caller, disable stall detection until
+ * 3 fqs loops have passed. This is required to ensure a fresh jiffies is
+ * loaded.  It should be safe to do from the fqs loop as enough timer
+ * interrupts and context switches should have passed.
+ *
  * The caller must disable hard irqs.
  */
 void rcu_cpu_stall_reset(void)
 {
-	WRITE_ONCE(rcu_state.jiffies_stall,
-		   jiffies + rcu_jiffies_till_stall_check());
+	WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 3);
+	WRITE_ONCE(rcu_state.jiffies_stall, ULONG_MAX);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -171,6 +176,7 @@ static void record_gp_stall_check_time(void)
 	WRITE_ONCE(rcu_state.gp_start, j);
 	j1 = rcu_jiffies_till_stall_check();
 	smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq.
+	WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 0);
 	WRITE_ONCE(rcu_state.jiffies_stall, j + j1);
 	rcu_state.jiffies_resched = j + j1 / 2;
 	rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
@@ -726,6 +732,16 @@ static void check_cpu_stall(struct rcu_data *rdp)
 	    !rcu_gp_in_progress())
 		return;
 	rcu_stall_kick_kthreads();
+
+	/*
+	 * Check if it was requested (via rcu_cpu_stall_reset()) that the FQS
+	 * loop has to set jiffies to ensure a non-stale jiffies value. This
+	 * is required to have good jiffies value after coming out of long
+	 * breaks of jiffies updates. Not doing so can cause false positives.
+	 */
+	if (READ_ONCE(rcu_state.nr_fqs_jiffies_stall) > 0)
+		return;
+
 	j = jiffies;
 
 	/*
-- 
cgit v1.2.3


From 92a708dc1fb8c33b0017ad77dc7ff6e434f96ee2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 27 Jul 2023 13:13:46 -0700
Subject: rcu-tasks: Add printk()s to localize boot-time self-test hang

Currently, rcu_tasks_initiate_self_tests() prints a message and then
initiates self tests on up to three different RCU Tasks flavors.  If one
of the flavors has a grace-period hang, it is not easy to work out which
of the three hung.  This commit therefore prints a message prior to each
individual test.

Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tasks.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 8d65f7d576a3..83049a893de5 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1979,20 +1979,22 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
 
 static void rcu_tasks_initiate_self_tests(void)
 {
-	pr_info("Running RCU-tasks wait API self tests\n");
 #ifdef CONFIG_TASKS_RCU
+	pr_info("Running RCU Tasks wait API self tests\n");
 	tests[0].runstart = jiffies;
 	synchronize_rcu_tasks();
 	call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
 #endif
 
 #ifdef CONFIG_TASKS_RUDE_RCU
+	pr_info("Running RCU Tasks Rude wait API self tests\n");
 	tests[1].runstart = jiffies;
 	synchronize_rcu_tasks_rude();
 	call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
 #endif
 
 #ifdef CONFIG_TASKS_TRACE_RCU
+	pr_info("Running RCU Tasks Trace wait API self tests\n");
 	tests[2].runstart = jiffies;
 	synchronize_rcu_tasks_trace();
 	call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
-- 
cgit v1.2.3


From e62d8ae4620865411d1b2347980aa28ccf891a3d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 2 Aug 2023 13:42:00 -0700
Subject: rcu-tasks: Pull sampling of ->percpu_dequeue_lim out of loop

The rcu_tasks_need_gpcb() samples ->percpu_dequeue_lim as part of the
condition clause of a "for" loop, which is a bit confusing.  This commit
therefore hoists this sampling out of the loop, using the result loaded
in the condition clause.

So why does this work in the face of a concurrent switch from single-CPU
queueing to per-CPU queueing?

o	The call_rcu_tasks_generic() that makes the change has already
	enqueued its callback, which means that all of the other CPU's
	callback queues are empty.

o	For the call_rcu_tasks_generic() that first notices
	the switch to per-CPU queues, the smp_store_release()
	used to update ->percpu_enqueue_lim pairs with the
	raw_spin_trylock_rcu_node()'s full barrier that is
	between the READ_ONCE(rtp->percpu_enqueue_shift) and the
	rcu_segcblist_enqueue() that enqueues the callback.

o	Because this CPU's queue is empty (unless it happens to
	be the original single queue, in which case there is no
	need for synchronization), this call_rcu_tasks_generic()
	will do an irq_work_queue() to schedule a handler for the
	needed rcuwait_wake_up() call.	This call will be ordered
	after the first call_rcu_tasks_generic() function's change to
	->percpu_dequeue_lim.

o	This rcuwait_wake_up() will either happen before or after the
	set_current_state() in rcuwait_wait_event().  If it happens
	before, the "condition" argument's call to rcu_tasks_need_gpcb()
	will be ordered after the original change, and all callbacks on
	all CPUs will be visible.  Otherwise, if it happens after, then
	the grace-period kthread's state will be set back to running,
	which will result in a later call to rcuwait_wait_event() and
	thus to rcu_tasks_need_gpcb(), which will again see the change.

So it all works out.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tasks.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 83049a893de5..94bb5abdbb37 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -432,6 +432,7 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
 static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
 {
 	int cpu;
+	int dequeue_limit;
 	unsigned long flags;
 	bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq);
 	long n;
@@ -439,7 +440,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
 	long ncbsnz = 0;
 	int needgpcb = 0;
 
-	for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) {
+	dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim);
+	for (cpu = 0; cpu < dequeue_limit; cpu++) {
 		struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
 
 		/* Advance and accelerate any new callbacks. */
-- 
cgit v1.2.3


From 0325e8a1282dfd30c3e44928c6384bb978649c63 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Thu, 3 Aug 2023 16:06:59 +0800
Subject: rcu-tasks: Make rcu_tasks_lazy_ms static

The rcu_tasks_lazy_ms variable is not used outside the file tasks.h,
so this commit marks it static.

kernel/rcu/tasks.h:1085:5: warning: symbol 'rcu_tasks_lazy_ms' was not declared. Should it be static?

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=6086
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tasks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 94bb5abdbb37..018f03f20629 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1086,7 +1086,7 @@ void rcu_barrier_tasks(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
 
-int rcu_tasks_lazy_ms = -1;
+static int rcu_tasks_lazy_ms = -1;
 module_param(rcu_tasks_lazy_ms, int, 0444);
 
 static int __init rcu_spawn_tasks_kthread(void)
-- 
cgit v1.2.3


From 730c3ed4ba30feadfb20b4d4d18e869bc00348f6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 1 Aug 2023 09:30:18 -0700
Subject: refscale: Fix misplaced data re-read

This commit fixes a misplaced data re-read in the typesafe code.
The reason that this was not noticed is that this is a performance test
with no writers, so a mismatch could not occur.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/refscale.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 91a0fd0d4d9a..750a63e99539 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -655,12 +655,12 @@ retry:
 			goto retry;
 		}
 		un_delay(udl, ndl);
+		b = READ_ONCE(rtsp->a);
 		// Remember, seqlock read-side release can fail.
 		if (!rts_release(rtsp, start)) {
 			rcu_read_unlock();
 			goto retry;
 		}
-		b = READ_ONCE(rtsp->a);
 		WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b);
 		b = rtsp->b;
 		rcu_read_unlock();
-- 
cgit v1.2.3


From d6fea1dde2064c8f298dbe872587c7ace7701b3f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 16 Aug 2023 11:27:26 -0700
Subject: refscale: Print out additional module parameters

The refscale.verbose_batched and refscale.lookup_instances module
parameters are omitted from the ref_scale_print_module_parms()
beginning-of-test output.  This commit therefore adds them.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/refscale.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 750a63e99539..2c2648a3ad30 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -1025,8 +1025,8 @@ static void
 ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag)
 {
 	pr_alert("%s" SCALE_FLAG
-		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
-		 verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay);
+		 "--- %s:  verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
+		 verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay);
 }
 
 static void
-- 
cgit v1.2.3


From 62663b849662c1a5126b6274d91671b90566ef13 Mon Sep 17 00:00:00 2001
From: Tero Kristo <tero.kristo@linux.intel.com>
Date: Mon, 11 Sep 2023 17:17:04 +0300
Subject: tracing/synthetic: Print out u64 values properly

The synth traces incorrectly print pointer to the synthetic event values
instead of the actual value when using u64 type. Fix by addressing the
contents of the union properly.

Link: https://lore.kernel.org/linux-trace-kernel/20230911141704.3585965-1-tero.kristo@linux.intel.com

Fixes: ddeea494a16f ("tracing/synthetic: Use union instead of casts")
Cc: stable@vger.kernel.org
Signed-off-by: Tero Kristo <tero.kristo@linux.intel.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_synth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 9897d0bfcab7..14cb275a0bab 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -337,7 +337,7 @@ static void print_synth_event_num_val(struct trace_seq *s,
 		break;
 
 	default:
-		trace_seq_printf(s, print_fmt, name, val, space);
+		trace_seq_printf(s, print_fmt, name, val->as_u64, space);
 		break;
 	}
 }
-- 
cgit v1.2.3


From a34a9f1a19afe9c60ca0ea61dfeee63a1c2baac8 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Mon, 11 Sep 2023 15:28:14 +0200
Subject: bpf: Avoid deadlock when using queue and stack maps from NMI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sysbot discovered that the queue and stack maps can deadlock if they are
being used from a BPF program that can be called from NMI context (such as
one that is attached to a perf HW counter event). To fix this, add an
in_nmi() check and use raw_spin_trylock() in NMI context, erroring out if
grabbing the lock fails.

Fixes: f1a2e44a3aec ("bpf: add queue and stack maps")
Reported-by: Hsin-Wei Hung <hsinweih@uci.edu>
Tested-by: Hsin-Wei Hung <hsinweih@uci.edu>
Co-developed-by: Hsin-Wei Hung <hsinweih@uci.edu>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20230911132815.717240-1-toke@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/queue_stack_maps.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 8d2ddcb7566b..d869f51ea93a 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -98,7 +98,12 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
 	int err = 0;
 	void *ptr;
 
-	raw_spin_lock_irqsave(&qs->lock, flags);
+	if (in_nmi()) {
+		if (!raw_spin_trylock_irqsave(&qs->lock, flags))
+			return -EBUSY;
+	} else {
+		raw_spin_lock_irqsave(&qs->lock, flags);
+	}
 
 	if (queue_stack_map_is_empty(qs)) {
 		memset(value, 0, qs->map.value_size);
@@ -128,7 +133,12 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
 	void *ptr;
 	u32 index;
 
-	raw_spin_lock_irqsave(&qs->lock, flags);
+	if (in_nmi()) {
+		if (!raw_spin_trylock_irqsave(&qs->lock, flags))
+			return -EBUSY;
+	} else {
+		raw_spin_lock_irqsave(&qs->lock, flags);
+	}
 
 	if (queue_stack_map_is_empty(qs)) {
 		memset(value, 0, qs->map.value_size);
@@ -193,7 +203,12 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
 	if (flags & BPF_NOEXIST || flags > BPF_EXIST)
 		return -EINVAL;
 
-	raw_spin_lock_irqsave(&qs->lock, irq_flags);
+	if (in_nmi()) {
+		if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags))
+			return -EBUSY;
+	} else {
+		raw_spin_lock_irqsave(&qs->lock, irq_flags);
+	}
 
 	if (queue_stack_map_is_full(qs)) {
 		if (!replace) {
-- 
cgit v1.2.3


From 7d672f40941a549f72aa5500f9d32b26b3f6ab34 Mon Sep 17 00:00:00 2001
From: Christopher James Halse Rogers <raof@ubuntu.com>
Date: Mon, 27 Jun 2022 10:45:12 +1000
Subject: stacktrace: Export stack_trace_save_tsk

The bcachefs module wants it, and there doesn't seem to be any
reason it shouldn't be exported like the other functions.

Signed-off-by: Christopher James Halse Rogers <raof@ubuntu.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 kernel/stacktrace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9ed5ce989415..4f65824879ab 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -151,6 +151,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
 	put_task_stack(tsk);
 	return c.len;
 }
+EXPORT_SYMBOL_GPL(stack_trace_save_tsk);
 
 /**
  * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
@@ -301,6 +302,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
 	save_stack_trace_tsk(task, &trace);
 	return trace.nr_entries;
 }
+EXPORT_SYMBOL_GPL(stack_trace_save_tsk);
 
 /**
  * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
-- 
cgit v1.2.3


From 1a49f4195d3498fe458a7f5ff7ec5385da70d92e Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 12 Sep 2023 03:55:37 +0300
Subject: bpf: Avoid dummy bpf_offload_netdev in __bpf_prog_dev_bound_init

Fix for a bug observable under the following sequence of events:
1. Create a network device that does not support XDP offload.
2. Load a device bound XDP program with BPF_F_XDP_DEV_BOUND_ONLY flag
   (such programs are not offloaded).
3. Load a device bound XDP program with zero flags
   (such programs are offloaded).

At step (2) __bpf_prog_dev_bound_init() associates with device (1)
a dummy bpf_offload_netdev struct with .offdev field set to NULL.
At step (3) __bpf_prog_dev_bound_init() would reuse dummy struct
allocated at step (2).
However, downstream usage of the bpf_offload_netdev assumes that
.offdev field can't be NULL, e.g. in bpf_prog_offload_verifier_prep().

Adjust __bpf_prog_dev_bound_init() to require bpf_offload_netdev
with non-NULL .offdev for offloaded BPF programs.

Fixes: 2b3486bc2d23 ("bpf: Introduce device-bound XDP programs")
Reported-by: syzbot+291100dcb32190ec02a8@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/bpf/000000000000d97f3c060479c4f8@google.com/
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20230912005539.2248244-2-eddyz87@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/offload.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 3e4f2ec1af06..87d6693d8233 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -199,12 +199,14 @@ static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *n
 	offload->netdev = netdev;
 
 	ondev = bpf_offload_find_netdev(offload->netdev);
+	/* When program is offloaded require presence of "true"
+	 * bpf_offload_netdev, avoid the one created for !ondev case below.
+	 */
+	if (bpf_prog_is_offloaded(prog->aux) && (!ondev || !ondev->offdev)) {
+		err = -EINVAL;
+		goto err_free;
+	}
 	if (!ondev) {
-		if (bpf_prog_is_offloaded(prog->aux)) {
-			err = -EINVAL;
-			goto err_free;
-		}
-
 		/* When only binding to the device, explicitly
 		 * create an entry in the hashtable.
 		 */
-- 
cgit v1.2.3


From 40d84e198b0ae64df71ac0e70675b16900b90bde Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Wed, 6 Sep 2023 12:18:41 +0800
Subject: PM: hibernate: Rename function parameter from snapshot_test to
 exclusive

Several functions reply on snapshot_test to decide whether to
open the resume device exclusively. However there is no strict
connection between the snapshot_test and the open mode. Rename
the 'snapshot_test' input parameter to 'exclusive' to better reflect
the use case.

No functional change is expected.

Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/power.h |  4 ++--
 kernel/power/swap.c  | 14 ++++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46eb14dc50c3..a98f95e309a3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -168,11 +168,11 @@ extern int swsusp_swap_in_use(void);
 #define SF_HW_SIG		8
 
 /* kernel/power/hibernate.c */
-int swsusp_check(bool snapshot_test);
+int swsusp_check(bool exclusive);
 extern void swsusp_free(void);
 extern int swsusp_read(unsigned int *flags_p);
 extern int swsusp_write(unsigned int flags);
-void swsusp_close(bool snapshot_test);
+void swsusp_close(bool exclusive);
 #ifdef CONFIG_SUSPEND
 extern int swsusp_unmark(void);
 #endif
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index f6ebcd00c410..74edbce2320b 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1513,12 +1513,13 @@ end:
 static void *swsusp_holder;
 
 /**
- *      swsusp_check - Check for swsusp signature in the resume device
+ * swsusp_check - Check for swsusp signature in the resume device
+ * @exclusive: Open the resume device exclusively.
  */
 
-int swsusp_check(bool snapshot_test)
+int swsusp_check(bool exclusive)
 {
-	void *holder = snapshot_test ? &swsusp_holder : NULL;
+	void *holder = exclusive ? &swsusp_holder : NULL;
 	int error;
 
 	hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ,
@@ -1563,17 +1564,18 @@ put:
 }
 
 /**
- *	swsusp_close - close swap device.
+ * swsusp_close - close swap device.
+ * @exclusive: Close the resume device which is exclusively opened.
  */
 
-void swsusp_close(bool snapshot_test)
+void swsusp_close(bool exclusive)
 {
 	if (IS_ERR(hib_resume_bdev)) {
 		pr_debug("Image device not initialised\n");
 		return;
 	}
 
-	blkdev_put(hib_resume_bdev, snapshot_test ? &swsusp_holder : NULL);
+	blkdev_put(hib_resume_bdev, exclusive ? &swsusp_holder : NULL);
 }
 
 /**
-- 
cgit v1.2.3


From 148b6f4cc3920e563094540fe1a12d00d3bbccae Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Wed, 6 Sep 2023 12:18:52 +0800
Subject: PM: hibernate: Fix the exclusive get block device in test_resume mode

Commit 5904de0d735b ("PM: hibernate: Do not get block device exclusively
in test_resume mode") fixes a hibernation issue under test_resume mode.
That commit is supposed to open the block device in non-exclusive mode
when in test_resume. However the code does the opposite, which is against
its description.

In summary, the swap device is only opened exclusively by swsusp_check()
with its corresponding *close(), and must be in non test_resume mode.
This is to avoid the race condition that different processes scribble the
device at the same time. All the other cases should use non-exclusive mode.

Fix it by really disabling exclusive mode under test_resume.

Fixes: 5904de0d735b ("PM: hibernate: Do not get block device exclusively in test_resume mode")
Closes: https://lore.kernel.org/lkml/000000000000761f5f0603324129@google.com/
Reported-by: Pengfei Xu <pengfei.xu@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Chenzhou Feng <chenzhoux.feng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 2b4a946a6ff5..8d35b9f9aaa3 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -786,9 +786,9 @@ int hibernate(void)
 	unlock_device_hotplug();
 	if (snapshot_test) {
 		pm_pr_dbg("Checking hibernation image\n");
-		error = swsusp_check(snapshot_test);
+		error = swsusp_check(false);
 		if (!error)
-			error = load_image_and_restore(snapshot_test);
+			error = load_image_and_restore(false);
 	}
 	thaw_processes();
 
@@ -945,14 +945,14 @@ static int software_resume(void)
 	pm_pr_dbg("Looking for hibernation image.\n");
 
 	mutex_lock(&system_transition_mutex);
-	error = swsusp_check(false);
+	error = swsusp_check(true);
 	if (error)
 		goto Unlock;
 
 	/* The snapshot device should not be opened while we're running */
 	if (!hibernate_acquire()) {
 		error = -EBUSY;
-		swsusp_close(false);
+		swsusp_close(true);
 		goto Unlock;
 	}
 
@@ -973,7 +973,7 @@ static int software_resume(void)
 		goto Close_Finish;
 	}
 
-	error = load_image_and_restore(false);
+	error = load_image_and_restore(true);
 	thaw_processes();
  Finish:
 	pm_notifier_call_chain(PM_POST_RESTORE);
@@ -987,7 +987,7 @@ static int software_resume(void)
 	pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
 	return error;
  Close_Finish:
-	swsusp_close(false);
+	swsusp_close(true);
 	goto Finish;
 }
 
-- 
cgit v1.2.3


From c8414dab164a74bd3bb859a2d836cb537d6b9298 Mon Sep 17 00:00:00 2001
From: Jinjie Ruan <ruanjinjie@huawei.com>
Date: Tue, 12 Sep 2023 21:47:52 +0800
Subject: eventfs: Fix the NULL pointer dereference bug in eventfs_remove_rec()

Inject fault while probing btrfs.ko, if kstrdup() fails in
eventfs_prepare_ef() in eventfs_add_dir(), it will return ERR_PTR
to assign file->ef. But the eventfs_remove() check NULL in
trace_module_remove_events(), which causes the below NULL
pointer dereference.

As both Masami and Steven suggest, allocater side should handle the
error carefully and remove it, so fix the places where it failed.

 Could not create tracefs 'raid56_write' directory
 Btrfs loaded, zoned=no, fsverity=no
 Unable to handle kernel NULL pointer dereference at virtual address 000000000000001c
 Mem abort info:
   ESR = 0x0000000096000004
   EC = 0x25: DABT (current EL), IL = 32 bits
   SET = 0, FnV = 0
   EA = 0, S1PTW = 0
   FSC = 0x04: level 0 translation fault
 Data abort info:
   ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
 user pgtable: 4k pages, 48-bit VAs, pgdp=0000000102544000
 [000000000000001c] pgd=0000000000000000, p4d=0000000000000000
 Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP
 Dumping ftrace buffer:
    (ftrace buffer empty)
 Modules linked in: btrfs(-) libcrc32c xor xor_neon raid6_pq cfg80211 rfkill 8021q garp mrp stp llc ipv6 [last unloaded: btrfs]
 CPU: 15 PID: 1343 Comm: rmmod Tainted: G                 N 6.5.0+ #40
 Hardware name: linux,dummy-virt (DT)
 pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
 pc : eventfs_remove_rec+0x24/0xc0
 lr : eventfs_remove+0x68/0x1d8
 sp : ffff800082d63b60
 x29: ffff800082d63b60 x28: ffffb84b80ddd00c x27: ffffb84b3054ba40
 x26: 0000000000000002 x25: ffff800082d63bf8 x24: ffffb84b8398e440
 x23: ffffb84b82af3000 x22: dead000000000100 x21: dead000000000122
 x20: ffff800082d63bf8 x19: fffffffffffffff4 x18: ffffb84b82508820
 x17: 0000000000000000 x16: 0000000000000000 x15: 000083bc876a3166
 x14: 000000000000006d x13: 000000000000006d x12: 0000000000000000
 x11: 0000000000000001 x10: 00000000000017e0 x9 : 0000000000000001
 x8 : 0000000000000000 x7 : 0000000000000000 x6 : ffffb84b84289804
 x5 : 0000000000000000 x4 : 9696969696969697 x3 : ffff33a5b7601f38
 x2 : 0000000000000000 x1 : ffff800082d63bf8 x0 : fffffffffffffff4
 Call trace:
  eventfs_remove_rec+0x24/0xc0
  eventfs_remove+0x68/0x1d8
  remove_event_file_dir+0x88/0x100
  event_remove+0x140/0x15c
  trace_module_notify+0x1fc/0x230
  notifier_call_chain+0x98/0x17c
  blocking_notifier_call_chain+0x4c/0x74
  __arm64_sys_delete_module+0x1a4/0x298
  invoke_syscall+0x44/0x100
  el0_svc_common.constprop.1+0x68/0xe0
  do_el0_svc+0x1c/0x28
  el0_svc+0x3c/0xc4
  el0t_64_sync_handler+0xa0/0xc4
  el0t_64_sync+0x174/0x178
 Code: 5400052c a90153b3 aa0003f3 aa0103f4 (f9401400)
 ---[ end trace 0000000000000000 ]---
 Kernel panic - not syncing: Oops: Fatal exception
 SMP: stopping secondary CPUs
 Dumping ftrace buffer:
    (ftrace buffer empty)
 Kernel Offset: 0x384b00c00000 from 0xffff800080000000
 PHYS_OFFSET: 0xffffcc5b80000000
 CPU features: 0x88000203,3c020000,1000421b
 Memory Limit: none
 Rebooting in 1 seconds..

Link: https://lore.kernel.org/linux-trace-kernel/20230912134752.1838524-1-ruanjinjie@huawei.com
Link: https://lore.kernel.org/all/20230912025808.668187-1-ruanjinjie@huawei.com/
Link: https://lore.kernel.org/all/20230911052818.1020547-1-ruanjinjie@huawei.com/
Link: https://lore.kernel.org/all/20230909072817.182846-1-ruanjinjie@huawei.com/
Link: https://lore.kernel.org/all/20230908074816.3724716-1-ruanjinjie@huawei.com/

Cc: Ajay Kaher <akaher@vmware.com>
Fixes: 5bdcd5f5331a ("eventfs: Implement removal of meta data from eventfs")
Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
Suggested-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 065c63991858..91951d038ba4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2286,6 +2286,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 {
 	struct event_subsystem *system, *iter;
 	struct trace_subsystem_dir *dir;
+	struct eventfs_file *ef;
 	int res;
 
 	/* First see if we did not already create this dir */
@@ -2318,13 +2319,14 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 	} else
 		__get_system(system);
 
-	dir->ef = eventfs_add_subsystem_dir(name, parent);
-	if (IS_ERR(dir->ef)) {
+	ef = eventfs_add_subsystem_dir(name, parent);
+	if (IS_ERR(ef)) {
 		pr_warn("Failed to create system directory %s\n", name);
 		__put_system(system);
 		goto out_free;
 	}
 
+	dir->ef = ef;
 	dir->tr = tr;
 	dir->ref_count = 1;
 	dir->nr_events = 1;
@@ -2404,6 +2406,7 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 	struct trace_event_call *call = file->event_call;
 	struct eventfs_file *ef_subsystem = NULL;
 	struct trace_array *tr = file->tr;
+	struct eventfs_file *ef;
 	const char *name;
 	int ret;
 
@@ -2420,12 +2423,14 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 		return -ENOMEM;
 
 	name = trace_event_name(call);
-	file->ef = eventfs_add_dir(name, ef_subsystem);
-	if (IS_ERR(file->ef)) {
+	ef = eventfs_add_dir(name, ef_subsystem);
+	if (IS_ERR(ef)) {
 		pr_warn("Could not create tracefs '%s' directory\n", name);
 		return -1;
 	}
 
+	file->ef = ef;
+
 	if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
 		eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file,
 				  &ftrace_enable_fops);
-- 
cgit v1.2.3


From 54aee5f15b83437f23b2b2469bcf21bdd9823916 Mon Sep 17 00:00:00 2001
From: Shuai Xue <xueshuai@linux.alibaba.com>
Date: Thu, 7 Sep 2023 08:43:07 +0800
Subject: perf/core: Bail out early if the request AUX area is out of bound

When perf-record with a large AUX area, e.g 4GB, it fails with:

    #perf record -C 0 -m ,4G -e arm_spe_0// -- sleep 1
    failed to mmap with 12 (Cannot allocate memory)

and it reveals a WARNING with __alloc_pages():

	------------[ cut here ]------------
	WARNING: CPU: 44 PID: 17573 at mm/page_alloc.c:5568 __alloc_pages+0x1ec/0x248
	Call trace:
	 __alloc_pages+0x1ec/0x248
	 __kmalloc_large_node+0xc0/0x1f8
	 __kmalloc_node+0x134/0x1e8
	 rb_alloc_aux+0xe0/0x298
	 perf_mmap+0x440/0x660
	 mmap_region+0x308/0x8a8
	 do_mmap+0x3c0/0x528
	 vm_mmap_pgoff+0xf4/0x1b8
	 ksys_mmap_pgoff+0x18c/0x218
	 __arm64_sys_mmap+0x38/0x58
	 invoke_syscall+0x50/0x128
	 el0_svc_common.constprop.0+0x58/0x188
	 do_el0_svc+0x34/0x50
	 el0_svc+0x34/0x108
	 el0t_64_sync_handler+0xb8/0xc0
	 el0t_64_sync+0x1a4/0x1a8

'rb->aux_pages' allocated by kcalloc() is a pointer array which is used to
maintains AUX trace pages. The allocated page for this array is physically
contiguous (and virtually contiguous) with an order of 0..MAX_ORDER. If the
size of pointer array crosses the limitation set by MAX_ORDER, it reveals a
WARNING.

So bail out early with -ENOMEM if the request AUX area is out of bound,
e.g.:

    #perf record -C 0 -m ,4G -e arm_spe_0// -- sleep 1
    failed to mmap with 12 (Cannot allocate memory)

Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/ring_buffer.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index fb1e180b5f0a..e8d82c2f07d0 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -700,6 +700,12 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
 		watermark = 0;
 	}
 
+	/*
+	 * kcalloc_node() is unable to allocate buffer if the size is larger
+	 * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case.
+	 */
+	if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER)
+		return -ENOMEM;
 	rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
 				     node);
 	if (!rb->aux_pages)
-- 
cgit v1.2.3


From 2b5dcb31a19a2e0acd869b12c9db9b2d696ef544 Mon Sep 17 00:00:00 2001
From: Leon Hwang <hffilwlqm@gmail.com>
Date: Tue, 12 Sep 2023 23:04:41 +0800
Subject: bpf, x64: Fix tailcall infinite loop

From commit ebf7d1f508a73871 ("bpf, x64: rework pro/epilogue and tailcall
handling in JIT"), the tailcall on x64 works better than before.

From commit e411901c0b775a3a ("bpf: allow for tailcalls in BPF subprograms
for x64 JIT"), tailcall is able to run in BPF subprograms on x64.

From commit 5b92a28aae4dd0f8 ("bpf: Support attaching tracing BPF program
to other BPF programs"), BPF program is able to trace other BPF programs.

How about combining them all together?

1. FENTRY/FEXIT on a BPF subprogram.
2. A tailcall runs in the BPF subprogram.
3. The tailcall calls the subprogram's caller.

As a result, a tailcall infinite loop comes up. And the loop would halt
the machine.

As we know, in tail call context, the tail_call_cnt propagates by stack
and rax register between BPF subprograms. So do in trampolines.

Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT")
Fixes: e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT")
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Leon Hwang <hffilwlqm@gmail.com>
Link: https://lore.kernel.org/r/20230912150442.2009-3-hffilwlqm@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++------
 include/linux/bpf.h         |  5 +++++
 kernel/bpf/trampoline.c     |  4 ++--
 kernel/bpf/verifier.c       |  3 +++
 4 files changed, 32 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index bcca1c9b9a02..2846c21d75bf 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1022,6 +1022,10 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
 
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
+/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
+#define RESTORE_TAIL_CALL_CNT(stack)				\
+	EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8)
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
 		  int oldproglen, struct jit_context *ctx, bool jmp_padding)
 {
@@ -1627,9 +1631,7 @@ st:			if (is_imm8(insn->off))
 
 			func = (u8 *) __bpf_call_base + imm32;
 			if (tail_call_reachable) {
-				/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
-				EMIT3_off32(0x48, 0x8B, 0x85,
-					    -round_up(bpf_prog->aux->stack_depth, 8) - 8);
+				RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth);
 				if (!imm32)
 					return -EINVAL;
 				offs = 7 + x86_call_depth_emit_accounting(&prog, func);
@@ -2404,6 +2406,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	 *                     [ ...        ]
 	 *                     [ stack_arg2 ]
 	 * RBP - arg_stack_off [ stack_arg1 ]
+	 * RSP                 [ tail_call_cnt ] BPF_TRAMP_F_TAIL_CALL_CTX
 	 */
 
 	/* room for return value of orig_call or fentry prog */
@@ -2468,6 +2471,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	else
 		/* sub rsp, stack_size */
 		EMIT4(0x48, 0x83, 0xEC, stack_size);
+	if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+		EMIT1(0x50);		/* push rax */
 	/* mov QWORD PTR [rbp - rbx_off], rbx */
 	emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off);
 
@@ -2520,9 +2525,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 		restore_regs(m, &prog, regs_off);
 		save_args(m, &prog, arg_stack_off, true);
 
+		if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+			/* Before calling the original function, restore the
+			 * tail_call_cnt from stack to rax.
+			 */
+			RESTORE_TAIL_CALL_CNT(stack_size);
+
 		if (flags & BPF_TRAMP_F_ORIG_STACK) {
-			emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
-			EMIT2(0xff, 0xd0); /* call *rax */
+			emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8);
+			EMIT2(0xff, 0xd3); /* call *rbx */
 		} else {
 			/* call original function */
 			if (emit_rsb_call(&prog, orig_call, prog)) {
@@ -2573,7 +2584,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 			ret = -EINVAL;
 			goto cleanup;
 		}
-	}
+	} else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+		/* Before running the original function, restore the
+		 * tail_call_cnt from stack to rax.
+		 */
+		RESTORE_TAIL_CALL_CNT(stack_size);
+
 	/* restore return value of orig_call or fentry prog back into RAX */
 	if (save_ret)
 		emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 87eeb3a46a1d..b9e573159432 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1035,6 +1035,11 @@ struct btf_func_model {
  */
 #define BPF_TRAMP_F_SHARE_IPMODIFY	BIT(6)
 
+/* Indicate that current trampoline is in a tail call context. Then, it has to
+ * cache and restore tail_call_cnt to avoid infinite tail call loop.
+ */
+#define BPF_TRAMP_F_TAIL_CALL_CTX	BIT(7)
+
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.
  */
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 53ff50cac61e..e97aeda3a86b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -415,8 +415,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
 		goto out;
 	}
 
-	/* clear all bits except SHARE_IPMODIFY */
-	tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY;
+	/* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */
+	tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX);
 
 	if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
 	    tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dbba2b806017..18e673c0ac15 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19774,6 +19774,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	if (!tr)
 		return -ENOMEM;
 
+	if (tgt_prog && tgt_prog->aux->tail_call_reachable)
+		tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX;
+
 	prog->aux->dst_trampoline = tr;
 	return 0;
 }
-- 
cgit v1.2.3


From 97f576eb38ae219a6fb337d13586d8acbf01d3f8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 17 Aug 2023 13:35:02 -0700
Subject: audit: Annotate struct audit_chunk with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct audit_chunk.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Paul Moore <paul@paul-moore.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: audit@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e867c17d3f84..85a5b306733b 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -34,7 +34,7 @@ struct audit_chunk {
 		struct list_head list;
 		struct audit_tree *owner;
 		unsigned index;		/* index; upper bit indicates 'will prune' */
-	} owners[];
+	} owners[] __counted_by(count);
 };
 
 struct audit_tree_mark {
-- 
cgit v1.2.3


From a8f12572860ad8ba659d96eee9cf09e181f6ebcc Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 8 Sep 2023 18:33:35 +0200
Subject: bpf: Fix a erroneous check after snprintf()

snprintf() does not return negative error code on error, it returns the
number of characters which *would* be generated for the given input.

Fix the error handling check.

Fixes: 57539b1c0ac2 ("bpf: Enable annotating trusted nested pointers")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/393bdebc87b22563c08ace094defa7160eb7a6c0.1694190795.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 1095bbe29859..8090d7fb11ef 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -8501,7 +8501,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 	tname = btf_name_by_offset(btf, walk_type->name_off);
 
 	ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);
-	if (ret < 0)
+	if (ret >= sizeof(safe_tname))
 		return false;
 
 	safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info));
-- 
cgit v1.2.3


From 214bfd267f4929722b374b43fda456c21cd6f016 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 11 Sep 2023 23:08:12 -0700
Subject: bpf, cgroup: fix multiple kernel-doc warnings

Fix missing or extra function parameter kernel-doc warnings
in cgroup.c:

kernel/bpf/cgroup.c:1359: warning: Excess function parameter 'type' description in '__cgroup_bpf_run_filter_skb'
kernel/bpf/cgroup.c:1359: warning: Function parameter or member 'atype' not described in '__cgroup_bpf_run_filter_skb'
kernel/bpf/cgroup.c:1439: warning: Excess function parameter 'type' description in '__cgroup_bpf_run_filter_sk'
kernel/bpf/cgroup.c:1439: warning: Function parameter or member 'atype' not described in '__cgroup_bpf_run_filter_sk'
kernel/bpf/cgroup.c:1467: warning: Excess function parameter 'type' description in '__cgroup_bpf_run_filter_sock_addr'
kernel/bpf/cgroup.c:1467: warning: Function parameter or member 'atype' not described in '__cgroup_bpf_run_filter_sock_addr'
kernel/bpf/cgroup.c:1512: warning: Excess function parameter 'type' description in '__cgroup_bpf_run_filter_sock_ops'
kernel/bpf/cgroup.c:1512: warning: Function parameter or member 'atype' not described in '__cgroup_bpf_run_filter_sock_ops'
kernel/bpf/cgroup.c:1685: warning: Excess function parameter 'type' description in '__cgroup_bpf_run_filter_sysctl'
kernel/bpf/cgroup.c:1685: warning: Function parameter or member 'atype' not described in '__cgroup_bpf_run_filter_sysctl'
kernel/bpf/cgroup.c:795: warning: Excess function parameter 'type' description in '__cgroup_bpf_replace'
kernel/bpf/cgroup.c:795: warning: Function parameter or member 'new_prog' not described in '__cgroup_bpf_replace'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: bpf@vger.kernel.org
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230912060812.1715-1-rdunlap@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 5b2741aa0d9b..03b3d4492980 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -785,7 +785,8 @@ found:
  *                          to descendants
  * @cgrp: The cgroup which descendants to traverse
  * @link: A link for which to replace BPF program
- * @type: Type of attach operation
+ * @new_prog: &struct bpf_prog for the target BPF program with its refcnt
+ *            incremented
  *
  * Must be called with cgroup_mutex held.
  */
@@ -1334,7 +1335,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
  * @sk: The socket sending or receiving traffic
  * @skb: The skb that is being sent or received
- * @type: The type of program to be executed
+ * @atype: The type of program to be executed
  *
  * If no socket is passed, or the socket is not of type INET or INET6,
  * this function does nothing and returns 0.
@@ -1424,7 +1425,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 /**
  * __cgroup_bpf_run_filter_sk() - Run a program on a sock
  * @sk: sock structure to manipulate
- * @type: The type of program to be executed
+ * @atype: The type of program to be executed
  *
  * socket is passed is expected to be of type INET or INET6.
  *
@@ -1449,7 +1450,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  *                                       provided by user sockaddr
  * @sk: sock struct that will use sockaddr
  * @uaddr: sockaddr struct provided by user
- * @type: The type of program to be executed
+ * @atype: The type of program to be executed
  * @t_ctx: Pointer to attach type specific context
  * @flags: Pointer to u32 which contains higher bits of BPF program
  *         return value (OR'ed together).
@@ -1496,7 +1497,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
  * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
  * sk with connection information (IP addresses, etc.) May not contain
  * cgroup info if it is a req sock.
- * @type: The type of program to be executed
+ * @atype: The type of program to be executed
  *
  * socket passed is expected to be of type INET or INET6.
  *
@@ -1670,7 +1671,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
  * @ppos: value-result argument: value is position at which read from or write
  *	to sysctl is happening, result is new position if program overrode it,
  *	initial value otherwise
- * @type: type of program to be executed
+ * @atype: type of program to be executed
  *
  * Program is run when sysctl is being accessed, either read or written, and
  * can allow or deny such access.
-- 
cgit v1.2.3


From a6a241764f69c62d23fc6960920cc662ae4069e9 Mon Sep 17 00:00:00 2001
From: Ross Lagerwall <ross.lagerwall@citrix.com>
Date: Mon, 11 Sep 2023 11:32:51 +0100
Subject: swiotlb: use the calculated number of areas

Commit 8ac04063354a ("swiotlb: reduce the number of areas to match
actual memory pool size") calculated the reduced number of areas in
swiotlb_init_remap() but didn't actually use the value. Replace usage of
default_nareas accordingly.

Fixes: 8ac04063354a ("swiotlb: reduce the number of areas to match actual memory pool size")
Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 394494a6b1f3..85dd94323b98 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -399,14 +399,13 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
 	}
 
 	mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area),
-		default_nareas), SMP_CACHE_BYTES);
+		nareas), SMP_CACHE_BYTES);
 	if (!mem->areas) {
 		pr_warn("%s: Failed to allocate mem->areas.\n", __func__);
 		return;
 	}
 
-	swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false,
-				 default_nareas);
+	swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, nareas);
 	add_mem_pool(&io_tlb_default_mem, mem);
 
 	if (flags & SWIOTLB_VERBOSE)
-- 
cgit v1.2.3


From d090ec0df81e56556af3a2bf04a7e89347ae5784 Mon Sep 17 00:00:00 2001
From: Leonardo Bras <leobras@redhat.com>
Date: Thu, 31 Aug 2023 03:31:28 -0300
Subject: smp: Change function signatures to use call_single_data_t

call_single_data_t is a size-aligned typedef of struct __call_single_data.

This alignment is desirable in order to have smp_call_function*() avoid
bouncing an extra cacheline in case of an unaligned csd, given this
would hurt performance.

Since the removal of struct request->csd in commit 660e802c76c8
("blk-mq: use percpu csd to remote complete instead of per-rq csd") there
are no current users of smp_call_function*() with unaligned csd.

Change every 'struct __call_single_data' function parameter to
'call_single_data_t', so we have warnings if any new code tries to
introduce an smp_call_function*() call with unaligned csd.

Signed-off-by: Leonardo Bras <leobras@redhat.com>
Reviewed-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230831063129.335425-1-leobras@redhat.com
---
 include/linux/smp.h        |  2 +-
 include/trace/events/csd.h |  8 ++++----
 kernel/smp.c               | 26 +++++++++++++-------------
 kernel/up.c                |  2 +-
 4 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 91ea4a67f8ca..e87520dc2959 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -53,7 +53,7 @@ int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
 			   void *info, bool wait, const struct cpumask *mask);
 
-int smp_call_function_single_async(int cpu, struct __call_single_data *csd);
+int smp_call_function_single_async(int cpu, call_single_data_t *csd);
 
 /*
  * Cpus stopping functions in panic. All have default weak definitions.
diff --git a/include/trace/events/csd.h b/include/trace/events/csd.h
index 67e9d01f80c2..58cc83b99c34 100644
--- a/include/trace/events/csd.h
+++ b/include/trace/events/csd.h
@@ -12,7 +12,7 @@ TRACE_EVENT(csd_queue_cpu,
 	TP_PROTO(const unsigned int cpu,
 		unsigned long callsite,
 		smp_call_func_t func,
-		struct __call_single_data *csd),
+		call_single_data_t *csd),
 
 	TP_ARGS(cpu, callsite, func, csd),
 
@@ -39,7 +39,7 @@ TRACE_EVENT(csd_queue_cpu,
  */
 DECLARE_EVENT_CLASS(csd_function,
 
-	TP_PROTO(smp_call_func_t func, struct __call_single_data *csd),
+	TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
 
 	TP_ARGS(func, csd),
 
@@ -57,12 +57,12 @@ DECLARE_EVENT_CLASS(csd_function,
 );
 
 DEFINE_EVENT(csd_function, csd_function_entry,
-	TP_PROTO(smp_call_func_t func, struct __call_single_data *csd),
+	TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
 	TP_ARGS(func, csd)
 );
 
 DEFINE_EVENT(csd_function, csd_function_exit,
-	TP_PROTO(smp_call_func_t func, struct __call_single_data *csd),
+	TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
 	TP_ARGS(func, csd)
 );
 
diff --git a/kernel/smp.c b/kernel/smp.c
index 385179dae360..822fabb7e3e1 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -125,7 +125,7 @@ send_call_function_ipi_mask(struct cpumask *mask)
 }
 
 static __always_inline void
-csd_do_func(smp_call_func_t func, void *info, struct __call_single_data *csd)
+csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd)
 {
 	trace_csd_function_entry(func, csd);
 	func(info);
@@ -172,7 +172,7 @@ module_param(csd_lock_timeout, ulong, 0444);
 static atomic_t csd_bug_count = ATOMIC_INIT(0);
 
 /* Record current CSD work for current CPU, NULL to erase. */
-static void __csd_lock_record(struct __call_single_data *csd)
+static void __csd_lock_record(call_single_data_t *csd)
 {
 	if (!csd) {
 		smp_mb(); /* NULL cur_csd after unlock. */
@@ -187,13 +187,13 @@ static void __csd_lock_record(struct __call_single_data *csd)
 		  /* Or before unlock, as the case may be. */
 }
 
-static __always_inline void csd_lock_record(struct __call_single_data *csd)
+static __always_inline void csd_lock_record(call_single_data_t *csd)
 {
 	if (static_branch_unlikely(&csdlock_debug_enabled))
 		__csd_lock_record(csd);
 }
 
-static int csd_lock_wait_getcpu(struct __call_single_data *csd)
+static int csd_lock_wait_getcpu(call_single_data_t *csd)
 {
 	unsigned int csd_type;
 
@@ -208,7 +208,7 @@ static int csd_lock_wait_getcpu(struct __call_single_data *csd)
  * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
  * so waiting on other types gets much less information.
  */
-static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id)
+static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
 {
 	int cpu = -1;
 	int cpux;
@@ -272,7 +272,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
  * previous function call. For multi-cpu calls its even more interesting
  * as we'll have to ensure no other cpu is observing our csd.
  */
-static void __csd_lock_wait(struct __call_single_data *csd)
+static void __csd_lock_wait(call_single_data_t *csd)
 {
 	int bug_id = 0;
 	u64 ts0, ts1;
@@ -286,7 +286,7 @@ static void __csd_lock_wait(struct __call_single_data *csd)
 	smp_acquire__after_ctrl_dep();
 }
 
-static __always_inline void csd_lock_wait(struct __call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
 {
 	if (static_branch_unlikely(&csdlock_debug_enabled)) {
 		__csd_lock_wait(csd);
@@ -296,17 +296,17 @@ static __always_inline void csd_lock_wait(struct __call_single_data *csd)
 	smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 }
 #else
-static void csd_lock_record(struct __call_single_data *csd)
+static void csd_lock_record(call_single_data_t *csd)
 {
 }
 
-static __always_inline void csd_lock_wait(struct __call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
 {
 	smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 }
 #endif
 
-static __always_inline void csd_lock(struct __call_single_data *csd)
+static __always_inline void csd_lock(call_single_data_t *csd)
 {
 	csd_lock_wait(csd);
 	csd->node.u_flags |= CSD_FLAG_LOCK;
@@ -319,7 +319,7 @@ static __always_inline void csd_lock(struct __call_single_data *csd)
 	smp_wmb();
 }
 
-static __always_inline void csd_unlock(struct __call_single_data *csd)
+static __always_inline void csd_unlock(call_single_data_t *csd)
 {
 	WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
 
@@ -372,7 +372,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
  * for execution on the given CPU. data must already have
  * ->func, ->info, and ->flags set.
  */
-static int generic_exec_single(int cpu, struct __call_single_data *csd)
+static int generic_exec_single(int cpu, call_single_data_t *csd)
 {
 	if (cpu == smp_processor_id()) {
 		smp_call_func_t func = csd->func;
@@ -658,7 +658,7 @@ EXPORT_SYMBOL(smp_call_function_single);
  *
  * Return: %0 on success or negative errno value on error
  */
-int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 {
 	int err = 0;
 
diff --git a/kernel/up.c b/kernel/up.c
index a38b8b095251..df50828cc2f0 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -25,7 +25,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
-int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 {
 	unsigned long flags;
 
-- 
cgit v1.2.3


From 94b548a15e8ec47dfbf6925bdfb64bb5657dce0c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2023 20:52:55 +0200
Subject: sched: Simplify set_user_nice()

Use guards to reduce gotos and simplify control flow.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 13 ++++++-------
 kernel/sched/sched.h |  5 +++++
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2299a5cfbfb9..fa57a560c52a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7187,9 +7187,8 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
 void set_user_nice(struct task_struct *p, long nice)
 {
 	bool queued, running;
-	int old_prio;
-	struct rq_flags rf;
 	struct rq *rq;
+	int old_prio;
 
 	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
 		return;
@@ -7197,7 +7196,9 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
-	rq = task_rq_lock(p, &rf);
+	CLASS(task_rq_lock, rq_guard)(p);
+	rq = rq_guard.rq;
+
 	update_rq_clock(rq);
 
 	/*
@@ -7208,8 +7209,9 @@ void set_user_nice(struct task_struct *p, long nice)
 	 */
 	if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
-		goto out_unlock;
+		return;
 	}
+
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
@@ -7232,9 +7234,6 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * lowered its priority, then reschedule its CPU:
 	 */
 	p->sched_class->prio_changed(rq, p, old_prio);
-
-out_unlock:
-	task_rq_unlock(rq, p, &rf);
 }
 EXPORT_SYMBOL(set_user_nice);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 04846272409c..68768f47ccb7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1658,6 +1658,11 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
 	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 }
 
+DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
+		    _T->rq = task_rq_lock(_T->lock, &_T->rf),
+		    task_rq_unlock(_T->rq, _T->lock, &_T->rf),
+		    struct rq *rq; struct rq_flags rf)
+
 static inline void
 rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
 	__acquires(rq->lock)
-- 
cgit v1.2.3


From febe162d4d9158cf2b5d48fdd440db7bb55dd622 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2023 16:54:54 +0200
Subject: sched: Simplify syscalls

Use guards to reduce gotos and simplify control flow.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 154 +++++++++++++++++++++++-----------------------------
 1 file changed, 68 insertions(+), 86 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa57a560c52a..67c32c43a94b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7506,6 +7506,21 @@ static struct task_struct *find_process_by_pid(pid_t pid)
 	return pid ? find_task_by_vpid(pid) : current;
 }
 
+static struct task_struct *find_get_task(pid_t pid)
+{
+	struct task_struct *p;
+	guard(rcu)();
+
+	p = find_process_by_pid(pid);
+	if (likely(p))
+		get_task_struct(p);
+
+	return p;
+}
+
+DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
+	     find_get_task(pid), pid_t pid)
+
 /*
  * sched_setparam() passes in -1 for its policy, to let the functions
  * it calls know not to change it.
@@ -7543,14 +7558,11 @@ static void __setscheduler_params(struct task_struct *p,
 static bool check_same_owner(struct task_struct *p)
 {
 	const struct cred *cred = current_cred(), *pcred;
-	bool match;
+	guard(rcu)();
 
-	rcu_read_lock();
 	pcred = __task_cred(p);
-	match = (uid_eq(cred->euid, pcred->euid) ||
-		 uid_eq(cred->euid, pcred->uid));
-	rcu_read_unlock();
-	return match;
+	return (uid_eq(cred->euid, pcred->euid) ||
+		uid_eq(cred->euid, pcred->uid));
 }
 
 /*
@@ -7962,27 +7974,17 @@ static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
 	struct sched_param lparam;
-	struct task_struct *p;
-	int retval;
 
 	if (!param || pid < 0)
 		return -EINVAL;
 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
 		return -EFAULT;
 
-	rcu_read_lock();
-	retval = -ESRCH;
-	p = find_process_by_pid(pid);
-	if (likely(p))
-		get_task_struct(p);
-	rcu_read_unlock();
-
-	if (likely(p)) {
-		retval = sched_setscheduler(p, policy, &lparam);
-		put_task_struct(p);
-	}
+	CLASS(find_get_task, p)(pid);
+	if (!p)
+		return -ESRCH;
 
-	return retval;
+	return sched_setscheduler(p, policy, &lparam);
 }
 
 /*
@@ -8078,7 +8080,6 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
 			       unsigned int, flags)
 {
 	struct sched_attr attr;
-	struct task_struct *p;
 	int retval;
 
 	if (!uattr || pid < 0 || flags)
@@ -8093,21 +8094,14 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
 	if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
 		attr.sched_policy = SETPARAM_POLICY;
 
-	rcu_read_lock();
-	retval = -ESRCH;
-	p = find_process_by_pid(pid);
-	if (likely(p))
-		get_task_struct(p);
-	rcu_read_unlock();
+	CLASS(find_get_task, p)(pid);
+	if (!p)
+		return -ESRCH;
 
-	if (likely(p)) {
-		if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
-			get_params(p, &attr);
-		retval = sched_setattr(p, &attr);
-		put_task_struct(p);
-	}
+	if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+		get_params(p, &attr);
 
-	return retval;
+	return sched_setattr(p, &attr);
 }
 
 /**
@@ -8125,16 +8119,17 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 	if (pid < 0)
 		return -EINVAL;
 
-	retval = -ESRCH;
-	rcu_read_lock();
+	guard(rcu)();
 	p = find_process_by_pid(pid);
-	if (p) {
-		retval = security_task_getscheduler(p);
-		if (!retval)
-			retval = p->policy
-				| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
+	if (!p)
+		return -ESRCH;
+
+	retval = security_task_getscheduler(p);
+	if (!retval) {
+		retval = p->policy;
+		if (p->sched_reset_on_fork)
+			retval |= SCHED_RESET_ON_FORK;
 	}
-	rcu_read_unlock();
 	return retval;
 }
 
@@ -8155,30 +8150,23 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 	if (!param || pid < 0)
 		return -EINVAL;
 
-	rcu_read_lock();
-	p = find_process_by_pid(pid);
-	retval = -ESRCH;
-	if (!p)
-		goto out_unlock;
+	scoped_guard (rcu) {
+		p = find_process_by_pid(pid);
+		if (!p)
+			return -ESRCH;
 
-	retval = security_task_getscheduler(p);
-	if (retval)
-		goto out_unlock;
+		retval = security_task_getscheduler(p);
+		if (retval)
+			return retval;
 
-	if (task_has_rt_policy(p))
-		lp.sched_priority = p->rt_priority;
-	rcu_read_unlock();
+		if (task_has_rt_policy(p))
+			lp.sched_priority = p->rt_priority;
+	}
 
 	/*
 	 * This one might sleep, we cannot do it with a spinlock held ...
 	 */
-	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-
-	return retval;
-
-out_unlock:
-	rcu_read_unlock();
-	return retval;
+	return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 }
 
 /*
@@ -8238,39 +8226,33 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 	    usize < SCHED_ATTR_SIZE_VER0 || flags)
 		return -EINVAL;
 
-	rcu_read_lock();
-	p = find_process_by_pid(pid);
-	retval = -ESRCH;
-	if (!p)
-		goto out_unlock;
+	scoped_guard (rcu) {
+		p = find_process_by_pid(pid);
+		if (!p)
+			return -ESRCH;
 
-	retval = security_task_getscheduler(p);
-	if (retval)
-		goto out_unlock;
+		retval = security_task_getscheduler(p);
+		if (retval)
+			return retval;
 
-	kattr.sched_policy = p->policy;
-	if (p->sched_reset_on_fork)
-		kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
-	get_params(p, &kattr);
-	kattr.sched_flags &= SCHED_FLAG_ALL;
+		kattr.sched_policy = p->policy;
+		if (p->sched_reset_on_fork)
+			kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+		get_params(p, &kattr);
+		kattr.sched_flags &= SCHED_FLAG_ALL;
 
 #ifdef CONFIG_UCLAMP_TASK
-	/*
-	 * This could race with another potential updater, but this is fine
-	 * because it'll correctly read the old or the new value. We don't need
-	 * to guarantee who wins the race as long as it doesn't return garbage.
-	 */
-	kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
-	kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+		/*
+		 * This could race with another potential updater, but this is fine
+		 * because it'll correctly read the old or the new value. We don't need
+		 * to guarantee who wins the race as long as it doesn't return garbage.
+		 */
+		kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+		kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
 #endif
-
-	rcu_read_unlock();
+	}
 
 	return sched_attr_copy_to_user(uattr, &kattr, usize);
-
-out_unlock:
-	rcu_read_unlock();
-	return retval;
 }
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 92c2ec5bc1081e6bbbe172bcfb1a566ad7b4f809 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2023 16:57:35 +0200
Subject: sched: Simplify sched_{set,get}affinity()

Use guards to reduce gotos and simplify control flow.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 53 ++++++++++++++---------------------------------------
 1 file changed, 14 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 67c32c43a94b..1d5cbb305057 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8347,39 +8347,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	struct affinity_context ac;
 	struct cpumask *user_mask;
-	struct task_struct *p;
 	int retval;
 
-	rcu_read_lock();
-
-	p = find_process_by_pid(pid);
-	if (!p) {
-		rcu_read_unlock();
+	CLASS(find_get_task, p)(pid);
+	if (!p)
 		return -ESRCH;
-	}
-
-	/* Prevent p going away */
-	get_task_struct(p);
-	rcu_read_unlock();
 
-	if (p->flags & PF_NO_SETAFFINITY) {
-		retval = -EINVAL;
-		goto out_put_task;
-	}
+	if (p->flags & PF_NO_SETAFFINITY)
+		return -EINVAL;
 
 	if (!check_same_owner(p)) {
-		rcu_read_lock();
-		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
-			rcu_read_unlock();
-			retval = -EPERM;
-			goto out_put_task;
-		}
-		rcu_read_unlock();
+		guard(rcu)();
+		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
+			return -EPERM;
 	}
 
 	retval = security_task_setscheduler(p);
 	if (retval)
-		goto out_put_task;
+		return retval;
 
 	/*
 	 * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
@@ -8389,8 +8374,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	if (user_mask) {
 		cpumask_copy(user_mask, in_mask);
 	} else if (IS_ENABLED(CONFIG_SMP)) {
-		retval = -ENOMEM;
-		goto out_put_task;
+		return -ENOMEM;
 	}
 
 	ac = (struct affinity_context){
@@ -8402,8 +8386,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	retval = __sched_setaffinity(p, &ac);
 	kfree(ac.user_mask);
 
-out_put_task:
-	put_task_struct(p);
 	return retval;
 }
 
@@ -8445,28 +8427,21 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
 long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
 	struct task_struct *p;
-	unsigned long flags;
 	int retval;
 
-	rcu_read_lock();
-
-	retval = -ESRCH;
+	guard(rcu)();
 	p = find_process_by_pid(pid);
 	if (!p)
-		goto out_unlock;
+		return -ESRCH;
 
 	retval = security_task_getscheduler(p);
 	if (retval)
-		goto out_unlock;
+		return retval;
 
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	guard(raw_spinlock_irqsave)(&p->pi_lock);
 	cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
-out_unlock:
-	rcu_read_unlock();
-
-	return retval;
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3


From 7a50f76674f8b6f4f30a1cec954179f10e20110c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2023 16:58:23 +0200
Subject: sched: Simplify yield_to()

Use guards to reduce gotos and simplify control flow.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 67 +++++++++++++++++++++++------------------------------
 1 file changed, 29 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1d5cbb305057..6c8c40a54560 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8888,55 +8888,46 @@ int __sched yield_to(struct task_struct *p, bool preempt)
 {
 	struct task_struct *curr = current;
 	struct rq *rq, *p_rq;
-	unsigned long flags;
 	int yielded = 0;
 
-	local_irq_save(flags);
-	rq = this_rq();
+	scoped_guard (irqsave) {
+		rq = this_rq();
 
 again:
-	p_rq = task_rq(p);
-	/*
-	 * If we're the only runnable task on the rq and target rq also
-	 * has only one task, there's absolutely no point in yielding.
-	 */
-	if (rq->nr_running == 1 && p_rq->nr_running == 1) {
-		yielded = -ESRCH;
-		goto out_irq;
-	}
+		p_rq = task_rq(p);
+		/*
+		 * If we're the only runnable task on the rq and target rq also
+		 * has only one task, there's absolutely no point in yielding.
+		 */
+		if (rq->nr_running == 1 && p_rq->nr_running == 1)
+			return -ESRCH;
 
-	double_rq_lock(rq, p_rq);
-	if (task_rq(p) != p_rq) {
-		double_rq_unlock(rq, p_rq);
-		goto again;
-	}
+		guard(double_rq_lock)(rq, p_rq);
+		if (task_rq(p) != p_rq)
+			goto again;
 
-	if (!curr->sched_class->yield_to_task)
-		goto out_unlock;
+		if (!curr->sched_class->yield_to_task)
+			return 0;
 
-	if (curr->sched_class != p->sched_class)
-		goto out_unlock;
+		if (curr->sched_class != p->sched_class)
+			return 0;
 
-	if (task_on_cpu(p_rq, p) || !task_is_running(p))
-		goto out_unlock;
+		if (task_on_cpu(p_rq, p) || !task_is_running(p))
+			return 0;
 
-	yielded = curr->sched_class->yield_to_task(rq, p);
-	if (yielded) {
-		schedstat_inc(rq->yld_count);
-		/*
-		 * Make p's CPU reschedule; pick_next_entity takes care of
-		 * fairness.
-		 */
-		if (preempt && rq != p_rq)
-			resched_curr(p_rq);
+		yielded = curr->sched_class->yield_to_task(rq, p);
+		if (yielded) {
+			schedstat_inc(rq->yld_count);
+			/*
+			 * Make p's CPU reschedule; pick_next_entity
+			 * takes care of fairness.
+			 */
+			if (preempt && rq != p_rq)
+				resched_curr(p_rq);
+		}
 	}
 
-out_unlock:
-	double_rq_unlock(rq, p_rq);
-out_irq:
-	local_irq_restore(flags);
-
-	if (yielded > 0)
+	if (yielded)
 		schedule();
 
 	return yielded;
-- 
cgit v1.2.3


From af7c5763f5e8bc1b3f827354a283ccaf6a8c8098 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2023 16:59:05 +0200
Subject: sched: Simplify sched_rr_get_interval()

Use guards to reduce gotos and simplify control flow.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6c8c40a54560..d298176367f7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9030,38 +9030,30 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 
 static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
 {
-	struct task_struct *p;
-	unsigned int time_slice;
-	struct rq_flags rf;
-	struct rq *rq;
+	unsigned int time_slice = 0;
 	int retval;
 
 	if (pid < 0)
 		return -EINVAL;
 
-	retval = -ESRCH;
-	rcu_read_lock();
-	p = find_process_by_pid(pid);
-	if (!p)
-		goto out_unlock;
+	scoped_guard (rcu) {
+		struct task_struct *p = find_process_by_pid(pid);
+		if (!p)
+			return -ESRCH;
 
-	retval = security_task_getscheduler(p);
-	if (retval)
-		goto out_unlock;
+		retval = security_task_getscheduler(p);
+		if (retval)
+			return retval;
 
-	rq = task_rq_lock(p, &rf);
-	time_slice = 0;
-	if (p->sched_class->get_rr_interval)
-		time_slice = p->sched_class->get_rr_interval(rq, p);
-	task_rq_unlock(rq, p, &rf);
+		scoped_guard (task_rq_lock, p) {
+			struct rq *rq = scope.rq;
+			if (p->sched_class->get_rr_interval)
+				time_slice = p->sched_class->get_rr_interval(rq, p);
+		}
+	}
 
-	rcu_read_unlock();
 	jiffies_to_timespec64(time_slice, t);
 	return 0;
-
-out_unlock:
-	rcu_read_unlock();
-	return retval;
 }
 
 /**
-- 
cgit v1.2.3


From fa614b4feb5a246474ac71b45e520a8ddefc809c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2023 20:41:09 +0200
Subject: sched: Simplify sched_move_task()

Use guards to reduce gotos and simplify control flow.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d298176367f7..a3f4fb8a6841 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10437,17 +10437,18 @@ void sched_move_task(struct task_struct *tsk)
 	int queued, running, queue_flags =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 	struct task_group *group;
-	struct rq_flags rf;
 	struct rq *rq;
 
-	rq = task_rq_lock(tsk, &rf);
+	CLASS(task_rq_lock, rq_guard)(tsk);
+	rq = rq_guard.rq;
+
 	/*
 	 * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
 	 * group changes.
 	 */
 	group = sched_get_task_group(tsk);
 	if (group == tsk->sched_task_group)
-		goto unlock;
+		return;
 
 	update_rq_clock(rq);
 
@@ -10472,9 +10473,6 @@ void sched_move_task(struct task_struct *tsk)
 		 */
 		resched_curr(rq);
 	}
-
-unlock:
-	task_rq_unlock(rq, tsk, &rf);
 }
 
 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
-- 
cgit v1.2.3


From 6fb45460615358157a6d3c990e74f9c1395247e2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2023 20:45:16 +0200
Subject: sched: Simplify tg_set_cfs_bandwidth()

Use guards to reduce gotos and simplify control flow.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpu.h |  2 ++
 kernel/sched/core.c | 38 +++++++++++++++++++-------------------
 2 files changed, 21 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 0abd60a7987b..f19f56501809 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -153,6 +153,8 @@ static inline int remove_cpu(unsigned int cpu) { return -EPERM; }
 static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
 #endif	/* !CONFIG_HOTPLUG_CPU */
 
+DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock())
+
 #ifdef CONFIG_PM_SLEEP_SMP
 extern int freeze_secondary_cpus(int primary);
 extern void thaw_secondary_cpus(void);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a3f4fb8a6841..5d9f36359461 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10802,11 +10802,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
 	 * Prevent race between setting of cfs_rq->runtime_enabled and
 	 * unthrottle_offline_cfs_rqs().
 	 */
-	cpus_read_lock();
-	mutex_lock(&cfs_constraints_mutex);
+	guard(cpus_read_lock)();
+	guard(mutex)(&cfs_constraints_mutex);
+
 	ret = __cfs_schedulable(tg, period, quota);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
 	runtime_enabled = quota != RUNTIME_INF;
 	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
@@ -10816,39 +10817,38 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
 	 */
 	if (runtime_enabled && !runtime_was_enabled)
 		cfs_bandwidth_usage_inc();
-	raw_spin_lock_irq(&cfs_b->lock);
-	cfs_b->period = ns_to_ktime(period);
-	cfs_b->quota = quota;
-	cfs_b->burst = burst;
 
-	__refill_cfs_bandwidth_runtime(cfs_b);
+	scoped_guard (raw_spinlock_irq, &cfs_b->lock) {
+		cfs_b->period = ns_to_ktime(period);
+		cfs_b->quota = quota;
+		cfs_b->burst = burst;
 
-	/* Restart the period timer (if active) to handle new period expiry: */
-	if (runtime_enabled)
-		start_cfs_bandwidth(cfs_b);
+		__refill_cfs_bandwidth_runtime(cfs_b);
 
-	raw_spin_unlock_irq(&cfs_b->lock);
+		/*
+		 * Restart the period timer (if active) to handle new
+		 * period expiry:
+		 */
+		if (runtime_enabled)
+			start_cfs_bandwidth(cfs_b);
+	}
 
 	for_each_online_cpu(i) {
 		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
 		struct rq *rq = cfs_rq->rq;
-		struct rq_flags rf;
 
-		rq_lock_irq(rq, &rf);
+		guard(rq_lock_irq)(rq);
 		cfs_rq->runtime_enabled = runtime_enabled;
 		cfs_rq->runtime_remaining = 0;
 
 		if (cfs_rq->throttled)
 			unthrottle_cfs_rq(cfs_rq);
-		rq_unlock_irq(rq, &rf);
 	}
+
 	if (runtime_was_enabled && !runtime_enabled)
 		cfs_bandwidth_usage_dec();
-out_unlock:
-	mutex_unlock(&cfs_constraints_mutex);
-	cpus_read_unlock();
 
-	return ret;
+	return 0;
 }
 
 static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
-- 
cgit v1.2.3


From 0e34600ac9317dbe5f0a7bfaa3d7187d757572ed Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2023 20:52:49 +0200
Subject: sched: Misc cleanups

Random remaining guard use...

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 167 ++++++++++++++++++++--------------------------------
 1 file changed, 63 insertions(+), 104 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5d9f36359461..76662d809183 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1480,16 +1480,12 @@ static void __uclamp_update_util_min_rt_default(struct task_struct *p)
 
 static void uclamp_update_util_min_rt_default(struct task_struct *p)
 {
-	struct rq_flags rf;
-	struct rq *rq;
-
 	if (!rt_task(p))
 		return;
 
 	/* Protect updates to p->uclamp_* */
-	rq = task_rq_lock(p, &rf);
+	guard(task_rq_lock)(p);
 	__uclamp_update_util_min_rt_default(p);
-	task_rq_unlock(rq, p, &rf);
 }
 
 static inline struct uclamp_se
@@ -1785,9 +1781,8 @@ static void uclamp_update_root_tg(void)
 	uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
 		      sysctl_sched_uclamp_util_max, false);
 
-	rcu_read_lock();
+	guard(rcu)();
 	cpu_util_update_eff(&root_task_group.css);
-	rcu_read_unlock();
 }
 #else
 static void uclamp_update_root_tg(void) { }
@@ -1814,10 +1809,9 @@ static void uclamp_sync_util_min_rt_default(void)
 	smp_mb__after_spinlock();
 	read_unlock(&tasklist_lock);
 
-	rcu_read_lock();
+	guard(rcu)();
 	for_each_process_thread(g, p)
 		uclamp_update_util_min_rt_default(p);
-	rcu_read_unlock();
 }
 
 static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
@@ -2250,20 +2244,13 @@ static __always_inline
 int task_state_match(struct task_struct *p, unsigned int state)
 {
 #ifdef CONFIG_PREEMPT_RT
-	int match;
-
 	/*
 	 * Serialize against current_save_and_set_rtlock_wait_state() and
 	 * current_restore_rtlock_saved_state().
 	 */
-	raw_spin_lock_irq(&p->pi_lock);
-	match = __task_state_match(p, state);
-	raw_spin_unlock_irq(&p->pi_lock);
-
-	return match;
-#else
-	return __task_state_match(p, state);
+	guard(raw_spinlock_irq)(&p->pi_lock);
 #endif
+	return __task_state_match(p, state);
 }
 
 /*
@@ -2417,10 +2404,9 @@ void migrate_disable(void)
 		return;
 	}
 
-	preempt_disable();
+	guard(preempt)();
 	this_rq()->nr_pinned++;
 	p->migration_disabled = 1;
-	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_disable);
 
@@ -2444,7 +2430,7 @@ void migrate_enable(void)
 	 * Ensure stop_task runs either before or after this, and that
 	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
 	 */
-	preempt_disable();
+	guard(preempt)();
 	if (p->cpus_ptr != &p->cpus_mask)
 		__set_cpus_allowed_ptr(p, &ac);
 	/*
@@ -2455,7 +2441,6 @@ void migrate_enable(void)
 	barrier();
 	p->migration_disabled = 0;
 	this_rq()->nr_pinned--;
-	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
 
@@ -3516,13 +3501,11 @@ out:
  */
 void kick_process(struct task_struct *p)
 {
-	int cpu;
+	guard(preempt)();
+	int cpu = task_cpu(p);
 
-	preempt_disable();
-	cpu = task_cpu(p);
 	if ((cpu != smp_processor_id()) && task_curr(p))
 		smp_send_reschedule(cpu);
-	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
 
@@ -6368,8 +6351,9 @@ static void sched_core_balance(struct rq *rq)
 	struct sched_domain *sd;
 	int cpu = cpu_of(rq);
 
-	preempt_disable();
-	rcu_read_lock();
+	guard(preempt)();
+	guard(rcu)();
+
 	raw_spin_rq_unlock_irq(rq);
 	for_each_domain(cpu, sd) {
 		if (need_resched())
@@ -6379,8 +6363,6 @@ static void sched_core_balance(struct rq *rq)
 			break;
 	}
 	raw_spin_rq_lock_irq(rq);
-	rcu_read_unlock();
-	preempt_enable();
 }
 
 static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
@@ -8258,8 +8240,6 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 #ifdef CONFIG_SMP
 int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
 {
-	int ret = 0;
-
 	/*
 	 * If the task isn't a deadline task or admission control is
 	 * disabled then we don't care about affinity changes.
@@ -8273,11 +8253,11 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
 	 * tasks allowed to run on all the CPUs in the task's
 	 * root_domain.
 	 */
-	rcu_read_lock();
+	guard(rcu)();
 	if (!cpumask_subset(task_rq(p)->rd->span, mask))
-		ret = -EBUSY;
-	rcu_read_unlock();
-	return ret;
+		return -EBUSY;
+
+	return 0;
 }
 #endif
 
@@ -10509,11 +10489,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 
 #ifdef CONFIG_UCLAMP_TASK_GROUP
 	/* Propagate the effective uclamp value for the new group */
-	mutex_lock(&uclamp_mutex);
-	rcu_read_lock();
+	guard(mutex)(&uclamp_mutex);
+	guard(rcu)();
 	cpu_util_update_eff(css);
-	rcu_read_unlock();
-	mutex_unlock(&uclamp_mutex);
 #endif
 
 	return 0;
@@ -10664,8 +10642,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
 
 	static_branch_enable(&sched_uclamp_used);
 
-	mutex_lock(&uclamp_mutex);
-	rcu_read_lock();
+	guard(mutex)(&uclamp_mutex);
+	guard(rcu)();
 
 	tg = css_tg(of_css(of));
 	if (tg->uclamp_req[clamp_id].value != req.util)
@@ -10680,9 +10658,6 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
 	/* Update effective clamps to track the most restrictive value */
 	cpu_util_update_eff(of_css(of));
 
-	rcu_read_unlock();
-	mutex_unlock(&uclamp_mutex);
-
 	return nbytes;
 }
 
@@ -10708,10 +10683,10 @@ static inline void cpu_uclamp_print(struct seq_file *sf,
 	u64 percent;
 	u32 rem;
 
-	rcu_read_lock();
-	tg = css_tg(seq_css(sf));
-	util_clamp = tg->uclamp_req[clamp_id].value;
-	rcu_read_unlock();
+	scoped_guard (rcu) {
+		tg = css_tg(seq_css(sf));
+		util_clamp = tg->uclamp_req[clamp_id].value;
+	}
 
 	if (util_clamp == SCHED_CAPACITY_SCALE) {
 		seq_puts(sf, "max\n");
@@ -11033,7 +11008,6 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
 
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 {
-	int ret;
 	struct cfs_schedulable_data data = {
 		.tg = tg,
 		.period = period,
@@ -11045,11 +11019,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 		do_div(data.quota, NSEC_PER_USEC);
 	}
 
-	rcu_read_lock();
-	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
-	rcu_read_unlock();
-
-	return ret;
+	guard(rcu)();
+	return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
 }
 
 static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
@@ -11654,14 +11625,12 @@ int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
 	 * are not the last task to be migrated from this cpu for this mm, so
 	 * there is no need to move src_cid to the destination cpu.
 	 */
-	rcu_read_lock();
+	guard(rcu)();
 	src_task = rcu_dereference(src_rq->curr);
 	if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
-		rcu_read_unlock();
 		t->last_mm_cid = -1;
 		return -1;
 	}
-	rcu_read_unlock();
 
 	return src_cid;
 }
@@ -11705,18 +11674,17 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
 	 * the lazy-put flag, this task will be responsible for transitioning
 	 * from lazy-put flag set to MM_CID_UNSET.
 	 */
-	rcu_read_lock();
-	src_task = rcu_dereference(src_rq->curr);
-	if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
-		rcu_read_unlock();
-		/*
-		 * We observed an active task for this mm, there is therefore
-		 * no point in moving this cid to the destination cpu.
-		 */
-		t->last_mm_cid = -1;
-		return -1;
+	scoped_guard (rcu) {
+		src_task = rcu_dereference(src_rq->curr);
+		if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+			/*
+			 * We observed an active task for this mm, there is therefore
+			 * no point in moving this cid to the destination cpu.
+			 */
+			t->last_mm_cid = -1;
+			return -1;
+		}
 	}
-	rcu_read_unlock();
 
 	/*
 	 * The src_cid is unused, so it can be unset.
@@ -11789,7 +11757,6 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *t;
-	unsigned long flags;
 	int cid, lazy_cid;
 
 	cid = READ_ONCE(pcpu_cid->cid);
@@ -11824,23 +11791,21 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_
 	 * the lazy-put flag, that task will be responsible for transitioning
 	 * from lazy-put flag set to MM_CID_UNSET.
 	 */
-	rcu_read_lock();
-	t = rcu_dereference(rq->curr);
-	if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {
-		rcu_read_unlock();
-		return;
+	scoped_guard (rcu) {
+		t = rcu_dereference(rq->curr);
+		if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
+			return;
 	}
-	rcu_read_unlock();
 
 	/*
 	 * The cid is unused, so it can be unset.
 	 * Disable interrupts to keep the window of cid ownership without rq
 	 * lock small.
 	 */
-	local_irq_save(flags);
-	if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
-		__mm_cid_put(mm, cid);
-	local_irq_restore(flags);
+	scoped_guard (irqsave) {
+		if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+			__mm_cid_put(mm, cid);
+	}
 }
 
 static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
@@ -11862,14 +11827,13 @@ static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
 	 * snapshot associated with this cid if an active task using the mm is
 	 * observed on this rq.
 	 */
-	rcu_read_lock();
-	curr = rcu_dereference(rq->curr);
-	if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
-		WRITE_ONCE(pcpu_cid->time, rq_clock);
-		rcu_read_unlock();
-		return;
+	scoped_guard (rcu) {
+		curr = rcu_dereference(rq->curr);
+		if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
+			WRITE_ONCE(pcpu_cid->time, rq_clock);
+			return;
+		}
 	}
-	rcu_read_unlock();
 
 	if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
 		return;
@@ -11963,7 +11927,6 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
 void sched_mm_cid_exit_signals(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
-	struct rq_flags rf;
 	struct rq *rq;
 
 	if (!mm)
@@ -11971,7 +11934,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
 
 	preempt_disable();
 	rq = this_rq();
-	rq_lock_irqsave(rq, &rf);
+	guard(rq_lock_irqsave)(rq);
 	preempt_enable_no_resched();	/* holding spinlock */
 	WRITE_ONCE(t->mm_cid_active, 0);
 	/*
@@ -11981,13 +11944,11 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
 	smp_mb();
 	mm_cid_put(mm);
 	t->last_mm_cid = t->mm_cid = -1;
-	rq_unlock_irqrestore(rq, &rf);
 }
 
 void sched_mm_cid_before_execve(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
-	struct rq_flags rf;
 	struct rq *rq;
 
 	if (!mm)
@@ -11995,7 +11956,7 @@ void sched_mm_cid_before_execve(struct task_struct *t)
 
 	preempt_disable();
 	rq = this_rq();
-	rq_lock_irqsave(rq, &rf);
+	guard(rq_lock_irqsave)(rq);
 	preempt_enable_no_resched();	/* holding spinlock */
 	WRITE_ONCE(t->mm_cid_active, 0);
 	/*
@@ -12005,13 +11966,11 @@ void sched_mm_cid_before_execve(struct task_struct *t)
 	smp_mb();
 	mm_cid_put(mm);
 	t->last_mm_cid = t->mm_cid = -1;
-	rq_unlock_irqrestore(rq, &rf);
 }
 
 void sched_mm_cid_after_execve(struct task_struct *t)
 {
 	struct mm_struct *mm = t->mm;
-	struct rq_flags rf;
 	struct rq *rq;
 
 	if (!mm)
@@ -12019,16 +11978,16 @@ void sched_mm_cid_after_execve(struct task_struct *t)
 
 	preempt_disable();
 	rq = this_rq();
-	rq_lock_irqsave(rq, &rf);
-	preempt_enable_no_resched();	/* holding spinlock */
-	WRITE_ONCE(t->mm_cid_active, 1);
-	/*
-	 * Store t->mm_cid_active before loading per-mm/cpu cid.
-	 * Matches barrier in sched_mm_cid_remote_clear_old().
-	 */
-	smp_mb();
-	t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
-	rq_unlock_irqrestore(rq, &rf);
+	scoped_guard (rq_lock_irqsave, rq) {
+		preempt_enable_no_resched();	/* holding spinlock */
+		WRITE_ONCE(t->mm_cid_active, 1);
+		/*
+		 * Store t->mm_cid_active before loading per-mm/cpu cid.
+		 * Matches barrier in sched_mm_cid_remote_clear_old().
+		 */
+		smp_mb();
+		t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
+	}
 	rseq_set_notify_resume(t);
 }
 
-- 
cgit v1.2.3


From 450e749707bc1755f22b505d9cd942d4869dc535 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Thu, 7 Sep 2023 10:42:21 -0700
Subject: sched/fair: Fix SMT4 group_smt_balance handling

For SMT4, any group with more than 2 tasks will be marked as
group_smt_balance. Retain the behaviour of group_has_spare by marking
the busiest group as the group which has the least number of idle_cpus.

Also, handle rounding effect of adding (ncores_local + ncores_busy) when
the local is fully idle and busy group imbalance is less than 2 tasks.
Local group should try to pull at least 1 task in this case so imbalance
should be set to 2 instead.

Fixes: fee1759e4f04 ("sched/fair: Determine active load balance for SMT sched groups")
Acked-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/6cd1633036bb6b651af575c32c2a9608a106702c.camel@linux.intel.com
---
 kernel/sched/fair.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 33a2b6bba676..cb225921bbca 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9580,7 +9580,7 @@ static inline long sibling_imbalance(struct lb_env *env,
 	imbalance /= ncores_local + ncores_busiest;
 
 	/* Take advantage of resource in an empty sched group */
-	if (imbalance == 0 && local->sum_nr_running == 0 &&
+	if (imbalance <= 1 && local->sum_nr_running == 0 &&
 	    busiest->sum_nr_running > 1)
 		imbalance = 2;
 
@@ -9768,6 +9768,15 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 		break;
 
 	case group_smt_balance:
+		/*
+		 * Check if we have spare CPUs on either SMT group to
+		 * choose has spare or fully busy handling.
+		 */
+		if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
+			goto has_spare;
+
+		fallthrough;
+
 	case group_fully_busy:
 		/*
 		 * Select the fully busy group with highest avg_load. In
@@ -9807,6 +9816,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 			else
 				return true;
 		}
+has_spare:
 
 		/*
 		 * Select not overloaded group with lowest number of idle cpus
-- 
cgit v1.2.3


From f0a31b26be1f725e3f56beed76486c1b034120b3 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sat, 29 Jul 2023 14:27:32 +0000
Subject: srcu: Fix error handling in init_srcu_struct_fields()

The current error handling in init_srcu_struct_fields() is a bit
inconsistent.  If init_srcu_struct_nodes() fails, the function either
returns -ENOMEM or 0 depending on whether ssp->sda_is_static is true or
false. This can make init_srcu_struct_fields() return 0 even if memory
allocation failed!

Simplify the error handling by always returning -ENOMEM if either
init_srcu_struct_nodes() or the per-CPU allocation fails. This makes the
control flow easier to follow and avoids the inconsistent return values.

Add goto labels to avoid duplicating the error cleanup code.

Link: https://lore.kernel.org/r/20230404003508.GA254019@google.com
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/srcutree.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 20d7a238d675..f1a905200fc2 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -255,29 +255,31 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 	ssp->srcu_sup->sda_is_static = is_static;
 	if (!is_static)
 		ssp->sda = alloc_percpu(struct srcu_data);
-	if (!ssp->sda) {
-		if (!is_static)
-			kfree(ssp->srcu_sup);
-		return -ENOMEM;
-	}
+	if (!ssp->sda)
+		goto err_free_sup;
 	init_srcu_struct_data(ssp);
 	ssp->srcu_sup->srcu_gp_seq_needed_exp = 0;
 	ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
 	if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
-		if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
-			if (!ssp->srcu_sup->sda_is_static) {
-				free_percpu(ssp->sda);
-				ssp->sda = NULL;
-				kfree(ssp->srcu_sup);
-				return -ENOMEM;
-			}
-		} else {
-			WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
-		}
+		if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC))
+			goto err_free_sda;
+		WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
 	}
 	ssp->srcu_sup->srcu_ssp = ssp;
 	smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */
 	return 0;
+
+err_free_sda:
+	if (!is_static) {
+		free_percpu(ssp->sda);
+		ssp->sda = NULL;
+	}
+err_free_sup:
+	if (!is_static) {
+		kfree(ssp->srcu_sup);
+		ssp->srcu_sup = NULL;
+	}
+	return -ENOMEM;
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-- 
cgit v1.2.3


From 4502138acc8f4139c94ce0ec0eee926f8805fbbc Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sat, 29 Jul 2023 14:27:36 +0000
Subject: rcu/tree: Remove superfluous return from void call_rcu* functions

The return keyword is not needed here.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cb1caefa8bd0..7c79480bfaa0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2713,7 +2713,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
  */
 void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
 {
-	return __call_rcu_common(head, func, false);
+	__call_rcu_common(head, func, false);
 }
 EXPORT_SYMBOL_GPL(call_rcu_hurry);
 #endif
@@ -2764,7 +2764,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
  */
 void call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
-	return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
+	__call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
-- 
cgit v1.2.3


From 16128b1f8c823438dcd3f3b4a57cbe7267bcf82f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 1 Aug 2023 17:15:25 -0700
Subject: rcu: Add sysfs to provide throttled access to rcu_barrier()

When running a series of stress tests all making heavy use of RCU,
it is all too possible to OOM the system when the prior test's RCU
callbacks don't get invoked until after the subsequent test starts.
One way of handling this is just a timed wait, but this fails when a
given CPU has so many callbacks queued that they take longer to invoke
than allowed for by that timed wait.

This commit therefore adds an rcutree.do_rcu_barrier module parameter that
is accessible from sysfs.  Writing one of the many synonyms for boolean
"true" will cause an rcu_barrier() to be invoked, but will guarantee that
no more than one rcu_barrier() will be invoked per sixteenth of a second
via this mechanism.  The flip side is that a given request might wait a
second or three longer than absolutely necessary, but only when there are
multiple uses of rcutree.do_rcu_barrier within a one-second time interval.

This commit unnecessarily serializes the rcu_barrier() machinery, given
that serialization is already provided by procfs.  This has the advantage
of allowing throttled rcu_barrier() from other sources within the kernel.

Reported-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  7 +++
 kernel/rcu/tree.c                               | 76 +++++++++++++++++++++++++
 2 files changed, 83 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 0a1731a0f0ef..7ec8a406d419 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4769,6 +4769,13 @@
 			Set maximum number of finished RCU callbacks to
 			process in one batch.
 
+	rcutree.do_rcu_barrier=	[KNL]
+			Request a call to rcu_barrier().  This is
+			throttled so that userspace tests can safely
+			hammer on the sysfs variable if they so choose.
+			If triggered before the RCU grace-period machinery
+			is fully active, this will error out with EAGAIN.
+
 	rcutree.dump_tree=	[KNL]
 			Dump the structure of the rcu_node combining tree
 			out at early boot.  This is used for diagnostic
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7c79480bfaa0..3c7281fc25a7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4083,6 +4083,82 @@ retry:
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
+static unsigned long rcu_barrier_last_throttle;
+
+/**
+ * rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second
+ *
+ * This can be thought of as guard rails around rcu_barrier() that
+ * permits unrestricted userspace use, at least assuming the hardware's
+ * try_cmpxchg() is robust.  There will be at most one call per second to
+ * rcu_barrier() system-wide from use of this function, which means that
+ * callers might needlessly wait a second or three.
+ *
+ * This is intended for use by test suites to avoid OOM by flushing RCU
+ * callbacks from the previous test before starting the next.  See the
+ * rcutree.do_rcu_barrier module parameter for more information.
+ *
+ * Why not simply make rcu_barrier() more scalable?  That might be
+ * the eventual endpoint, but let's keep it simple for the time being.
+ * Note that the module parameter infrastructure serializes calls to a
+ * given .set() function, but should concurrent .set() invocation ever be
+ * possible, we are ready!
+ */
+static void rcu_barrier_throttled(void)
+{
+	unsigned long j = jiffies;
+	unsigned long old = READ_ONCE(rcu_barrier_last_throttle);
+	unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
+
+	while (time_in_range(j, old, old + HZ / 16) ||
+	       !try_cmpxchg(&rcu_barrier_last_throttle, &old, j)) {
+		schedule_timeout_idle(HZ / 16);
+		if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
+			smp_mb(); /* caller's subsequent code after above check. */
+			return;
+		}
+		j = jiffies;
+		old = READ_ONCE(rcu_barrier_last_throttle);
+	}
+	rcu_barrier();
+}
+
+/*
+ * Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier
+ * request arrives.  We insist on a true value to allow for possible
+ * future expansion.
+ */
+static int param_set_do_rcu_barrier(const char *val, const struct kernel_param *kp)
+{
+	bool b;
+	int ret;
+
+	if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
+		return -EAGAIN;
+	ret = kstrtobool(val, &b);
+	if (!ret && b) {
+		atomic_inc((atomic_t *)kp->arg);
+		rcu_barrier_throttled();
+		atomic_dec((atomic_t *)kp->arg);
+	}
+	return ret;
+}
+
+/*
+ * Output the number of outstanding rcutree.do_rcu_barrier requests.
+ */
+static int param_get_do_rcu_barrier(char *buffer, const struct kernel_param *kp)
+{
+	return sprintf(buffer, "%d\n", atomic_read((atomic_t *)kp->arg));
+}
+
+static const struct kernel_param_ops do_rcu_barrier_ops = {
+	.set = param_set_do_rcu_barrier,
+	.get = param_get_do_rcu_barrier,
+};
+static atomic_t do_rcu_barrier;
+module_param_cb(do_rcu_barrier, &do_rcu_barrier_ops, &do_rcu_barrier, 0644);
+
 /*
  * Compute the mask of online CPUs for the specified rcu_node structure.
  * This will not be stable unless the rcu_node structure's ->lock is
-- 
cgit v1.2.3


From 2cbc482d325ee58001472c4359b311958c4efdd1 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Sat, 5 Aug 2023 11:17:26 +0800
Subject: rcu: Dump memory object info if callback function is invalid

When a structure containing an RCU callback rhp is (incorrectly) freed
and reallocated after rhp is passed to call_rcu(), it is not unusual for
rhp->func to be set to NULL. This defeats the debugging prints used by
__call_rcu_common() in kernels built with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y,
which expect to identify the offending code using the identity of this
function.

And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things
are even worse, as can be seen from this splat:

Unable to handle kernel NULL pointer dereference at virtual address 0
... ...
PC is at 0x0
LR is at rcu_do_batch+0x1c0/0x3b8
... ...
 (rcu_do_batch) from (rcu_core+0x1d4/0x284)
 (rcu_core) from (__do_softirq+0x24c/0x344)
 (__do_softirq) from (__irq_exit_rcu+0x64/0x108)
 (__irq_exit_rcu) from (irq_exit+0x8/0x10)
 (irq_exit) from (__handle_domain_irq+0x74/0x9c)
 (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98)
 (gic_handle_irq) from (__irq_svc+0x5c/0x94)
 (__irq_svc) from (arch_cpu_idle+0x20/0x3c)
 (arch_cpu_idle) from (default_idle_call+0x4c/0x78)
 (default_idle_call) from (do_idle+0xf8/0x150)
 (do_idle) from (cpu_startup_entry+0x18/0x20)
 (cpu_startup_entry) from (0xc01530)

This commit therefore adds calls to mem_dump_obj(rhp) to output some
information, for example:

  slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256

This provides the rough size of the memory block and the offset of the
rcu_head structure, which as least provides at least a few clues to help
locate the problem. If the problem is reproducible, additional slab
debugging can be enabled, for example, CONFIG_DEBUG_SLAB=y, which can
provide significantly more information.

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/rcu.h      | 7 +++++++
 kernel/rcu/srcutiny.c | 1 +
 kernel/rcu/srcutree.c | 1 +
 kernel/rcu/tasks.h    | 1 +
 kernel/rcu/tiny.c     | 1 +
 kernel/rcu/tree.c     | 1 +
 6 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 98e13be411af..d612731feea4 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -10,6 +10,7 @@
 #ifndef __LINUX_RCU_H
 #define __LINUX_RCU_H
 
+#include <linux/slab.h>
 #include <trace/events/rcu.h>
 
 /*
@@ -248,6 +249,12 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
 }
 #endif	/* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 
+static inline void debug_rcu_head_callback(struct rcu_head *rhp)
+{
+	if (unlikely(!rhp->func))
+		kmem_dump_obj(rhp);
+}
+
 extern int rcu_cpu_stall_suppress_at_boot;
 
 static inline bool rcu_stall_is_suppressed_at_boot(void)
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 336af24e0fe3..c38e5933a5d6 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp)
 	while (lh) {
 		rhp = lh;
 		lh = lh->next;
+		debug_rcu_head_callback(rhp);
 		local_bh_disable();
 		rhp->func(rhp);
 		local_bh_enable();
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index f1a905200fc2..833a8f848a90 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1710,6 +1710,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	rhp = rcu_cblist_dequeue(&ready_cbs);
 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
 		debug_rcu_head_unqueue(rhp);
+		debug_rcu_head_callback(rhp);
 		local_bh_disable();
 		rhp->func(rhp);
 		local_bh_enable();
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 8d65f7d576a3..7c845532a50a 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -538,6 +538,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
 	len = rcl.len;
 	for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+		debug_rcu_head_callback(rhp);
 		local_bh_disable();
 		rhp->func(rhp);
 		local_bh_enable();
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 42f7589e51e0..fec804b79080 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
 
 	trace_rcu_invoke_callback("", head);
 	f = head->func;
+	debug_rcu_head_callback(head);
 	WRITE_ONCE(head->func, (rcu_callback_t)0L);
 	f(head);
 	rcu_lock_release(&rcu_callback_map);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3c7281fc25a7..aae515071ffd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2135,6 +2135,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
 		trace_rcu_invoke_callback(rcu_state.name, rhp);
 
 		f = rhp->func;
+		debug_rcu_head_callback(rhp);
 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
 		f(rhp);
 
-- 
cgit v1.2.3


From 0ae9942f03d0d034fdb0a4f44fc99f62a3107987 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 18 Aug 2023 08:53:58 -0700
Subject: rcu: Eliminate rcu_gp_slow_unregister() false positive

When using rcutorture as a module, there are a number of conditions that
can abort the modprobe operation, for example, when attempting to run
both RCU CPU stall warning tests and forward-progress tests.  This can
cause rcu_torture_cleanup() to be invoked on the unwind path out of
rcu_rcu_torture_init(), which will mean that rcu_gp_slow_unregister()
is invoked without a matching rcu_gp_slow_register().  This will cause
a splat because rcu_gp_slow_unregister() is passed rcu_fwd_cb_nodelay,
which does not match a NULL pointer.

This commit therefore forgives a mismatch involving a NULL pointer, thus
avoiding this false-positive splat.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index aae515071ffd..a83ecab77917 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1260,7 +1260,7 @@ EXPORT_SYMBOL_GPL(rcu_gp_slow_register);
 /* Unregister a counter, with NULL for not caring which. */
 void rcu_gp_slow_unregister(atomic_t *rgssp)
 {
-	WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress);
+	WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress && rcu_gp_slow_suppress != NULL);
 
 	WRITE_ONCE(rcu_gp_slow_suppress, NULL);
 }
-- 
cgit v1.2.3


From e35a6cf1cc343d720ad235f678f1cd2a9876b777 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 21 Aug 2023 15:22:07 +0100
Subject: futex: Use a folio instead of a page

The futex code already handles compound pages correctly, but using a folio
tells the compiler that there is already a reference to the head page and
it doesn't need to call compound_head() again.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230821142207.2537124-1-willy@infradead.org
---
 kernel/futex/core.c | 67 ++++++++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 514e4582b863..adf7e2c1c8f4 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -222,7 +222,8 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
-	struct page *page, *tail;
+	struct page *page;
+	struct folio *folio;
 	struct address_space *mapping;
 	int err, ro = 0;
 
@@ -273,54 +274,52 @@ again:
 		err = 0;
 
 	/*
-	 * The treatment of mapping from this point on is critical. The page
-	 * lock protects many things but in this context the page lock
+	 * The treatment of mapping from this point on is critical. The folio
+	 * lock protects many things but in this context the folio lock
 	 * stabilizes mapping, prevents inode freeing in the shared
 	 * file-backed region case and guards against movement to swap cache.
 	 *
-	 * Strictly speaking the page lock is not needed in all cases being
-	 * considered here and page lock forces unnecessarily serialization
+	 * Strictly speaking the folio lock is not needed in all cases being
+	 * considered here and folio lock forces unnecessarily serialization.
 	 * From this point on, mapping will be re-verified if necessary and
-	 * page lock will be acquired only if it is unavoidable
+	 * folio lock will be acquired only if it is unavoidable
 	 *
-	 * Mapping checks require the head page for any compound page so the
-	 * head page and mapping is looked up now. For anonymous pages, it
-	 * does not matter if the page splits in the future as the key is
-	 * based on the address. For filesystem-backed pages, the tail is
-	 * required as the index of the page determines the key. For
-	 * base pages, there is no tail page and tail == page.
+	 * Mapping checks require the folio so it is looked up now. For
+	 * anonymous pages, it does not matter if the folio is split
+	 * in the future as the key is based on the address. For
+	 * filesystem-backed pages, the precise page is required as the
+	 * index of the page determines the key.
 	 */
-	tail = page;
-	page = compound_head(page);
-	mapping = READ_ONCE(page->mapping);
+	folio = page_folio(page);
+	mapping = READ_ONCE(folio->mapping);
 
 	/*
-	 * If page->mapping is NULL, then it cannot be a PageAnon
+	 * If folio->mapping is NULL, then it cannot be an anonymous
 	 * page; but it might be the ZERO_PAGE or in the gate area or
 	 * in a special mapping (all cases which we are happy to fail);
 	 * or it may have been a good file page when get_user_pages_fast
 	 * found it, but truncated or holepunched or subjected to
-	 * invalidate_complete_page2 before we got the page lock (also
+	 * invalidate_complete_page2 before we got the folio lock (also
 	 * cases which we are happy to fail).  And we hold a reference,
 	 * so refcount care in invalidate_inode_page's remove_mapping
 	 * prevents drop_caches from setting mapping to NULL beneath us.
 	 *
 	 * The case we do have to guard against is when memory pressure made
 	 * shmem_writepage move it from filecache to swapcache beneath us:
-	 * an unlikely race, but we do need to retry for page->mapping.
+	 * an unlikely race, but we do need to retry for folio->mapping.
 	 */
 	if (unlikely(!mapping)) {
 		int shmem_swizzled;
 
 		/*
-		 * Page lock is required to identify which special case above
-		 * applies. If this is really a shmem page then the page lock
+		 * Folio lock is required to identify which special case above
+		 * applies. If this is really a shmem page then the folio lock
 		 * will prevent unexpected transitions.
 		 */
-		lock_page(page);
-		shmem_swizzled = PageSwapCache(page) || page->mapping;
-		unlock_page(page);
-		put_page(page);
+		folio_lock(folio);
+		shmem_swizzled = folio_test_swapcache(folio) || folio->mapping;
+		folio_unlock(folio);
+		folio_put(folio);
 
 		if (shmem_swizzled)
 			goto again;
@@ -331,14 +330,14 @@ again:
 	/*
 	 * Private mappings are handled in a simple way.
 	 *
-	 * If the futex key is stored on an anonymous page, then the associated
+	 * If the futex key is stored in anonymous memory, then the associated
 	 * object is the mm which is implicitly pinned by the calling process.
 	 *
 	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 	 * it's a read-only handle, it's expected that futexes attach to
 	 * the object not the particular process.
 	 */
-	if (PageAnon(page)) {
+	if (folio_test_anon(folio)) {
 		/*
 		 * A RO anonymous page will never change and thus doesn't make
 		 * sense for futex operations.
@@ -357,10 +356,10 @@ again:
 
 		/*
 		 * The associated futex object in this case is the inode and
-		 * the page->mapping must be traversed. Ordinarily this should
-		 * be stabilised under page lock but it's not strictly
+		 * the folio->mapping must be traversed. Ordinarily this should
+		 * be stabilised under folio lock but it's not strictly
 		 * necessary in this case as we just want to pin the inode, not
-		 * update the radix tree or anything like that.
+		 * update i_pages or anything like that.
 		 *
 		 * The RCU read lock is taken as the inode is finally freed
 		 * under RCU. If the mapping still matches expectations then the
@@ -368,9 +367,9 @@ again:
 		 */
 		rcu_read_lock();
 
-		if (READ_ONCE(page->mapping) != mapping) {
+		if (READ_ONCE(folio->mapping) != mapping) {
 			rcu_read_unlock();
-			put_page(page);
+			folio_put(folio);
 
 			goto again;
 		}
@@ -378,19 +377,19 @@ again:
 		inode = READ_ONCE(mapping->host);
 		if (!inode) {
 			rcu_read_unlock();
-			put_page(page);
+			folio_put(folio);
 
 			goto again;
 		}
 
 		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
 		key->shared.i_seq = get_inode_sequence_number(inode);
-		key->shared.pgoff = page_to_pgoff(tail);
+		key->shared.pgoff = folio->index + folio_page_idx(folio, page);
 		rcu_read_unlock();
 	}
 
 out:
-	put_page(page);
+	folio_put(folio);
 	return err;
 }
 
-- 
cgit v1.2.3


From 5c04433daf9ed8b28d4900112be1fd19e1786b25 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Thu, 14 Sep 2023 15:25:42 -0700
Subject: bpf: Charge modmem for struct_ops trampoline

Current code charges modmem for regular trampoline, but not for struct_ops
trampoline. Add bpf_jit_[charge|uncharge]_modmem() to struct_ops so the
trampoline is charged in both cases.

Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230914222542.2986059-1-song@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/bpf_struct_ops.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index fdc3e8705a3c..db6176fb64dc 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -615,7 +615,10 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
 	if (st_map->links)
 		bpf_struct_ops_map_put_progs(st_map);
 	bpf_map_area_free(st_map->links);
-	bpf_jit_free_exec(st_map->image);
+	if (st_map->image) {
+		bpf_jit_free_exec(st_map->image);
+		bpf_jit_uncharge_modmem(PAGE_SIZE);
+	}
 	bpf_map_area_free(st_map->uvalue);
 	bpf_map_area_free(st_map);
 }
@@ -657,6 +660,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	struct bpf_struct_ops_map *st_map;
 	const struct btf_type *t, *vt;
 	struct bpf_map *map;
+	int ret;
 
 	st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
 	if (!st_ops)
@@ -681,12 +685,27 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	st_map->st_ops = st_ops;
 	map = &st_map->map;
 
+	ret = bpf_jit_charge_modmem(PAGE_SIZE);
+	if (ret) {
+		__bpf_struct_ops_map_free(map);
+		return ERR_PTR(ret);
+	}
+
+	st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
+	if (!st_map->image) {
+		/* __bpf_struct_ops_map_free() uses st_map->image as flag
+		 * for "charged or not". In this case, we need to unchange
+		 * here.
+		 */
+		bpf_jit_uncharge_modmem(PAGE_SIZE);
+		__bpf_struct_ops_map_free(map);
+		return ERR_PTR(-ENOMEM);
+	}
 	st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
 	st_map->links =
 		bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_links *),
 				   NUMA_NO_NODE);
-	st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
-	if (!st_map->uvalue || !st_map->links || !st_map->image) {
+	if (!st_map->uvalue || !st_map->links) {
 		__bpf_struct_ops_map_free(map);
 		return ERR_PTR(-ENOMEM);
 	}
@@ -907,4 +926,3 @@ err_out:
 	kfree(link);
 	return err;
 }
-
-- 
cgit v1.2.3


From cccd32816506cbac3a4c65d9dff51b3125ef1a03 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Fri, 15 Sep 2023 09:55:39 +0200
Subject: panic: Reenable preemption in WARN slowpath

Commit:

  5a5d7e9badd2 ("cpuidle: lib/bug: Disable rcu_is_watching() during WARN/BUG")

amended warn_slowpath_fmt() to disable preemption until the WARN splat
has been emitted.

However the commit neglected to reenable preemption in the !fmt codepath,
i.e. when a WARN splat is emitted without additional format string.

One consequence is that users may see more splats than intended.  E.g. a
WARN splat emitted in a work item results in at least two extra splats:

  BUG: workqueue leaked lock or atomic
  (emitted by process_one_work())

  BUG: scheduling while atomic
  (emitted by worker_thread() -> schedule())

Ironically the point of the commit was to *avoid* extra splats. ;)

Fix it.

Fixes: 5a5d7e9badd2 ("cpuidle: lib/bug: Disable rcu_is_watching() during WARN/BUG")
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/r/3ec48fde01e4ee6505f77908ba351bad200ae3d1.1694763684.git.lukas@wunner.de
---
 kernel/panic.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 07239d4ad81e..ffa037fa777d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -697,6 +697,7 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint,
 	if (!fmt) {
 		__warn(file, line, __builtin_return_address(0), taint,
 		       NULL, NULL);
+		warn_rcu_exit(rcu);
 		return;
 	}
 
-- 
cgit v1.2.3


From 8f4f68e788c3a7a696546291258bfa5fdb215523 Mon Sep 17 00:00:00 2001
From: Lu Jialin <lujialin4@huawei.com>
Date: Mon, 4 Sep 2023 13:33:41 +0000
Subject: crypto: pcrypt - Fix hungtask for PADATA_RESET

We found a hungtask bug in test_aead_vec_cfg as follows:

INFO: task cryptomgr_test:391009 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
Call trace:
 __switch_to+0x98/0xe0
 __schedule+0x6c4/0xf40
 schedule+0xd8/0x1b4
 schedule_timeout+0x474/0x560
 wait_for_common+0x368/0x4e0
 wait_for_completion+0x20/0x30
 wait_for_completion+0x20/0x30
 test_aead_vec_cfg+0xab4/0xd50
 test_aead+0x144/0x1f0
 alg_test_aead+0xd8/0x1e0
 alg_test+0x634/0x890
 cryptomgr_test+0x40/0x70
 kthread+0x1e0/0x220
 ret_from_fork+0x10/0x18
 Kernel panic - not syncing: hung_task: blocked tasks

For padata_do_parallel, when the return err is 0 or -EBUSY, it will call
wait_for_completion(&wait->completion) in test_aead_vec_cfg. In normal
case, aead_request_complete() will be called in pcrypt_aead_serial and the
return err is 0 for padata_do_parallel. But, when pinst->flags is
PADATA_RESET, the return err is -EBUSY for padata_do_parallel, and it
won't call aead_request_complete(). Therefore, test_aead_vec_cfg will
hung at wait_for_completion(&wait->completion), which will cause
hungtask.

The problem comes as following:
(padata_do_parallel)                 |
    rcu_read_lock_bh();              |
    err = -EINVAL;                   |   (padata_replace)
                                     |     pinst->flags |= PADATA_RESET;
    err = -EBUSY                     |
    if (pinst->flags & PADATA_RESET) |
        rcu_read_unlock_bh()         |
        return err

In order to resolve the problem, we replace the return err -EBUSY with
-EAGAIN, which means parallel_data is changing, and the caller should call
it again.

v3:
remove retry and just change the return err.
v2:
introduce padata_try_do_parallel() in pcrypt_aead_encrypt and
pcrypt_aead_decrypt to solve the hungtask.

Signed-off-by: Lu Jialin <lujialin4@huawei.com>
Signed-off-by: Guo Zihua <guozihua@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/pcrypt.c | 4 ++++
 kernel/padata.c | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index 8c1d0ca41213..d0d954fe9d54 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -117,6 +117,8 @@ static int pcrypt_aead_encrypt(struct aead_request *req)
 	err = padata_do_parallel(ictx->psenc, padata, &ctx->cb_cpu);
 	if (!err)
 		return -EINPROGRESS;
+	if (err == -EBUSY)
+		return -EAGAIN;
 
 	return err;
 }
@@ -164,6 +166,8 @@ static int pcrypt_aead_decrypt(struct aead_request *req)
 	err = padata_do_parallel(ictx->psdec, padata, &ctx->cb_cpu);
 	if (!err)
 		return -EINPROGRESS;
+	if (err == -EBUSY)
+		return -EAGAIN;
 
 	return err;
 }
diff --git a/kernel/padata.c b/kernel/padata.c
index 222d60195de6..81c8183f3176 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -202,7 +202,7 @@ int padata_do_parallel(struct padata_shell *ps,
 		*cb_cpu = cpu;
 	}
 
-	err =  -EBUSY;
+	err = -EBUSY;
 	if ((pinst->flags & PADATA_RESET))
 		goto out;
 
-- 
cgit v1.2.3


From d1db9fb432d50b0eecdfdd85d17cc15a59cc093b Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 19 Aug 2023 07:12:34 -0700
Subject: sched/fair: Fix open-coded numa_nearest_node()

task_numa_placement() searches for a nearest node to migrate by calling
for_each_node_state(). Now that we have numa_nearest_node(), switch to
using it.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20230819141239.287290-3-yury.norov@gmail.com
---
 kernel/sched/fair.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8dbff6e7ad4f..41cfd61b4d6b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2847,19 +2847,7 @@ static void task_numa_placement(struct task_struct *p)
 	}
 
 	/* Cannot migrate task to CPU-less node */
-	if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
-		int near_nid = max_nid;
-		int distance, near_distance = INT_MAX;
-
-		for_each_node_state(nid, N_CPU) {
-			distance = node_distance(max_nid, nid);
-			if (distance < near_distance) {
-				near_nid = nid;
-				near_distance = distance;
-			}
-		}
-		max_nid = near_nid;
-	}
+	max_nid = numa_nearest_node(max_nid, N_CPU);
 
 	if (ng) {
 		numa_group_count_active_nodes(ng);
-- 
cgit v1.2.3


From 617f2c38cb5ce60226042081c09e2ee3a90d03f8 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 19 Aug 2023 07:12:35 -0700
Subject: sched/topology: Fix sched_numa_find_nth_cpu() in CPU-less case

When the node provided by user is CPU-less, corresponding record in
sched_domains_numa_masks is not set. Trying to dereference it in the
following code leads to kernel crash.

To avoid it, start searching from the nearest node with CPUs.

Fixes: cd7f55359c90 ("sched: add sched_numa_find_nth_cpu()")
Reported-by: Yicong Yang <yangyicong@hisilicon.com>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Yicong Yang <yangyicong@hisilicon.com>
Cc: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20230819141239.287290-4-yury.norov@gmail.com

Closes: https://lore.kernel.org/lkml/CAAH8bW8C5humYnfpW3y5ypwx0E-09A3QxFE1JFzR66v+mO4XfA@mail.gmail.com/T/
Closes: https://lore.kernel.org/lkml/ZMHSNQfv39HN068m@yury-ThinkPad/T/#mf6431cb0b7f6f05193c41adeee444bc95bf2b1c4
---
 kernel/sched/topology.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 05a5bc678c08..423d08947962 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2122,12 +2122,16 @@ static int hop_cmp(const void *a, const void *b)
  */
 int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 {
-	struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
+	struct __cmp_key k = { .cpus = cpus, .cpu = cpu };
 	struct cpumask ***hop_masks;
 	int hop, ret = nr_cpu_ids;
 
 	rcu_read_lock();
 
+	/* CPU-less node entries are uninitialized in sched_domains_numa_masks */
+	node = numa_nearest_node(node, N_CPU);
+	k.node = node;
+
 	k.masks = rcu_dereference(sched_domains_numa_masks);
 	if (!k.masks)
 		goto unlock;
-- 
cgit v1.2.3


From 9ecea9ae4d3127a09fb5dfcea87f248937a39ff5 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 19 Aug 2023 07:12:37 -0700
Subject: sched/topology: Handle NUMA_NO_NODE in sched_numa_find_nth_cpu()

sched_numa_find_nth_cpu() doesn't handle NUMA_NO_NODE properly, and
may crash kernel if passed with it. On the other hand, the only user
of sched_numa_find_nth_cpu() has to check NUMA_NO_NODE case explicitly.

It would be easier for users if this logic will get moved into
sched_numa_find_nth_cpu().

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20230819141239.287290-6-yury.norov@gmail.com
---
 kernel/sched/topology.c | 3 +++
 lib/cpumask.c           | 4 +---
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 423d08947962..a60ecf45360c 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2126,6 +2126,9 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 	struct cpumask ***hop_masks;
 	int hop, ret = nr_cpu_ids;
 
+	if (node == NUMA_NO_NODE)
+		return cpumask_nth_and(cpu, cpus, cpu_online_mask);
+
 	rcu_read_lock();
 
 	/* CPU-less node entries are uninitialized in sched_domains_numa_masks */
diff --git a/lib/cpumask.c b/lib/cpumask.c
index a7fd02b5ae26..34335c1e7265 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -146,9 +146,7 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
 	/* Wrap: we always want a cpu. */
 	i %= num_online_cpus();
 
-	cpu = (node == NUMA_NO_NODE) ?
-		cpumask_nth(i, cpu_online_mask) :
-		sched_numa_find_nth_cpu(cpu_online_mask, i, node);
+	cpu = sched_numa_find_nth_cpu(cpu_online_mask, i, node);
 
 	WARN_ON(cpu >= nr_cpu_ids);
 	return cpu;
-- 
cgit v1.2.3


From 6d08ad2166f7770341ea56afad45fa41cd16ae62 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sat, 19 Aug 2023 07:12:38 -0700
Subject: sched/topology: Fix sched_numa_find_nth_cpu() comment

Reword sched_numa_find_nth_cpu() comment and make it kernel-doc compatible.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/20230819141239.287290-7-yury.norov@gmail.com
---
 kernel/sched/topology.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a60ecf45360c..a7b50bba7829 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2112,13 +2112,15 @@ static int hop_cmp(const void *a, const void *b)
 	return -1;
 }
 
-/*
- * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
- *                             closest to @cpu from @cpumask.
- * cpumask: cpumask to find a cpu from
- * cpu: Nth cpu to find
- *
- * returns: cpu, or nr_cpu_ids when nothing found.
+/**
+ * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU
+ *                             from @cpus to @cpu, taking into account distance
+ *                             from a given @node.
+ * @cpus: cpumask to find a cpu from
+ * @cpu: CPU to start searching
+ * @node: NUMA node to order CPUs by distance
+ *
+ * Return: cpu, or nr_cpu_ids when nothing found.
  */
 int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
 {
-- 
cgit v1.2.3


From c0490bc9bb62d9376f3dd4ec28e03ca0fef97152 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Wed, 13 Sep 2023 13:20:31 +0000
Subject: sched/fair: Fix cfs_rq_is_decayed() on !SMP

We don't need to maintain per-queue leaf_cfs_rq_list on !SMP, since
it's used for cfs_rq load tracking & balancing on SMP.

But sched debug interface uses it to print per-cfs_rq stats.

This patch fixes the !SMP version of cfs_rq_is_decayed(), so the
per-queue leaf_cfs_rq_list is also maintained correctly on !SMP,
to fix the warning in assert_list_leaf_cfs_rq().

Fixes: 0a00a354644e ("sched/fair: Delete useless condition in tg_unthrottle_up()")
Reported-by: Leo Yu-Chi Liang <ycliang@andestech.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Leo Yu-Chi Liang <ycliang@andestech.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Closes: https://lore.kernel.org/all/ZN87UsqkWcFLDxea@swlinux02/
Link: https://lore.kernel.org/r/20230913132031.2242151-1-chengming.zhou@linux.dev
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 41cfd61b4d6b..c893721ff5b1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4866,7 +4866,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 
 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 {
-	return true;
+	return !cfs_rq->nr_running;
 }
 
 #define UPDATE_TG	0x0
-- 
cgit v1.2.3


From 4ff34ad3d39377d9f6953f3606ccf611ce636767 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Tue, 28 Feb 2023 17:14:26 +0100
Subject: sched/core: Use do-while instead of for loop in set_nr_if_polling()

Use equivalent do-while loop instead of infinite for loop.

There are no asm code changes.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230228161426.4508-1-ubizjak@gmail.com
---
 kernel/sched/core.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 76662d809183..f39482d6a6e6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -919,14 +919,13 @@ static bool set_nr_if_polling(struct task_struct *p)
 	struct thread_info *ti = task_thread_info(p);
 	typeof(ti->flags) val = READ_ONCE(ti->flags);
 
-	for (;;) {
+	do {
 		if (!(val & _TIF_POLLING_NRFLAG))
 			return false;
 		if (val & _TIF_NEED_RESCHED)
 			return true;
-		if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
-			break;
-	}
+	} while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED));
+
 	return true;
 }
 
-- 
cgit v1.2.3


From dca7acd84e93f2881e3f63465bbb5d89a40b5d17 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 13 Sep 2023 21:59:43 +0800
Subject: bpf: Skip unit_size checking for global per-cpu allocator

For global per-cpu allocator, the size of free object in free list
doesn't match with unit_size and now there is no way to get the size of
per-cpu pointer saved in free object, so just skip the checking.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/bpf/20230913133436.0eeec4cb@canb.auug.org.au/
Signed-off-by: Hou Tao <houtao1@huawei.com>
Tested-by: Biju Das <biju.das.jz@bp.renesas.com>
Link: https://lore.kernel.org/r/20230913135943.3137292-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 1c22b90e754a..cf1941516643 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -491,6 +491,13 @@ static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx)
 	struct llist_node *first;
 	unsigned int obj_size;
 
+	/* For per-cpu allocator, the size of free objects in free list doesn't
+	 * match with unit_size and now there is no way to get the size of
+	 * per-cpu pointer saved in free object, so just skip the checking.
+	 */
+	if (c->percpu_size)
+		return 0;
+
 	first = c->free_llist.first;
 	if (!first)
 		return 0;
-- 
cgit v1.2.3


From 57eb5e1c5c57972c95e8efab6bc81b87161b0b07 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 15 Sep 2023 12:14:20 +0200
Subject: bpf: Fix uprobe_multi get_pid_task error path

Dan reported Smatch static checker warning due to missing error
value set in uprobe multi link's get_pid_task error path.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/bpf/c5ffa7c0-6b06-40d5-aca2-63833b5cd9af@moroto.mountain/
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Reviewed-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230915101420.1193800-1-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/bpf_trace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c1c1af63ced2..868008f56fec 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3223,8 +3223,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 		rcu_read_lock();
 		task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
 		rcu_read_unlock();
-		if (!task)
+		if (!task) {
+			err = -ESRCH;
 			goto error_path_put;
+		}
 	}
 
 	err = -ENOMEM;
-- 
cgit v1.2.3


From fc45c5b642dbcac3bb10f4f904e4b863233e5369 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 13 Sep 2023 10:13:48 -0700
Subject: bpf: make it easier to add new metadata kfunc

No functional changes.

Instead of having hand-crafted code in bpf_dev_bound_resolve_kfunc,
move kfunc <> xmo handler relationship into XDP_METADATA_KFUNC_xxx.
This way, any time new kfunc is added, we don't have to touch
bpf_dev_bound_resolve_kfunc.

Also document XDP_METADATA_KFUNC_xxx arguments since we now have
more than two and it might be confusing what is what.

Cc: netdev@vger.kernel.org
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230913171350.369987-2-sdf@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/net/xdp.h    | 16 ++++++++++++----
 kernel/bpf/offload.c |  9 +++++----
 net/core/xdp.c       |  4 ++--
 3 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index de08c8e0d134..d59e12f8f311 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -383,14 +383,22 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
 
 #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE
 
+/* Define the relationship between xdp-rx-metadata kfunc and
+ * various other entities:
+ * - xdp_rx_metadata enum
+ * - kfunc name
+ * - xdp_metadata_ops field
+ */
 #define XDP_METADATA_KFUNC_xxx	\
 	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \
-			   bpf_xdp_metadata_rx_timestamp) \
+			   bpf_xdp_metadata_rx_timestamp, \
+			   xmo_rx_timestamp) \
 	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \
-			   bpf_xdp_metadata_rx_hash) \
+			   bpf_xdp_metadata_rx_hash, \
+			   xmo_rx_hash) \
 
-enum {
-#define XDP_METADATA_KFUNC(name, _) name,
+enum xdp_rx_metadata {
+#define XDP_METADATA_KFUNC(name, _, __) name,
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 MAX_XDP_METADATA_KFUNC,
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 3e4f2ec1af06..6aa6de8d715d 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -845,10 +845,11 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
 	if (!ops)
 		goto out;
 
-	if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP))
-		p = ops->xmo_rx_timestamp;
-	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH))
-		p = ops->xmo_rx_hash;
+#define XDP_METADATA_KFUNC(name, _, xmo) \
+	if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo;
+	XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
 out:
 	up_read(&bpf_devs_lock);
 
diff --git a/net/core/xdp.c b/net/core/xdp.c
index a70670fe9a2d..bab563b2f812 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -741,7 +741,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
 __diag_pop();
 
 BTF_SET8_START(xdp_metadata_kfunc_ids)
-#define XDP_METADATA_KFUNC(_, name) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
+#define XDP_METADATA_KFUNC(_, name, __) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 BTF_SET8_END(xdp_metadata_kfunc_ids)
@@ -752,7 +752,7 @@ static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
 };
 
 BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted)
-#define XDP_METADATA_KFUNC(name, str) BTF_ID(func, str)
+#define XDP_METADATA_KFUNC(name, str, _) BTF_ID(func, str)
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 
-- 
cgit v1.2.3


From a9c2a608549bb1a2363d289d63907640afcf22af Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 13 Sep 2023 10:13:49 -0700
Subject: bpf: expose information about supported xdp metadata kfunc

Add new xdp-rx-metadata-features member to netdev netlink
which exports a bitmask of supported kfuncs. Most of the patch
is autogenerated (headers), the only relevant part is netdev.yaml
and the changes in netdev-genl.c to marshal into netlink.

Example output on veth:

$ ip link add veth0 type veth peer name veth1 # ifndex == 12
$ ./tools/net/ynl/samples/netdev 12

Select ifc ($ifindex; or 0 = dump; or -2 ntf check): 12
   veth1[12]    xdp-features (23): basic redirect rx-sg xdp-rx-metadata-features (3): timestamp hash xdp-zc-max-segs=0

Cc: netdev@vger.kernel.org
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230913171350.369987-3-sdf@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml      | 21 +++++++++++++++++++++
 Documentation/networking/xdp-rx-metadata.rst |  7 +++++++
 include/net/xdp.h                            |  5 ++++-
 include/uapi/linux/netdev.h                  | 16 ++++++++++++++++
 kernel/bpf/offload.c                         |  2 +-
 net/core/netdev-genl.c                       | 12 +++++++++++-
 net/core/xdp.c                               |  4 ++--
 tools/include/uapi/linux/netdev.h            | 16 ++++++++++++++++
 8 files changed, 78 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 1c7284fd535b..c46fcc78fc04 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -42,6 +42,19 @@ definitions:
         doc:
           This feature informs if netdev implements non-linear XDP buffer
           support in ndo_xdp_xmit callback.
+  -
+    type: flags
+    name: xdp-rx-metadata
+    render-max: true
+    entries:
+      -
+        name: timestamp
+        doc:
+          Device is capable of exposing receive HW timestamp via bpf_xdp_metadata_rx_timestamp().
+      -
+        name: hash
+        doc:
+          Device is capable of exposing receive packet hash via bpf_xdp_metadata_rx_hash().
 
 attribute-sets:
   -
@@ -68,6 +81,13 @@ attribute-sets:
         type: u32
         checks:
           min: 1
+      -
+        name: xdp-rx-metadata-features
+        doc: Bitmask of supported XDP receive metadata features.
+             See Documentation/networking/xdp-rx-metadata.rst for more details.
+        type: u64
+        enum: xdp-rx-metadata
+        enum-as-flags: true
 
 operations:
   list:
@@ -84,6 +104,7 @@ operations:
             - ifindex
             - xdp-features
             - xdp-zc-max-segs
+            - xdp-rx-metadata-features
       dump:
         reply: *dev-all
     -
diff --git a/Documentation/networking/xdp-rx-metadata.rst b/Documentation/networking/xdp-rx-metadata.rst
index 25ce72af81c2..205696780b78 100644
--- a/Documentation/networking/xdp-rx-metadata.rst
+++ b/Documentation/networking/xdp-rx-metadata.rst
@@ -105,6 +105,13 @@ bpf_tail_call
 Adding programs that access metadata kfuncs to the ``BPF_MAP_TYPE_PROG_ARRAY``
 is currently not supported.
 
+Supported Devices
+=================
+
+It is possible to query which kfunc the particular netdev implements via
+netlink. See ``xdp-rx-metadata-features`` attribute set in
+``Documentation/netlink/specs/netdev.yaml``.
+
 Example
 =======
 
diff --git a/include/net/xdp.h b/include/net/xdp.h
index d59e12f8f311..349c36fb5fd8 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -386,19 +386,22 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
 /* Define the relationship between xdp-rx-metadata kfunc and
  * various other entities:
  * - xdp_rx_metadata enum
+ * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml)
  * - kfunc name
  * - xdp_metadata_ops field
  */
 #define XDP_METADATA_KFUNC_xxx	\
 	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \
+			   NETDEV_XDP_RX_METADATA_TIMESTAMP, \
 			   bpf_xdp_metadata_rx_timestamp, \
 			   xmo_rx_timestamp) \
 	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \
+			   NETDEV_XDP_RX_METADATA_HASH, \
 			   bpf_xdp_metadata_rx_hash, \
 			   xmo_rx_hash) \
 
 enum xdp_rx_metadata {
-#define XDP_METADATA_KFUNC(name, _, __) name,
+#define XDP_METADATA_KFUNC(name, _, __, ___) name,
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 MAX_XDP_METADATA_KFUNC,
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index c1634b95c223..2943a151d4f1 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -38,11 +38,27 @@ enum netdev_xdp_act {
 	NETDEV_XDP_ACT_MASK = 127,
 };
 
+/**
+ * enum netdev_xdp_rx_metadata
+ * @NETDEV_XDP_RX_METADATA_TIMESTAMP: Device is capable of exposing receive HW
+ *   timestamp via bpf_xdp_metadata_rx_timestamp().
+ * @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet
+ *   hash via bpf_xdp_metadata_rx_hash().
+ */
+enum netdev_xdp_rx_metadata {
+	NETDEV_XDP_RX_METADATA_TIMESTAMP = 1,
+	NETDEV_XDP_RX_METADATA_HASH = 2,
+
+	/* private: */
+	NETDEV_XDP_RX_METADATA_MASK = 3,
+};
+
 enum {
 	NETDEV_A_DEV_IFINDEX = 1,
 	NETDEV_A_DEV_PAD,
 	NETDEV_A_DEV_XDP_FEATURES,
 	NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
+	NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
 
 	__NETDEV_A_DEV_MAX,
 	NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 6aa6de8d715d..e7a1752b5a09 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -845,7 +845,7 @@ void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id)
 	if (!ops)
 		goto out;
 
-#define XDP_METADATA_KFUNC(name, _, xmo) \
+#define XDP_METADATA_KFUNC(name, _, __, xmo) \
 	if (func_id == bpf_xdp_metadata_kfunc_id(name)) p = ops->xmo;
 	XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index c1aea8b756b6..fe61f85bcf33 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -5,6 +5,7 @@
 #include <linux/rtnetlink.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
+#include <net/xdp.h>
 
 #include "netdev-genl-gen.h"
 
@@ -12,15 +13,24 @@ static int
 netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
 		   const struct genl_info *info)
 {
+	u64 xdp_rx_meta = 0;
 	void *hdr;
 
 	hdr = genlmsg_iput(rsp, info);
 	if (!hdr)
 		return -EMSGSIZE;
 
+#define XDP_METADATA_KFUNC(_, flag, __, xmo) \
+	if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \
+		xdp_rx_meta |= flag;
+XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
 	if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
 	    nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
-			      netdev->xdp_features, NETDEV_A_DEV_PAD)) {
+			      netdev->xdp_features, NETDEV_A_DEV_PAD) ||
+	    nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
+			      xdp_rx_meta, NETDEV_A_DEV_PAD)) {
 		genlmsg_cancel(rsp, hdr);
 		return -EINVAL;
 	}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index bab563b2f812..df4789ab512d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -741,7 +741,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
 __diag_pop();
 
 BTF_SET8_START(xdp_metadata_kfunc_ids)
-#define XDP_METADATA_KFUNC(_, name, __) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
+#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 BTF_SET8_END(xdp_metadata_kfunc_ids)
@@ -752,7 +752,7 @@ static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
 };
 
 BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted)
-#define XDP_METADATA_KFUNC(name, str, _) BTF_ID(func, str)
+#define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str)
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index c1634b95c223..2943a151d4f1 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -38,11 +38,27 @@ enum netdev_xdp_act {
 	NETDEV_XDP_ACT_MASK = 127,
 };
 
+/**
+ * enum netdev_xdp_rx_metadata
+ * @NETDEV_XDP_RX_METADATA_TIMESTAMP: Device is capable of exposing receive HW
+ *   timestamp via bpf_xdp_metadata_rx_timestamp().
+ * @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet
+ *   hash via bpf_xdp_metadata_rx_hash().
+ */
+enum netdev_xdp_rx_metadata {
+	NETDEV_XDP_RX_METADATA_TIMESTAMP = 1,
+	NETDEV_XDP_RX_METADATA_HASH = 2,
+
+	/* private: */
+	NETDEV_XDP_RX_METADATA_MASK = 3,
+};
+
 enum {
 	NETDEV_A_DEV_IFINDEX = 1,
 	NETDEV_A_DEV_PAD,
 	NETDEV_A_DEV_XDP_FEATURES,
 	NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
+	NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
 
 	__NETDEV_A_DEV_MAX,
 	NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1)
-- 
cgit v1.2.3


From 9b2b86332a9b9932d9022a0c004251d5d6437020 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Fri, 15 Sep 2023 10:39:10 +0200
Subject: bpf: Allow to use kfunc XDP hints and frags together

There is no fundamental reason, why multi-buffer XDP and XDP kfunc RX hints
cannot coexist in a single program.

Allow those features to be used together by modifying the flags condition
for dev-bound-only programs, segments are still prohibited for fully
offloaded programs, hence additional check.

Suggested-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/CAKH8qBuzgtJj=OKMdsxEkyML36VsAuZpcrsXcyqjdKXSJCBq=Q@mail.gmail.com/
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20230915083914.65538-1-larysa.zaremba@intel.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/offload.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index e7a1752b5a09..92c9df46134a 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -232,7 +232,14 @@ int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr)
 	    attr->prog_type != BPF_PROG_TYPE_XDP)
 		return -EINVAL;
 
-	if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY)
+	if (attr->prog_flags & ~(BPF_F_XDP_DEV_BOUND_ONLY | BPF_F_XDP_HAS_FRAGS))
+		return -EINVAL;
+
+	/* Frags are allowed only if program is dev-bound-only, but not
+	 * if it is requesting bpf offload.
+	 */
+	if (attr->prog_flags & BPF_F_XDP_HAS_FRAGS &&
+	    !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY))
 		return -EINVAL;
 
 	if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS &&
-- 
cgit v1.2.3


From fd5d27b70188379bb441d404c29a0afb111e1753 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:31:59 +0200
Subject: arch/x86: Implement arch_bpf_stack_walk

The plumbing for offline unwinding when we throw an exception in
programs would require walking the stack, hence introduce a new
arch_bpf_stack_walk function. This is provided when the JIT supports
exceptions, i.e. bpf_jit_supports_exceptions is true. The arch-specific
code is really minimal, hence it should be straightforward to extend
this support to other architectures as well, as it reuses the logic of
arch_stack_walk, but allowing access to unwind_state data.

Once the stack pointer and frame pointer are known for the main subprog
during the unwinding, we know the stack layout and location of any
callee-saved registers which must be restored before we return back to
the kernel. This handling will be added in the subsequent patches.

Note that while we primarily unwind through BPF frames, which are
effectively CONFIG_UNWINDER_FRAME_POINTER, we still need one of this or
CONFIG_UNWINDER_ORC to be able to unwind through the bpf_throw frame
from which we begin walking the stack. We also require both sp and bp
(stack and frame pointers) from the unwind_state structure, which are
only available when one of these two options are enabled.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++++++++
 include/linux/filter.h      |  2 ++
 kernel/bpf/core.c           |  9 +++++++++
 3 files changed, 39 insertions(+)

(limited to 'kernel')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index a0d03503b3cb..d0c24b5a6abb 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -16,6 +16,7 @@
 #include <asm/set_memory.h>
 #include <asm/nospec-branch.h>
 #include <asm/text-patching.h>
+#include <asm/unwind.h>
 
 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
@@ -2933,3 +2934,30 @@ void bpf_jit_free(struct bpf_prog *prog)
 
 	bpf_prog_unlock_free(prog);
 }
+
+bool bpf_jit_supports_exceptions(void)
+{
+	/* We unwind through both kernel frames (starting from within bpf_throw
+	 * call) and BPF frames. Therefore we require one of ORC or FP unwinder
+	 * to be enabled to walk kernel frames and reach BPF frames in the stack
+	 * trace.
+	 */
+	return IS_ENABLED(CONFIG_UNWINDER_ORC) || IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER);
+}
+
+void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
+{
+#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
+	struct unwind_state state;
+	unsigned long addr;
+
+	for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state);
+	     unwind_next_frame(&state)) {
+		addr = unwind_get_return_address(&state);
+		if (!addr || !consume_fn(cookie, (u64)addr, (u64)state.sp, (u64)state.bp))
+			break;
+	}
+	return;
+#endif
+	WARN(1, "verification of programs using bpf_throw should have failed\n");
+}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0138832ad571..88874de974cb 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -954,6 +954,8 @@ bool bpf_jit_needs_zext(void);
 bool bpf_jit_supports_subprog_tailcalls(void);
 bool bpf_jit_supports_kfunc_call(void);
 bool bpf_jit_supports_far_kfunc_call(void);
+bool bpf_jit_supports_exceptions(void);
+void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
 bool bpf_helper_changes_pkt_data(void *func);
 
 static inline bool bpf_dump_raw_ok(const struct cred *cred)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 95599df82ee4..c4ac084f2767 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2914,6 +2914,15 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len)
 	return -ENOTSUPP;
 }
 
+bool __weak bpf_jit_supports_exceptions(void)
+{
+	return false;
+}
+
+void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
+{
+}
+
 #ifdef CONFIG_BPF_SYSCALL
 static int __init bpf_global_ma_init(void)
 {
-- 
cgit v1.2.3


From 335d1c5b545284d75ef96ee42e461eacefe865bb Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:00 +0200
Subject: bpf: Implement support for adding hidden subprogs

Introduce support in the verifier for generating a subprogram and
include it as part of a BPF program dynamically after the do_check phase
is complete. The first user will be the next patch which generates
default exception callbacks if none are set for the program. The phase
of invocation will be do_misc_fixups. Note that this is an internal
verifier function, and should be used with instruction blocks which
uphold the invariants stated in check_subprogs.

Since these subprogs are always appended to the end of the instruction
sequence of the program, it becomes relatively inexpensive to do the
related adjustments to the subprog_info of the program. Only the fake
exit subprogram is shifted forward, making room for our new subprog.

This is useful to insert a new subprogram, get it JITed, and obtain its
function pointer. The next patch will use this functionality to insert a
default exception callback which will be invoked after unwinding the
stack.

Note that these added subprograms are invisible to userspace, and never
reported in BPF_OBJ_GET_INFO_BY_ID etc. For now, only a single
subprogram is supported, but more can be easily supported in the future.

To this end, two function counts are introduced now, the existing
func_cnt, and real_func_cnt, the latter including hidden programs. This
allows us to conver the JIT code to use the real_func_cnt for management
of resources while syscall path continues working with existing
func_cnt.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  1 +
 include/linux/bpf_verifier.h |  3 ++-
 kernel/bpf/core.c            | 12 ++++++------
 kernel/bpf/syscall.c         |  2 +-
 kernel/bpf/verifier.c        | 36 +++++++++++++++++++++++++++++++++---
 5 files changed, 43 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9171b0b6a590..c3667e95af59 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1389,6 +1389,7 @@ struct bpf_prog_aux {
 	u32 stack_depth;
 	u32 id;
 	u32 func_cnt; /* used by non-func prog as the number of func progs */
+	u32 real_func_cnt; /* includes hidden progs, only used for JIT and freeing progs */
 	u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
 	u32 attach_btf_id; /* in-kernel BTF type id to attach to */
 	u32 ctx_arg_info_size;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index a3236651ec64..3c2a8636ab29 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -588,6 +588,7 @@ struct bpf_verifier_env {
 	u32 used_map_cnt;		/* number of used maps */
 	u32 used_btf_cnt;		/* number of used BTF objects */
 	u32 id_gen;			/* used to generate unique reg IDs */
+	u32 hidden_subprog_cnt;		/* number of hidden subprogs */
 	bool explore_alu_limits;
 	bool allow_ptr_leaks;
 	bool allow_uninit_stack;
@@ -598,7 +599,7 @@ struct bpf_verifier_env {
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	const struct bpf_line_info *prev_linfo;
 	struct bpf_verifier_log log;
-	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
+	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 2]; /* max + 2 for the fake and exception subprogs */
 	union {
 		struct bpf_idmap idmap_scratch;
 		struct bpf_idset idset_scratch;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c4ac084f2767..840ba952702d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -212,7 +212,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
 	const struct bpf_line_info *linfo;
 	void **jited_linfo;
 
-	if (!prog->aux->jited_linfo)
+	if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
 		/* Userspace did not provide linfo */
 		return;
 
@@ -539,7 +539,7 @@ static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
 {
 	int i;
 
-	for (i = 0; i < fp->aux->func_cnt; i++)
+	for (i = 0; i < fp->aux->real_func_cnt; i++)
 		bpf_prog_kallsyms_del(fp->aux->func[i]);
 }
 
@@ -589,7 +589,7 @@ bpf_prog_ksym_set_name(struct bpf_prog *prog)
 	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
 
 	/* prog->aux->name will be ignored if full btf name is available */
-	if (prog->aux->func_info_cnt) {
+	if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
 		type = btf_type_by_id(prog->aux->btf,
 				      prog->aux->func_info[prog->aux->func_idx].type_id);
 		func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
@@ -1208,7 +1208,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
 		if (!extra_pass)
 			addr = NULL;
 		else if (prog->aux->func &&
-			 off >= 0 && off < prog->aux->func_cnt)
+			 off >= 0 && off < prog->aux->real_func_cnt)
 			addr = (u8 *)prog->aux->func[off]->bpf_func;
 		else
 			return -EINVAL;
@@ -2721,7 +2721,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 #endif
 	if (aux->dst_trampoline)
 		bpf_trampoline_put(aux->dst_trampoline);
-	for (i = 0; i < aux->func_cnt; i++) {
+	for (i = 0; i < aux->real_func_cnt; i++) {
 		/* We can just unlink the subprog poke descriptor table as
 		 * it was originally linked to the main program and is also
 		 * released along with it.
@@ -2729,7 +2729,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 		aux->func[i]->aux->poke_tab = NULL;
 		bpf_jit_free(aux->func[i]);
 	}
-	if (aux->func_cnt) {
+	if (aux->real_func_cnt) {
 		kfree(aux->func);
 		bpf_prog_unlock_free(aux->prog);
 	} else {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6a692f3bea15..85c1d908f70f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2749,7 +2749,7 @@ free_used_maps:
 	 * period before we can tear down JIT memory since symbols
 	 * are already exposed under kallsyms.
 	 */
-	__bpf_prog_put_noref(prog, prog->aux->func_cnt);
+	__bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
 	return err;
 free_prog_sec:
 	free_uid(prog->aux->user);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 18e673c0ac15..39548e326d53 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15210,7 +15210,8 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
 	if (!aux->func_info)
 		return;
 
-	for (i = 0; i < env->subprog_cnt; i++)
+	/* func_info is not available for hidden subprogs */
+	for (i = 0; i < env->subprog_cnt - env->hidden_subprog_cnt; i++)
 		aux->func_info[i].insn_off = env->subprog_info[i].start;
 }
 
@@ -18151,7 +18152,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		 * the call instruction, as an index for this list
 		 */
 		func[i]->aux->func = func;
-		func[i]->aux->func_cnt = env->subprog_cnt;
+		func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+		func[i]->aux->real_func_cnt = env->subprog_cnt;
 	}
 	for (i = 0; i < env->subprog_cnt; i++) {
 		old_bpf_func = func[i]->bpf_func;
@@ -18197,7 +18199,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->aux->extable = func[0]->aux->extable;
 	prog->aux->num_exentries = func[0]->aux->num_exentries;
 	prog->aux->func = func;
-	prog->aux->func_cnt = env->subprog_cnt;
+	prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
+	prog->aux->real_func_cnt = env->subprog_cnt;
 	bpf_prog_jit_attempt_done(prog);
 	return 0;
 out_free:
@@ -18433,6 +18436,33 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	return 0;
 }
 
+/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
+static __maybe_unused int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
+{
+	struct bpf_subprog_info *info = env->subprog_info;
+	int cnt = env->subprog_cnt;
+	struct bpf_prog *prog;
+
+	/* We only reserve one slot for hidden subprogs in subprog_info. */
+	if (env->hidden_subprog_cnt) {
+		verbose(env, "verifier internal error: only one hidden subprog supported\n");
+		return -EFAULT;
+	}
+	/* We're not patching any existing instruction, just appending the new
+	 * ones for the hidden subprog. Hence all of the adjustment operations
+	 * in bpf_patch_insn_data are no-ops.
+	 */
+	prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len);
+	if (!prog)
+		return -ENOMEM;
+	env->prog = prog;
+	info[cnt + 1].start = info[cnt].start;
+	info[cnt].start = prog->len - len + 1;
+	env->subprog_cnt++;
+	env->hidden_subprog_cnt++;
+	return 0;
+}
+
 /* Do various post-verification rewrites in a single program pass.
  * These rewrites simplify JIT and interpreter implementations.
  */
-- 
cgit v1.2.3


From f18b03fabaa9b7c80e80b72a621f481f0d706ae0 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:01 +0200
Subject: bpf: Implement BPF exceptions

This patch implements BPF exceptions, and introduces a bpf_throw kfunc
to allow programs to throw exceptions during their execution at runtime.
A bpf_throw invocation is treated as an immediate termination of the
program, returning back to its caller within the kernel, unwinding all
stack frames.

This allows the program to simplify its implementation, by testing for
runtime conditions which the verifier has no visibility into, and assert
that they are true. In case they are not, the program can simply throw
an exception from the other branch.

BPF exceptions are explicitly *NOT* an unlikely slowpath error handling
primitive, and this objective has guided design choices of the
implementation of the them within the kernel (with the bulk of the cost
for unwinding the stack offloaded to the bpf_throw kfunc).

The implementation of this mechanism requires use of add_hidden_subprog
mechanism introduced in the previous patch, which generates a couple of
instructions to move R1 to R0 and exit. The JIT then rewrites the
prologue of this subprog to take the stack pointer and frame pointer as
inputs and reset the stack frame, popping all callee-saved registers
saved by the main subprog. The bpf_throw function then walks the stack
at runtime, and invokes this exception subprog with the stack and frame
pointers as parameters.

Reviewers must take note that currently the main program is made to save
all callee-saved registers on x86_64 during entry into the program. This
is because we must do an equivalent of a lightweight context switch when
unwinding the stack, therefore we need the callee-saved registers of the
caller of the BPF program to be able to return with a sane state.

Note that we have to additionally handle r12, even though it is not used
by the program, because when throwing the exception the program makes an
entry into the kernel which could clobber r12 after saving it on the
stack. To be able to preserve the value we received on program entry, we
push r12 and restore it from the generated subprogram when unwinding the
stack.

For now, bpf_throw invocation fails when lingering resources or locks
exist in that path of the program. In a future followup, bpf_throw will
be extended to perform frame-by-frame unwinding to release lingering
resources for each stack frame, removing this limitation.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-5-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c                    |  89 ++++++++++++++++---
 include/linux/bpf.h                            |   3 +
 include/linux/bpf_verifier.h                   |   4 +
 include/linux/filter.h                         |   6 ++
 kernel/bpf/core.c                              |   2 +-
 kernel/bpf/helpers.c                           |  38 ++++++++
 kernel/bpf/verifier.c                          | 116 ++++++++++++++++++++++---
 tools/testing/selftests/bpf/bpf_experimental.h |  16 ++++
 8 files changed, 247 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d0c24b5a6abb..84005f2114e0 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -18,6 +18,8 @@
 #include <asm/text-patching.h>
 #include <asm/unwind.h>
 
+static bool all_callee_regs_used[4] = {true, true, true, true};
+
 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
 	if (len == 1)
@@ -256,6 +258,14 @@ struct jit_context {
 /* Number of bytes that will be skipped on tailcall */
 #define X86_TAIL_CALL_OFFSET	(11 + ENDBR_INSN_SIZE)
 
+static void push_r12(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	EMIT2(0x41, 0x54);   /* push r12 */
+	*pprog = prog;
+}
+
 static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
 {
 	u8 *prog = *pprog;
@@ -271,6 +281,14 @@ static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
 	*pprog = prog;
 }
 
+static void pop_r12(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	EMIT2(0x41, 0x5C);   /* pop r12 */
+	*pprog = prog;
+}
+
 static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
 {
 	u8 *prog = *pprog;
@@ -292,7 +310,8 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
  * while jumping to another program
  */
 static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
-			  bool tail_call_reachable, bool is_subprog)
+			  bool tail_call_reachable, bool is_subprog,
+			  bool is_exception_cb)
 {
 	u8 *prog = *pprog;
 
@@ -312,8 +331,22 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
 			/* Keep the same instruction layout. */
 			EMIT2(0x66, 0x90); /* nop2 */
 	}
-	EMIT1(0x55);             /* push rbp */
-	EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+	/* Exception callback receives FP as third parameter */
+	if (is_exception_cb) {
+		EMIT3(0x48, 0x89, 0xF4); /* mov rsp, rsi */
+		EMIT3(0x48, 0x89, 0xD5); /* mov rbp, rdx */
+		/* The main frame must have exception_boundary as true, so we
+		 * first restore those callee-saved regs from stack, before
+		 * reusing the stack frame.
+		 */
+		pop_callee_regs(&prog, all_callee_regs_used);
+		pop_r12(&prog);
+		/* Reset the stack frame. */
+		EMIT3(0x48, 0x89, 0xEC); /* mov rsp, rbp */
+	} else {
+		EMIT1(0x55);             /* push rbp */
+		EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+	}
 
 	/* X86_TAIL_CALL_OFFSET is here */
 	EMIT_ENDBR();
@@ -472,7 +505,8 @@ static void emit_return(u8 **pprog, u8 *ip)
  *   goto *(prog->bpf_func + prologue_size);
  * out:
  */
-static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
+static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
+					u8 **pprog, bool *callee_regs_used,
 					u32 stack_depth, u8 *ip,
 					struct jit_context *ctx)
 {
@@ -522,7 +556,12 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
 	offset = ctx->tail_call_indirect_label - (prog + 2 - start);
 	EMIT2(X86_JE, offset);                    /* je out */
 
-	pop_callee_regs(&prog, callee_regs_used);
+	if (bpf_prog->aux->exception_boundary) {
+		pop_callee_regs(&prog, all_callee_regs_used);
+		pop_r12(&prog);
+	} else {
+		pop_callee_regs(&prog, callee_regs_used);
+	}
 
 	EMIT1(0x58);                              /* pop rax */
 	if (stack_depth)
@@ -546,7 +585,8 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
 	*pprog = prog;
 }
 
-static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
+static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
+				      struct bpf_jit_poke_descriptor *poke,
 				      u8 **pprog, u8 *ip,
 				      bool *callee_regs_used, u32 stack_depth,
 				      struct jit_context *ctx)
@@ -575,7 +615,13 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
 	emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE,
 		  poke->tailcall_bypass);
 
-	pop_callee_regs(&prog, callee_regs_used);
+	if (bpf_prog->aux->exception_boundary) {
+		pop_callee_regs(&prog, all_callee_regs_used);
+		pop_r12(&prog);
+	} else {
+		pop_callee_regs(&prog, callee_regs_used);
+	}
+
 	EMIT1(0x58);                                  /* pop rax */
 	if (stack_depth)
 		EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
@@ -1050,8 +1096,20 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 
 	emit_prologue(&prog, bpf_prog->aux->stack_depth,
 		      bpf_prog_was_classic(bpf_prog), tail_call_reachable,
-		      bpf_is_subprog(bpf_prog));
-	push_callee_regs(&prog, callee_regs_used);
+		      bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
+	/* Exception callback will clobber callee regs for its own use, and
+	 * restore the original callee regs from main prog's stack frame.
+	 */
+	if (bpf_prog->aux->exception_boundary) {
+		/* We also need to save r12, which is not mapped to any BPF
+		 * register, as we throw after entry into the kernel, which may
+		 * overwrite r12.
+		 */
+		push_r12(&prog);
+		push_callee_regs(&prog, all_callee_regs_used);
+	} else {
+		push_callee_regs(&prog, callee_regs_used);
+	}
 
 	ilen = prog - temp;
 	if (rw_image)
@@ -1648,13 +1706,15 @@ st:			if (is_imm8(insn->off))
 
 		case BPF_JMP | BPF_TAIL_CALL:
 			if (imm32)
-				emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
+				emit_bpf_tail_call_direct(bpf_prog,
+							  &bpf_prog->aux->poke_tab[imm32 - 1],
 							  &prog, image + addrs[i - 1],
 							  callee_regs_used,
 							  bpf_prog->aux->stack_depth,
 							  ctx);
 			else
-				emit_bpf_tail_call_indirect(&prog,
+				emit_bpf_tail_call_indirect(bpf_prog,
+							    &prog,
 							    callee_regs_used,
 							    bpf_prog->aux->stack_depth,
 							    image + addrs[i - 1],
@@ -1907,7 +1967,12 @@ emit_jmp:
 			seen_exit = true;
 			/* Update cleanup_addr */
 			ctx->cleanup_addr = proglen;
-			pop_callee_regs(&prog, callee_regs_used);
+			if (bpf_prog->aux->exception_boundary) {
+				pop_callee_regs(&prog, all_callee_regs_used);
+				pop_r12(&prog);
+			} else {
+				pop_callee_regs(&prog, callee_regs_used);
+			}
 			EMIT1(0xC9);         /* leave */
 			emit_return(&prog, image + addrs[i - 1] + (prog - temp));
 			break;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c3667e95af59..16740ee82082 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1410,6 +1410,8 @@ struct bpf_prog_aux {
 	bool sleepable;
 	bool tail_call_reachable;
 	bool xdp_has_frags;
+	bool exception_cb;
+	bool exception_boundary;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
 	/* function name for valid attach_btf_id */
@@ -1432,6 +1434,7 @@ struct bpf_prog_aux {
 	int cgroup_atype; /* enum cgroup_bpf_attach_type */
 	struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 	char name[BPF_OBJ_NAME_LEN];
+	unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp);
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3c2a8636ab29..da21a3ec5027 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -541,7 +541,9 @@ struct bpf_subprog_info {
 	bool has_tail_call;
 	bool tail_call_reachable;
 	bool has_ld_abs;
+	bool is_cb;
 	bool is_async_cb;
+	bool is_exception_cb;
 };
 
 struct bpf_verifier_env;
@@ -589,6 +591,7 @@ struct bpf_verifier_env {
 	u32 used_btf_cnt;		/* number of used BTF objects */
 	u32 id_gen;			/* used to generate unique reg IDs */
 	u32 hidden_subprog_cnt;		/* number of hidden subprogs */
+	int exception_callback_subprog;
 	bool explore_alu_limits;
 	bool allow_ptr_leaks;
 	bool allow_uninit_stack;
@@ -596,6 +599,7 @@ struct bpf_verifier_env {
 	bool bypass_spec_v1;
 	bool bypass_spec_v4;
 	bool seen_direct_write;
+	bool seen_exception;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	const struct bpf_line_info *prev_linfo;
 	struct bpf_verifier_log log;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 88874de974cb..27406aee2d40 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1171,6 +1171,7 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
 bool is_bpf_text_address(unsigned long addr);
 int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 		    char *sym);
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr);
 
 static inline const char *
 bpf_address_lookup(unsigned long addr, unsigned long *size,
@@ -1238,6 +1239,11 @@ static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
 	return -ERANGE;
 }
 
+static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+{
+	return NULL;
+}
+
 static inline const char *
 bpf_address_lookup(unsigned long addr, unsigned long *size,
 		   unsigned long *off, char **modname, char *sym)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 840ba952702d..7849b9cca749 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -733,7 +733,7 @@ bool is_bpf_text_address(unsigned long addr)
 	return ret;
 }
 
-static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
 {
 	struct bpf_ksym *ksym = bpf_ksym_find(addr);
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b0a9834f1051..78e8f4de6750 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2449,6 +2449,43 @@ __bpf_kfunc void bpf_rcu_read_unlock(void)
 	rcu_read_unlock();
 }
 
+struct bpf_throw_ctx {
+	struct bpf_prog_aux *aux;
+	u64 sp;
+	u64 bp;
+	int cnt;
+};
+
+static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+	struct bpf_throw_ctx *ctx = cookie;
+	struct bpf_prog *prog;
+
+	if (!is_bpf_text_address(ip))
+		return !ctx->cnt;
+	prog = bpf_prog_ksym_find(ip);
+	ctx->cnt++;
+	if (bpf_is_subprog(prog))
+		return true;
+	ctx->aux = prog->aux;
+	ctx->sp = sp;
+	ctx->bp = bp;
+	return false;
+}
+
+__bpf_kfunc void bpf_throw(u64 cookie)
+{
+	struct bpf_throw_ctx ctx = {};
+
+	arch_bpf_stack_walk(bpf_stack_walker, &ctx);
+	WARN_ON_ONCE(!ctx.aux);
+	if (ctx.aux)
+		WARN_ON_ONCE(!ctx.aux->exception_boundary);
+	WARN_ON_ONCE(!ctx.bp);
+	WARN_ON_ONCE(!ctx.cnt);
+	ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp);
+}
+
 __diag_pop();
 
 BTF_SET8_START(generic_btf_ids)
@@ -2478,6 +2515,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
 #endif
 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_throw)
 BTF_SET8_END(generic_btf_ids)
 
 static const struct btf_kfunc_id_set generic_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 39548e326d53..9baa6f187b38 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -543,6 +543,7 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
 }
 
 static bool is_callback_calling_kfunc(u32 btf_id);
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
 
 static bool is_callback_calling_function(enum bpf_func_id func_id)
 {
@@ -1748,7 +1749,9 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 		return -ENOMEM;
 	dst_state->jmp_history_cnt = src->jmp_history_cnt;
 
-	/* if dst has more stack frames then src frame, free them */
+	/* if dst has more stack frames then src frame, free them, this is also
+	 * necessary in case of exceptional exits using bpf_throw.
+	 */
 	for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
 		free_func_state(dst_state->frame[i]);
 		dst_state->frame[i] = NULL;
@@ -2868,7 +2871,7 @@ next:
 		if (i == subprog_end - 1) {
 			/* to avoid fall-through from one subprog into another
 			 * the last insn of the subprog should be either exit
-			 * or unconditional jump back
+			 * or unconditional jump back or bpf_throw call
 			 */
 			if (code != (BPF_JMP | BPF_EXIT) &&
 			    code != (BPF_JMP32 | BPF_JA) &&
@@ -5661,6 +5664,27 @@ continue_func:
 	for (; i < subprog_end; i++) {
 		int next_insn, sidx;
 
+		if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) {
+			bool err = false;
+
+			if (!is_bpf_throw_kfunc(insn + i))
+				continue;
+			if (subprog[idx].is_cb)
+				err = true;
+			for (int c = 0; c < frame && !err; c++) {
+				if (subprog[ret_prog[c]].is_cb) {
+					err = true;
+					break;
+				}
+			}
+			if (!err)
+				continue;
+			verbose(env,
+				"bpf_throw kfunc (insn %d) cannot be called from callback subprog %d\n",
+				i, idx);
+			return -EINVAL;
+		}
+
 		if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
 			continue;
 		/* remember insn and function to return to */
@@ -8919,6 +8943,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	 * callbacks
 	 */
 	if (set_callee_state_cb != set_callee_state) {
+		env->subprog_info[subprog].is_cb = true;
 		if (bpf_pseudo_kfunc_call(insn) &&
 		    !is_callback_calling_kfunc(insn->imm)) {
 			verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
@@ -9308,7 +9333,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 		verbose(env, "to caller at %d:\n", *insn_idx);
 		print_verifier_state(env, caller, true);
 	}
-	/* clear everything in the callee */
+	/* clear everything in the callee. In case of exceptional exits using
+	 * bpf_throw, this will be done by copy_verifier_state for extra frames. */
 	free_func_state(callee);
 	state->frame[state->curframe--] = NULL;
 	return 0;
@@ -9432,17 +9458,17 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	return 0;
 }
 
-static int check_reference_leak(struct bpf_verifier_env *env)
+static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
 {
 	struct bpf_func_state *state = cur_func(env);
 	bool refs_lingering = false;
 	int i;
 
-	if (state->frameno && !state->in_callback_fn)
+	if (!exception_exit && state->frameno && !state->in_callback_fn)
 		return 0;
 
 	for (i = 0; i < state->acquired_refs; i++) {
-		if (state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
+		if (!exception_exit && state->in_callback_fn && state->refs[i].callback_ref != state->frameno)
 			continue;
 		verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
 			state->refs[i].id, state->refs[i].insn_idx);
@@ -9697,7 +9723,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 	switch (func_id) {
 	case BPF_FUNC_tail_call:
-		err = check_reference_leak(env);
+		err = check_reference_leak(env, false);
 		if (err) {
 			verbose(env, "tail_call would lead to reference leak\n");
 			return err;
@@ -10332,6 +10358,7 @@ enum special_kfunc_type {
 	KF_bpf_dynptr_clone,
 	KF_bpf_percpu_obj_new_impl,
 	KF_bpf_percpu_obj_drop_impl,
+	KF_bpf_throw,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -10354,6 +10381,7 @@ BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -10378,6 +10406,7 @@ BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
+BTF_ID(func, bpf_throw)
 
 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -10695,6 +10724,12 @@ static bool is_callback_calling_kfunc(u32 btf_id)
 	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
 }
 
+static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
+{
+	return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
+	       insn->imm == special_kfunc_list[KF_bpf_throw];
+}
+
 static bool is_rbtree_lock_required_kfunc(u32 btf_id)
 {
 	return is_bpf_rbtree_api_kfunc(btf_id);
@@ -11480,6 +11515,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		}
 	}
 
+	if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
+		if (!bpf_jit_supports_exceptions()) {
+			verbose(env, "JIT does not support calling kfunc %s#%d\n",
+				func_name, meta.func_id);
+			return -ENOTSUPP;
+		}
+		env->seen_exception = true;
+	}
+
 	for (i = 0; i < CALLER_SAVED_REGS; i++)
 		mark_reg_not_init(env, regs, caller_saved[i]);
 
@@ -14525,7 +14569,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	 * gen_ld_abs() may terminate the program at runtime, leading to
 	 * reference leak.
 	 */
-	err = check_reference_leak(env);
+	err = check_reference_leak(env, false);
 	if (err) {
 		verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n");
 		return err;
@@ -16539,6 +16583,7 @@ static int do_check(struct bpf_verifier_env *env)
 	int prev_insn_idx = -1;
 
 	for (;;) {
+		bool exception_exit = false;
 		struct bpf_insn *insn;
 		u8 class;
 		int err;
@@ -16753,12 +16798,17 @@ static int do_check(struct bpf_verifier_env *env)
 						return -EINVAL;
 					}
 				}
-				if (insn->src_reg == BPF_PSEUDO_CALL)
+				if (insn->src_reg == BPF_PSEUDO_CALL) {
 					err = check_func_call(env, insn, &env->insn_idx);
-				else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+				} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
 					err = check_kfunc_call(env, insn, &env->insn_idx);
-				else
+					if (!err && is_bpf_throw_kfunc(insn)) {
+						exception_exit = true;
+						goto process_bpf_exit_full;
+					}
+				} else {
 					err = check_helper_call(env, insn, &env->insn_idx);
+				}
 				if (err)
 					return err;
 
@@ -16788,7 +16838,7 @@ static int do_check(struct bpf_verifier_env *env)
 					verbose(env, "BPF_EXIT uses reserved fields\n");
 					return -EINVAL;
 				}
-
+process_bpf_exit_full:
 				if (env->cur_state->active_lock.ptr &&
 				    !in_rbtree_lock_required_cb(env)) {
 					verbose(env, "bpf_spin_unlock is missing\n");
@@ -16807,10 +16857,23 @@ static int do_check(struct bpf_verifier_env *env)
 				 * function, for which reference_state must
 				 * match caller reference state when it exits.
 				 */
-				err = check_reference_leak(env);
+				err = check_reference_leak(env, exception_exit);
 				if (err)
 					return err;
 
+				/* The side effect of the prepare_func_exit
+				 * which is being skipped is that it frees
+				 * bpf_func_state. Typically, process_bpf_exit
+				 * will only be hit with outermost exit.
+				 * copy_verifier_state in pop_stack will handle
+				 * freeing of any extra bpf_func_state left over
+				 * from not processing all nested function
+				 * exits. We also skip return code checks as
+				 * they are not needed for exceptional exits.
+				 */
+				if (exception_exit)
+					goto process_bpf_exit;
+
 				if (state->curframe) {
 					/* exit from nested function */
 					err = prepare_func_exit(env, &env->insn_idx);
@@ -18113,6 +18176,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		}
 		func[i]->aux->num_exentries = num_exentries;
 		func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
+		func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
+		if (!i)
+			func[i]->aux->exception_boundary = env->seen_exception;
 		func[i] = bpf_int_jit_compile(func[i]);
 		if (!func[i]->jited) {
 			err = -ENOTSUPP;
@@ -18201,6 +18267,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->aux->func = func;
 	prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt;
 	prog->aux->real_func_cnt = env->subprog_cnt;
+	prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func;
+	prog->aux->exception_boundary = func[0]->aux->exception_boundary;
 	bpf_prog_jit_attempt_done(prog);
 	return 0;
 out_free:
@@ -18437,7 +18505,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 }
 
 /* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */
-static __maybe_unused int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
+static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len)
 {
 	struct bpf_subprog_info *info = env->subprog_info;
 	int cnt = env->subprog_cnt;
@@ -18481,6 +18549,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 	struct bpf_map *map_ptr;
 	int i, ret, cnt, delta = 0;
 
+	if (env->seen_exception && !env->exception_callback_subprog) {
+		struct bpf_insn patch[] = {
+			env->prog->insnsi[insn_cnt - 1],
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		};
+
+		ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch));
+		if (ret < 0)
+			return ret;
+		prog = env->prog;
+		insn = prog->insnsi;
+
+		env->exception_callback_subprog = env->subprog_cnt - 1;
+		/* Don't update insn_cnt, as add_hidden_subprog always appends insns */
+		env->subprog_info[env->exception_callback_subprog].is_cb = true;
+		env->subprog_info[env->exception_callback_subprog].is_async_cb = true;
+		env->subprog_info[env->exception_callback_subprog].is_exception_cb = true;
+	}
+
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		/* Make divide-by-zero exceptions impossible. */
 		if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 4494eaa9937e..333b54a86e3a 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -162,4 +162,20 @@ extern void bpf_percpu_obj_drop_impl(void *kptr, void *meta) __ksym;
 /* Convenience macro to wrap over bpf_obj_drop_impl */
 #define bpf_percpu_obj_drop(kptr) bpf_percpu_obj_drop_impl(kptr, NULL)
 
+/* Description
+ *	Throw a BPF exception from the program, immediately terminating its
+ *	execution and unwinding the stack. The supplied 'cookie' parameter
+ *	will be the return value of the program when an exception is thrown.
+ *
+ *	Note that throwing an exception with lingering resources (locks,
+ *	references, etc.) will lead to a verification error.
+ *
+ *	Note that callbacks *cannot* call this helper.
+ * Returns
+ *	Never.
+ * Throws
+ *	An exception with the specified 'cookie' value.
+ */
+extern void bpf_throw(u64 cookie) __ksym;
+
 #endif
-- 
cgit v1.2.3


From aaa619ebccb2b78b3c6d2c0cd72d206ee8fc0025 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:02 +0200
Subject: bpf: Refactor check_btf_func and split into two phases

This patch splits the check_btf_info's check_btf_func check into two
separate phases.  The first phase sets up the BTF and prepares
func_info, but does not perform any validation of required invariants
for subprogs just yet. This is left to the second phase, which happens
where check_btf_info executes currently, and performs the line_info and
CO-RE relocation.

The reason to perform this split is to obtain the userspace supplied
func_info information before we perform the add_subprog call, where we
would now require finding and adding subprogs that may not have a
bpf_pseudo_call or bpf_pseudo_func instruction in the program.

We require this as we want to enable userspace to supply exception
callbacks that can override the default hidden subprogram generated by
the verifier (which performs a hardcoded action). In such a case, the
exception callback may never be referenced in an instruction, but will
still be suitably annotated (by way of BTF declaration tags). For
finding this exception callback, we would require the program's BTF
information, and the supplied func_info information which maps BTF type
IDs to subprograms.

Since the exception callback won't actually be referenced through
instructions, later checks in check_cfg and do_check_subprogs will not
verify the subprog. This means that add_subprog needs to add them in the
add_subprog_and_kfunc phase before we move forward, which is why the BTF
and func_info are required at that point.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-6-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 128 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 100 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9baa6f187b38..ec767ae08c2b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15115,20 +15115,18 @@ static int check_abnormal_return(struct bpf_verifier_env *env)
 #define MIN_BPF_FUNCINFO_SIZE	8
 #define MAX_FUNCINFO_REC_SIZE	252
 
-static int check_btf_func(struct bpf_verifier_env *env,
-			  const union bpf_attr *attr,
-			  bpfptr_t uattr)
+static int check_btf_func_early(struct bpf_verifier_env *env,
+				const union bpf_attr *attr,
+				bpfptr_t uattr)
 {
-	const struct btf_type *type, *func_proto, *ret_type;
-	u32 i, nfuncs, urec_size, min_size;
 	u32 krec_size = sizeof(struct bpf_func_info);
+	const struct btf_type *type, *func_proto;
+	u32 i, nfuncs, urec_size, min_size;
 	struct bpf_func_info *krecord;
-	struct bpf_func_info_aux *info_aux = NULL;
 	struct bpf_prog *prog;
 	const struct btf *btf;
-	bpfptr_t urecord;
 	u32 prev_offset = 0;
-	bool scalar_return;
+	bpfptr_t urecord;
 	int ret = -ENOMEM;
 
 	nfuncs = attr->func_info_cnt;
@@ -15138,11 +15136,6 @@ static int check_btf_func(struct bpf_verifier_env *env,
 		return 0;
 	}
 
-	if (nfuncs != env->subprog_cnt) {
-		verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
-		return -EINVAL;
-	}
-
 	urec_size = attr->func_info_rec_size;
 	if (urec_size < MIN_BPF_FUNCINFO_SIZE ||
 	    urec_size > MAX_FUNCINFO_REC_SIZE ||
@@ -15160,9 +15153,6 @@ static int check_btf_func(struct bpf_verifier_env *env,
 	krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
 	if (!krecord)
 		return -ENOMEM;
-	info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
-	if (!info_aux)
-		goto err_free;
 
 	for (i = 0; i < nfuncs; i++) {
 		ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
@@ -15201,11 +15191,6 @@ static int check_btf_func(struct bpf_verifier_env *env,
 			goto err_free;
 		}
 
-		if (env->subprog_info[i].start != krecord[i].insn_off) {
-			verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
-			goto err_free;
-		}
-
 		/* check type_id */
 		type = btf_type_by_id(btf, krecord[i].type_id);
 		if (!type || !btf_type_is_func(type)) {
@@ -15213,12 +15198,80 @@ static int check_btf_func(struct bpf_verifier_env *env,
 				krecord[i].type_id);
 			goto err_free;
 		}
-		info_aux[i].linkage = BTF_INFO_VLEN(type->info);
 
 		func_proto = btf_type_by_id(btf, type->type);
 		if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto)))
 			/* btf_func_check() already verified it during BTF load */
 			goto err_free;
+
+		prev_offset = krecord[i].insn_off;
+		bpfptr_add(&urecord, urec_size);
+	}
+
+	prog->aux->func_info = krecord;
+	prog->aux->func_info_cnt = nfuncs;
+	return 0;
+
+err_free:
+	kvfree(krecord);
+	return ret;
+}
+
+static int check_btf_func(struct bpf_verifier_env *env,
+			  const union bpf_attr *attr,
+			  bpfptr_t uattr)
+{
+	const struct btf_type *type, *func_proto, *ret_type;
+	u32 i, nfuncs, urec_size, min_size;
+	u32 krec_size = sizeof(struct bpf_func_info);
+	struct bpf_func_info *krecord;
+	struct bpf_func_info_aux *info_aux = NULL;
+	struct bpf_prog *prog;
+	const struct btf *btf;
+	bpfptr_t urecord;
+	u32 prev_offset = 0;
+	bool scalar_return;
+	int ret = -ENOMEM;
+
+	nfuncs = attr->func_info_cnt;
+	if (!nfuncs) {
+		if (check_abnormal_return(env))
+			return -EINVAL;
+		return 0;
+	}
+	if (nfuncs != env->subprog_cnt) {
+		verbose(env, "number of funcs in func_info doesn't match number of subprogs\n");
+		return -EINVAL;
+	}
+
+	urec_size = attr->func_info_rec_size;
+
+	prog = env->prog;
+	btf = prog->aux->btf;
+
+	urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
+	min_size = min_t(u32, krec_size, urec_size);
+
+	krecord = prog->aux->func_info;
+	info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
+	if (!info_aux)
+		return -ENOMEM;
+
+	for (i = 0; i < nfuncs; i++) {
+		/* check insn_off */
+		ret = -EINVAL;
+
+		if (env->subprog_info[i].start != krecord[i].insn_off) {
+			verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n");
+			goto err_free;
+		}
+
+		/* Already checked type_id */
+		type = btf_type_by_id(btf, krecord[i].type_id);
+		info_aux[i].linkage = BTF_INFO_VLEN(type->info);
+		/* Already checked func_proto */
+		func_proto = btf_type_by_id(btf, type->type);
+
 		ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
 		scalar_return =
 			btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
@@ -15235,13 +15288,10 @@ static int check_btf_func(struct bpf_verifier_env *env,
 		bpfptr_add(&urecord, urec_size);
 	}
 
-	prog->aux->func_info = krecord;
-	prog->aux->func_info_cnt = nfuncs;
 	prog->aux->func_info_aux = info_aux;
 	return 0;
 
 err_free:
-	kvfree(krecord);
 	kfree(info_aux);
 	return ret;
 }
@@ -15459,9 +15509,9 @@ static int check_core_relo(struct bpf_verifier_env *env,
 	return err;
 }
 
-static int check_btf_info(struct bpf_verifier_env *env,
-			  const union bpf_attr *attr,
-			  bpfptr_t uattr)
+static int check_btf_info_early(struct bpf_verifier_env *env,
+				const union bpf_attr *attr,
+				bpfptr_t uattr)
 {
 	struct btf *btf;
 	int err;
@@ -15481,6 +15531,24 @@ static int check_btf_info(struct bpf_verifier_env *env,
 	}
 	env->prog->aux->btf = btf;
 
+	err = check_btf_func_early(env, attr, uattr);
+	if (err)
+		return err;
+	return 0;
+}
+
+static int check_btf_info(struct bpf_verifier_env *env,
+			  const union bpf_attr *attr,
+			  bpfptr_t uattr)
+{
+	int err;
+
+	if (!attr->func_info_cnt && !attr->line_info_cnt) {
+		if (check_abnormal_return(env))
+			return -EINVAL;
+		return 0;
+	}
+
 	err = check_btf_func(env, attr, uattr);
 	if (err)
 		return err;
@@ -19990,6 +20058,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (!env->explored_states)
 		goto skip_full_check;
 
+	ret = check_btf_info_early(env, attr, uattr);
+	if (ret < 0)
+		goto skip_full_check;
+
 	ret = add_subprog_and_kfunc(env);
 	if (ret < 0)
 		goto skip_full_check;
-- 
cgit v1.2.3


From b9ae0c9dd0aca79bffc17be51c2dc148d1f72708 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:03 +0200
Subject: bpf: Add support for custom exception callbacks

By default, the subprog generated by the verifier to handle a thrown
exception hardcodes a return value of 0. To allow user-defined logic
and modification of the return value when an exception is thrown,
introduce the 'exception_callback:' declaration tag, which marks a
callback as the default exception handler for the program.

The format of the declaration tag is 'exception_callback:<value>', where
<value> is the name of the exception callback. Each main program can be
tagged using this BTF declaratiion tag to associate it with an exception
callback. In case the tag is absent, the default callback is used.

As such, the exception callback cannot be modified at runtime, only set
during verification.

Allowing modification of the callback for the current program execution
at runtime leads to issues when the programs begin to nest, as any
per-CPU state maintaing this information will have to be saved and
restored. We don't want it to stay in bpf_prog_aux as this takes a
global effect for all programs. An alternative solution is spilling
the callback pointer at a known location on the program stack on entry,
and then passing this location to bpf_throw as a parameter.

However, since exceptions are geared more towards a use case where they
are ideally never invoked, optimizing for this use case and adding to
the complexity has diminishing returns.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                            |   4 +-
 include/linux/bpf_verifier.h                   |   1 +
 kernel/bpf/btf.c                               |  29 +++++--
 kernel/bpf/verifier.c                          | 113 +++++++++++++++++++++++--
 tools/testing/selftests/bpf/bpf_experimental.h |  31 ++++++-
 5 files changed, 160 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 16740ee82082..30063a760b5a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2422,9 +2422,11 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *reg);
+			  struct bpf_reg_state *reg, bool is_ex_cb);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+				    int comp_idx, const char *tag_key);
 
 struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index da21a3ec5027..94ec766432f5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -300,6 +300,7 @@ struct bpf_func_state {
 	bool in_callback_fn;
 	struct tnum callback_ret_range;
 	bool in_async_callback_fn;
+	bool in_exception_callback_fn;
 
 	/* The following fields should be last. See copy_func_state() */
 	int acquired_refs;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 187b57276fec..f93e835d90af 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3310,10 +3310,10 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
 	return BTF_FIELD_FOUND;
 }
 
-static const char *btf_find_decl_tag_value(const struct btf *btf,
-					   const struct btf_type *pt,
-					   int comp_idx, const char *tag_key)
+const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
+				    int comp_idx, const char *tag_key)
 {
+	const char *value = NULL;
 	int i;
 
 	for (i = 1; i < btf_nr_types(btf); i++) {
@@ -3327,9 +3327,14 @@ static const char *btf_find_decl_tag_value(const struct btf *btf,
 			continue;
 		if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
 			continue;
-		return __btf_name_by_offset(btf, t->name_off) + len;
+		/* Prevent duplicate entries for same type */
+		if (value)
+			return ERR_PTR(-EEXIST);
+		value = __btf_name_by_offset(btf, t->name_off) + len;
 	}
-	return NULL;
+	if (!value)
+		return ERR_PTR(-ENOENT);
+	return value;
 }
 
 static int
@@ -3347,7 +3352,7 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
 	if (t->size != sz)
 		return BTF_FIELD_IGNORE;
 	value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
-	if (!value_type)
+	if (IS_ERR(value_type))
 		return -EINVAL;
 	node_field_name = strstr(value_type, ":");
 	if (!node_field_name)
@@ -6954,7 +6959,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
  * (either PTR_TO_CTX or SCALAR_VALUE).
  */
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *regs)
+			  struct bpf_reg_state *regs, bool is_ex_cb)
 {
 	struct bpf_verifier_log *log = &env->log;
 	struct bpf_prog *prog = env->prog;
@@ -7011,7 +7016,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			tname, nargs, MAX_BPF_FUNC_REG_ARGS);
 		return -EINVAL;
 	}
-	/* check that function returns int */
+	/* check that function returns int, exception cb also requires this */
 	t = btf_type_by_id(btf, t->type);
 	while (btf_type_is_modifier(t))
 		t = btf_type_by_id(btf, t->type);
@@ -7060,6 +7065,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			i, btf_type_str(t), tname);
 		return -EINVAL;
 	}
+	/* We have already ensured that the callback returns an integer, just
+	 * like all global subprogs. We need to determine it only has a single
+	 * scalar argument.
+	 */
+	if (is_ex_cb && (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE)) {
+		bpf_log(log, "exception cb only supports single integer argument\n");
+		return -EINVAL;
+	}
 	return 0;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ec767ae08c2b..ec3f22312516 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2457,6 +2457,68 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 	return env->subprog_cnt - 1;
 }
 
+static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env)
+{
+	struct bpf_prog_aux *aux = env->prog->aux;
+	struct btf *btf = aux->btf;
+	const struct btf_type *t;
+	u32 main_btf_id, id;
+	const char *name;
+	int ret, i;
+
+	/* Non-zero func_info_cnt implies valid btf */
+	if (!aux->func_info_cnt)
+		return 0;
+	main_btf_id = aux->func_info[0].type_id;
+
+	t = btf_type_by_id(btf, main_btf_id);
+	if (!t) {
+		verbose(env, "invalid btf id for main subprog in func_info\n");
+		return -EINVAL;
+	}
+
+	name = btf_find_decl_tag_value(btf, t, -1, "exception_callback:");
+	if (IS_ERR(name)) {
+		ret = PTR_ERR(name);
+		/* If there is no tag present, there is no exception callback */
+		if (ret == -ENOENT)
+			ret = 0;
+		else if (ret == -EEXIST)
+			verbose(env, "multiple exception callback tags for main subprog\n");
+		return ret;
+	}
+
+	ret = btf_find_by_name_kind(btf, name, BTF_KIND_FUNC);
+	if (ret < 0) {
+		verbose(env, "exception callback '%s' could not be found in BTF\n", name);
+		return ret;
+	}
+	id = ret;
+	t = btf_type_by_id(btf, id);
+	if (btf_func_linkage(t) != BTF_FUNC_GLOBAL) {
+		verbose(env, "exception callback '%s' must have global linkage\n", name);
+		return -EINVAL;
+	}
+	ret = 0;
+	for (i = 0; i < aux->func_info_cnt; i++) {
+		if (aux->func_info[i].type_id != id)
+			continue;
+		ret = aux->func_info[i].insn_off;
+		/* Further func_info and subprog checks will also happen
+		 * later, so assume this is the right insn_off for now.
+		 */
+		if (!ret) {
+			verbose(env, "invalid exception callback insn_off in func_info: 0\n");
+			ret = -EINVAL;
+		}
+	}
+	if (!ret) {
+		verbose(env, "exception callback type id not found in func_info\n");
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
 #define MAX_KFUNC_DESCS 256
 #define MAX_KFUNC_BTFS	256
 
@@ -2796,8 +2858,8 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
 static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
 {
 	struct bpf_subprog_info *subprog = env->subprog_info;
+	int i, ret, insn_cnt = env->prog->len, ex_cb_insn;
 	struct bpf_insn *insn = env->prog->insnsi;
-	int i, ret, insn_cnt = env->prog->len;
 
 	/* Add entry function. */
 	ret = add_subprog(env, 0);
@@ -2823,6 +2885,26 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
 			return ret;
 	}
 
+	ret = bpf_find_exception_callback_insn_off(env);
+	if (ret < 0)
+		return ret;
+	ex_cb_insn = ret;
+
+	/* If ex_cb_insn > 0, this means that the main program has a subprog
+	 * marked using BTF decl tag to serve as the exception callback.
+	 */
+	if (ex_cb_insn) {
+		ret = add_subprog(env, ex_cb_insn);
+		if (ret < 0)
+			return ret;
+		for (i = 1; i < env->subprog_cnt; i++) {
+			if (env->subprog_info[i].start != ex_cb_insn)
+				continue;
+			env->exception_callback_subprog = i;
+			break;
+		}
+	}
+
 	/* Add a fake 'exit' subprog which could simplify subprog iteration
 	 * logic. 'subprog_cnt' should not be increased.
 	 */
@@ -5707,6 +5789,10 @@ continue_func:
 			/* async callbacks don't increase bpf prog stack size unless called directly */
 			if (!bpf_pseudo_call(insn + i))
 				continue;
+			if (subprog[sidx].is_exception_cb) {
+				verbose(env, "insn %d cannot call exception cb directly\n", i);
+				return -EINVAL;
+			}
 		}
 		i = next_insn;
 		idx = sidx;
@@ -5728,8 +5814,13 @@ continue_func:
 	 * tail call counter throughout bpf2bpf calls combined with tailcalls
 	 */
 	if (tail_call_reachable)
-		for (j = 0; j < frame; j++)
+		for (j = 0; j < frame; j++) {
+			if (subprog[ret_prog[j]].is_exception_cb) {
+				verbose(env, "cannot tail call within exception cb\n");
+				return -EINVAL;
+			}
 			subprog[ret_prog[j]].tail_call_reachable = true;
+		}
 	if (subprog[0].tail_call_reachable)
 		env->prog->aux->tail_call_reachable = true;
 
@@ -14630,7 +14721,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	const bool is_subprog = frame->subprogno;
 
 	/* LSM and struct_ops func-ptr's return type could be "void" */
-	if (!is_subprog) {
+	if (!is_subprog || frame->in_exception_callback_fn) {
 		switch (prog_type) {
 		case BPF_PROG_TYPE_LSM:
 			if (prog->expected_attach_type == BPF_LSM_CGROUP)
@@ -14678,7 +14769,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 		return 0;
 	}
 
-	if (is_subprog) {
+	if (is_subprog && !frame->in_exception_callback_fn) {
 		if (reg->type != SCALAR_VALUE) {
 			verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
 				reg_type_str(env, reg->type));
@@ -19334,7 +19425,7 @@ static void free_states(struct bpf_verifier_env *env)
 	}
 }
 
-static int do_check_common(struct bpf_verifier_env *env, int subprog)
+static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex_cb)
 {
 	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
 	struct bpf_verifier_state *state;
@@ -19365,7 +19456,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 
 	regs = state->frame[state->curframe]->regs;
 	if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
-		ret = btf_prepare_func_args(env, subprog, regs);
+		ret = btf_prepare_func_args(env, subprog, regs, is_ex_cb);
 		if (ret)
 			goto out;
 		for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
@@ -19381,6 +19472,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 				regs[i].id = ++env->id_gen;
 			}
 		}
+		if (is_ex_cb) {
+			state->frame[0]->in_exception_callback_fn = true;
+			env->subprog_info[subprog].is_cb = true;
+			env->subprog_info[subprog].is_async_cb = true;
+			env->subprog_info[subprog].is_exception_cb = true;
+		}
 	} else {
 		/* 1st arg to a function */
 		regs[BPF_REG_1].type = PTR_TO_CTX;
@@ -19445,7 +19542,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env)
 			continue;
 		env->insn_idx = env->subprog_info[i].start;
 		WARN_ON_ONCE(env->insn_idx == 0);
-		ret = do_check_common(env, i);
+		ret = do_check_common(env, i, env->exception_callback_subprog == i);
 		if (ret) {
 			return ret;
 		} else if (env->log.level & BPF_LOG_LEVEL) {
@@ -19462,7 +19559,7 @@ static int do_check_main(struct bpf_verifier_env *env)
 	int ret;
 
 	env->insn_idx = 0;
-	ret = do_check_common(env, 0);
+	ret = do_check_common(env, 0, false);
 	if (!ret)
 		env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
 	return ret;
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 333b54a86e3a..9a87170524ce 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -165,7 +165,16 @@ extern void bpf_percpu_obj_drop_impl(void *kptr, void *meta) __ksym;
 /* Description
  *	Throw a BPF exception from the program, immediately terminating its
  *	execution and unwinding the stack. The supplied 'cookie' parameter
- *	will be the return value of the program when an exception is thrown.
+ *	will be the return value of the program when an exception is thrown,
+ *	and the default exception callback is used. Otherwise, if an exception
+ *	callback is set using the '__exception_cb(callback)' declaration tag
+ *	on the main program, the 'cookie' parameter will be the callback's only
+ *	input argument.
+ *
+ *	Thus, in case of default exception callback, 'cookie' is subjected to
+ *	constraints on the program's return value (as with R0 on exit).
+ *	Otherwise, the return value of the marked exception callback will be
+ *	subjected to the same checks.
  *
  *	Note that throwing an exception with lingering resources (locks,
  *	references, etc.) will lead to a verification error.
@@ -178,4 +187,24 @@ extern void bpf_percpu_obj_drop_impl(void *kptr, void *meta) __ksym;
  */
 extern void bpf_throw(u64 cookie) __ksym;
 
+/* This macro must be used to mark the exception callback corresponding to the
+ * main program. For example:
+ *
+ * int exception_cb(u64 cookie) {
+ *	return cookie;
+ * }
+ *
+ * SEC("tc")
+ * __exception_cb(exception_cb)
+ * int main_prog(struct __sk_buff *ctx) {
+ *	...
+ *	return TC_ACT_OK;
+ * }
+ *
+ * Here, exception callback for the main program will be 'exception_cb'. Note
+ * that this attribute can only be used once, and multiple exception callbacks
+ * specified for the main program will lead to verification error.
+ */
+#define __exception_cb(name) __attribute__((btf_decl_tag("exception_callback:" #name)))
+
 #endif
-- 
cgit v1.2.3


From b62bf8a5e9110922f58f6ea8fe747e1759f49e61 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:04 +0200
Subject: bpf: Perform CFG walk for exception callback

Since exception callbacks are not referenced using bpf_pseudo_func and
bpf_pseudo_call instructions, check_cfg traversal will never explore
instructions of the exception callback. Even after adding the subprog,
the program will then fail with a 'unreachable insn' error.

We thus need to begin walking from the start of the exception callback
again in check_cfg after a complete CFG traversal finishes, so as to
explore the CFG rooted at the exception callback.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-8-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ec3f22312516..863e4e6c4616 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15126,8 +15126,8 @@ static int check_cfg(struct bpf_verifier_env *env)
 {
 	int insn_cnt = env->prog->len;
 	int *insn_stack, *insn_state;
-	int ret = 0;
-	int i;
+	int ex_insn_beg, i, ret = 0;
+	bool ex_done = false;
 
 	insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
 	if (!insn_state)
@@ -15143,6 +15143,7 @@ static int check_cfg(struct bpf_verifier_env *env)
 	insn_stack[0] = 0; /* 0 is the first instruction */
 	env->cfg.cur_stack = 1;
 
+walk_cfg:
 	while (env->cfg.cur_stack > 0) {
 		int t = insn_stack[env->cfg.cur_stack - 1];
 
@@ -15169,6 +15170,16 @@ static int check_cfg(struct bpf_verifier_env *env)
 		goto err_free;
 	}
 
+	if (env->exception_callback_subprog && !ex_done) {
+		ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start;
+
+		insn_state[ex_insn_beg] = DISCOVERED;
+		insn_stack[0] = ex_insn_beg;
+		env->cfg.cur_stack = 1;
+		ex_done = true;
+		goto walk_cfg;
+	}
+
 	for (i = 0; i < insn_cnt; i++) {
 		if (insn_state[i] != EXPLORED) {
 			verbose(env, "unreachable insn %d\n", i);
-- 
cgit v1.2.3


From a923819fb2c5be029a69c0ca53239865c9bc05dd Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:05 +0200
Subject: bpf: Treat first argument as return value for bpf_throw

In case of the default exception callback, change the behavior of
bpf_throw, where the passed cookie value is no longer ignored, but
is instead the return value of the default exception callback. As
such, we need to place restrictions on the value being passed into
bpf_throw in such a case, only allowing those permitted by the
check_return_code function.

Thus, bpf_throw can now control the return value of the program from
each call site without having the user install a custom exception
callback just to override the return value when an exception is thrown.

We also modify the hidden subprog instructions to now move BPF_REG_1 to
BPF_REG_0, so as to set the return value before exit in the default
callback.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-9-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 863e4e6c4616..0ba32b626320 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11485,6 +11485,8 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int check_return_code(struct bpf_verifier_env *env, int regno);
+
 static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			    int *insn_idx_p)
 {
@@ -11613,6 +11615,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			return -ENOTSUPP;
 		}
 		env->seen_exception = true;
+
+		/* In the case of the default callback, the cookie value passed
+		 * to bpf_throw becomes the return value of the program.
+		 */
+		if (!env->exception_callback_subprog) {
+			err = check_return_code(env, BPF_REG_1);
+			if (err < 0)
+				return err;
+		}
 	}
 
 	for (i = 0; i < CALLER_SAVED_REGS; i++)
@@ -14709,7 +14720,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
-static int check_return_code(struct bpf_verifier_env *env)
+static int check_return_code(struct bpf_verifier_env *env, int regno)
 {
 	struct tnum enforce_attach_type_range = tnum_unknown;
 	const struct bpf_prog *prog = env->prog;
@@ -14743,22 +14754,22 @@ static int check_return_code(struct bpf_verifier_env *env)
 	 * of bpf_exit, which means that program wrote
 	 * something into it earlier
 	 */
-	err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+	err = check_reg_arg(env, regno, SRC_OP);
 	if (err)
 		return err;
 
-	if (is_pointer_value(env, BPF_REG_0)) {
-		verbose(env, "R0 leaks addr as return value\n");
+	if (is_pointer_value(env, regno)) {
+		verbose(env, "R%d leaks addr as return value\n", regno);
 		return -EACCES;
 	}
 
-	reg = cur_regs(env) + BPF_REG_0;
+	reg = cur_regs(env) + regno;
 
 	if (frame->in_async_callback_fn) {
 		/* enforce return zero from async callbacks like timer */
 		if (reg->type != SCALAR_VALUE) {
-			verbose(env, "In async callback the register R0 is not a known value (%s)\n",
-				reg_type_str(env, reg->type));
+			verbose(env, "In async callback the register R%d is not a known value (%s)\n",
+				regno, reg_type_str(env, reg->type));
 			return -EINVAL;
 		}
 
@@ -14771,8 +14782,8 @@ static int check_return_code(struct bpf_verifier_env *env)
 
 	if (is_subprog && !frame->in_exception_callback_fn) {
 		if (reg->type != SCALAR_VALUE) {
-			verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
-				reg_type_str(env, reg->type));
+			verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n",
+				regno, reg_type_str(env, reg->type));
 			return -EINVAL;
 		}
 		return 0;
@@ -14854,8 +14865,8 @@ static int check_return_code(struct bpf_verifier_env *env)
 	}
 
 	if (reg->type != SCALAR_VALUE) {
-		verbose(env, "At program exit the register R0 is not a known value (%s)\n",
-			reg_type_str(env, reg->type));
+		verbose(env, "At program exit the register R%d is not a known value (%s)\n",
+			regno, reg_type_str(env, reg->type));
 		return -EINVAL;
 	}
 
@@ -17053,7 +17064,7 @@ process_bpf_exit_full:
 					continue;
 				}
 
-				err = check_return_code(env);
+				err = check_return_code(env, BPF_REG_0);
 				if (err)
 					return err;
 process_bpf_exit:
@@ -18722,7 +18733,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 	if (env->seen_exception && !env->exception_callback_subprog) {
 		struct bpf_insn patch[] = {
 			env->prog->insnsi[insn_cnt - 1],
-			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
 			BPF_EXIT_INSN(),
 		};
 
-- 
cgit v1.2.3


From ec5290a178b787b2f8b21581fdadc919bd004e12 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:07 +0200
Subject: bpf: Prevent KASAN false positive with bpf_throw

The KASAN stack instrumentation when CONFIG_KASAN_STACK is true poisons
the stack of a function when it is entered and unpoisons it when
leaving. However, in the case of bpf_throw, we will never return as we
switch our stack frame to the BPF exception callback. Later, this
discrepancy will lead to confusing KASAN splats when kernel resumes
execution on return from the BPF program.

Fix this by unpoisoning everything below the stack pointer of the BPF
program, which should cover the range that would not be unpoisoned. An
example splat is below:

BUG: KASAN: stack-out-of-bounds in stack_trace_consume_entry+0x14e/0x170
Write of size 8 at addr ffffc900013af958 by task test_progs/227

CPU: 0 PID: 227 Comm: test_progs Not tainted 6.5.0-rc2-g43f1c6c9052a-dirty #26
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-2.fc39 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x4a/0x80
 print_report+0xcf/0x670
 ? arch_stack_walk+0x79/0x100
 kasan_report+0xda/0x110
 ? stack_trace_consume_entry+0x14e/0x170
 ? stack_trace_consume_entry+0x14e/0x170
 ? __pfx_stack_trace_consume_entry+0x10/0x10
 stack_trace_consume_entry+0x14e/0x170
 ? __sys_bpf+0xf2e/0x41b0
 arch_stack_walk+0x8b/0x100
 ? __sys_bpf+0xf2e/0x41b0
 ? bpf_prog_test_run_skb+0x341/0x1c70
 ? bpf_prog_test_run_skb+0x341/0x1c70
 stack_trace_save+0x9b/0xd0
 ? __pfx_stack_trace_save+0x10/0x10
 ? __kasan_slab_free+0x109/0x180
 ? bpf_prog_test_run_skb+0x341/0x1c70
 ? __sys_bpf+0xf2e/0x41b0
 ? __x64_sys_bpf+0x78/0xc0
 ? do_syscall_64+0x3c/0x90
 ? entry_SYSCALL_64_after_hwframe+0x6e/0xd8
 kasan_save_stack+0x33/0x60
 ? kasan_save_stack+0x33/0x60
 ? kasan_set_track+0x25/0x30
 ? kasan_save_free_info+0x2b/0x50
 ? __kasan_slab_free+0x109/0x180
 ? kmem_cache_free+0x191/0x460
 ? bpf_prog_test_run_skb+0x341/0x1c70
 kasan_set_track+0x25/0x30
 kasan_save_free_info+0x2b/0x50
 __kasan_slab_free+0x109/0x180
 kmem_cache_free+0x191/0x460
 bpf_prog_test_run_skb+0x341/0x1c70
 ? __pfx_bpf_prog_test_run_skb+0x10/0x10
 ? __fget_light+0x51/0x220
 __sys_bpf+0xf2e/0x41b0
 ? __might_fault+0xa2/0x170
 ? __pfx___sys_bpf+0x10/0x10
 ? lock_release+0x1de/0x620
 ? __might_fault+0xcd/0x170
 ? __pfx_lock_release+0x10/0x10
 ? __pfx_blkcg_maybe_throttle_current+0x10/0x10
 __x64_sys_bpf+0x78/0xc0
 ? syscall_enter_from_user_mode+0x20/0x50
 do_syscall_64+0x3c/0x90
 entry_SYSCALL_64_after_hwframe+0x6e/0xd8
RIP: 0033:0x7f0fbb38880d
Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d
89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d f3 45 12 00 f7 d8 64
89 01 48
RSP: 002b:00007ffe13907de8 EFLAGS: 00000206 ORIG_RAX: 0000000000000141
RAX: ffffffffffffffda RBX: 00007ffe13908708 RCX: 00007f0fbb38880d
RDX: 0000000000000050 RSI: 00007ffe13907e20 RDI: 000000000000000a
RBP: 00007ffe13907e00 R08: 0000000000000000 R09: 00007ffe13907e20
R10: 0000000000000064 R11: 0000000000000206 R12: 0000000000000003
R13: 0000000000000000 R14: 00007f0fbb532000 R15: 0000000000cfbd90
 </TASK>

The buggy address belongs to stack of task test_progs/227
KASAN internal error: frame info validation failed; invalid marker: 0

The buggy address belongs to the virtual mapping at
 [ffffc900013a8000, ffffc900013b1000) created by:
 kernel_clone+0xcd/0x600

The buggy address belongs to the physical page:
page:00000000b70f4332 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11418f
flags: 0x2fffe0000000000(node=0|zone=2|lastcpupid=0x7fff)
page_type: 0xffffffff()
raw: 02fffe0000000000 0000000000000000 dead000000000122 0000000000000000
raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffffc900013af800: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffffc900013af880: 00 00 00 f1 f1 f1 f1 00 00 00 f3 f3 f3 f3 f3 00
>ffffc900013af900: 00 00 00 00 00 00 00 00 00 00 00 f1 00 00 00 00
                                                    ^
 ffffc900013af980: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffffc900013afa00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
==================================================================
Disabling lock debugging due to kernel taint

Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Acked-by: Andrey Konovalov <andreyknvl@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-11-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 78e8f4de6750..2c8e1ee97b71 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -22,6 +22,7 @@
 #include <linux/security.h>
 #include <linux/btf_ids.h>
 #include <linux/bpf_mem_alloc.h>
+#include <linux/kasan.h>
 
 #include "../../lib/kstrtox.h"
 
@@ -2483,6 +2484,11 @@ __bpf_kfunc void bpf_throw(u64 cookie)
 		WARN_ON_ONCE(!ctx.aux->exception_boundary);
 	WARN_ON_ONCE(!ctx.bp);
 	WARN_ON_ONCE(!ctx.cnt);
+	/* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning
+	 * deeper stack depths than ctx.sp as we do not return from bpf_throw,
+	 * which skips compiler generated instrumentation to do the same.
+	 */
+	kasan_unpoison_task_stack_below((void *)ctx.sp);
 	ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp);
 }
 
-- 
cgit v1.2.3


From 66d9111f3517f85ef2af0337ece02683ce0faf21 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:08 +0200
Subject: bpf: Detect IP == ksym.end as part of BPF program

Now that bpf_throw kfunc is the first such call instruction that has
noreturn semantics within the verifier, this also kicks in dead code
elimination in unprecedented ways. For one, any instruction following
a bpf_throw call will never be marked as seen. Moreover, if a callchain
ends up throwing, any instructions after the call instruction to the
eventually throwing subprog in callers will also never be marked as
seen.

The tempting way to fix this would be to emit extra 'int3' instructions
which bump the jited_len of a program, and ensure that during runtime
when a program throws, we can discover its boundaries even if the call
instruction to bpf_throw (or to subprogs that always throw) is emitted
as the final instruction in the program.

An example of such a program would be this:

do_something():
	...
	r0 = 0
	exit

foo():
	r1 = 0
	call bpf_throw
	r0 = 0
	exit

bar(cond):
	if r1 != 0 goto pc+2
	call do_something
	exit
	call foo
	r0 = 0  // Never seen by verifier
	exit	//

main(ctx):
	r1 = ...
	call bar
	r0 = 0
	exit

Here, if we do end up throwing, the stacktrace would be the following:

bpf_throw
foo
bar
main

In bar, the final instruction emitted will be the call to foo, as such,
the return address will be the subsequent instruction (which the JIT
emits as int3 on x86). This will end up lying outside the jited_len of
the program, thus, when unwinding, we will fail to discover the return
address as belonging to any program and end up in a panic due to the
unreliable stack unwinding of BPF programs that we never expect.

To remedy this case, make bpf_prog_ksym_find treat IP == ksym.end as
part of the BPF program, so that is_bpf_text_address returns true when
such a case occurs, and we are able to unwind reliably when the final
instruction ends up being a call instruction.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-12-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/core.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7849b9cca749..8f921b6d6981 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -623,7 +623,11 @@ static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
 
 	if (val < ksym->start)
 		return -1;
-	if (val >= ksym->end)
+	/* Ensure that we detect return addresses as part of the program, when
+	 * the final instruction is a call for a program part of the stack
+	 * trace. Therefore, do val > ksym->end instead of val >= ksym->end.
+	 */
+	if (val > ksym->end)
 		return  1;
 
 	return 0;
-- 
cgit v1.2.3


From fd548e1a46185000191a89cae4be560e076ed6c7 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:09 +0200
Subject: bpf: Disallow fentry/fexit/freplace for exception callbacks

During testing, it was discovered that extensions to exception callbacks
had no checks, upon running a testcase, the kernel ended up running off
the end of a program having final call as bpf_throw, and hitting int3
instructions.

The reason is that while the default exception callback would have reset
the stack frame to return back to the main program's caller, the
replacing extension program will simply return back to bpf_throw, which
will instead return back to the program and the program will continue
execution, now in an undefined state where anything could happen.

The way to support extensions to an exception callback would be to mark
the BPF_PROG_TYPE_EXT main subprog as an exception_cb, and prevent it
from calling bpf_throw. This would make the JIT produce a prologue that
restores saved registers and reset the stack frame. But let's not do
that until there is a concrete use case for this, and simply disallow
this for now.

Similar issues will exist for fentry and fexit cases, where trampoline
saves data on the stack when invoking exception callback, which however
will then end up resetting the stack frame, and on return, the fexit
program will never will invoked as the return address points to the main
program's caller in the kernel. Instead of additional complexity and
back and forth between the two stacks to enable such a use case, simply
forbid it.

One key point here to note is that currently X86_TAIL_CALL_OFFSET didn't
require any modifications, even though we emit instructions before the
corresponding endbr64 instruction. This is because we ensure that a main
subprog never serves as an exception callback, and therefore the
exception callback (which will be a global subprog) can never serve as
the tail call target, eliminating any discrepancies. However, once we
support a BPF_PROG_TYPE_EXT to also act as an exception callback, it
will end up requiring change to the tail call offset to account for the
extra instructions. For simplicitly, tail calls could be disabled for
such targets.

Noting the above, it appears better to wait for a concrete use case
before choosing to permit extension programs to replace exception
callbacks.

As a precaution, we disable fentry and fexit for exception callbacks as
well.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-13-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c  | 1 +
 kernel/bpf/verifier.c | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 2c8e1ee97b71..7ff2a42f1996 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2490,6 +2490,7 @@ __bpf_kfunc void bpf_throw(u64 cookie)
 	 */
 	kasan_unpoison_task_stack_below((void *)ctx.sp);
 	ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp);
+	WARN(1, "A call to BPF exception callback should never return\n");
 }
 
 __diag_pop();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0ba32b626320..5ccb50fd74e5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19750,6 +19750,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			bpf_log(log, "Subprog %s doesn't exist\n", tname);
 			return -EINVAL;
 		}
+		if (aux->func && aux->func[subprog]->aux->exception_cb) {
+			bpf_log(log,
+				"%s programs cannot attach to exception callback\n",
+				prog_extension ? "Extension" : "FENTRY/FEXIT");
+			return -EINVAL;
+		}
 		conservative = aux->func_info_aux[subprog].unreliable;
 		if (prog_extension) {
 			if (conservative) {
-- 
cgit v1.2.3


From 06d686f771ddc27a8554cd8f5b22e071040dc90e Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 13 Sep 2023 01:32:10 +0200
Subject: bpf: Fix kfunc callback register type handling

The kfunc code to handle KF_ARG_PTR_TO_CALLBACK does not check the reg
type before using reg->subprogno. This can accidently permit invalid
pointers from being passed into callback helpers (e.g. silently from
different paths). Likewise, reg->subprogno from the per-register type
union may not be meaningful either. We need to reject any other type
except PTR_TO_FUNC.

Acked-by: Dave Marchevsky <davemarchevsky@fb.com>
Fixes: 5d92ddc3de1b ("bpf: Add callback validation to kfunc verifier logic")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230912233214.1518551-14-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5ccb50fd74e5..a7178ecf676d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11407,6 +11407,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			break;
 		}
 		case KF_ARG_PTR_TO_CALLBACK:
+			if (reg->type != PTR_TO_FUNC) {
+				verbose(env, "arg%d expected pointer to func\n", i);
+				return -EINVAL;
+			}
 			meta->subprogno = reg->subprogno;
 			break;
 		case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
-- 
cgit v1.2.3


From fbaa6a181a4b1886cbf4214abdf9a2df68471510 Mon Sep 17 00:00:00 2001
From: Elliot Berman <quic_eberman@quicinc.com>
Date: Fri, 8 Sep 2023 15:49:15 -0700
Subject: sched/core: Remove ifdeffery for saved_state

In preparation for freezer to also use saved_state, remove the
CONFIG_PREEMPT_RT compilation guard around saved_state.

On the arm64 platform I tested which did not have CONFIG_PREEMPT_RT,
there was no statistically significant deviation by applying this patch.

Test methodology:

perf bench sched message -g 40 -l 40

Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 --
 kernel/sched/core.c   | 8 ++------
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..dc37ae787e33 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -750,10 +750,8 @@ struct task_struct {
 #endif
 	unsigned int			__state;
 
-#ifdef CONFIG_PREEMPT_RT
 	/* saved state for "spinlock sleepers" */
 	unsigned int			saved_state;
-#endif
 
 	/*
 	 * This begins the randomizable portion of task_struct. Only
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f39482d6a6e6..49541e3c1295 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2232,23 +2232,20 @@ int __task_state_match(struct task_struct *p, unsigned int state)
 	if (READ_ONCE(p->__state) & state)
 		return 1;
 
-#ifdef CONFIG_PREEMPT_RT
 	if (READ_ONCE(p->saved_state) & state)
 		return -1;
-#endif
+
 	return 0;
 }
 
 static __always_inline
 int task_state_match(struct task_struct *p, unsigned int state)
 {
-#ifdef CONFIG_PREEMPT_RT
 	/*
 	 * Serialize against current_save_and_set_rtlock_wait_state() and
 	 * current_restore_rtlock_saved_state().
 	 */
 	guard(raw_spinlock_irq)(&p->pi_lock);
-#endif
 	return __task_state_match(p, state);
 }
 
@@ -4038,7 +4035,6 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
 
 	*success = !!(match = __task_state_match(p, state));
 
-#ifdef CONFIG_PREEMPT_RT
 	/*
 	 * Saved state preserves the task state across blocking on
 	 * an RT lock.  If the state matches, set p::saved_state to
@@ -4054,7 +4050,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
 	 */
 	if (match < 0)
 		p->saved_state = TASK_RUNNING;
-#endif
+
 	return match > 0;
 }
 
-- 
cgit v1.2.3


From 8f0eed4a78a81668bc78923ea09f51a7a663c2b0 Mon Sep 17 00:00:00 2001
From: Elliot Berman <quic_eberman@quicinc.com>
Date: Fri, 8 Sep 2023 15:49:16 -0700
Subject: freezer,sched: Use saved_state to reduce some spurious wakeups

After commit f5d39b020809 ("freezer,sched: Rewrite core freezer logic"),
tasks that transition directly from TASK_FREEZABLE to TASK_FROZEN  are
always woken up on the thaw path. Prior to that commit, tasks could ask
freezer to consider them "frozen enough" via freezer_do_not_count(). The
commit replaced freezer_do_not_count() with a TASK_FREEZABLE state which
allows freezer to immediately mark the task as TASK_FROZEN without
waking up the task.  This is efficient for the suspend path, but on the
thaw path, the task is always woken up even if the task didn't need to
wake up and goes back to its TASK_(UN)INTERRUPTIBLE state. Although
these tasks are capable of handling of the wakeup, we can observe a
power/perf impact from the extra wakeup.

We observed on Android many tasks wait in the TASK_FREEZABLE state
(particularly due to many of them being binder clients). We observed
nearly 4x the number of tasks and a corresponding linear increase in
latency and power consumption when thawing the system. The latency
increased from ~15ms to ~50ms.

Avoid the spurious wakeups by saving the state of TASK_FREEZABLE tasks.
If the task was running before entering TASK_FROZEN state
(__refrigerator()) or if the task received a wake up for the saved
state, then the task is woken on thaw. saved_state from PREEMPT_RT locks
can be re-used because freezer would not stomp on the rtlock wait flow:
TASK_RTLOCK_WAIT isn't considered freezable.

Reported-by: Prakash Viswalingam <quic_prakashv@quicinc.com>
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/freezer.c    | 41 +++++++++++++++++++----------------------
 kernel/sched/core.c | 23 ++++++++++++++---------
 2 files changed, 33 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/freezer.c b/kernel/freezer.c
index 4fad0e6fca64..c450fa8b8b5e 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -71,7 +71,11 @@ bool __refrigerator(bool check_kthr_stop)
 	for (;;) {
 		bool freeze;
 
+		raw_spin_lock_irq(&current->pi_lock);
 		set_current_state(TASK_FROZEN);
+		/* unstale saved_state so that __thaw_task() will wake us up */
+		current->saved_state = TASK_RUNNING;
+		raw_spin_unlock_irq(&current->pi_lock);
 
 		spin_lock_irq(&freezer_lock);
 		freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop());
@@ -129,6 +133,7 @@ static int __set_task_frozen(struct task_struct *p, void *arg)
 		WARN_ON_ONCE(debug_locks && p->lockdep_depth);
 #endif
 
+	p->saved_state = p->__state;
 	WRITE_ONCE(p->__state, TASK_FROZEN);
 	return TASK_FROZEN;
 }
@@ -170,42 +175,34 @@ bool freeze_task(struct task_struct *p)
 }
 
 /*
- * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical
- * state in p->jobctl. If either of them got a wakeup that was missed because
- * TASK_FROZEN, then their canonical state reflects that and the below will
- * refuse to restore the special state and instead issue the wakeup.
+ * Restore the saved_state before the task entered freezer. For typical task
+ * in the __refrigerator(), saved_state == TASK_RUNNING so nothing happens
+ * here. For tasks which were TASK_NORMAL | TASK_FREEZABLE, their initial state
+ * is restored unless they got an expected wakeup (see ttwu_state_match()).
+ * Returns 1 if the task state was restored.
  */
-static int __set_task_special(struct task_struct *p, void *arg)
+static int __restore_freezer_state(struct task_struct *p, void *arg)
 {
-	unsigned int state = 0;
+	unsigned int state = p->saved_state;
 
-	if (p->jobctl & JOBCTL_TRACED)
-		state = TASK_TRACED;
-
-	else if (p->jobctl & JOBCTL_STOPPED)
-		state = TASK_STOPPED;
-
-	if (state)
+	if (state != TASK_RUNNING) {
 		WRITE_ONCE(p->__state, state);
+		return 1;
+	}
 
-	return state;
+	return 0;
 }
 
 void __thaw_task(struct task_struct *p)
 {
-	unsigned long flags, flags2;
+	unsigned long flags;
 
 	spin_lock_irqsave(&freezer_lock, flags);
 	if (WARN_ON_ONCE(freezing(p)))
 		goto unlock;
 
-	if (lock_task_sighand(p, &flags2)) {
-		/* TASK_FROZEN -> TASK_{STOPPED,TRACED} */
-		bool ret = task_call_func(p, __set_task_special, NULL);
-		unlock_task_sighand(p, &flags2);
-		if (ret)
-			goto unlock;
-	}
+	if (task_call_func(p, __restore_freezer_state, NULL))
+		goto unlock;
 
 	wake_up_state(p, TASK_FROZEN);
 unlock:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 49541e3c1295..5a50c4e41be9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2242,8 +2242,8 @@ static __always_inline
 int task_state_match(struct task_struct *p, unsigned int state)
 {
 	/*
-	 * Serialize against current_save_and_set_rtlock_wait_state() and
-	 * current_restore_rtlock_saved_state().
+	 * Serialize against current_save_and_set_rtlock_wait_state(),
+	 * current_restore_rtlock_saved_state(), and __refrigerator().
 	 */
 	guard(raw_spinlock_irq)(&p->pi_lock);
 	return __task_state_match(p, state);
@@ -4015,13 +4015,17 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  * The caller holds p::pi_lock if p != current or has preemption
  * disabled when p == current.
  *
- * The rules of PREEMPT_RT saved_state:
+ * The rules of saved_state:
  *
  *   The related locking code always holds p::pi_lock when updating
  *   p::saved_state, which means the code is fully serialized in both cases.
  *
- *   The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
- *   bits set. This allows to distinguish all wakeup scenarios.
+ *   For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT.
+ *   No other bits set. This allows to distinguish all wakeup scenarios.
+ *
+ *   For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This
+ *   allows us to prevent early wakeup of tasks before they can be run on
+ *   asymmetric ISA architectures (eg ARMv9).
  */
 static __always_inline
 bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
@@ -4037,10 +4041,11 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
 
 	/*
 	 * Saved state preserves the task state across blocking on
-	 * an RT lock.  If the state matches, set p::saved_state to
-	 * TASK_RUNNING, but do not wake the task because it waits
-	 * for a lock wakeup. Also indicate success because from
-	 * the regular waker's point of view this has succeeded.
+	 * an RT lock or TASK_FREEZABLE tasks.  If the state matches,
+	 * set p::saved_state to TASK_RUNNING, but do not wake the task
+	 * because it waits for a lock wakeup or __thaw_task(). Also
+	 * indicate success because from the regular waker's point of
+	 * view this has succeeded.
 	 *
 	 * After acquiring the lock the task will restore p::__state
 	 * from p::saved_state which ensures that the regular
-- 
cgit v1.2.3


From 1528c661c24b407e92194426b0adbb43de859ce0 Mon Sep 17 00:00:00 2001
From: Aaron Lu <aaron.lu@intel.com>
Date: Tue, 12 Sep 2023 14:58:08 +0800
Subject: sched/fair: Ratelimit update to tg->load_avg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When using sysbench to benchmark Postgres in a single docker instance
with sysbench's nr_threads set to nr_cpu, it is observed there are times
update_cfs_group() and update_load_avg() shows noticeable overhead on
a 2sockets/112core/224cpu Intel Sapphire Rapids(SPR):

    13.75%    13.74%  [kernel.vmlinux]           [k] update_cfs_group
    10.63%    10.04%  [kernel.vmlinux]           [k] update_load_avg

Annotate shows the cycles are mostly spent on accessing tg->load_avg
with update_load_avg() being the write side and update_cfs_group() being
the read side. tg->load_avg is per task group and when different tasks
of the same taskgroup running on different CPUs frequently access
tg->load_avg, it can be heavily contended.

E.g. when running postgres_sysbench on a 2sockets/112cores/224cpus Intel
Sappire Rapids, during a 5s window, the wakeup number is 14millions and
migration number is 11millions and with each migration, the task's load
will transfer from src cfs_rq to target cfs_rq and each change involves
an update to tg->load_avg. Since the workload can trigger as many wakeups
and migrations, the access(both read and write) to tg->load_avg can be
unbound. As a result, the two mentioned functions showed noticeable
overhead. With netperf/nr_client=nr_cpu/UDP_RR, the problem is worse:
during a 5s window, wakeup number is 21millions and migration number is
14millions; update_cfs_group() costs ~25% and update_load_avg() costs ~16%.

Reduce the overhead by limiting updates to tg->load_avg to at most once
per ms. The update frequency is a tradeoff between tracking accuracy and
overhead. 1ms is chosen because PELT window is roughly 1ms and it
delivered good results for the tests that I've done. After this change,
the cost of accessing tg->load_avg is greatly reduced and performance
improved. Detailed test results below.

  ==============================
  postgres_sysbench on SPR:
  25%
  base:   42382±19.8%
  patch:  50174±9.5%  (noise)

  50%
  base:   67626±1.3%
  patch:  67365±3.1%  (noise)

  75%
  base:   100216±1.2%
  patch:  112470±0.1% +12.2%

  100%
  base:    93671±0.4%
  patch:  113563±0.2% +21.2%

  ==============================
  hackbench on ICL:
  group=1
  base:    114912±5.2%
  patch:   117857±2.5%  (noise)

  group=4
  base:    359902±1.6%
  patch:   361685±2.7%  (noise)

  group=8
  base:    461070±0.8%
  patch:   491713±0.3% +6.6%

  group=16
  base:    309032±5.0%
  patch:   378337±1.3% +22.4%

  =============================
  hackbench on SPR:
  group=1
  base:    100768±2.9%
  patch:   103134±2.9%  (noise)

  group=4
  base:    413830±12.5%
  patch:   378660±16.6% (noise)

  group=8
  base:    436124±0.6%
  patch:   490787±3.2% +12.5%

  group=16
  base:    457730±3.2%
  patch:   680452±1.3% +48.8%

  ============================
  netperf/udp_rr on ICL
  25%
  base:    114413±0.1%
  patch:   115111±0.0% +0.6%

  50%
  base:    86803±0.5%
  patch:   86611±0.0%  (noise)

  75%
  base:    35959±5.3%
  patch:   49801±0.6% +38.5%

  100%
  base:    61951±6.4%
  patch:   70224±0.8% +13.4%

  ===========================
  netperf/udp_rr on SPR
  25%
  base:   104954±1.3%
  patch:  107312±2.8%  (noise)

  50%
  base:    55394±4.6%
  patch:   54940±7.4%  (noise)

  75%
  base:    13779±3.1%
  patch:   36105±1.1% +162%

  100%
  base:     9703±3.7%
  patch:   28011±0.2% +189%

  ==============================================
  netperf/tcp_stream on ICL (all in noise range)
  25%
  base:    43092±0.1%
  patch:   42891±0.5%

  50%
  base:    19278±14.9%
  patch:   22369±7.2%

  75%
  base:    16822±3.0%
  patch:   17086±2.3%

  100%
  base:    18216±0.6%
  patch:   18078±2.9%

  ===============================================
  netperf/tcp_stream on SPR (all in noise range)
  25%
  base:    34491±0.3%
  patch:   34886±0.5%

  50%
  base:    19278±14.9%
  patch:   22369±7.2%

  75%
  base:    16822±3.0%
  patch:   17086±2.3%

  100%
  base:    18216±0.6%
  patch:   18078±2.9%

Reported-by: Nitin Tekchandani <nitin.tekchandani@intel.com>
Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: David Vernet <void@manifault.com>
Tested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Tested-by: Swapnil Sapkal <Swapnil.Sapkal@amd.com>
Link: https://lkml.kernel.org/r/20230912065808.2530-2-aaron.lu@intel.com
---
 kernel/sched/fair.c  | 13 ++++++++++++-
 kernel/sched/sched.h |  1 +
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c893721ff5b1..d0877878bcdb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3876,7 +3876,8 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
  */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 {
-	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+	long delta;
+	u64 now;
 
 	/*
 	 * No need to update load_avg for root_task_group as it is not used.
@@ -3884,9 +3885,19 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 	if (cfs_rq->tg == &root_task_group)
 		return;
 
+	/*
+	 * For migration heavy workloads, access to tg->load_avg can be
+	 * unbound. Limit the update rate to at most once per ms.
+	 */
+	now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+	if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
+		return;
+
+	delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
 	if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
 		atomic_long_add(delta, &cfs_rq->tg->load_avg);
 		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
+		cfs_rq->last_update_tg_load_avg = now;
 	}
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 68768f47ccb7..887468c48ff6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -594,6 +594,7 @@ struct cfs_rq {
 	} removed;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	u64			last_update_tg_load_avg;
 	unsigned long		tg_load_avg_contrib;
 	long			propagate;
 	long			prop_runnable_sum;
-- 
cgit v1.2.3


From 7ad0354d18ae05e9c8885251e234cbcf141f8972 Mon Sep 17 00:00:00 2001
From: GUO Zihua <guozihua@huawei.com>
Date: Fri, 18 Aug 2023 09:56:33 +0800
Subject: sched/headers: Remove duplicated includes in kernel/sched/sched.h

Remove duplicated includes of linux/cgroup.h and linux/psi.h. Both of
these includes are included regardless of the config and they are all
protected by ifndef, so no point including them again.

Signed-off-by: GUO Zihua <guozihua@huawei.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230818015633.18370-1-guozihua@huawei.com
---
 kernel/sched/sched.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 887468c48ff6..5f217b1e8f1c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -74,15 +74,6 @@
 
 #include "../workqueue_internal.h"
 
-#ifdef CONFIG_CGROUP_SCHED
-#include <linux/cgroup.h>
-#include <linux/psi.h>
-#endif
-
-#ifdef CONFIG_SCHED_DEBUG
-# include <linux/static_key.h>
-#endif
-
 #ifdef CONFIG_PARAVIRT
 # include <asm/paravirt.h>
 # include <asm/paravirt_api_clock.h>
-- 
cgit v1.2.3


From 6b93bb41f6eaa1cc5c5f30ec3b687b380f116cd0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:00 +0206
Subject: printk: Add non-BKL (nbcon) console basic infrastructure

The current console/printk subsystem is protected by a Big Kernel Lock,
(aka console_lock) which has ill defined semantics and is more or less
stateless. This puts severe limitations on the console subsystem and
makes forced takeover and output in emergency and panic situations a
fragile endeavour that is based on try and pray.

The goal of non-BKL (nbcon) consoles is to break out of the console lock
jail and to provide a new infrastructure that avoids the pitfalls and
also allows console drivers to be gradually converted over.

The proposed infrastructure aims for the following properties:

  - Per console locking instead of global locking
  - Per console state that allows to make informed decisions
  - Stateful handover and takeover

As a first step, state is added to struct console. The per console state
is an atomic_t using a 32bit bit field.

Reserve state bits, which will be populated later in the series. Wire
it up into the console register/unregister functionality.

It was decided to use a bitfield because using a plain u32 with
mask/shift operations resulted in uncomprehensible code.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-2-john.ogness@linutronix.de
---
 include/linux/console.h  | 31 +++++++++++++++++++++
 kernel/printk/Makefile   |  2 +-
 kernel/printk/internal.h |  8 ++++++
 kernel/printk/nbcon.c    | 70 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/printk/printk.c   | 13 ++++++---
 5 files changed, 120 insertions(+), 4 deletions(-)
 create mode 100644 kernel/printk/nbcon.c

(limited to 'kernel')

diff --git a/include/linux/console.h b/include/linux/console.h
index 7de11c763eb3..a2d37a7a98a8 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -156,6 +156,8 @@ static inline int con_debug_leave(void)
  *			/dev/kmesg which requires a larger output buffer.
  * @CON_SUSPENDED:	Indicates if a console is suspended. If true, the
  *			printing callbacks must not be called.
+ * @CON_NBCON:		Console can operate outside of the legacy style console_lock
+ *			constraints.
  */
 enum cons_flags {
 	CON_PRINTBUFFER		= BIT(0),
@@ -166,8 +168,32 @@ enum cons_flags {
 	CON_BRL			= BIT(5),
 	CON_EXTENDED		= BIT(6),
 	CON_SUSPENDED		= BIT(7),
+	CON_NBCON		= BIT(8),
 };
 
+/**
+ * struct nbcon_state - console state for nbcon consoles
+ * @atom:	Compound of the state fields for atomic operations
+ *
+ * To be used for reading and preparing of the value stored in the nbcon
+ * state variable @console::nbcon_state.
+ */
+struct nbcon_state {
+	union {
+		unsigned int	atom;
+		struct {
+		};
+	};
+};
+
+/*
+ * The nbcon_state struct is used to easily create and interpret values that
+ * are stored in the @console::nbcon_state variable. Ensure this struct stays
+ * within the size boundaries of the atomic variable's underlying type in
+ * order to avoid any accidental truncation.
+ */
+static_assert(sizeof(struct nbcon_state) <= sizeof(int));
+
 /**
  * struct console - The console descriptor structure
  * @name:		The name of the console driver
@@ -187,6 +213,8 @@ enum cons_flags {
  * @dropped:		Number of unreported dropped ringbuffer records
  * @data:		Driver private data
  * @node:		hlist node for the console list
+ *
+ * @nbcon_state:	State for nbcon consoles
  */
 struct console {
 	char			name[16];
@@ -206,6 +234,9 @@ struct console {
 	unsigned long		dropped;
 	void			*data;
 	struct hlist_node	node;
+
+	/* nbcon console specific members */
+	atomic_t		__private nbcon_state;
 };
 
 #ifdef CONFIG_LOCKDEP
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index f5b388e810b9..39a2b61c7232 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y	= printk.o
-obj-$(CONFIG_PRINTK)	+= printk_safe.o
+obj-$(CONFIG_PRINTK)	+= printk_safe.o nbcon.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
 obj-$(CONFIG_PRINTK_INDEX)	+= index.o
 
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 7d4979d5c3ce..2ca0ab78802c 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -3,6 +3,7 @@
  * internal.h - printk internal definitions
  */
 #include <linux/percpu.h>
+#include <linux/console.h>
 
 #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
 void __init printk_sysctl_init(void);
@@ -61,6 +62,10 @@ void defer_console_output(void);
 
 u16 printk_parse_prefix(const char *text, int *level,
 			enum printk_info_flags *flags);
+
+void nbcon_init(struct console *con);
+void nbcon_cleanup(struct console *con);
+
 #else
 
 #define PRINTK_PREFIX_MAX	0
@@ -76,6 +81,9 @@ u16 printk_parse_prefix(const char *text, int *level,
 #define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
 
 static inline bool printk_percpu_data_ready(void) { return false; }
+static inline void nbcon_init(struct console *con) { }
+static inline void nbcon_cleanup(struct console *con) { }
+
 #endif /* CONFIG_PRINTK */
 
 /**
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
new file mode 100644
index 000000000000..63d24ca62ac5
--- /dev/null
+++ b/kernel/printk/nbcon.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2022 Linutronix GmbH, John Ogness
+// Copyright (C) 2022 Intel, Thomas Gleixner
+
+#include <linux/kernel.h>
+#include <linux/console.h>
+#include "internal.h"
+/*
+ * Printk console printing implementation for consoles which does not depend
+ * on the legacy style console_lock mechanism.
+ */
+
+/**
+ * nbcon_state_set - Helper function to set the console state
+ * @con:	Console to update
+ * @new:	The new state to write
+ *
+ * Only to be used when the console is not yet or no longer visible in the
+ * system. Otherwise use nbcon_state_try_cmpxchg().
+ */
+static inline void nbcon_state_set(struct console *con, struct nbcon_state *new)
+{
+	atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom);
+}
+
+/**
+ * nbcon_state_read - Helper function to read the console state
+ * @con:	Console to read
+ * @state:	The state to store the result
+ */
+static inline void nbcon_state_read(struct console *con, struct nbcon_state *state)
+{
+	state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state));
+}
+
+/**
+ * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state
+ * @con:	Console to update
+ * @cur:	Old/expected state
+ * @new:	New state
+ *
+ * Return: True on success. False on fail and @cur is updated.
+ */
+static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur,
+					   struct nbcon_state *new)
+{
+	return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
+}
+
+/**
+ * nbcon_init - Initialize the nbcon console specific data
+ * @con:	Console to initialize
+ */
+void nbcon_init(struct console *con)
+{
+	struct nbcon_state state = { };
+
+	nbcon_state_set(con, &state);
+}
+
+/**
+ * nbcon_cleanup - Cleanup the nbcon console specific data
+ * @con:	Console to cleanup
+ */
+void nbcon_cleanup(struct console *con)
+{
+	struct nbcon_state state = { };
+
+	nbcon_state_set(con, &state);
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 7e0b4dd02398..5f372eaceb29 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3326,9 +3326,10 @@ static void try_enable_default_console(struct console *newcon)
 		newcon->flags |= CON_CONSDEV;
 }
 
-#define con_printk(lvl, con, fmt, ...)			\
-	printk(lvl pr_fmt("%sconsole [%s%d] " fmt),	\
-	       (con->flags & CON_BOOT) ? "boot" : "",	\
+#define con_printk(lvl, con, fmt, ...)				\
+	printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt),		\
+	       (con->flags & CON_NBCON) ? "" : "legacy ",	\
+	       (con->flags & CON_BOOT) ? "boot" : "",		\
 	       con->name, con->index, ##__VA_ARGS__)
 
 static void console_init_seq(struct console *newcon, bool bootcon_registered)
@@ -3488,6 +3489,9 @@ void register_console(struct console *newcon)
 	newcon->dropped = 0;
 	console_init_seq(newcon, bootcon_registered);
 
+	if (newcon->flags & CON_NBCON)
+		nbcon_init(newcon);
+
 	/*
 	 * Put this console in the list - keep the
 	 * preferred driver at the head of the list.
@@ -3579,6 +3583,9 @@ static int unregister_console_locked(struct console *console)
 	 */
 	synchronize_srcu(&console_srcu);
 
+	if (console->flags & CON_NBCON)
+		nbcon_cleanup(console);
+
 	console_sysfs_notify();
 
 	if (console->exit)
-- 
cgit v1.2.3


From 3a5bb25162b880da749f7cdf281c78dbade4164b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:01 +0206
Subject: printk: nbcon: Add acquire/release logic

Add per console acquire/release functionality.

The state of the console is maintained in the "nbcon_state" atomic
variable.

The console is locked when:

  - The 'prio' field contains the priority of the context that owns the
    console. Only higher priority contexts are allowed to take over the
    lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked.

  - The 'cpu' field denotes on which CPU the console is locked. It is used
    to prevent busy waiting on the same CPU. Also it informs the lock owner
    that it has lost the lock in a more complex scenario when the lock was
    taken over by a higher priority context, released, and taken on another
    CPU with the same priority as the interrupted owner.

The acquire mechanism uses a few more fields:

  - The 'req_prio' field is used by the handover approach to make the
    current owner aware that there is a context with a higher priority
    waiting for the friendly handover.

  - The 'unsafe' field allows to take over the console in a safe way in the
    middle of emitting a message. The field is set only when accessing some
    shared resources or when the console device is manipulated. It can be
    cleared, for example, after emitting one character when the console
    device is in a consistent state.

  - The 'unsafe_takeover' field is set when a hostile takeover took the
    console in an unsafe state. The console will stay in the unsafe state
    until re-initialized.

The acquire mechanism uses three approaches:

  1) Direct acquire when the console is not owned or is owned by a lower
     priority context and is in a safe state.

  2) Friendly handover mechanism uses a request/grant handshake. It is used
     when the current owner has lower priority and the console is in an
     unsafe state.

     The requesting context:

       a) Sets its priority into the 'req_prio' field.

       b) Waits (with a timeout) for the owning context to unlock the
          console.

       c) Takes the lock and clears the 'req_prio' field.

     The owning context:

       a) Observes the 'req_prio' field set on exit from the unsafe
          console state.

       b) Gives up console ownership by clearing the 'prio' field.

  3) Unsafe hostile takeover allows to take over the lock even when the
     console is an unsafe state. It is used only in panic() by the final
     attempt to flush consoles in a try and hope mode.

     Note that separate record buffers are used in panic(). As a result,
     the messages can be read and formatted without any risk even after
     using the hostile takeover in unsafe state.

The release function simply clears the 'prio' field.

All operations on @console::nbcon_state are atomic cmpxchg based to
handle concurrency.

The acquire/release functions implement only minimal policies:

  - Preference for higher priority contexts.
  - Protection of the panic CPU.

All other policy decisions must be made at the call sites:

  - What is marked as an unsafe section.
  - Whether to spin-wait if there is already an owner and the console is
    in an unsafe state.
  - Whether to attempt an unsafe hostile takeover.

The design allows to implement the well known:

    acquire()
    output_one_printk_record()
    release()

The output of one printk record might be interrupted with a higher priority
context. The new owner is supposed to reprint the entire interrupted record
from scratch.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-3-john.ogness@linutronix.de
---
 include/linux/console.h |  56 ++++++
 kernel/printk/nbcon.c   | 497 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 553 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/console.h b/include/linux/console.h
index a2d37a7a98a8..98210fd01f18 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -175,13 +175,29 @@ enum cons_flags {
  * struct nbcon_state - console state for nbcon consoles
  * @atom:	Compound of the state fields for atomic operations
  *
+ * @req_prio:		The priority of a handover request
+ * @prio:		The priority of the current owner
+ * @unsafe:		Console is busy in a non takeover region
+ * @unsafe_takeover:	A hostile takeover in an unsafe state happened in the
+ *			past. The console cannot be safe until re-initialized.
+ * @cpu:		The CPU on which the owner runs
+ *
  * To be used for reading and preparing of the value stored in the nbcon
  * state variable @console::nbcon_state.
+ *
+ * The @prio and @req_prio fields are particularly important to allow
+ * spin-waiting to timeout and give up without the risk of a waiter being
+ * assigned the lock after giving up.
  */
 struct nbcon_state {
 	union {
 		unsigned int	atom;
 		struct {
+			unsigned int prio		:  2;
+			unsigned int req_prio		:  2;
+			unsigned int unsafe		:  1;
+			unsigned int unsafe_takeover	:  1;
+			unsigned int cpu		: 24;
 		};
 	};
 };
@@ -194,6 +210,46 @@ struct nbcon_state {
  */
 static_assert(sizeof(struct nbcon_state) <= sizeof(int));
 
+/**
+ * nbcon_prio - console owner priority for nbcon consoles
+ * @NBCON_PRIO_NONE:		Unused
+ * @NBCON_PRIO_NORMAL:		Normal (non-emergency) usage
+ * @NBCON_PRIO_EMERGENCY:	Emergency output (WARN/OOPS...)
+ * @NBCON_PRIO_PANIC:		Panic output
+ * @NBCON_PRIO_MAX:		The number of priority levels
+ *
+ * A higher priority context can takeover the console when it is
+ * in the safe state. The final attempt to flush consoles in panic()
+ * can be allowed to do so even in an unsafe state (Hope and pray).
+ */
+enum nbcon_prio {
+	NBCON_PRIO_NONE = 0,
+	NBCON_PRIO_NORMAL,
+	NBCON_PRIO_EMERGENCY,
+	NBCON_PRIO_PANIC,
+	NBCON_PRIO_MAX,
+};
+
+struct console;
+
+/**
+ * struct nbcon_context - Context for console acquire/release
+ * @console:			The associated console
+ * @spinwait_max_us:		Limit for spin-wait acquire
+ * @prio:			Priority of the context
+ * @allow_unsafe_takeover:	Allow performing takeover even if unsafe. Can
+ *				be used only with NBCON_PRIO_PANIC @prio. It
+ *				might cause a system freeze when the console
+ *				is used later.
+ */
+struct nbcon_context {
+	/* members set by caller */
+	struct console		*console;
+	unsigned int		spinwait_max_us;
+	enum nbcon_prio		prio;
+	unsigned int		allow_unsafe_takeover	: 1;
+};
+
 /**
  * struct console - The console descriptor structure
  * @name:		The name of the console driver
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 63d24ca62ac5..a2a354f859f9 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -4,10 +4,98 @@
 
 #include <linux/kernel.h>
 #include <linux/console.h>
+#include <linux/delay.h>
 #include "internal.h"
 /*
  * Printk console printing implementation for consoles which does not depend
  * on the legacy style console_lock mechanism.
+ *
+ * The state of the console is maintained in the "nbcon_state" atomic
+ * variable.
+ *
+ * The console is locked when:
+ *
+ *   - The 'prio' field contains the priority of the context that owns the
+ *     console. Only higher priority contexts are allowed to take over the
+ *     lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked.
+ *
+ *   - The 'cpu' field denotes on which CPU the console is locked. It is used
+ *     to prevent busy waiting on the same CPU. Also it informs the lock owner
+ *     that it has lost the lock in a more complex scenario when the lock was
+ *     taken over by a higher priority context, released, and taken on another
+ *     CPU with the same priority as the interrupted owner.
+ *
+ * The acquire mechanism uses a few more fields:
+ *
+ *   - The 'req_prio' field is used by the handover approach to make the
+ *     current owner aware that there is a context with a higher priority
+ *     waiting for the friendly handover.
+ *
+ *   - The 'unsafe' field allows to take over the console in a safe way in the
+ *     middle of emitting a message. The field is set only when accessing some
+ *     shared resources or when the console device is manipulated. It can be
+ *     cleared, for example, after emitting one character when the console
+ *     device is in a consistent state.
+ *
+ *   - The 'unsafe_takeover' field is set when a hostile takeover took the
+ *     console in an unsafe state. The console will stay in the unsafe state
+ *     until re-initialized.
+ *
+ * The acquire mechanism uses three approaches:
+ *
+ *   1) Direct acquire when the console is not owned or is owned by a lower
+ *      priority context and is in a safe state.
+ *
+ *   2) Friendly handover mechanism uses a request/grant handshake. It is used
+ *      when the current owner has lower priority and the console is in an
+ *      unsafe state.
+ *
+ *      The requesting context:
+ *
+ *        a) Sets its priority into the 'req_prio' field.
+ *
+ *        b) Waits (with a timeout) for the owning context to unlock the
+ *           console.
+ *
+ *        c) Takes the lock and clears the 'req_prio' field.
+ *
+ *      The owning context:
+ *
+ *        a) Observes the 'req_prio' field set on exit from the unsafe
+ *           console state.
+ *
+ *        b) Gives up console ownership by clearing the 'prio' field.
+ *
+ *   3) Unsafe hostile takeover allows to take over the lock even when the
+ *      console is an unsafe state. It is used only in panic() by the final
+ *      attempt to flush consoles in a try and hope mode.
+ *
+ * The release function simply clears the 'prio' field.
+ *
+ * All operations on @console::nbcon_state are atomic cmpxchg based to
+ * handle concurrency.
+ *
+ * The acquire/release functions implement only minimal policies:
+ *
+ *   - Preference for higher priority contexts.
+ *   - Protection of the panic CPU.
+ *
+ * All other policy decisions must be made at the call sites:
+ *
+ *   - What is marked as an unsafe section.
+ *   - Whether to spin-wait if there is already an owner and the console is
+ *     in an unsafe state.
+ *   - Whether to attempt an unsafe hostile takeover.
+ *
+ * The design allows to implement the well known:
+ *
+ *     acquire()
+ *     output_one_printk_record()
+ *     release()
+ *
+ * The output of one printk record might be interrupted with a higher priority
+ * context. The new owner is supposed to reprint the entire interrupted record
+ * from scratch.
  */
 
 /**
@@ -47,6 +135,415 @@ static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_sta
 	return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
 }
 
+/**
+ * nbcon_context_try_acquire_direct - Try to acquire directly
+ * @ctxt:	The context of the caller
+ * @cur:	The current console state
+ *
+ * Acquire the console when it is released. Also acquire the console when
+ * the current owner has a lower priority and the console is in a safe state.
+ *
+ * Return:	0 on success. Otherwise, an error code on failure. Also @cur
+ *		is updated to the latest state when failed to modify it.
+ *
+ * Errors:
+ *
+ *	-EPERM:		A panic is in progress and this is not the panic CPU.
+ *			Or the current owner or waiter has the same or higher
+ *			priority. No acquire method can be successful in
+ *			this case.
+ *
+ *	-EBUSY:		The current owner has a lower priority but the console
+ *			in an unsafe state. The caller should try using
+ *			the handover acquire method.
+ */
+static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
+					    struct nbcon_state *cur)
+{
+	unsigned int cpu = smp_processor_id();
+	struct console *con = ctxt->console;
+	struct nbcon_state new;
+
+	do {
+		if (other_cpu_in_panic())
+			return -EPERM;
+
+		if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio)
+			return -EPERM;
+
+		if (cur->unsafe)
+			return -EBUSY;
+
+		/*
+		 * The console should never be safe for a direct acquire
+		 * if an unsafe hostile takeover has ever happened.
+		 */
+		WARN_ON_ONCE(cur->unsafe_takeover);
+
+		new.atom = cur->atom;
+		new.prio	= ctxt->prio;
+		new.req_prio	= NBCON_PRIO_NONE;
+		new.unsafe	= cur->unsafe_takeover;
+		new.cpu		= cpu;
+
+	} while (!nbcon_state_try_cmpxchg(con, cur, &new));
+
+	return 0;
+}
+
+static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
+{
+	/*
+	 * The request context is well defined by the @req_prio because:
+	 *
+	 * - Only a context with a higher priority can take over the request.
+	 * - There are only three priorities.
+	 * - Only one CPU is allowed to request PANIC priority.
+	 * - Lower priorities are ignored during panic() until reboot.
+	 *
+	 * As a result, the following scenario is *not* possible:
+	 *
+	 * 1. Another context with a higher priority directly takes ownership.
+	 * 2. The higher priority context releases the ownership.
+	 * 3. A lower priority context takes the ownership.
+	 * 4. Another context with the same priority as this context
+	 *    creates a request and starts waiting.
+	 */
+
+	return (cur->req_prio == expected_prio);
+}
+
+/**
+ * nbcon_context_try_acquire_requested - Try to acquire after having
+ *					 requested a handover
+ * @ctxt:	The context of the caller
+ * @cur:	The current console state
+ *
+ * This is a helper function for nbcon_context_try_acquire_handover().
+ * It is called when the console is in an unsafe state. The current
+ * owner will release the console on exit from the unsafe region.
+ *
+ * Return:	0 on success and @cur is updated to the new console state.
+ *		Otherwise an error code on failure.
+ *
+ * Errors:
+ *
+ *	-EPERM:		A panic is in progress and this is not the panic CPU
+ *			or this context is no longer the waiter.
+ *
+ *	-EBUSY:		The console is still locked. The caller should
+ *			continue waiting.
+ *
+ * Note: The caller must still remove the request when an error has occurred
+ *       except when this context is no longer the waiter.
+ */
+static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt,
+					       struct nbcon_state *cur)
+{
+	unsigned int cpu = smp_processor_id();
+	struct console *con = ctxt->console;
+	struct nbcon_state new;
+
+	/* Note that the caller must still remove the request! */
+	if (other_cpu_in_panic())
+		return -EPERM;
+
+	/*
+	 * Note that the waiter will also change if there was an unsafe
+	 * hostile takeover.
+	 */
+	if (!nbcon_waiter_matches(cur, ctxt->prio))
+		return -EPERM;
+
+	/* If still locked, caller should continue waiting. */
+	if (cur->prio != NBCON_PRIO_NONE)
+		return -EBUSY;
+
+	/*
+	 * The previous owner should have never released ownership
+	 * in an unsafe region.
+	 */
+	WARN_ON_ONCE(cur->unsafe);
+
+	new.atom = cur->atom;
+	new.prio	= ctxt->prio;
+	new.req_prio	= NBCON_PRIO_NONE;
+	new.unsafe	= cur->unsafe_takeover;
+	new.cpu		= cpu;
+
+	if (!nbcon_state_try_cmpxchg(con, cur, &new)) {
+		/*
+		 * The acquire could fail only when it has been taken
+		 * over by a higher priority context.
+		 */
+		WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio));
+		return -EPERM;
+	}
+
+	/* Handover success. This context now owns the console. */
+	return 0;
+}
+
+/**
+ * nbcon_context_try_acquire_handover - Try to acquire via handover
+ * @ctxt:	The context of the caller
+ * @cur:	The current console state
+ *
+ * The function must be called only when the context has higher priority
+ * than the current owner and the console is in an unsafe state.
+ * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY.
+ *
+ * The function sets "req_prio" field to make the current owner aware of
+ * the request. Then it waits until the current owner releases the console,
+ * or an even higher context takes over the request, or timeout expires.
+ *
+ * The current owner checks the "req_prio" field on exit from the unsafe
+ * region and releases the console. It does not touch the "req_prio" field
+ * so that the console stays reserved for the waiter.
+ *
+ * Return:	0 on success. Otherwise, an error code on failure. Also @cur
+ *		is updated to the latest state when failed to modify it.
+ *
+ * Errors:
+ *
+ *	-EPERM:		A panic is in progress and this is not the panic CPU.
+ *			Or a higher priority context has taken over the
+ *			console or the handover request.
+ *
+ *	-EBUSY:		The current owner is on the same CPU so that the hand
+ *			shake could not work. Or the current owner is not
+ *			willing to wait (zero timeout). Or the console does
+ *			not enter the safe state before timeout passed. The
+ *			caller might still use the unsafe hostile takeover
+ *			when allowed.
+ *
+ *	-EAGAIN:	@cur has changed when creating the handover request.
+ *			The caller should retry with direct acquire.
+ */
+static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
+					      struct nbcon_state *cur)
+{
+	unsigned int cpu = smp_processor_id();
+	struct console *con = ctxt->console;
+	struct nbcon_state new;
+	int timeout;
+	int request_err = -EBUSY;
+
+	/*
+	 * Check that the handover is called when the direct acquire failed
+	 * with -EBUSY.
+	 */
+	WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
+	WARN_ON_ONCE(!cur->unsafe);
+
+	/* Handover is not possible on the same CPU. */
+	if (cur->cpu == cpu)
+		return -EBUSY;
+
+	/*
+	 * Console stays unsafe after an unsafe takeover until re-initialized.
+	 * Waiting is not going to help in this case.
+	 */
+	if (cur->unsafe_takeover)
+		return -EBUSY;
+
+	/* Is the caller willing to wait? */
+	if (ctxt->spinwait_max_us == 0)
+		return -EBUSY;
+
+	/*
+	 * Setup a request for the handover. The caller should try to acquire
+	 * the console directly when the current state has been modified.
+	 */
+	new.atom = cur->atom;
+	new.req_prio = ctxt->prio;
+	if (!nbcon_state_try_cmpxchg(con, cur, &new))
+		return -EAGAIN;
+
+	cur->atom = new.atom;
+
+	/* Wait until there is no owner and then acquire the console. */
+	for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) {
+		/* On successful acquire, this request is cleared. */
+		request_err = nbcon_context_try_acquire_requested(ctxt, cur);
+		if (!request_err)
+			return 0;
+
+		/*
+		 * If the acquire should be aborted, it must be ensured
+		 * that the request is removed before returning to caller.
+		 */
+		if (request_err == -EPERM)
+			break;
+
+		udelay(1);
+
+		/* Re-read the state because some time has passed. */
+		nbcon_state_read(con, cur);
+	}
+
+	/* Timed out or aborted. Carefully remove handover request. */
+	do {
+		/*
+		 * No need to remove request if there is a new waiter. This
+		 * can only happen if a higher priority context has taken over
+		 * the console or the handover request.
+		 */
+		if (!nbcon_waiter_matches(cur, ctxt->prio))
+			return -EPERM;
+
+		/* Unset request for handover. */
+		new.atom = cur->atom;
+		new.req_prio = NBCON_PRIO_NONE;
+		if (nbcon_state_try_cmpxchg(con, cur, &new)) {
+			/*
+			 * Request successfully unset. Report failure of
+			 * acquiring via handover.
+			 */
+			cur->atom = new.atom;
+			return request_err;
+		}
+
+		/*
+		 * Unable to remove request. Try to acquire in case
+		 * the owner has released the lock.
+		 */
+	} while (nbcon_context_try_acquire_requested(ctxt, cur));
+
+	/* Lucky timing. The acquire succeeded while removing the request. */
+	return 0;
+}
+
+/**
+ * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover
+ * @ctxt:	The context of the caller
+ * @cur:	The current console state
+ *
+ * Acquire the console even in the unsafe state.
+ *
+ * It can be permitted by setting the 'allow_unsafe_takeover' field only
+ * by the final attempt to flush messages in panic().
+ *
+ * Return:	0 on success. -EPERM when not allowed by the context.
+ */
+static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt,
+					     struct nbcon_state *cur)
+{
+	unsigned int cpu = smp_processor_id();
+	struct console *con = ctxt->console;
+	struct nbcon_state new;
+
+	if (!ctxt->allow_unsafe_takeover)
+		return -EPERM;
+
+	/* Ensure caller is allowed to perform unsafe hostile takeovers. */
+	if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC))
+		return -EPERM;
+
+	/*
+	 * Check that try_acquire_direct() and try_acquire_handover() returned
+	 * -EBUSY in the right situation.
+	 */
+	WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
+	WARN_ON_ONCE(cur->unsafe != true);
+
+	do {
+		new.atom = cur->atom;
+		new.cpu			= cpu;
+		new.prio		= ctxt->prio;
+		new.unsafe		|= cur->unsafe_takeover;
+		new.unsafe_takeover	|= cur->unsafe;
+
+	} while (!nbcon_state_try_cmpxchg(con, cur, &new));
+
+	return 0;
+}
+
+/**
+ * nbcon_context_try_acquire - Try to acquire nbcon console
+ * @ctxt:	The context of the caller
+ *
+ * Return:	True if the console was acquired. False otherwise.
+ *
+ * If the caller allowed an unsafe hostile takeover, on success the
+ * caller should check the current console state to see if it is
+ * in an unsafe state. Otherwise, on success the caller may assume
+ * the console is not in an unsafe state.
+ */
+__maybe_unused
+static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
+{
+	struct console *con = ctxt->console;
+	struct nbcon_state cur;
+	int err;
+
+	nbcon_state_read(con, &cur);
+try_again:
+	err = nbcon_context_try_acquire_direct(ctxt, &cur);
+	if (err != -EBUSY)
+		goto out;
+
+	err = nbcon_context_try_acquire_handover(ctxt, &cur);
+	if (err == -EAGAIN)
+		goto try_again;
+	if (err != -EBUSY)
+		goto out;
+
+	err = nbcon_context_try_acquire_hostile(ctxt, &cur);
+out:
+	return !err;
+}
+
+static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu,
+				int expected_prio)
+{
+	/*
+	 * Since consoles can only be acquired by higher priorities,
+	 * owning contexts are uniquely identified by @prio. However,
+	 * since contexts can unexpectedly lose ownership, it is
+	 * possible that later another owner appears with the same
+	 * priority. For this reason @cpu is also needed.
+	 */
+
+	if (cur->prio != expected_prio)
+		return false;
+
+	if (cur->cpu != expected_cpu)
+		return false;
+
+	return true;
+}
+
+/**
+ * nbcon_context_release - Release the console
+ * @ctxt:	The nbcon context from nbcon_context_try_acquire()
+ */
+__maybe_unused
+static void nbcon_context_release(struct nbcon_context *ctxt)
+{
+	unsigned int cpu = smp_processor_id();
+	struct console *con = ctxt->console;
+	struct nbcon_state cur;
+	struct nbcon_state new;
+
+	nbcon_state_read(con, &cur);
+
+	do {
+		if (!nbcon_owner_matches(&cur, cpu, ctxt->prio))
+			return;
+
+		new.atom = cur.atom;
+		new.prio = NBCON_PRIO_NONE;
+
+		/*
+		 * If @unsafe_takeover is set, it is kept set so that
+		 * the state remains permanently unsafe.
+		 */
+		new.unsafe |= cur.unsafe_takeover;
+
+	} while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+}
+
 /**
  * nbcon_init - Initialize the nbcon console specific data
  * @con:	Console to initialize
-- 
cgit v1.2.3


From d818b56f77521ecc5e3eda71dc9b2beb3d6681e3 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:02 +0206
Subject: printk: Make static printk buffers available to nbcon

The nbcon boot consoles also need printk buffers that are available
very early. Since the nbcon boot consoles will also be serialized
by the console_lock, they can use the same static printk buffers
that the legacy consoles are using.

Make the legacy static printk buffers available outside of printk.c
so they can be used by nbcon.c.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-4-john.ogness@linutronix.de
---
 kernel/printk/internal.h |  2 ++
 kernel/printk/printk.c   | 13 +++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 2ca0ab78802c..7199d60bfc25 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -86,6 +86,8 @@ static inline void nbcon_cleanup(struct console *con) { }
 
 #endif /* CONFIG_PRINTK */
 
+extern struct printk_buffers printk_shared_pbufs;
+
 /**
  * struct printk_buffers - Buffers to read/format/output printk messages.
  * @outbuf:	After formatting, contains text to output.
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5f372eaceb29..17def3791bc0 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2846,6 +2846,13 @@ out:
 	return true;
 }
 
+/*
+ * Used as the printk buffers for non-panic, serialized console printing.
+ * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles.
+ * Its usage requires the console_lock held.
+ */
+struct printk_buffers printk_shared_pbufs;
+
 /*
  * Print one record for the given console. The record printed is whatever
  * record is the next available record for the given console.
@@ -2863,12 +2870,10 @@ out:
  */
 static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
 {
-	static struct printk_buffers pbufs;
-
 	bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
-	char *outbuf = &pbufs.outbuf[0];
+	char *outbuf = &printk_shared_pbufs.outbuf[0];
 	struct printk_message pmsg = {
-		.pbufs = &pbufs,
+		.pbufs = &printk_shared_pbufs,
 	};
 	unsigned long flags;
 
-- 
cgit v1.2.3


From 5634c90fd8553de7cbafecd048d0273690a2e84e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:03 +0206
Subject: printk: nbcon: Add buffer management

In case of hostile takeovers it must be ensured that the previous
owner cannot scribble over the output buffer of the emergency/panic
context. This is achieved by:

 - Adding a global output buffer instance for the panic context.
   This is the only situation where hostile takeovers can occur and
   there is always at most 1 panic context.

 - Allocating an output buffer per non-boot console upon console
   registration. This buffer is used by the console owner when not
   in panic context. (For boot consoles, the existing shared global
   legacy output buffer is used instead. Boot console printing will
   be synchronized with legacy console printing.)

 - Choosing the appropriate buffer is handled in the acquire/release
   functions.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-5-john.ogness@linutronix.de
---
 include/linux/console.h  |  7 +++++
 kernel/printk/internal.h | 12 ++++++--
 kernel/printk/nbcon.c    | 73 ++++++++++++++++++++++++++++++++++++++++++++----
 kernel/printk/printk.c   | 22 +++++++++------
 4 files changed, 99 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/console.h b/include/linux/console.h
index 98210fd01f18..ca1ef8700e55 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -231,6 +231,7 @@ enum nbcon_prio {
 };
 
 struct console;
+struct printk_buffers;
 
 /**
  * struct nbcon_context - Context for console acquire/release
@@ -241,6 +242,7 @@ struct console;
  *				be used only with NBCON_PRIO_PANIC @prio. It
  *				might cause a system freeze when the console
  *				is used later.
+ * @pbufs:			Pointer to the text buffer for this context
  */
 struct nbcon_context {
 	/* members set by caller */
@@ -248,6 +250,9 @@ struct nbcon_context {
 	unsigned int		spinwait_max_us;
 	enum nbcon_prio		prio;
 	unsigned int		allow_unsafe_takeover	: 1;
+
+	/* members set by acquire */
+	struct printk_buffers	*pbufs;
 };
 
 /**
@@ -271,6 +276,7 @@ struct nbcon_context {
  * @node:		hlist node for the console list
  *
  * @nbcon_state:	State for nbcon consoles
+ * @pbufs:		Pointer to nbcon private buffer
  */
 struct console {
 	char			name[16];
@@ -293,6 +299,7 @@ struct console {
 
 	/* nbcon console specific members */
 	atomic_t		__private nbcon_state;
+	struct printk_buffers	*pbufs;
 };
 
 #ifdef CONFIG_LOCKDEP
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 7199d60bfc25..f6161cd75d7d 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -13,6 +13,12 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
 #define printk_sysctl_init() do { } while (0)
 #endif
 
+#define con_printk(lvl, con, fmt, ...)				\
+	printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt),		\
+		(con->flags & CON_NBCON) ? "" : "legacy ",	\
+		(con->flags & CON_BOOT) ? "boot" : "",		\
+		con->name, con->index, ##__VA_ARGS__)
+
 #ifdef CONFIG_PRINTK
 
 #ifdef CONFIG_PRINTK_CALLER
@@ -63,8 +69,9 @@ void defer_console_output(void);
 u16 printk_parse_prefix(const char *text, int *level,
 			enum printk_info_flags *flags);
 
+bool nbcon_alloc(struct console *con);
 void nbcon_init(struct console *con);
-void nbcon_cleanup(struct console *con);
+void nbcon_free(struct console *con);
 
 #else
 
@@ -81,8 +88,9 @@ void nbcon_cleanup(struct console *con);
 #define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
 
 static inline bool printk_percpu_data_ready(void) { return false; }
+static inline bool nbcon_alloc(struct console *con) { return false; }
 static inline void nbcon_init(struct console *con) { }
-static inline void nbcon_cleanup(struct console *con) { }
+static inline void nbcon_free(struct console *con) { }
 
 #endif /* CONFIG_PRINTK */
 
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index a2a354f859f9..ba1febf15db6 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -5,6 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/console.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 #include "internal.h"
 /*
  * Printk console printing implementation for consoles which does not depend
@@ -70,6 +71,10 @@
  *      console is an unsafe state. It is used only in panic() by the final
  *      attempt to flush consoles in a try and hope mode.
  *
+ *      Note that separate record buffers are used in panic(). As a result,
+ *      the messages can be read and formatted without any risk even after
+ *      using the hostile takeover in unsafe state.
+ *
  * The release function simply clears the 'prio' field.
  *
  * All operations on @console::nbcon_state are atomic cmpxchg based to
@@ -459,6 +464,8 @@ static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt,
 	return 0;
 }
 
+static struct printk_buffers panic_nbcon_pbufs;
+
 /**
  * nbcon_context_try_acquire - Try to acquire nbcon console
  * @ctxt:	The context of the caller
@@ -473,6 +480,7 @@ static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt,
 __maybe_unused
 static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
 {
+	unsigned int cpu = smp_processor_id();
 	struct console *con = ctxt->console;
 	struct nbcon_state cur;
 	int err;
@@ -491,7 +499,18 @@ try_again:
 
 	err = nbcon_context_try_acquire_hostile(ctxt, &cur);
 out:
-	return !err;
+	if (err)
+		return false;
+
+	/* Acquire succeeded. */
+
+	/* Assign the appropriate buffer for this context. */
+	if (atomic_read(&panic_cpu) == cpu)
+		ctxt->pbufs = &panic_nbcon_pbufs;
+	else
+		ctxt->pbufs = con->pbufs;
+
+	return true;
 }
 
 static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu,
@@ -530,7 +549,7 @@ static void nbcon_context_release(struct nbcon_context *ctxt)
 
 	do {
 		if (!nbcon_owner_matches(&cur, cpu, ctxt->prio))
-			return;
+			break;
 
 		new.atom = cur.atom;
 		new.prio = NBCON_PRIO_NONE;
@@ -542,26 +561,70 @@ static void nbcon_context_release(struct nbcon_context *ctxt)
 		new.unsafe |= cur.unsafe_takeover;
 
 	} while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+
+	ctxt->pbufs = NULL;
+}
+
+/**
+ * nbcon_alloc - Allocate buffers needed by the nbcon console
+ * @con:	Console to allocate buffers for
+ *
+ * Return:	True on success. False otherwise and the console cannot
+ *		be used.
+ *
+ * This is not part of nbcon_init() because buffer allocation must
+ * be performed earlier in the console registration process.
+ */
+bool nbcon_alloc(struct console *con)
+{
+	if (con->flags & CON_BOOT) {
+		/*
+		 * Boot console printing is synchronized with legacy console
+		 * printing, so boot consoles can share the same global printk
+		 * buffers.
+		 */
+		con->pbufs = &printk_shared_pbufs;
+	} else {
+		con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL);
+		if (!con->pbufs) {
+			con_printk(KERN_ERR, con, "failed to allocate printing buffer\n");
+			return false;
+		}
+	}
+
+	return true;
 }
 
 /**
  * nbcon_init - Initialize the nbcon console specific data
  * @con:	Console to initialize
+ *
+ * nbcon_alloc() *must* be called and succeed before this function
+ * is called.
  */
 void nbcon_init(struct console *con)
 {
 	struct nbcon_state state = { };
 
+	/* nbcon_alloc() must have been called and successful! */
+	BUG_ON(!con->pbufs);
+
 	nbcon_state_set(con, &state);
 }
 
 /**
- * nbcon_cleanup - Cleanup the nbcon console specific data
- * @con:	Console to cleanup
+ * nbcon_free - Free and cleanup the nbcon console specific data
+ * @con:	Console to free/cleanup nbcon data
  */
-void nbcon_cleanup(struct console *con)
+void nbcon_free(struct console *con)
 {
 	struct nbcon_state state = { };
 
 	nbcon_state_set(con, &state);
+
+	/* Boot consoles share global printk buffers. */
+	if (!(con->flags & CON_BOOT))
+		kfree(con->pbufs);
+
+	con->pbufs = NULL;
 }
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 17def3791bc0..1c9720acd960 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3331,12 +3331,6 @@ static void try_enable_default_console(struct console *newcon)
 		newcon->flags |= CON_CONSDEV;
 }
 
-#define con_printk(lvl, con, fmt, ...)				\
-	printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt),		\
-	       (con->flags & CON_NBCON) ? "" : "legacy ",	\
-	       (con->flags & CON_BOOT) ? "boot" : "",		\
-	       con->name, con->index, ##__VA_ARGS__)
-
 static void console_init_seq(struct console *newcon, bool bootcon_registered)
 {
 	struct console *con;
@@ -3450,6 +3444,15 @@ void register_console(struct console *newcon)
 		goto unlock;
 	}
 
+	if (newcon->flags & CON_NBCON) {
+		/*
+		 * Ensure the nbcon console buffers can be allocated
+		 * before modifying any global data.
+		 */
+		if (!nbcon_alloc(newcon))
+			goto unlock;
+	}
+
 	/*
 	 * See if we want to enable this console driver by default.
 	 *
@@ -3477,8 +3480,11 @@ void register_console(struct console *newcon)
 		err = try_enable_preferred_console(newcon, false);
 
 	/* printk() messages are not printed to the Braille console. */
-	if (err || newcon->flags & CON_BRL)
+	if (err || newcon->flags & CON_BRL) {
+		if (newcon->flags & CON_NBCON)
+			nbcon_free(newcon);
 		goto unlock;
+	}
 
 	/*
 	 * If we have a bootconsole, and are switching to a real console,
@@ -3589,7 +3595,7 @@ static int unregister_console_locked(struct console *console)
 	synchronize_srcu(&console_srcu);
 
 	if (console->flags & CON_NBCON)
-		nbcon_cleanup(console);
+		nbcon_free(console);
 
 	console_sysfs_notify();
 
-- 
cgit v1.2.3


From 4b08d9e24f50163a8bda0a702b45b06bf841d792 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:04 +0206
Subject: printk: nbcon: Add ownership state functions

Provide functions that are related to the safe handover mechanism
and allow console drivers to dynamically specify unsafe regions:

 - nbcon_context_can_proceed()

   Invoked by a console owner to check whether a handover request
   is pending or whether the console has been taken over by another
   context. If a handover request is pending, this function will
   also perform the handover, thus cancelling its own ownership.

 - nbcon_context_enter_unsafe()/nbcon_context_exit_unsafe()

   Invoked by a console owner to denote that the driver is about
   to enter or leave a critical region where a take over is unsafe.
   The exit variant is the point where the current owner releases
   the lock for a higher priority context which asked for the
   friendly handover.

   The unsafe state is stored in the console state and allows a
   new context to make informed decisions whether to attempt a
   takeover of such a console. The unsafe state is also available
   to the driver so that it can make informed decisions about the
   required actions and possibly take a special emergency path.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-6-john.ogness@linutronix.de
---
 kernel/printk/nbcon.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index ba1febf15db6..98e4be5429f0 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -537,7 +537,6 @@ static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu,
  * nbcon_context_release - Release the console
  * @ctxt:	The nbcon context from nbcon_context_try_acquire()
  */
-__maybe_unused
 static void nbcon_context_release(struct nbcon_context *ctxt)
 {
 	unsigned int cpu = smp_processor_id();
@@ -565,6 +564,128 @@ static void nbcon_context_release(struct nbcon_context *ctxt)
 	ctxt->pbufs = NULL;
 }
 
+/**
+ * nbcon_context_can_proceed - Check whether ownership can proceed
+ * @ctxt:	The nbcon context from nbcon_context_try_acquire()
+ * @cur:	The current console state
+ *
+ * Return:	True if this context still owns the console. False if
+ *		ownership was handed over or taken.
+ *
+ * Must be invoked when entering the unsafe state to make sure that it still
+ * owns the lock. Also must be invoked when exiting the unsafe context
+ * to eventually free the lock for a higher priority context which asked
+ * for the friendly handover.
+ *
+ * It can be called inside an unsafe section when the console is just
+ * temporary in safe state instead of exiting and entering the unsafe
+ * state.
+ *
+ * Also it can be called in the safe context before doing an expensive
+ * safe operation. It does not make sense to do the operation when
+ * a higher priority context took the lock.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_state *cur)
+{
+	unsigned int cpu = smp_processor_id();
+
+	/* Make sure this context still owns the console. */
+	if (!nbcon_owner_matches(cur, cpu, ctxt->prio))
+		return false;
+
+	/* The console owner can proceed if there is no waiter. */
+	if (cur->req_prio == NBCON_PRIO_NONE)
+		return true;
+
+	/*
+	 * A console owner within an unsafe region is always allowed to
+	 * proceed, even if there are waiters. It can perform a handover
+	 * when exiting the unsafe region. Otherwise the waiter will
+	 * need to perform an unsafe hostile takeover.
+	 */
+	if (cur->unsafe)
+		return true;
+
+	/* Waiters always have higher priorities than owners. */
+	WARN_ON_ONCE(cur->req_prio <= cur->prio);
+
+	/*
+	 * Having a safe point for take over and eventually a few
+	 * duplicated characters or a full line is way better than a
+	 * hostile takeover. Post processing can take care of the garbage.
+	 * Release and hand over.
+	 */
+	nbcon_context_release(ctxt);
+
+	/*
+	 * It is not clear whether the waiter really took over ownership. The
+	 * outermost callsite must make the final decision whether console
+	 * ownership is needed for it to proceed. If yes, it must reacquire
+	 * ownership (possibly hostile) before carefully proceeding.
+	 *
+	 * The calling context no longer owns the console so go back all the
+	 * way instead of trying to implement reacquire heuristics in tons of
+	 * places.
+	 */
+	return false;
+}
+
+#define nbcon_context_enter_unsafe(c)	__nbcon_context_update_unsafe(c, true)
+#define nbcon_context_exit_unsafe(c)	__nbcon_context_update_unsafe(c, false)
+
+/**
+ * __nbcon_context_update_unsafe - Update the unsafe bit in @con->nbcon_state
+ * @ctxt:	The nbcon context from nbcon_context_try_acquire()
+ * @unsafe:	The new value for the unsafe bit
+ *
+ * Return:	True if the unsafe state was updated and this context still
+ *		owns the console. Otherwise false if ownership was handed
+ *		over or taken.
+ *
+ * This function allows console owners to modify the unsafe status of the
+ * console.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ *
+ * Internal helper to avoid duplicated code.
+ */
+__maybe_unused
+static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe)
+{
+	struct console *con = ctxt->console;
+	struct nbcon_state cur;
+	struct nbcon_state new;
+
+	nbcon_state_read(con, &cur);
+
+	do {
+		/*
+		 * The unsafe bit must not be cleared if an
+		 * unsafe hostile takeover has occurred.
+		 */
+		if (!unsafe && cur.unsafe_takeover)
+			goto out;
+
+		if (!nbcon_context_can_proceed(ctxt, &cur))
+			return false;
+
+		new.atom = cur.atom;
+		new.unsafe = unsafe;
+	} while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+
+	cur.atom = new.atom;
+out:
+	return nbcon_context_can_proceed(ctxt, &cur);
+}
+
 /**
  * nbcon_alloc - Allocate buffers needed by the nbcon console
  * @con:	Console to allocate buffers for
-- 
cgit v1.2.3


From ad56ebd1d79b216dc147474fac89a11daf6b10df Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:05 +0206
Subject: printk: nbcon: Add sequence handling

Add an atomic_long_t field @nbcon_seq to the console struct to
store the sequence number for nbcon consoles. For nbcon consoles
this will be used instead of the non-atomic @seq field. The new
field allows for safe atomic sequence number updates without
requiring any locking.

On 64bit systems the new field stores the full sequence number.
On 32bit systems the new field stores the lower 32 bits of the
sequence number, which are expanded to 64bit as needed by
folding the values based on the sequence numbers available in
the ringbuffer.

For 32bit systems, having a 32bit representation in the console
is sufficient. If a console ever gets more than 2^31 records
behind the ringbuffer then this is the least of the problems.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-7-john.ogness@linutronix.de
---
 include/linux/console.h  |   4 ++
 kernel/printk/internal.h |   7 ++++
 kernel/printk/nbcon.c    | 101 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/printk/printk.c   |  31 +++++++++++----
 4 files changed, 136 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/console.h b/include/linux/console.h
index ca1ef8700e55..20cd486b76ad 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -243,6 +243,7 @@ struct printk_buffers;
  *				might cause a system freeze when the console
  *				is used later.
  * @pbufs:			Pointer to the text buffer for this context
+ * @seq:			The sequence number to print for this context
  */
 struct nbcon_context {
 	/* members set by caller */
@@ -253,6 +254,7 @@ struct nbcon_context {
 
 	/* members set by acquire */
 	struct printk_buffers	*pbufs;
+	u64			seq;
 };
 
 /**
@@ -276,6 +278,7 @@ struct nbcon_context {
  * @node:		hlist node for the console list
  *
  * @nbcon_state:	State for nbcon consoles
+ * @nbcon_seq:		Sequence number of the next record for nbcon to print
  * @pbufs:		Pointer to nbcon private buffer
  */
 struct console {
@@ -299,6 +302,7 @@ struct console {
 
 	/* nbcon console specific members */
 	atomic_t		__private nbcon_state;
+	atomic_long_t		__private nbcon_seq;
 	struct printk_buffers	*pbufs;
 };
 
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index f6161cd75d7d..6473f5ae4a18 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -4,6 +4,7 @@
  */
 #include <linux/percpu.h>
 #include <linux/console.h>
+#include "printk_ringbuffer.h"
 
 #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
 void __init printk_sysctl_init(void);
@@ -42,6 +43,8 @@ enum printk_info_flags {
 	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
 };
 
+extern struct printk_ringbuffer *prb;
+
 __printf(4, 0)
 int vprintk_store(int facility, int level,
 		  const struct dev_printk_info *dev_info,
@@ -69,6 +72,8 @@ void defer_console_output(void);
 u16 printk_parse_prefix(const char *text, int *level,
 			enum printk_info_flags *flags);
 
+u64 nbcon_seq_read(struct console *con);
+void nbcon_seq_force(struct console *con, u64 seq);
 bool nbcon_alloc(struct console *con);
 void nbcon_init(struct console *con);
 void nbcon_free(struct console *con);
@@ -88,6 +93,8 @@ void nbcon_free(struct console *con);
 #define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
 
 static inline bool printk_percpu_data_ready(void) { return false; }
+static inline u64 nbcon_seq_read(struct console *con) { return 0; }
+static inline void nbcon_seq_force(struct console *con, u64 seq) { }
 static inline bool nbcon_alloc(struct console *con) { return false; }
 static inline void nbcon_init(struct console *con) { }
 static inline void nbcon_free(struct console *con) { }
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 98e4be5429f0..e076096b31c0 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -140,6 +140,101 @@ static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_sta
 	return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
 }
 
+#ifdef CONFIG_64BIT
+
+#define __seq_to_nbcon_seq(seq) (seq)
+#define __nbcon_seq_to_seq(seq) (seq)
+
+#else /* CONFIG_64BIT */
+
+#define __seq_to_nbcon_seq(seq) ((u32)seq)
+
+static inline u64 __nbcon_seq_to_seq(u32 nbcon_seq)
+{
+	u64 seq;
+	u64 rb_next_seq;
+
+	/*
+	 * The provided sequence is only the lower 32 bits of the ringbuffer
+	 * sequence. It needs to be expanded to 64bit. Get the next sequence
+	 * number from the ringbuffer and fold it.
+	 *
+	 * Having a 32bit representation in the console is sufficient.
+	 * If a console ever gets more than 2^31 records behind
+	 * the ringbuffer then this is the least of the problems.
+	 *
+	 * Also the access to the ring buffer is always safe.
+	 */
+	rb_next_seq = prb_next_seq(prb);
+	seq = rb_next_seq - ((u32)rb_next_seq - nbcon_seq);
+
+	return seq;
+}
+
+#endif /* CONFIG_64BIT */
+
+/**
+ * nbcon_seq_read - Read the current console sequence
+ * @con:	Console to read the sequence of
+ *
+ * Return:	Sequence number of the next record to print on @con.
+ */
+u64 nbcon_seq_read(struct console *con)
+{
+	unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq));
+
+	return __nbcon_seq_to_seq(nbcon_seq);
+}
+
+/**
+ * nbcon_seq_force - Force console sequence to a specific value
+ * @con:	Console to work on
+ * @seq:	Sequence number value to set
+ *
+ * Only to be used during init (before registration) or in extreme situations
+ * (such as panic with CONSOLE_REPLAY_ALL).
+ */
+void nbcon_seq_force(struct console *con, u64 seq)
+{
+	/*
+	 * If the specified record no longer exists, the oldest available record
+	 * is chosen. This is especially important on 32bit systems because only
+	 * the lower 32 bits of the sequence number are stored. The upper 32 bits
+	 * are derived from the sequence numbers available in the ringbuffer.
+	 */
+	u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb));
+
+	atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __seq_to_nbcon_seq(valid_seq));
+
+	/* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */
+	con->seq = 0;
+}
+
+/**
+ * nbcon_seq_try_update - Try to update the console sequence number
+ * @ctxt:	Pointer to an acquire context that contains
+ *		all information about the acquire mode
+ * @new_seq:	The new sequence number to set
+ *
+ * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to
+ * the 64bit value). This could be a different value than @new_seq if
+ * nbcon_seq_force() was used or the current context no longer owns the
+ * console. In the later case, it will stop printing anyway.
+ */
+__maybe_unused
+static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
+{
+	unsigned long nbcon_seq = __seq_to_nbcon_seq(ctxt->seq);
+	struct console *con = ctxt->console;
+
+	if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq,
+				    __seq_to_nbcon_seq(new_seq))) {
+		ctxt->seq = new_seq;
+	} else {
+		ctxt->seq = nbcon_seq_read(con);
+	}
+}
+
 /**
  * nbcon_context_try_acquire_direct - Try to acquire directly
  * @ctxt:	The context of the caller
@@ -510,6 +605,9 @@ out:
 	else
 		ctxt->pbufs = con->pbufs;
 
+	/* Set the record sequence for this context to print. */
+	ctxt->seq = nbcon_seq_read(ctxt->console);
+
 	return true;
 }
 
@@ -722,6 +820,8 @@ bool nbcon_alloc(struct console *con)
  *
  * nbcon_alloc() *must* be called and succeed before this function
  * is called.
+ *
+ * This function expects that the legacy @con->seq has been set.
  */
 void nbcon_init(struct console *con)
 {
@@ -730,6 +830,7 @@ void nbcon_init(struct console *con)
 	/* nbcon_alloc() must have been called and successful! */
 	BUG_ON(!con->pbufs);
 
+	nbcon_seq_force(con, con->seq);
 	nbcon_state_set(con, &state);
 }
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1c9720acd960..77857d2118ca 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -494,7 +494,7 @@ _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
 
 static struct printk_ringbuffer printk_rb_dynamic;
 
-static struct printk_ringbuffer *prb = &printk_rb_static;
+struct printk_ringbuffer *prb = &printk_rb_static;
 
 /*
  * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
@@ -3168,6 +3168,7 @@ void console_flush_on_panic(enum con_flush_mode mode)
 
 	if (mode == CONSOLE_REPLAY_ALL) {
 		struct console *c;
+		short flags;
 		int cookie;
 		u64 seq;
 
@@ -3175,11 +3176,17 @@ void console_flush_on_panic(enum con_flush_mode mode)
 
 		cookie = console_srcu_read_lock();
 		for_each_console_srcu(c) {
-			/*
-			 * This is an unsynchronized assignment, but the
-			 * kernel is in "hope and pray" mode anyway.
-			 */
-			c->seq = seq;
+			flags = console_srcu_read_flags(c);
+
+			if (flags & CON_NBCON) {
+				nbcon_seq_force(c, seq);
+			} else {
+				/*
+				 * This is an unsynchronized assignment. On
+				 * panic legacy consoles are only best effort.
+				 */
+				c->seq = seq;
+			}
 		}
 		console_srcu_read_unlock(cookie);
 	}
@@ -3750,6 +3757,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 	struct console *c;
 	u64 last_diff = 0;
 	u64 printk_seq;
+	short flags;
 	int cookie;
 	u64 diff;
 	u64 seq;
@@ -3771,6 +3779,9 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 		for_each_console_srcu(c) {
 			if (con && con != c)
 				continue;
+
+			flags = console_srcu_read_flags(c);
+
 			/*
 			 * If consoles are not usable, it cannot be expected
 			 * that they make forward progress, so only increment
@@ -3778,7 +3789,13 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 			 */
 			if (!console_is_usable(c))
 				continue;
-			printk_seq = c->seq;
+
+			if (flags & CON_NBCON) {
+				printk_seq = nbcon_seq_read(c);
+			} else {
+				printk_seq = c->seq;
+			}
+
 			if (printk_seq < seq)
 				diff += seq - printk_seq;
 		}
-- 
cgit v1.2.3


From 06653d57ff283be627a2c769139d73ecc487810f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:06 +0206
Subject: printk: nbcon: Add emit function and callback function for atomic
 printing

Implement an emit function for nbcon consoles to output printk
messages. It utilizes the lockless printk_get_next_message() and
console_prepend_dropped() functions to retrieve/build the output
message. The emit function includes the required safety points to
check for handover/takeover and calls a new write_atomic callback
of the console driver to output the message. It also includes
proper handling for updating the nbcon console sequence number.

A new nbcon_write_context struct is introduced. This is provided
to the write_atomic callback and includes only the information
necessary for performing atomic writes.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-8-john.ogness@linutronix.de
---
 include/linux/console.h  |  21 ++++++++++
 kernel/printk/internal.h |   6 +++
 kernel/printk/nbcon.c    | 106 ++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/printk/printk.c   |   9 ++--
 4 files changed, 134 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/console.h b/include/linux/console.h
index 20cd486b76ad..14563dcb34b1 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -242,6 +242,7 @@ struct printk_buffers;
  *				be used only with NBCON_PRIO_PANIC @prio. It
  *				might cause a system freeze when the console
  *				is used later.
+ * @backlog:			Ringbuffer has pending records
  * @pbufs:			Pointer to the text buffer for this context
  * @seq:			The sequence number to print for this context
  */
@@ -252,11 +253,28 @@ struct nbcon_context {
 	enum nbcon_prio		prio;
 	unsigned int		allow_unsafe_takeover	: 1;
 
+	/* members set by emit */
+	unsigned int		backlog			: 1;
+
 	/* members set by acquire */
 	struct printk_buffers	*pbufs;
 	u64			seq;
 };
 
+/**
+ * struct nbcon_write_context - Context handed to the nbcon write callbacks
+ * @ctxt:		The core console context
+ * @outbuf:		Pointer to the text buffer for output
+ * @len:		Length to write
+ * @unsafe_takeover:	If a hostile takeover in an unsafe state has occurred
+ */
+struct nbcon_write_context {
+	struct nbcon_context	__private ctxt;
+	char			*outbuf;
+	unsigned int		len;
+	bool			unsafe_takeover;
+};
+
 /**
  * struct console - The console descriptor structure
  * @name:		The name of the console driver
@@ -277,6 +295,7 @@ struct nbcon_context {
  * @data:		Driver private data
  * @node:		hlist node for the console list
  *
+ * @write_atomic:	Write callback for atomic context
  * @nbcon_state:	State for nbcon consoles
  * @nbcon_seq:		Sequence number of the next record for nbcon to print
  * @pbufs:		Pointer to nbcon private buffer
@@ -301,6 +320,8 @@ struct console {
 	struct hlist_node	node;
 
 	/* nbcon console specific members */
+	bool			(*write_atomic)(struct console *con,
+						struct nbcon_write_context *wctxt);
 	atomic_t		__private nbcon_state;
 	atomic_long_t		__private nbcon_seq;
 	struct printk_buffers	*pbufs;
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 6473f5ae4a18..6c2afee5ef62 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -130,3 +130,9 @@ struct printk_message {
 };
 
 bool other_cpu_in_panic(void);
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+			     bool is_extended, bool may_supress);
+
+#ifdef CONFIG_PRINTK
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped);
+#endif
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index e076096b31c0..6e05d263fd22 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -221,7 +221,6 @@ void nbcon_seq_force(struct console *con, u64 seq)
  * nbcon_seq_force() was used or the current context no longer owns the
  * console. In the later case, it will stop printing anyway.
  */
-__maybe_unused
 static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
 {
 	unsigned long nbcon_seq = __seq_to_nbcon_seq(ctxt->seq);
@@ -755,7 +754,6 @@ static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_s
  *
  * Internal helper to avoid duplicated code.
  */
-__maybe_unused
 static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe)
 {
 	struct console *con = ctxt->console;
@@ -784,6 +782,110 @@ out:
 	return nbcon_context_can_proceed(ctxt, &cur);
 }
 
+/**
+ * nbcon_emit_next_record - Emit a record in the acquired context
+ * @wctxt:	The write context that will be handed to the write function
+ *
+ * Return:	True if this context still owns the console. False if
+ *		ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context. If the caller
+ * wants to do more it must reacquire the console first.
+ *
+ * When true is returned, @wctxt->ctxt.backlog indicates whether there are
+ * still records pending in the ringbuffer,
+ */
+__maybe_unused
+static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
+{
+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+	struct console *con = ctxt->console;
+	bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+	struct printk_message pmsg = {
+		.pbufs = ctxt->pbufs,
+	};
+	unsigned long con_dropped;
+	struct nbcon_state cur;
+	unsigned long dropped;
+	bool done;
+
+	/*
+	 * The printk buffers are filled within an unsafe section. This
+	 * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from
+	 * clobbering each other.
+	 */
+
+	if (!nbcon_context_enter_unsafe(ctxt))
+		return false;
+
+	ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true);
+	if (!ctxt->backlog)
+		return nbcon_context_exit_unsafe(ctxt);
+
+	/*
+	 * @con->dropped is not protected in case of an unsafe hostile
+	 * takeover. In that situation the update can be racy so
+	 * annotate it accordingly.
+	 */
+	con_dropped = data_race(READ_ONCE(con->dropped));
+
+	dropped = con_dropped + pmsg.dropped;
+	if (dropped && !is_extended)
+		console_prepend_dropped(&pmsg, dropped);
+
+	if (!nbcon_context_exit_unsafe(ctxt))
+		return false;
+
+	/* For skipped records just update seq/dropped in @con. */
+	if (pmsg.outbuf_len == 0)
+		goto update_con;
+
+	/* Initialize the write context for driver callbacks. */
+	wctxt->outbuf = &pmsg.pbufs->outbuf[0];
+	wctxt->len = pmsg.outbuf_len;
+	nbcon_state_read(con, &cur);
+	wctxt->unsafe_takeover = cur.unsafe_takeover;
+
+	if (con->write_atomic) {
+		done = con->write_atomic(con, wctxt);
+	} else {
+		nbcon_context_release(ctxt);
+		WARN_ON_ONCE(1);
+		done = false;
+	}
+
+	/* If not done, the emit was aborted. */
+	if (!done)
+		return false;
+
+	/*
+	 * Since any dropped message was successfully output, reset the
+	 * dropped count for the console.
+	 */
+	dropped = 0;
+update_con:
+	/*
+	 * The dropped count and the sequence number are updated within an
+	 * unsafe section. This limits update races to the panic context and
+	 * allows the panic context to win.
+	 */
+
+	if (!nbcon_context_enter_unsafe(ctxt))
+		return false;
+
+	if (dropped != con_dropped) {
+		/* Counterpart to the READ_ONCE() above. */
+		WRITE_ONCE(con->dropped, dropped);
+	}
+
+	nbcon_seq_try_update(ctxt, pmsg.seq + 1);
+
+	return nbcon_context_exit_unsafe(ctxt);
+}
+
 /**
  * nbcon_alloc - Allocate buffers needed by the nbcon console
  * @con:	Console to allocate buffers for
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 77857d2118ca..778359b21761 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -698,9 +698,6 @@ out:
 	return len;
 }
 
-static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
-				    bool is_extended, bool may_supress);
-
 /* /dev/kmsg - userspace message inject/listen interface */
 struct devkmsg_user {
 	atomic64_t seq;
@@ -2733,7 +2730,7 @@ static void __console_unlock(void)
  * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
  */
 #ifdef CONFIG_PRINTK
-static void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
 {
 	struct printk_buffers *pbufs = pmsg->pbufs;
 	const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
@@ -2787,8 +2784,8 @@ static void console_prepend_dropped(struct printk_message *pmsg, unsigned long d
  * of @pmsg are valid. (See the documentation of struct printk_message
  * for information about the @pmsg fields.)
  */
-static bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
-				    bool is_extended, bool may_suppress)
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+			     bool is_extended, bool may_suppress)
 {
 	static int panic_console_dropped;
 
-- 
cgit v1.2.3


From 9757acd0a700ba4a0d16dde4ba820eb052aba1a7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 16 Sep 2023 21:26:07 +0206
Subject: printk: nbcon: Allow drivers to mark unsafe regions and check state

For the write_atomic callback, the console driver may have unsafe
regions that need to be appropriately marked. Provide functions
that accept the nbcon_write_context struct to allow for the driver
to enter and exit unsafe regions.

Also provide a function for drivers to check if they are still the
owner of the console.

Co-developed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Thomas Gleixner (Intel) <tglx@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230916192007.608398-9-john.ogness@linutronix.de
---
 include/linux/console.h | 10 +++++++
 kernel/printk/nbcon.c   | 75 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/console.h b/include/linux/console.h
index 14563dcb34b1..e4fc6f7c1496 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -451,6 +451,16 @@ static inline bool console_is_registered(const struct console *con)
 	lockdep_assert_console_list_lock_held();			\
 	hlist_for_each_entry(con, &console_list, node)
 
+#ifdef CONFIG_PRINTK
+extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt);
+extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
+extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
+#else
+static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; }
+static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; }
+static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; }
+#endif
+
 extern int console_set_on_cmdline;
 extern struct console *early_console;
 
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 6e05d263fd22..b96077152f49 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -732,6 +732,41 @@ static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_s
 	return false;
 }
 
+/**
+ * nbcon_can_proceed - Check whether ownership can proceed
+ * @wctxt:	The write context that was handed to the write function
+ *
+ * Return:	True if this context still owns the console. False if
+ *		ownership was handed over or taken.
+ *
+ * It is used in nbcon_enter_unsafe() to make sure that it still owns the
+ * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock
+ * for a higher priority context which asked for the friendly handover.
+ *
+ * It can be called inside an unsafe section when the console is just
+ * temporary in safe state instead of exiting and entering the unsafe state.
+ *
+ * Also it can be called in the safe context before doing an expensive safe
+ * operation. It does not make sense to do the operation when a higher
+ * priority context took the lock.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_can_proceed(struct nbcon_write_context *wctxt)
+{
+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+	struct console *con = ctxt->console;
+	struct nbcon_state cur;
+
+	nbcon_state_read(con, &cur);
+
+	return nbcon_context_can_proceed(ctxt, &cur);
+}
+EXPORT_SYMBOL_GPL(nbcon_can_proceed);
+
 #define nbcon_context_enter_unsafe(c)	__nbcon_context_update_unsafe(c, true)
 #define nbcon_context_exit_unsafe(c)	__nbcon_context_update_unsafe(c, false)
 
@@ -782,6 +817,46 @@ out:
 	return nbcon_context_can_proceed(ctxt, &cur);
 }
 
+/**
+ * nbcon_enter_unsafe - Enter an unsafe region in the driver
+ * @wctxt:	The write context that was handed to the write function
+ *
+ * Return:	True if this context still owns the console. False if
+ *		ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt)
+{
+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+	return nbcon_context_enter_unsafe(ctxt);
+}
+EXPORT_SYMBOL_GPL(nbcon_enter_unsafe);
+
+/**
+ * nbcon_exit_unsafe - Exit an unsafe region in the driver
+ * @wctxt:	The write context that was handed to the write function
+ *
+ * Return:	True if this context still owns the console. False if
+ *		ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt)
+{
+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+	return nbcon_context_exit_unsafe(ctxt);
+}
+EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);
+
 /**
  * nbcon_emit_next_record - Emit a record in the acquired context
  * @wctxt:	The write context that will be handed to the write function
-- 
cgit v1.2.3


From a6828214480e2f00a8a7e64c7a55fc42b0f54e1c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 5 Sep 2023 17:49:35 -0400
Subject: workqueue: Removed double allocation of wq_update_pod_attrs_buf

First commit 2930155b2e272 ("workqueue: Initialize unbound CPU pods later in
the boot") added the initialization of wq_update_pod_attrs_buf to
workqueue_init_early(), and then latter on, commit 84193c07105c6
("workqueue: Generalize unbound CPU pods") added it as well. This appeared
in a kmemleak run where the second allocation made the first allocation
leak.

Fixes: 84193c07105c6 ("workqueue: Generalize unbound CPU pods")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c85825e17df8..129328b765fb 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -6535,9 +6535,6 @@ void __init workqueue_init_early(void)
 
 	BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));
 
-	wq_update_pod_attrs_buf = alloc_workqueue_attrs();
-	BUG_ON(!wq_update_pod_attrs_buf);
-
 	pt->nr_pods = 1;
 	cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
 	pt->pod_node[0] = NUMA_NO_NODE;
-- 
cgit v1.2.3


From dd64c873ed11cdae340be06dcd2364870fd3e4fc Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang1211@gmail.com>
Date: Mon, 11 Sep 2023 16:27:22 +0800
Subject: workqueue: Fix missed pwq_release_worker creation in
 wq_cpu_intensive_thresh_init()

Currently, if the wq_cpu_intensive_thresh_us is set to specific
value, will cause the wq_cpu_intensive_thresh_init() early exit
and missed creation of pwq_release_worker. this commit therefore
create the pwq_release_worker in advance before checking the
wq_cpu_intensive_thresh_us.

Signed-off-by: Zqiang <qiang.zhang1211@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 967b494e2fd1 ("workqueue: Use a kthread_worker to release pool_workqueues")
---
 kernel/workqueue.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 129328b765fb..b9f053a5a5f0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -6602,13 +6602,13 @@ static void __init wq_cpu_intensive_thresh_init(void)
 	unsigned long thresh;
 	unsigned long bogo;
 
+	pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
+	BUG_ON(IS_ERR(pwq_release_worker));
+
 	/* if the user set it to a specific value, keep it */
 	if (wq_cpu_intensive_thresh_us != ULONG_MAX)
 		return;
 
-	pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
-	BUG_ON(IS_ERR(pwq_release_worker));
-
 	/*
 	 * The default of 10ms is derived from the fact that most modern (as of
 	 * 2023) processors can do a lot in 10ms and that it's just below what
-- 
cgit v1.2.3


From fd55c0adb46a44c9a0630dc32509e4733c290103 Mon Sep 17 00:00:00 2001
From: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Date: Tue, 12 Sep 2023 12:34:34 +0530
Subject: cgroup: Check for ret during cgroup1_base_files cft addition

There is no check for possible failure while populating
cgroup1_base_files cft in css_populate_dir(), like its cgroup v2 counter
parts cgroup_{base,psi}_files.  In case of failure, the cgroup might not
be set up right.  Add ret value check to return on failure.

Signed-off-by: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1fb7f562289d..d40d58b963c8 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1731,8 +1731,10 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
 					return ret;
 			}
 		} else {
-			cgroup_addrm_files(css, cgrp,
-					   cgroup1_base_files, true);
+			ret = cgroup_addrm_files(css, cgrp,
+						 cgroup1_base_files, true);
+			if (ret < 0)
+				return ret;
 		}
 	} else {
 		list_for_each_entry(cfts, &css->ss->cfts, node) {
-- 
cgit v1.2.3


From d24f05987ce8bf61e62d86fedbe47523dc5c3393 Mon Sep 17 00:00:00 2001
From: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Date: Tue, 12 Sep 2023 12:34:35 +0530
Subject: cgroup: Avoid extra dereference in css_populate_dir()

Use css directly instead of dereferencing it from &cgroup->self, while
adding the cgroup v2 cft base and psi files in css_populate_dir(). Both
points to the same css, when css->ss is NULL, this avoids extra deferences
and makes code consistent in usage across the function.

Signed-off-by: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d40d58b963c8..833ac6dd15d9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1719,13 +1719,13 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
 
 	if (!css->ss) {
 		if (cgroup_on_dfl(cgrp)) {
-			ret = cgroup_addrm_files(&cgrp->self, cgrp,
+			ret = cgroup_addrm_files(css, cgrp,
 						 cgroup_base_files, true);
 			if (ret < 0)
 				return ret;
 
 			if (cgroup_psi_enabled()) {
-				ret = cgroup_addrm_files(&cgrp->self, cgrp,
+				ret = cgroup_addrm_files(css, cgrp,
 							 cgroup_psi_files, true);
 				if (ret < 0)
 					return ret;
-- 
cgit v1.2.3


From 6fcdb0183bf024a70abccb0439321c25891c708d Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 5 Sep 2023 09:32:37 -0400
Subject: cgroup/cpuset: Fix load balance state in update_partition_sd_lb()

Commit a86ce68078b2 ("cgroup/cpuset: Extract out CS_CPU_EXCLUSIVE
& CS_SCHED_LOAD_BALANCE handling") adds a new helper function
update_partition_sd_lb() to update the load balance state of the
cpuset. However the new load balance is determined by just looking at
whether the cpuset is a valid isolated partition root or not.  That is
not enough if the cpuset is not a valid partition root but its parent
is in the isolated state (load balance off). Update the function to
set the new state to be the same as its parent in this case like what
has been done in commit c8c926200c55 ("cgroup/cpuset: Inherit parent's
load balance state in v2").

Fixes: a86ce68078b2 ("cgroup/cpuset: Extract out CS_CPU_EXCLUSIVE & CS_SCHED_LOAD_BALANCE handling")
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 58ec88efa4f8..4749e0c86c62 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1304,13 +1304,23 @@ static int update_partition_exclusive(struct cpuset *cs, int new_prs)
  *
  * Changing load balance flag will automatically call
  * rebuild_sched_domains_locked().
+ * This function is for cgroup v2 only.
  */
 static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
 {
 	int new_prs = cs->partition_root_state;
-	bool new_lb = (new_prs != PRS_ISOLATED);
 	bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
+	bool new_lb;
 
+	/*
+	 * If cs is not a valid partition root, the load balance state
+	 * will follow its parent.
+	 */
+	if (new_prs > 0) {
+		new_lb = (new_prs != PRS_ISOLATED);
+	} else {
+		new_lb = is_sched_load_balance(parent_cs(cs));
+	}
 	if (new_lb != !!is_sched_load_balance(cs)) {
 		rebuild_domains = true;
 		if (new_lb)
-- 
cgit v1.2.3


From 0c7f293efc87a06b51db9aa65256f8cb0a5a0a21 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 5 Sep 2023 09:32:38 -0400
Subject: cgroup/cpuset: Add cpuset.cpus.exclusive.effective for v2

The creation of a cpuset partition means dedicating a set of exclusive
CPUs to be used by a particular partition only. These exclusive CPUs
will not be used by any cpusets outside of that partition.

To enable more flexibility in creating partitions, we need a way to
distribute exclusive CPUs that can be used in new partitions. Currently,
we have a subparts_cpus cpumask in struct cpuset that tracks only
the exclusive CPUs used by all the sub-partitions underneath a given
cpuset.

This patch reworks the way we do exclusive CPUs tracking. The
subparts_cpus is now renamed to effective_xcpus which tracks the
exclusive CPUs allocated to a partition root including those that are
further distributed down to sub-partitions underneath it. IOW, it also
includes the exclusive CPUs used by the current partition root. Note
that effective_xcpus can contain offline CPUs and it will always be a
subset of cpus_allowed.

The renamed effective_xcpus is now exposed via a new read-only
"cpuset.cpus.exclusive.effective" control file. The new effective_xcpus
cpumask should be set to cpus_allowed when a cpuset becomes a partition
root and be cleared if it is not a valid partition root.

In the next patch, we will enable write to another new control file to
enable further control of what can get into effective_xcpus.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 728 ++++++++++++++++++++++++++++---------------------
 1 file changed, 421 insertions(+), 307 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4749e0c86c62..b269c6b79e1a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -78,7 +78,7 @@ enum prs_errcode {
 };
 
 static const char * const perr_strings[] = {
-	[PERR_INVCPUS]   = "Invalid cpu list in cpuset.cpus",
+	[PERR_INVCPUS]   = "Invalid cpu list in cpuset.cpus.exclusive",
 	[PERR_INVPARENT] = "Parent is an invalid partition root",
 	[PERR_NOTPART]   = "Parent is not a partition root",
 	[PERR_NOTEXCL]   = "Cpu list in cpuset.cpus not exclusive",
@@ -121,14 +121,18 @@ struct cpuset {
 	nodemask_t effective_mems;
 
 	/*
-	 * CPUs allocated to child sub-partitions (default hierarchy only)
-	 * - CPUs granted by the parent = effective_cpus U subparts_cpus
-	 * - effective_cpus and subparts_cpus are mutually exclusive.
+	 * Exclusive CPUs dedicated to current cgroup (default hierarchy only)
 	 *
-	 * effective_cpus contains only onlined CPUs, but subparts_cpus
-	 * may have offlined ones.
+	 * This exclusive CPUs must be a subset of cpus_allowed. A parent
+	 * cgroup can only grant exclusive CPUs to one of its children.
+	 *
+	 * When the cgroup becomes a valid partition root, effective_xcpus
+	 * defaults to cpus_allowed if not set. The effective_cpus of a valid
+	 * partition root comes solely from its effective_xcpus and some of the
+	 * effective_xcpus may be distributed to sub-partitions below & hence
+	 * excluded from its effective_cpus.
 	 */
-	cpumask_var_t subparts_cpus;
+	cpumask_var_t effective_xcpus;
 
 	/*
 	 * This is old Memory Nodes tasks took on.
@@ -156,8 +160,8 @@ struct cpuset {
 	/* for custom sched domain */
 	int relax_domain_level;
 
-	/* number of CPUs in subparts_cpus */
-	int nr_subparts_cpus;
+	/* number of valid sub-partitions */
+	int nr_subparts;
 
 	/* partition root state */
 	int partition_root_state;
@@ -185,6 +189,11 @@ struct cpuset {
 	struct cgroup_file partition_file;
 };
 
+/*
+ * Exclusive CPUs distributed out to sub-partitions of top_cpuset
+ */
+static cpumask_var_t	subpartitions_cpus;
+
 /*
  * Partition root states:
  *
@@ -312,7 +321,7 @@ static inline int is_partition_invalid(const struct cpuset *cs)
  */
 static inline void make_partition_invalid(struct cpuset *cs)
 {
-	if (is_partition_valid(cs))
+	if (cs->partition_root_state > 0)
 		cs->partition_root_state = -cs->partition_root_state;
 }
 
@@ -469,7 +478,7 @@ static inline bool partition_is_populated(struct cpuset *cs,
 
 	if (cs->css.cgroup->nr_populated_csets)
 		return true;
-	if (!excluded_child && !cs->nr_subparts_cpus)
+	if (!excluded_child && !cs->nr_subparts)
 		return cgroup_is_populated(cs->css.cgroup);
 
 	rcu_read_lock();
@@ -601,7 +610,7 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 	if (cs) {
 		pmask1 = &cs->cpus_allowed;
 		pmask2 = &cs->effective_cpus;
-		pmask3 = &cs->subparts_cpus;
+		pmask3 = &cs->effective_xcpus;
 	} else {
 		pmask1 = &tmp->new_cpus;
 		pmask2 = &tmp->addmask;
@@ -636,7 +645,7 @@ static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 	if (cs) {
 		free_cpumask_var(cs->cpus_allowed);
 		free_cpumask_var(cs->effective_cpus);
-		free_cpumask_var(cs->subparts_cpus);
+		free_cpumask_var(cs->effective_xcpus);
 	}
 	if (tmp) {
 		free_cpumask_var(tmp->new_cpus);
@@ -664,6 +673,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 
 	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
 	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
+	cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
 	return trial;
 }
 
@@ -677,6 +687,25 @@ static inline void free_cpuset(struct cpuset *cs)
 	kfree(cs);
 }
 
+/*
+ * cpu_exclusive_check() - check if two cpusets are exclusive
+ *
+ * Return 0 if exclusive, -EINVAL if not
+ */
+static inline bool cpu_exclusive_check(struct cpuset *cs1, struct cpuset *cs2)
+{
+	struct cpumask *cpus1, *cpus2;
+
+	cpus1 = cpumask_empty(cs1->effective_xcpus)
+		? cs1->cpus_allowed : cs1->effective_xcpus;
+	cpus2 = cpumask_empty(cs2->effective_xcpus)
+		? cs2->cpus_allowed : cs2->effective_xcpus;
+
+	if (cpumask_intersects(cpus1, cpus2))
+		return -EINVAL;
+	return 0;
+}
+
 /*
  * validate_change_legacy() - Validate conditions specific to legacy (v1)
  *                            behavior.
@@ -776,9 +805,10 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 	ret = -EINVAL;
 	cpuset_for_each_child(c, css, par) {
 		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
-		    c != cur &&
-		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
-			goto out;
+		    c != cur) {
+			if (cpu_exclusive_check(trial, c))
+				goto out;
+		}
 		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
 		    c != cur &&
 		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
@@ -908,7 +938,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	csa = NULL;
 
 	/* Special case for the 99% of systems with one, full, sched domain */
-	if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
+	if (root_load_balance && !top_cpuset.nr_subparts) {
 		ndoms = 1;
 		doms = alloc_sched_domains(ndoms);
 		if (!doms)
@@ -1159,7 +1189,7 @@ static void rebuild_sched_domains_locked(void)
 	 * should be the same as the active CPUs, so checking only top_cpuset
 	 * is enough to detect racing CPU offlines.
 	 */
-	if (!top_cpuset.nr_subparts_cpus &&
+	if (cpumask_empty(subpartitions_cpus) &&
 	    !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
 		return;
 
@@ -1168,7 +1198,7 @@ static void rebuild_sched_domains_locked(void)
 	 * root should be only a subset of the active CPUs.  Since a CPU in any
 	 * partition root could be offlined, all must be checked.
 	 */
-	if (top_cpuset.nr_subparts_cpus) {
+	if (top_cpuset.nr_subparts) {
 		rcu_read_lock();
 		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
 			if (!is_partition_valid(cs)) {
@@ -1232,7 +1262,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
 			 */
 			if (kthread_is_per_cpu(task))
 				continue;
-			cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus);
+			cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
 		} else {
 			cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
 		}
@@ -1247,32 +1277,22 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
  * @cs: the cpuset the need to recompute the new effective_cpus mask
  * @parent: the parent cpuset
  *
- * If the parent has subpartition CPUs, include them in the list of
- * allowable CPUs in computing the new effective_cpus mask. Since offlined
- * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
- * to mask those out.
+ * The result is valid only if the given cpuset isn't a partition root.
  */
 static void compute_effective_cpumask(struct cpumask *new_cpus,
 				      struct cpuset *cs, struct cpuset *parent)
 {
-	if (parent->nr_subparts_cpus && is_partition_valid(cs)) {
-		cpumask_or(new_cpus, parent->effective_cpus,
-			   parent->subparts_cpus);
-		cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
-		cpumask_and(new_cpus, new_cpus, cpu_active_mask);
-	} else {
-		cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
-	}
+	cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
 }
 
 /*
- * Commands for update_parent_subparts_cpumask
+ * Commands for update_parent_effective_cpumask
  */
-enum subparts_cmd {
-	partcmd_enable,		/* Enable partition root	 */
-	partcmd_disable,	/* Disable partition root	 */
-	partcmd_update,		/* Update parent's subparts_cpus */
-	partcmd_invalidate,	/* Make partition invalid	 */
+enum partition_cmd {
+	partcmd_enable,		/* Enable partition root	  */
+	partcmd_disable,	/* Disable partition root	  */
+	partcmd_update,		/* Update parent's effective_cpus */
+	partcmd_invalidate,	/* Make partition invalid	  */
 };
 
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1333,8 +1353,23 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
 		rebuild_sched_domains_locked();
 }
 
+/*
+ * tasks_nocpu_error - Return true if tasks will have no effective_cpus
+ */
+static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
+			      struct cpumask *xcpus)
+{
+	/*
+	 * A populated partition (cs or parent) can't have empty effective_cpus
+	 */
+	return (cpumask_subset(parent->effective_cpus, xcpus) &&
+		partition_is_populated(parent, cs)) ||
+	       (!cpumask_intersects(xcpus, cpu_active_mask) &&
+		partition_is_populated(cs, NULL));
+}
+
 /**
- * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
+ * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
  * @cs:      The cpuset that requests change in partition root state
  * @cmd:     Partition root state change command
  * @newmask: Optional new cpumask for partcmd_update
@@ -1342,21 +1377,20 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
  * Return:   0 or a partition root state error code
  *
  * For partcmd_enable, the cpuset is being transformed from a non-partition
- * root to a partition root. The cpus_allowed mask of the given cpuset will
- * be put into parent's subparts_cpus and taken away from parent's
+ * root to a partition root. The effective_xcpus (cpus_allowed if effective_xcpus
+ * not set) mask of the given cpuset will be taken away from parent's
  * effective_cpus. The function will return 0 if all the CPUs listed in
- * cpus_allowed can be granted or an error code will be returned.
+ * effective_xcpus can be granted or an error code will be returned.
  *
  * For partcmd_disable, the cpuset is being transformed from a partition
- * root back to a non-partition root. Any CPUs in cpus_allowed that are in
- * parent's subparts_cpus will be taken away from that cpumask and put back
- * into parent's effective_cpus. 0 will always be returned.
+ * root back to a non-partition root. Any CPUs in effective_xcpus will be
+ * given back to parent's effective_cpus. 0 will always be returned.
  *
  * For partcmd_update, if the optional newmask is specified, the cpu list is
- * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is
+ * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is
  * assumed to remain the same. The cpuset should either be a valid or invalid
  * partition root. The partition root state may change from valid to invalid
- * or vice versa. An error code will only be returned if transitioning from
+ * or vice versa. An error code will be returned if transitioning from
  * invalid to valid violates the exclusivity rule.
  *
  * For partcmd_invalidate, the current partition will be made invalid.
@@ -1371,18 +1405,47 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
  * check for error and so partition_root_state and prs_error will be updated
  * directly.
  */
-static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
-					  struct cpumask *newmask,
-					  struct tmpmasks *tmp)
+static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
+					   struct cpumask *newmask,
+					   struct tmpmasks *tmp)
 {
 	struct cpuset *parent = parent_cs(cs);
-	int adding;	/* Moving cpus from effective_cpus to subparts_cpus */
-	int deleting;	/* Moving cpus from subparts_cpus to effective_cpus */
+	int adding;	/* Adding cpus to parent's effective_cpus	*/
+	int deleting;	/* Deleting cpus from parent's effective_cpus	*/
 	int old_prs, new_prs;
 	int part_error = PERR_NONE;	/* Partition error? */
+	int subparts_delta = 0;
+	struct cpumask *xcpus;		/* cs effective_xcpus */
+	bool nocpu;
 
 	lockdep_assert_held(&cpuset_mutex);
 
+	/*
+	 * new_prs will only be changed for the partcmd_update and
+	 * partcmd_invalidate commands.
+	 */
+	adding = deleting = false;
+	old_prs = new_prs = cs->partition_root_state;
+	xcpus = !cpumask_empty(cs->effective_xcpus)
+		? cs->effective_xcpus : cs->cpus_allowed;
+
+	if (cmd == partcmd_invalidate) {
+		if (is_prs_invalid(old_prs))
+			return 0;
+
+		/*
+		 * Make the current partition invalid.
+		 */
+		if (is_partition_valid(parent))
+			adding = cpumask_and(tmp->addmask,
+					     xcpus, parent->effective_xcpus);
+		if (old_prs > 0) {
+			new_prs = -old_prs;
+			subparts_delta--;
+		}
+		goto write_error;
+	}
+
 	/*
 	 * The parent must be a partition root.
 	 * The new cpumask, if present, or the current cpus_allowed must
@@ -1395,124 +1458,124 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
 	if (!newmask && cpumask_empty(cs->cpus_allowed))
 		return PERR_CPUSEMPTY;
 
-	/*
-	 * new_prs will only be changed for the partcmd_update and
-	 * partcmd_invalidate commands.
-	 */
-	adding = deleting = false;
-	old_prs = new_prs = cs->partition_root_state;
+	nocpu = tasks_nocpu_error(parent, cs, xcpus);
+
 	if (cmd == partcmd_enable) {
 		/*
-		 * Enabling partition root is not allowed if cpus_allowed
-		 * doesn't overlap parent's cpus_allowed.
+		 * Enabling partition root is not allowed if its
+		 * effective_xcpus is empty or doesn't overlap with
+		 * parent's effective_xcpus.
 		 */
-		if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed))
+		if (cpumask_empty(xcpus) ||
+		    !cpumask_intersects(xcpus, parent->effective_xcpus))
 			return PERR_INVCPUS;
 
 		/*
 		 * A parent can be left with no CPU as long as there is no
 		 * task directly associated with the parent partition.
 		 */
-		if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) &&
-		    partition_is_populated(parent, cs))
+		if (nocpu)
 			return PERR_NOCPUS;
 
-		cpumask_copy(tmp->addmask, cs->cpus_allowed);
-		adding = true;
+		cpumask_copy(tmp->delmask, xcpus);
+		deleting = true;
+		subparts_delta++;
 	} else if (cmd == partcmd_disable) {
 		/*
-		 * Need to remove cpus from parent's subparts_cpus for valid
-		 * partition root.
+		 n* May need to add cpus to parent's effective_cpus for
+		 * valid partition root.
 		 */
-		deleting = !is_prs_invalid(old_prs) &&
-			   cpumask_and(tmp->delmask, cs->cpus_allowed,
-				       parent->subparts_cpus);
-	} else if (cmd == partcmd_invalidate) {
-		if (is_prs_invalid(old_prs))
-			return 0;
-
+		adding = !is_prs_invalid(old_prs) &&
+			  cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
+		if (adding)
+			subparts_delta--;
+	} else if (newmask) {
 		/*
-		 * Make the current partition invalid. It is assumed that
-		 * invalidation is caused by violating cpu exclusivity rule.
+		 * Empty cpumask is not allowed
 		 */
-		deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
-				       parent->subparts_cpus);
-		if (old_prs > 0) {
-			new_prs = -old_prs;
-			part_error = PERR_NOTEXCL;
+		if (cpumask_empty(newmask)) {
+			part_error = PERR_CPUSEMPTY;
+			goto write_error;
 		}
-	} else if (newmask) {
+
 		/*
 		 * partcmd_update with newmask:
 		 *
-		 * Compute add/delete mask to/from subparts_cpus
+		 * Compute add/delete mask to/from effective_cpus
 		 *
-		 * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
-		 * addmask = newmask & parent->cpus_allowed
-		 *		     & ~parent->subparts_cpus
+		 * addmask = effective_xcpus & ~newmask & parent->effective_xcpus
+		 * delmask = newmask & ~cs->effective_xcpus
+		 *		     & parent->effective_xcpus
 		 */
-		cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask);
-		deleting = cpumask_and(tmp->delmask, tmp->delmask,
-				       parent->subparts_cpus);
+		cpumask_andnot(tmp->addmask, xcpus, newmask);
+		adding = cpumask_and(tmp->addmask, tmp->addmask,
+				     parent->effective_xcpus);
 
-		cpumask_and(tmp->addmask, newmask, parent->cpus_allowed);
-		adding = cpumask_andnot(tmp->addmask, tmp->addmask,
-					parent->subparts_cpus);
-		/*
-		 * Empty cpumask is not allowed
-		 */
-		if (cpumask_empty(newmask)) {
-			part_error = PERR_CPUSEMPTY;
+		cpumask_andnot(tmp->delmask, newmask, xcpus);
+		deleting = cpumask_and(tmp->delmask, tmp->delmask,
+				       parent->effective_xcpus);
 		/*
 		 * Make partition invalid if parent's effective_cpus could
 		 * become empty and there are tasks in the parent.
 		 */
-		} else if (adding &&
-		    cpumask_subset(parent->effective_cpus, tmp->addmask) &&
-		    !cpumask_intersects(tmp->delmask, cpu_active_mask) &&
-		    partition_is_populated(parent, cs)) {
+		if (nocpu && (!adding ||
+		    !cpumask_intersects(tmp->addmask, cpu_active_mask))) {
 			part_error = PERR_NOCPUS;
-			adding = false;
-			deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
-					       parent->subparts_cpus);
+			deleting = false;
+			adding = cpumask_and(tmp->addmask,
+					     xcpus, parent->effective_xcpus);
 		}
 	} else {
 		/*
-		 * partcmd_update w/o newmask:
+		 * partcmd_update w/o newmask
 		 *
-		 * delmask = cpus_allowed & parent->subparts_cpus
-		 * addmask = cpus_allowed & parent->cpus_allowed
-		 *			  & ~parent->subparts_cpus
+		 * delmask = effective_xcpus & parent->effective_cpus
 		 *
-		 * This gets invoked either due to a hotplug event or from
-		 * update_cpumasks_hier(). This can cause the state of a
-		 * partition root to transition from valid to invalid or vice
-		 * versa. So we still need to compute the addmask and delmask.
-
-		 * A partition error happens when:
-		 * 1) Cpuset is valid partition, but parent does not distribute
-		 *    out any CPUs.
-		 * 2) Parent has tasks and all its effective CPUs will have
-		 *    to be distributed out.
+		 * This can be called from:
+		 * 1) update_cpumasks_hier()
+		 * 2) cpuset_hotplug_update_tasks()
+		 *
+		 * Check to see if it can be transitioned from valid to
+		 * invalid partition or vice versa.
+		 *
+		 * A partition error happens when parent has tasks and all
+		 * its effective CPUs will have to be distributed out.
 		 */
-		cpumask_and(tmp->addmask, cs->cpus_allowed,
-					  parent->cpus_allowed);
-		adding = cpumask_andnot(tmp->addmask, tmp->addmask,
-					parent->subparts_cpus);
-
-		if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) ||
-		    (adding &&
-		     cpumask_subset(parent->effective_cpus, tmp->addmask) &&
-		     partition_is_populated(parent, cs))) {
+		WARN_ON_ONCE(!is_partition_valid(parent));
+		if (nocpu) {
 			part_error = PERR_NOCPUS;
-			adding = false;
-		}
+			if (is_partition_valid(cs))
+				adding = cpumask_and(tmp->addmask,
+						xcpus, parent->effective_xcpus);
+		} else if (is_partition_invalid(cs) &&
+			   cpumask_subset(xcpus, parent->effective_xcpus)) {
+			struct cgroup_subsys_state *css;
+			struct cpuset *child;
+			bool exclusive = true;
 
-		if (part_error && is_partition_valid(cs) &&
-		    parent->nr_subparts_cpus)
-			deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
-					       parent->subparts_cpus);
+			/*
+			 * Convert invalid partition to valid has to
+			 * pass the cpu exclusivity test.
+			 */
+			rcu_read_lock();
+			cpuset_for_each_child(child, css, parent) {
+				if (child == cs)
+					continue;
+				if (cpu_exclusive_check(cs, child)) {
+					exclusive = false;
+					break;
+				}
+			}
+			rcu_read_unlock();
+			if (exclusive)
+				deleting = cpumask_and(tmp->delmask,
+						xcpus, parent->effective_cpus);
+			else
+				part_error = PERR_NOTEXCL;
+		}
 	}
+
+write_error:
 	if (part_error)
 		WRITE_ONCE(cs->prs_err, part_error);
 
@@ -1524,13 +1587,17 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
 		switch (cs->partition_root_state) {
 		case PRS_ROOT:
 		case PRS_ISOLATED:
-			if (part_error)
+			if (part_error) {
 				new_prs = -old_prs;
+				subparts_delta--;
+			}
 			break;
 		case PRS_INVALID_ROOT:
 		case PRS_INVALID_ISOLATED:
-			if (!part_error)
+			if (!part_error) {
 				new_prs = -old_prs;
+				subparts_delta++;
+			}
 			break;
 		}
 	}
@@ -1550,32 +1617,43 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
 	}
 
 	/*
-	 * Change the parent's subparts_cpus.
+	 * Change the parent's effective_cpus & effective_xcpus (top cpuset
+	 * only).
+	 *
 	 * Newly added CPUs will be removed from effective_cpus and
 	 * newly deleted ones will be added back to effective_cpus.
 	 */
 	spin_lock_irq(&callback_lock);
 	if (adding) {
-		cpumask_or(parent->subparts_cpus,
-			   parent->subparts_cpus, tmp->addmask);
-		cpumask_andnot(parent->effective_cpus,
-			       parent->effective_cpus, tmp->addmask);
-	}
-	if (deleting) {
-		cpumask_andnot(parent->subparts_cpus,
-			       parent->subparts_cpus, tmp->delmask);
+		if (parent == &top_cpuset)
+			cpumask_andnot(subpartitions_cpus,
+				       subpartitions_cpus, tmp->addmask);
 		/*
-		 * Some of the CPUs in subparts_cpus might have been offlined.
+		 * Some of the CPUs in effective_xcpus might have been offlined.
 		 */
-		cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
 		cpumask_or(parent->effective_cpus,
-			   parent->effective_cpus, tmp->delmask);
+			   parent->effective_cpus, tmp->addmask);
+		cpumask_and(parent->effective_cpus,
+			    parent->effective_cpus, cpu_active_mask);
+	}
+	if (deleting) {
+		if (parent == &top_cpuset)
+			cpumask_or(subpartitions_cpus,
+				   subpartitions_cpus, tmp->delmask);
+		cpumask_andnot(parent->effective_cpus,
+			       parent->effective_cpus, tmp->delmask);
 	}
 
-	parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
+	if (is_partition_valid(parent)) {
+		parent->nr_subparts += subparts_delta;
+		WARN_ON_ONCE(parent->nr_subparts < 0);
+	}
 
-	if (old_prs != new_prs)
+	if (old_prs != new_prs) {
 		cs->partition_root_state = new_prs;
+		if (new_prs <= 0)
+			cs->nr_subparts = 0;
+	}
 
 	spin_unlock_irq(&callback_lock);
 
@@ -1600,6 +1678,71 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
 	return 0;
 }
 
+/**
+ * compute_partition_effective_cpumask - compute effective_cpus for partition
+ * @cs: partition root cpuset
+ * @new_ecpus: previously computed effective_cpus to be updated
+ *
+ * Compute the effective_cpus of a partition root by scanning effective_xcpus
+ * of child partition roots and exclusing their effective_xcpus.
+ *
+ * This has the side effect of invalidating valid child partition roots,
+ * if necessary. Since it is called from either cpuset_hotplug_update_tasks()
+ * or update_cpumasks_hier() where parent and children are modified
+ * successively, we don't need to call update_parent_effective_cpumask()
+ * and the child's effective_cpus will be updated in later iterations.
+ *
+ * Note that rcu_read_lock() is assumed to be held.
+ */
+static void compute_partition_effective_cpumask(struct cpuset *cs,
+						struct cpumask *new_ecpus)
+{
+	struct cgroup_subsys_state *css;
+	struct cpuset *child;
+	bool populated = partition_is_populated(cs, NULL);
+
+	/*
+	 * Check child partition roots to see if they should be
+	 * invalidated when
+	 *  1) child effective_xcpus not a subset of new
+	 *     excluisve_cpus
+	 *  2) All the effective_cpus will be used up and cp
+	 *     has tasks
+	 */
+	cpumask_and(new_ecpus, cs->effective_xcpus, cpu_active_mask);
+	rcu_read_lock();
+	cpuset_for_each_child(child, css, cs) {
+		if (!is_partition_valid(child))
+			continue;
+
+		child->prs_err = 0;
+		if (!cpumask_subset(child->effective_xcpus,
+				    cs->effective_xcpus))
+			child->prs_err = PERR_INVCPUS;
+		else if (populated &&
+			 cpumask_subset(new_ecpus, child->effective_xcpus))
+			child->prs_err = PERR_NOCPUS;
+
+		if (child->prs_err) {
+			int old_prs = child->partition_root_state;
+
+			/*
+			 * Invalidate child partition
+			 */
+			spin_lock_irq(&callback_lock);
+			make_partition_invalid(child);
+			cs->nr_subparts--;
+			child->nr_subparts = 0;
+			spin_unlock_irq(&callback_lock);
+			notify_partition_change(child, old_prs);
+			continue;
+		}
+		cpumask_andnot(new_ecpus, new_ecpus,
+			       child->effective_xcpus);
+	}
+	rcu_read_unlock();
+}
+
 /*
  * update_cpumasks_hier() flags
  */
@@ -1634,6 +1777,19 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 
 		compute_effective_cpumask(tmp->new_cpus, cp, parent);
 
+		if (is_partition_valid(parent) && is_partition_valid(cp))
+			compute_partition_effective_cpumask(cp, tmp->new_cpus);
+
+		/*
+		 * A partition with no effective_cpus is allowed as long as
+		 * there is no task associated with it. Call
+		 * update_parent_effective_cpumask() to check it.
+		 */
+		if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {
+			update_parent = true;
+			goto update_parent_effective;
+		}
+
 		/*
 		 * If it becomes empty, inherit the effective mask of the
 		 * parent, which is guaranteed to have some CPUs unless
@@ -1641,10 +1797,6 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 		 * out all its CPUs.
 		 */
 		if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
-			if (is_partition_valid(cp) &&
-			    cpumask_equal(cp->cpus_allowed, cp->subparts_cpus))
-				goto update_parent_subparts;
-
 			cpumask_copy(tmp->new_cpus, parent->effective_cpus);
 			if (!cp->use_parent_ecpus) {
 				cp->use_parent_ecpus = true;
@@ -1671,12 +1823,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 			continue;
 		}
 
-update_parent_subparts:
+update_parent_effective:
 		/*
-		 * update_parent_subparts_cpumask() should have been called
+		 * update_parent_effective_cpumask() should have been called
 		 * for cs already in update_cpumask(). We should also call
 		 * update_tasks_cpumask() again for tasks in the parent
-		 * cpuset if the parent's subparts_cpus changes.
+		 * cpuset if the parent's effective_cpus changes.
 		 */
 		old_prs = new_prs = cp->partition_root_state;
 		if ((cp != cs) && old_prs) {
@@ -1706,8 +1858,7 @@ update_parent_subparts:
 		rcu_read_unlock();
 
 		if (update_parent) {
-			update_parent_subparts_cpumask(cp, partcmd_update, NULL,
-						       tmp);
+			update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);
 			/*
 			 * The cpuset partition_root_state may become
 			 * invalid. Capture it.
@@ -1716,30 +1867,18 @@ update_parent_subparts:
 		}
 
 		spin_lock_irq(&callback_lock);
-
-		if (cp->nr_subparts_cpus && !is_partition_valid(cp)) {
-			/*
-			 * Put all active subparts_cpus back to effective_cpus.
-			 */
-			cpumask_or(tmp->new_cpus, tmp->new_cpus,
-				   cp->subparts_cpus);
-			cpumask_and(tmp->new_cpus, tmp->new_cpus,
-				   cpu_active_mask);
-			cp->nr_subparts_cpus = 0;
-			cpumask_clear(cp->subparts_cpus);
-		}
-
 		cpumask_copy(cp->effective_cpus, tmp->new_cpus);
-		if (cp->nr_subparts_cpus) {
-			/*
-			 * Make sure that effective_cpus & subparts_cpus
-			 * are mutually exclusive.
-			 */
-			cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
-				       cp->subparts_cpus);
-		}
-
 		cp->partition_root_state = new_prs;
+		if ((new_prs > 0) && cpumask_empty(cp->effective_xcpus))
+			cpumask_and(cp->effective_xcpus,
+				    cp->cpus_allowed, parent->effective_xcpus);
+		if (new_prs < 0) {
+			/* Reset partition data */
+			cp->nr_subparts = 0;
+			cpumask_clear(cp->effective_xcpus);
+			if (is_cpu_exclusive(cp))
+				clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
+		}
 		spin_unlock_irq(&callback_lock);
 
 		notify_partition_change(cp, old_prs);
@@ -1836,6 +1975,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 {
 	int retval;
 	struct tmpmasks tmp;
+	struct cpuset *parent = parent_cs(cs);
 	bool invalidate = false;
 	int old_prs = cs->partition_root_state;
 
@@ -1851,6 +1991,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	 */
 	if (!*buf) {
 		cpumask_clear(trialcs->cpus_allowed);
+		cpumask_clear(trialcs->effective_xcpus);
 	} else {
 		retval = cpulist_parse(buf, trialcs->cpus_allowed);
 		if (retval < 0)
@@ -1859,6 +2000,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 		if (!cpumask_subset(trialcs->cpus_allowed,
 				    top_cpuset.cpus_allowed))
 			return -EINVAL;
+
+		/*
+		 * When effective_xcpus is set, make sure it is a subset of
+		 * cpus_allowed and parent's effective_xcpus.
+		 */
+		cpumask_and(trialcs->effective_xcpus,
+			    parent->effective_xcpus, trialcs->cpus_allowed);
 	}
 
 	/* Nothing to do if the cpus didn't change */
@@ -1868,11 +2016,21 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (alloc_cpumasks(NULL, &tmp))
 		return -ENOMEM;
 
+	if (is_partition_valid(cs)) {
+		if (cpumask_empty(trialcs->effective_xcpus)) {
+			invalidate = true;
+			cs->prs_err = PERR_INVCPUS;
+		} else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
+			invalidate = true;
+			cs->prs_err = PERR_NOCPUS;
+		}
+	}
+
 	retval = validate_change(cs, trialcs);
 
 	if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
-		struct cpuset *cp, *parent;
 		struct cgroup_subsys_state *css;
+		struct cpuset *cp;
 
 		/*
 		 * The -EINVAL error code indicates that partition sibling
@@ -1883,69 +2041,44 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 		 */
 		invalidate = true;
 		rcu_read_lock();
-		parent = parent_cs(cs);
 		cpuset_for_each_child(cp, css, parent)
 			if (is_partition_valid(cp) &&
-			    cpumask_intersects(trialcs->cpus_allowed, cp->cpus_allowed)) {
+			    cpumask_intersects(trialcs->effective_xcpus, cp->effective_xcpus)) {
 				rcu_read_unlock();
-				update_parent_subparts_cpumask(cp, partcmd_invalidate, NULL, &tmp);
+				update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
 				rcu_read_lock();
 			}
 		rcu_read_unlock();
 		retval = 0;
 	}
+
 	if (retval < 0)
 		goto out_free;
 
 	if (cs->partition_root_state) {
 		if (invalidate)
-			update_parent_subparts_cpumask(cs, partcmd_invalidate,
-						       NULL, &tmp);
+			update_parent_effective_cpumask(cs, partcmd_invalidate,
+							NULL, &tmp);
 		else
-			update_parent_subparts_cpumask(cs, partcmd_update,
-						trialcs->cpus_allowed, &tmp);
+			update_parent_effective_cpumask(cs, partcmd_update,
+						trialcs->effective_xcpus, &tmp);
 	}
 
-	compute_effective_cpumask(trialcs->effective_cpus, trialcs,
-				  parent_cs(cs));
 	spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+	if (!is_partition_valid(cs))
+		cpumask_clear(cs->effective_xcpus);
+	else
+		cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
 
-	/*
-	 * Make sure that subparts_cpus, if not empty, is a subset of
-	 * cpus_allowed. Clear subparts_cpus if partition not valid or
-	 * empty effective cpus with tasks.
-	 */
-	if (cs->nr_subparts_cpus) {
-		if (!is_partition_valid(cs) ||
-		   (cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus) &&
-		    partition_is_populated(cs, NULL))) {
-			cs->nr_subparts_cpus = 0;
-			cpumask_clear(cs->subparts_cpus);
-		} else {
-			cpumask_and(cs->subparts_cpus, cs->subparts_cpus,
-				    cs->cpus_allowed);
-			cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
-		}
-	}
 	spin_unlock_irq(&callback_lock);
 
 	/* effective_cpus will be updated here */
 	update_cpumasks_hier(cs, &tmp, 0);
 
-	if (cs->partition_root_state) {
-		struct cpuset *parent = parent_cs(cs);
-
-		/*
-		 * For partition root, update the cpumasks of sibling
-		 * cpusets if they use parent's effective_cpus.
-		 */
-		if (parent->child_ecpus_count)
-			update_sibling_cpumasks(parent, cs, &tmp);
-
-		/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */
+	/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
+	if (cs->partition_root_state)
 		update_partition_sd_lb(cs, old_prs);
-	}
 out_free:
 	free_cpumasks(NULL, &tmp);
 	return 0;
@@ -2323,7 +2456,6 @@ out:
 static int update_prstate(struct cpuset *cs, int new_prs)
 {
 	int err = PERR_NONE, old_prs = cs->partition_root_state;
-	struct cpuset *parent = parent_cs(cs);
 	struct tmpmasks tmpmask;
 
 	if (old_prs == new_prs)
@@ -2341,6 +2473,19 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 	if (alloc_cpumasks(NULL, &tmpmask))
 		return -ENOMEM;
 
+	/*
+	 * Setup effective_xcpus if not set yet, it will be cleared later
+	 * if partition becomes invalid.
+	 */
+	if ((new_prs > 0) && cpumask_empty(cs->effective_xcpus)) {
+		struct cpuset *parent = parent_cs(cs);
+
+		spin_lock_irq(&callback_lock);
+		cpumask_and(cs->effective_xcpus,
+			    cs->cpus_allowed, parent->effective_xcpus);
+		spin_unlock_irq(&callback_lock);
+	}
+
 	err = update_partition_exclusive(cs, new_prs);
 	if (err)
 		goto out;
@@ -2354,8 +2499,8 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 			goto out;
 		}
 
-		err = update_parent_subparts_cpumask(cs, partcmd_enable,
-						     NULL, &tmpmask);
+		err = update_parent_effective_cpumask(cs, partcmd_enable,
+						      NULL, &tmpmask);
 	} else if (old_prs && new_prs) {
 		/*
 		 * A change in load balance state only, no change in cpumasks.
@@ -2366,19 +2511,13 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 		 * Switching back to member is always allowed even if it
 		 * disables child partitions.
 		 */
-		update_parent_subparts_cpumask(cs, partcmd_disable, NULL,
-					       &tmpmask);
+		update_parent_effective_cpumask(cs, partcmd_disable, NULL,
+						&tmpmask);
 
 		/*
-		 * If there are child partitions, they will all become invalid.
+		 * Invalidation of child partitions will be done in
+		 * update_cpumasks_hier().
 		 */
-		if (unlikely(cs->nr_subparts_cpus)) {
-			spin_lock_irq(&callback_lock);
-			cs->nr_subparts_cpus = 0;
-			cpumask_clear(cs->subparts_cpus);
-			compute_effective_cpumask(cs->effective_cpus, cs, parent);
-			spin_unlock_irq(&callback_lock);
-		}
 	}
 out:
 	/*
@@ -2393,14 +2532,12 @@ out:
 	spin_lock_irq(&callback_lock);
 	cs->partition_root_state = new_prs;
 	WRITE_ONCE(cs->prs_err, err);
+	if (!is_partition_valid(cs))
+		cpumask_clear(cs->effective_xcpus);
 	spin_unlock_irq(&callback_lock);
 
-	/*
-	 * Update child cpusets, if present.
-	 * Force update if switching back to member.
-	 */
-	if (!list_empty(&cs->css.children))
-		update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
+	/* Force update if switching back to member */
+	update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
 
 	/* Update sched domains and load balance flag */
 	update_partition_sd_lb(cs, old_prs);
@@ -2649,7 +2786,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
 		guarantee_online_cpus(task, cpus_attach);
 	else
 		cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
-			       cs->subparts_cpus);
+			       subpartitions_cpus);
 	/*
 	 * can_attach beforehand should guarantee that this doesn't
 	 * fail.  TODO: have a better way to handle failure here
@@ -2752,6 +2889,7 @@ typedef enum {
 	FILE_EFFECTIVE_CPULIST,
 	FILE_EFFECTIVE_MEMLIST,
 	FILE_SUBPARTS_CPULIST,
+	FILE_EFFECTIVE_XCPULIST,
 	FILE_CPU_EXCLUSIVE,
 	FILE_MEM_EXCLUSIVE,
 	FILE_MEM_HARDWALL,
@@ -2936,8 +3074,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 	case FILE_EFFECTIVE_MEMLIST:
 		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
 		break;
+	case FILE_EFFECTIVE_XCPULIST:
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
+		break;
 	case FILE_SUBPARTS_CPULIST:
-		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
 		break;
 	default:
 		ret = -EINVAL;
@@ -3209,11 +3350,18 @@ static struct cftype dfl_files[] = {
 		.file_offset = offsetof(struct cpuset, partition_file),
 	},
 
+	{
+		.name = "cpus.exclusive.effective",
+		.seq_show = cpuset_common_seq_show,
+		.private = FILE_EFFECTIVE_XCPULIST,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+
 	{
 		.name = "cpus.subpartitions",
 		.seq_show = cpuset_common_seq_show,
 		.private = FILE_SUBPARTS_CPULIST,
-		.flags = CFTYPE_DEBUG,
+		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
 	},
 
 	{ }	/* terminate */
@@ -3387,6 +3535,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 
 	if (is_in_v2_mode()) {
 		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+		cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);
 		top_cpuset.mems_allowed = node_possible_map;
 	} else {
 		cpumask_copy(top_cpuset.cpus_allowed,
@@ -3525,11 +3674,13 @@ int __init cpuset_init(void)
 {
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
-	BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
+	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
+	BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
 
 	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
 	cpumask_setall(top_cpuset.effective_cpus);
+	cpumask_setall(top_cpuset.effective_xcpus);
 	nodes_setall(top_cpuset.effective_mems);
 
 	fmeter_init(&top_cpuset.fmeter);
@@ -3669,30 +3820,15 @@ retry:
 	compute_effective_cpumask(&new_cpus, cs, parent);
 	nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
 
-	if (cs->nr_subparts_cpus)
-		/*
-		 * Make sure that CPUs allocated to child partitions
-		 * do not show up in effective_cpus.
-		 */
-		cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
-
 	if (!tmp || !cs->partition_root_state)
 		goto update_tasks;
 
 	/*
-	 * In the unlikely event that a partition root has empty
-	 * effective_cpus with tasks, we will have to invalidate child
-	 * partitions, if present, by setting nr_subparts_cpus to 0 to
-	 * reclaim their cpus.
+	 * Compute effective_cpus for valid partition root, may invalidate
+	 * child partition roots if necessary.
 	 */
-	if (cs->nr_subparts_cpus && is_partition_valid(cs) &&
-	    cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) {
-		spin_lock_irq(&callback_lock);
-		cs->nr_subparts_cpus = 0;
-		cpumask_clear(cs->subparts_cpus);
-		spin_unlock_irq(&callback_lock);
-		compute_effective_cpumask(&new_cpus, cs, parent);
-	}
+	if (is_partition_valid(cs) && is_partition_valid(parent))
+		compute_partition_effective_cpumask(cs, &new_cpus);
 
 	/*
 	 * Force the partition to become invalid if either one of
@@ -3701,44 +3837,22 @@ retry:
 	 * 2) parent is invalid or doesn't grant any cpus to child
 	 *    partitions.
 	 */
-	if (is_partition_valid(cs) && (!parent->nr_subparts_cpus ||
-	   (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) {
-		int old_prs, parent_prs;
-
-		update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp);
-		if (cs->nr_subparts_cpus) {
-			spin_lock_irq(&callback_lock);
-			cs->nr_subparts_cpus = 0;
-			cpumask_clear(cs->subparts_cpus);
-			spin_unlock_irq(&callback_lock);
-			compute_effective_cpumask(&new_cpus, cs, parent);
-		}
-
-		old_prs = cs->partition_root_state;
-		parent_prs = parent->partition_root_state;
-		if (is_partition_valid(cs)) {
-			spin_lock_irq(&callback_lock);
-			make_partition_invalid(cs);
-			spin_unlock_irq(&callback_lock);
-			if (is_prs_invalid(parent_prs))
-				WRITE_ONCE(cs->prs_err, PERR_INVPARENT);
-			else if (!parent_prs)
-				WRITE_ONCE(cs->prs_err, PERR_NOTPART);
-			else
-				WRITE_ONCE(cs->prs_err, PERR_HOTPLUG);
-			notify_partition_change(cs, old_prs);
-		}
+	if (is_partition_valid(cs) && (!is_partition_valid(parent) ||
+				tasks_nocpu_error(parent, cs, &new_cpus))) {
+		update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp);
+		compute_effective_cpumask(&new_cpus, cs, parent);
 		cpuset_force_rebuild();
 	}
-
 	/*
 	 * On the other hand, an invalid partition root may be transitioned
 	 * back to a regular one.
 	 */
 	else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
-		update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp);
-		if (is_partition_valid(cs))
+		update_parent_effective_cpumask(cs, partcmd_update, NULL, tmp);
+		if (is_partition_valid(cs)) {
+			compute_partition_effective_cpumask(cs, &new_cpus);
 			cpuset_force_rebuild();
+		}
 	}
 
 update_tasks:
@@ -3796,21 +3910,22 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	new_mems = node_states[N_MEMORY];
 
 	/*
-	 * If subparts_cpus is populated, it is likely that the check below
-	 * will produce a false positive on cpus_updated when the cpu list
-	 * isn't changed. It is extra work, but it is better to be safe.
+	 * If subpartitions_cpus is populated, it is likely that the check
+	 * below will produce a false positive on cpus_updated when the cpu
+	 * list isn't changed. It is extra work, but it is better to be safe.
 	 */
-	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||
+		       !cpumask_empty(subpartitions_cpus);
 	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
 
 	/*
-	 * In the rare case that hotplug removes all the cpus in subparts_cpus,
-	 * we assumed that cpus are updated.
+	 * In the rare case that hotplug removes all the cpus in
+	 * subpartitions_cpus, we assumed that cpus are updated.
 	 */
-	if (!cpus_updated && top_cpuset.nr_subparts_cpus)
+	if (!cpus_updated && top_cpuset.nr_subparts)
 		cpus_updated = true;
 
-	/* synchronize cpus_allowed to cpu_active_mask */
+	/* For v1, synchronize cpus_allowed to cpu_active_mask */
 	if (cpus_updated) {
 		spin_lock_irq(&callback_lock);
 		if (!on_dfl)
@@ -3818,17 +3933,16 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 		/*
 		 * Make sure that CPUs allocated to child partitions
 		 * do not show up in effective_cpus. If no CPU is left,
-		 * we clear the subparts_cpus & let the child partitions
+		 * we clear the subpartitions_cpus & let the child partitions
 		 * fight for the CPUs again.
 		 */
-		if (top_cpuset.nr_subparts_cpus) {
-			if (cpumask_subset(&new_cpus,
-					   top_cpuset.subparts_cpus)) {
-				top_cpuset.nr_subparts_cpus = 0;
-				cpumask_clear(top_cpuset.subparts_cpus);
+		if (!cpumask_empty(subpartitions_cpus)) {
+			if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
+				top_cpuset.nr_subparts = 0;
+				cpumask_clear(subpartitions_cpus);
 			} else {
 				cpumask_andnot(&new_cpus, &new_cpus,
-					       top_cpuset.subparts_cpus);
+					       subpartitions_cpus);
 			}
 		}
 		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
@@ -3960,7 +4074,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 		 * We first exclude cpus allocated to partitions. If there is no
 		 * allowable online cpu left, we fall back to all possible cpus.
 		 */
-		cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus);
+		cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
 		if (!cpumask_intersects(pmask, cpu_online_mask))
 			cpumask_copy(pmask, possible_mask);
 	}
-- 
cgit v1.2.3


From e2ffe502ba4505ee9c7b432980c702b7801a37f3 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 5 Sep 2023 09:32:39 -0400
Subject: cgroup/cpuset: Add cpuset.cpus.exclusive for v2

This patch introduces a new writable "cpuset.cpus.exclusive" control
file for v2 which will be added to non-root cpuset enabled cgroups. This new
file enables user to set a smaller list of exclusive CPUs to be used in
the creation of a cpuset partition.

The value written to "cpuset.cpus.exclusive" may not be the effective
value being used for the creation of cpuset partition, the effective
value will show up in "cpuset.cpus.exclusive.effective" and it is
subject to the constraint that it must also be a subset of cpus_allowed
and parent's "cpuset.cpus.exclusive.effective".

By writing to "cpuset.cpus.exclusive", "cpuset.cpus.exclusive.effective"
may be set to a non-empty value even for cgroups that are not valid
partition roots yet.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 273 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 239 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b269c6b79e1a..0419654f3004 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -134,6 +134,11 @@ struct cpuset {
 	 */
 	cpumask_var_t effective_xcpus;
 
+	/*
+	 * Exclusive CPUs as requested by the user (default hierarchy only)
+	 */
+	cpumask_var_t exclusive_cpus;
+
 	/*
 	 * This is old Memory Nodes tasks took on.
 	 *
@@ -605,16 +610,18 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  */
 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 {
-	cpumask_var_t *pmask1, *pmask2, *pmask3;
+	cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4;
 
 	if (cs) {
 		pmask1 = &cs->cpus_allowed;
 		pmask2 = &cs->effective_cpus;
 		pmask3 = &cs->effective_xcpus;
+		pmask4 = &cs->exclusive_cpus;
 	} else {
 		pmask1 = &tmp->new_cpus;
 		pmask2 = &tmp->addmask;
 		pmask3 = &tmp->delmask;
+		pmask4 = NULL;
 	}
 
 	if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
@@ -626,8 +633,14 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 	if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
 		goto free_two;
 
+	if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
+		goto free_three;
+
+
 	return 0;
 
+free_three:
+	free_cpumask_var(*pmask3);
 free_two:
 	free_cpumask_var(*pmask2);
 free_one:
@@ -646,6 +659,7 @@ static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 		free_cpumask_var(cs->cpus_allowed);
 		free_cpumask_var(cs->effective_cpus);
 		free_cpumask_var(cs->effective_xcpus);
+		free_cpumask_var(cs->exclusive_cpus);
 	}
 	if (tmp) {
 		free_cpumask_var(tmp->new_cpus);
@@ -674,6 +688,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
 	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
 	cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
+	cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
 	return trial;
 }
 
@@ -687,6 +702,13 @@ static inline void free_cpuset(struct cpuset *cs)
 	kfree(cs);
 }
 
+static inline struct cpumask *fetch_xcpus(struct cpuset *cs)
+{
+	return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus :
+	       cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed
+						  : cs->effective_xcpus;
+}
+
 /*
  * cpu_exclusive_check() - check if two cpusets are exclusive
  *
@@ -694,14 +716,10 @@ static inline void free_cpuset(struct cpuset *cs)
  */
 static inline bool cpu_exclusive_check(struct cpuset *cs1, struct cpuset *cs2)
 {
-	struct cpumask *cpus1, *cpus2;
+	struct cpumask *xcpus1 = fetch_xcpus(cs1);
+	struct cpumask *xcpus2 = fetch_xcpus(cs2);
 
-	cpus1 = cpumask_empty(cs1->effective_xcpus)
-		? cs1->cpus_allowed : cs1->effective_xcpus;
-	cpus2 = cpumask_empty(cs2->effective_xcpus)
-		? cs2->cpus_allowed : cs2->effective_xcpus;
-
-	if (cpumask_intersects(cpus1, cpus2))
+	if (cpumask_intersects(xcpus1, xcpus2))
 		return -EINVAL;
 	return 0;
 }
@@ -1368,6 +1386,54 @@ static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
 		partition_is_populated(cs, NULL));
 }
 
+static void reset_partition_data(struct cpuset *cs)
+{
+	struct cpuset *parent = parent_cs(cs);
+
+	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+		return;
+
+	lockdep_assert_held(&callback_lock);
+
+	cs->nr_subparts = 0;
+	if (cpumask_empty(cs->exclusive_cpus)) {
+		cpumask_clear(cs->effective_xcpus);
+		if (is_cpu_exclusive(cs))
+			clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+	}
+	if (!cpumask_and(cs->effective_cpus,
+			 parent->effective_cpus, cs->cpus_allowed)) {
+		cs->use_parent_ecpus = true;
+		parent->child_ecpus_count++;
+		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+	}
+}
+
+/*
+ * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
+ * @cs: cpuset
+ * @xcpus: effective exclusive CPUs value to be set
+ * Return: true if xcpus is not empty, false otherwise.
+ *
+ * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
+ * it must be a subset of cpus_allowed and parent's effective_xcpus.
+ */
+static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
+						struct cpumask *xcpus)
+{
+	struct cpuset *parent = parent_cs(cs);
+
+	if (!xcpus)
+		xcpus = cs->effective_xcpus;
+
+	if (!cpumask_empty(cs->exclusive_cpus))
+		cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed);
+	else
+		cpumask_copy(xcpus, cs->cpus_allowed);
+
+	return cpumask_and(xcpus, xcpus, parent->effective_xcpus);
+}
+
 /**
  * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
  * @cs:      The cpuset that requests change in partition root state
@@ -1426,7 +1492,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 	 */
 	adding = deleting = false;
 	old_prs = new_prs = cs->partition_root_state;
-	xcpus = !cpumask_empty(cs->effective_xcpus)
+	xcpus = !cpumask_empty(cs->exclusive_cpus)
 		? cs->effective_xcpus : cs->cpus_allowed;
 
 	if (cmd == partcmd_invalidate) {
@@ -1659,8 +1725,7 @@ write_error:
 
 	if (adding || deleting) {
 		update_tasks_cpumask(parent, tmp->addmask);
-		if (parent->child_ecpus_count)
-			update_sibling_cpumasks(parent, cs, tmp);
+		update_sibling_cpumasks(parent, cs, tmp);
 	}
 
 	/*
@@ -1709,7 +1774,9 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
 	 *  2) All the effective_cpus will be used up and cp
 	 *     has tasks
 	 */
-	cpumask_and(new_ecpus, cs->effective_xcpus, cpu_active_mask);
+	compute_effective_exclusive_cpumask(cs, new_ecpus);
+	cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
+
 	rcu_read_lock();
 	cpuset_for_each_child(child, css, cs) {
 		if (!is_partition_valid(child))
@@ -1777,6 +1844,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 
 		compute_effective_cpumask(tmp->new_cpus, cp, parent);
 
+		/*
+		 * Update effective_xcpus if exclusive_cpus set.
+		 * The case when exclusive_cpus isn't set is handled later.
+		 */
+		if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) {
+			spin_lock_irq(&callback_lock);
+			compute_effective_exclusive_cpumask(cp, NULL);
+			spin_unlock_irq(&callback_lock);
+		}
+
 		if (is_partition_valid(parent) && is_partition_valid(cp))
 			compute_partition_effective_cpumask(cp, tmp->new_cpus);
 
@@ -1869,7 +1946,11 @@ update_parent_effective:
 		spin_lock_irq(&callback_lock);
 		cpumask_copy(cp->effective_cpus, tmp->new_cpus);
 		cp->partition_root_state = new_prs;
-		if ((new_prs > 0) && cpumask_empty(cp->effective_xcpus))
+		/*
+		 * Make sure effective_xcpus is properly set for a valid
+		 * partition root.
+		 */
+		if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
 			cpumask_and(cp->effective_xcpus,
 				    cp->cpus_allowed, parent->effective_xcpus);
 		if (new_prs < 0) {
@@ -1886,7 +1967,7 @@ update_parent_effective:
 		WARN_ON(!is_in_v2_mode() &&
 			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
 
-		update_tasks_cpumask(cp, tmp->new_cpus);
+		update_tasks_cpumask(cp, cp->effective_cpus);
 
 		/*
 		 * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
@@ -1939,8 +2020,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
 
 	/*
 	 * Check all its siblings and call update_cpumasks_hier()
-	 * if their use_parent_ecpus flag is set in order for them
-	 * to use the right effective_cpus value.
+	 * if their effective_cpus will need to be changed.
+	 *
+	 * With the addition of effective_xcpus which is a subset of
+	 * cpus_allowed. It is possible a change in parent's effective_cpus
+	 * due to a change in a child partition's effective_xcpus will impact
+	 * its siblings even if they do not inherit parent's effective_cpus
+	 * directly.
 	 *
 	 * The update_cpumasks_hier() function may sleep. So we have to
 	 * release the RCU read lock before calling it. HIER_NO_SD_REBUILD
@@ -1951,8 +2037,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
 	cpuset_for_each_child(sibling, pos_css, parent) {
 		if (sibling == cs)
 			continue;
-		if (!sibling->use_parent_ecpus)
-			continue;
+		if (!sibling->use_parent_ecpus &&
+		    !is_partition_valid(sibling)) {
+			compute_effective_cpumask(tmp->new_cpus, sibling,
+						  parent);
+			if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
+				continue;
+		}
 		if (!css_tryget_online(&sibling->css))
 			continue;
 
@@ -1977,6 +2068,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	struct tmpmasks tmp;
 	struct cpuset *parent = parent_cs(cs);
 	bool invalidate = false;
+	int hier_flags = 0;
 	int old_prs = cs->partition_root_state;
 
 	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
@@ -2002,11 +2094,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 			return -EINVAL;
 
 		/*
-		 * When effective_xcpus is set, make sure it is a subset of
-		 * cpus_allowed and parent's effective_xcpus.
+		 * When exclusive_cpus isn't explicitly set, it is constrainted
+		 * by cpus_allowed and parent's effective_xcpus. Otherwise,
+		 * trialcs->effective_xcpus is used as a temporary cpumask
+		 * for checking validity of the partition root.
 		 */
-		cpumask_and(trialcs->effective_xcpus,
-			    parent->effective_xcpus, trialcs->cpus_allowed);
+		if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
+			compute_effective_exclusive_cpumask(trialcs, NULL);
 	}
 
 	/* Nothing to do if the cpus didn't change */
@@ -2026,6 +2120,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 		}
 	}
 
+	/*
+	 * Check all the descendants in update_cpumasks_hier() if
+	 * effective_xcpus is to be changed.
+	 */
+	if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
+		hier_flags = HIER_CHECKALL;
+
 	retval = validate_change(cs, trialcs);
 
 	if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
@@ -2055,7 +2156,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (retval < 0)
 		goto out_free;
 
-	if (cs->partition_root_state) {
+	if (is_partition_valid(cs)) {
 		if (invalidate)
 			update_parent_effective_cpumask(cs, partcmd_invalidate,
 							NULL, &tmp);
@@ -2066,15 +2167,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 
 	spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
-	if (!is_partition_valid(cs))
-		cpumask_clear(cs->effective_xcpus);
-	else
-		cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
-
+	cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
+	if ((old_prs > 0) && !is_partition_valid(cs))
+		reset_partition_data(cs);
 	spin_unlock_irq(&callback_lock);
 
-	/* effective_cpus will be updated here */
-	update_cpumasks_hier(cs, &tmp, 0);
+	/* effective_cpus/effective_xcpus will be updated here */
+	update_cpumasks_hier(cs, &tmp, hier_flags);
 
 	/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
 	if (cs->partition_root_state)
@@ -2084,6 +2183,94 @@ out_free:
 	return 0;
 }
 
+/**
+ * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ *
+ * The tasks' cpumask will be updated if cs is a valid partition root.
+ */
+static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+				    const char *buf)
+{
+	int retval;
+	struct tmpmasks tmp;
+	struct cpuset *parent = parent_cs(cs);
+	bool invalidate = false;
+	int hier_flags = 0;
+	int old_prs = cs->partition_root_state;
+
+	if (!*buf) {
+		cpumask_clear(trialcs->exclusive_cpus);
+	} else {
+		retval = cpulist_parse(buf, trialcs->exclusive_cpus);
+		if (retval < 0)
+			return retval;
+		if (!is_cpu_exclusive(cs))
+			set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);
+	}
+
+	/* Nothing to do if the CPUs didn't change */
+	if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
+		return 0;
+
+	if (alloc_cpumasks(NULL, &tmp))
+		return -ENOMEM;
+
+	compute_effective_exclusive_cpumask(trialcs, NULL);
+
+	/*
+	 * Check all the descendants in update_cpumasks_hier() if
+	 * effective_xcpus is to be changed.
+	 */
+	if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
+		hier_flags = HIER_CHECKALL;
+
+	retval = validate_change(cs, trialcs);
+	if (retval)
+		return retval;
+
+	if (is_partition_valid(cs)) {
+		if (cpumask_empty(trialcs->effective_xcpus)) {
+			invalidate = true;
+			cs->prs_err = PERR_INVCPUS;
+		} else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
+			invalidate = true;
+			cs->prs_err = PERR_NOCPUS;
+		}
+
+		if (invalidate)
+			update_parent_effective_cpumask(cs, partcmd_invalidate,
+							NULL, &tmp);
+		else
+			update_parent_effective_cpumask(cs, partcmd_update,
+						trialcs->effective_xcpus, &tmp);
+	}
+
+	spin_lock_irq(&callback_lock);
+	cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
+	cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
+	if ((old_prs > 0) && !is_partition_valid(cs))
+		reset_partition_data(cs);
+	spin_unlock_irq(&callback_lock);
+
+	/*
+	 * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus
+	 * of the subtree when it is a valid partition root or effective_xcpus
+	 * is updated.
+	 */
+	if (is_partition_valid(cs) || hier_flags)
+		update_cpumasks_hier(cs, &tmp, hier_flags);
+
+	/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
+	if (cs->partition_root_state)
+		update_partition_sd_lb(cs, old_prs);
+
+	free_cpumasks(NULL, &tmp);
+	return 0;
+}
+
 /*
  * Migrate memory region from one set of nodes to another.  This is
  * performed asynchronously as it can be called from process migration path
@@ -2474,10 +2661,10 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 		return -ENOMEM;
 
 	/*
-	 * Setup effective_xcpus if not set yet, it will be cleared later
-	 * if partition becomes invalid.
+	 * Setup effective_xcpus if not properly set yet, it will be cleared
+	 * later if partition becomes invalid.
 	 */
-	if ((new_prs > 0) && cpumask_empty(cs->effective_xcpus)) {
+	if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) {
 		struct cpuset *parent = parent_cs(cs);
 
 		spin_lock_irq(&callback_lock);
@@ -2533,7 +2720,7 @@ out:
 	cs->partition_root_state = new_prs;
 	WRITE_ONCE(cs->prs_err, err);
 	if (!is_partition_valid(cs))
-		cpumask_clear(cs->effective_xcpus);
+		reset_partition_data(cs);
 	spin_unlock_irq(&callback_lock);
 
 	/* Force update if switching back to member */
@@ -2889,6 +3076,7 @@ typedef enum {
 	FILE_EFFECTIVE_CPULIST,
 	FILE_EFFECTIVE_MEMLIST,
 	FILE_SUBPARTS_CPULIST,
+	FILE_EXCLUSIVE_CPULIST,
 	FILE_EFFECTIVE_XCPULIST,
 	FILE_CPU_EXCLUSIVE,
 	FILE_MEM_EXCLUSIVE,
@@ -3027,6 +3215,9 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	case FILE_CPULIST:
 		retval = update_cpumask(cs, trialcs, buf);
 		break;
+	case FILE_EXCLUSIVE_CPULIST:
+		retval = update_exclusive_cpumask(cs, trialcs, buf);
+		break;
 	case FILE_MEMLIST:
 		retval = update_nodemask(cs, trialcs, buf);
 		break;
@@ -3074,6 +3265,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 	case FILE_EFFECTIVE_MEMLIST:
 		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
 		break;
+	case FILE_EXCLUSIVE_CPULIST:
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
+		break;
 	case FILE_EFFECTIVE_XCPULIST:
 		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
 		break;
@@ -3350,6 +3544,15 @@ static struct cftype dfl_files[] = {
 		.file_offset = offsetof(struct cpuset, partition_file),
 	},
 
+	{
+		.name = "cpus.exclusive",
+		.seq_show = cpuset_common_seq_show,
+		.write = cpuset_write_resmask,
+		.max_write_len = (100U + 6 * NR_CPUS),
+		.private = FILE_EXCLUSIVE_CPULIST,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+
 	{
 		.name = "cpus.exclusive.effective",
 		.seq_show = cpuset_common_seq_show,
@@ -3675,12 +3878,14 @@ int __init cpuset_init(void)
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
+	BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
 	BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
 
 	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
 	cpumask_setall(top_cpuset.effective_cpus);
 	cpumask_setall(top_cpuset.effective_xcpus);
+	cpumask_setall(top_cpuset.exclusive_cpus);
 	nodes_setall(top_cpuset.effective_mems);
 
 	fmeter_init(&top_cpuset.fmeter);
-- 
cgit v1.2.3


From 181c8e091aae11b0b7efba49b34adfe3c89ce648 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 5 Sep 2023 09:32:40 -0400
Subject: cgroup/cpuset: Introduce remote partition

One can use "cpuset.cpus.partition" to create multiple scheduling domains
or to produce a set of isolated CPUs where load balancing is disabled.
The former use case is less common but the latter one can be frequently
used especially for the Telco use cases like DPDK.

The existing "isolated" partition can be used to produce isolated
CPUs if the applications have full control of a system. However, in a
containerized environment where all the apps are run in a container,
it is hard to distribute out isolated CPUs from the root down given
the unified hierarchy nature of cgroup v2.

The container running on isolated CPUs can be several layers down from
the root. The current partition feature requires that all the ancestors
of a leaf partition root must be parititon roots themselves. This can
be hard to configure.

This patch introduces a new type of partition called remote partition.
A remote partition is a partition whose parent is not a partition root
itself and its CPUs are acquired directly from available CPUs in the
top cpuset through a hierachical distribution of exclusive CPUs down
from it.

By contrast, the existing type of partitions where their parents have
to be valid partition roots are referred to as local partitions as they
have to be clustered around a parent partition root.

Child local partitons can be created under a remote partition, but
a remote partition cannot be created under a local partition. We may
relax this limitation in the future if there are use cases for such
configuration.

Manually writing to the "cpuset.cpus.exclusive" file is not necessary
when creating local partitions.  However, writing proper values to
"cpuset.cpus.exclusive" down the cgroup hierarchy before the target
remote partition root is mandatory for the creation of a remote
partition.

The value in "cpuset.cpus.exclusive.effective" may change if its
"cpuset.cpus" or its parent's "cpuset.cpus.exclusive.effective" changes.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 335 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 306 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0419654f3004..7ac320e079b8 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -192,6 +192,9 @@ struct cpuset {
 
 	/* Handle for cpuset.cpus.partition */
 	struct cgroup_file partition_file;
+
+	/* Remote partition silbling list anchored at remote_children */
+	struct list_head remote_sibling;
 };
 
 /*
@@ -199,6 +202,9 @@ struct cpuset {
  */
 static cpumask_var_t	subpartitions_cpus;
 
+/* List of remote partition root children */
+static struct list_head remote_children;
+
 /*
  * Partition root states:
  *
@@ -348,6 +354,7 @@ static struct cpuset top_cpuset = {
 	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
 		  (1 << CS_MEM_EXCLUSIVE)),
 	.partition_root_state = PRS_ROOT,
+	.remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
 };
 
 /**
@@ -1434,6 +1441,211 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
 	return cpumask_and(xcpus, xcpus, parent->effective_xcpus);
 }
 
+static inline bool is_remote_partition(struct cpuset *cs)
+{
+	return !list_empty(&cs->remote_sibling);
+}
+
+static inline bool is_local_partition(struct cpuset *cs)
+{
+	return is_partition_valid(cs) && !is_remote_partition(cs);
+}
+
+/*
+ * remote_partition_enable - Enable current cpuset as a remote partition root
+ * @cs: the cpuset to update
+ * @tmp: temparary masks
+ * Return: 1 if successful, 0 if error
+ *
+ * Enable the current cpuset to become a remote partition root taking CPUs
+ * directly from the top cpuset. cpuset_mutex must be held by the caller.
+ */
+static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp)
+{
+	/*
+	 * The user must have sysadmin privilege.
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
+	/*
+	 * The requested exclusive_cpus must not be allocated to other
+	 * partitions and it can't use up all the root's effective_cpus.
+	 *
+	 * Note that if there is any local partition root above it or
+	 * remote partition root underneath it, its exclusive_cpus must
+	 * have overlapped with subpartitions_cpus.
+	 */
+	compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
+	if (cpumask_empty(tmp->new_cpus) ||
+	    cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
+	    cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
+		return 0;
+
+	spin_lock_irq(&callback_lock);
+	cpumask_andnot(top_cpuset.effective_cpus,
+		       top_cpuset.effective_cpus, tmp->new_cpus);
+	cpumask_or(subpartitions_cpus,
+		   subpartitions_cpus, tmp->new_cpus);
+
+	if (cs->use_parent_ecpus) {
+		struct cpuset *parent = parent_cs(cs);
+
+		cs->use_parent_ecpus = false;
+		parent->child_ecpus_count--;
+	}
+	list_add(&cs->remote_sibling, &remote_children);
+	spin_unlock_irq(&callback_lock);
+
+	/*
+	 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+	 */
+	update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+	update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+
+	return 1;
+}
+
+/*
+ * remote_partition_disable - Remove current cpuset from remote partition list
+ * @cs: the cpuset to update
+ * @tmp: temparary masks
+ *
+ * The effective_cpus is also updated.
+ *
+ * cpuset_mutex must be held by the caller.
+ */
+static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
+{
+	compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
+	WARN_ON_ONCE(!is_remote_partition(cs));
+	WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
+
+	spin_lock_irq(&callback_lock);
+	cpumask_andnot(subpartitions_cpus,
+		       subpartitions_cpus, tmp->new_cpus);
+	cpumask_and(tmp->new_cpus,
+		    tmp->new_cpus, cpu_active_mask);
+	cpumask_or(top_cpuset.effective_cpus,
+		   top_cpuset.effective_cpus, tmp->new_cpus);
+	list_del_init(&cs->remote_sibling);
+	cs->partition_root_state = -cs->partition_root_state;
+	if (!cs->prs_err)
+		cs->prs_err = PERR_INVCPUS;
+	reset_partition_data(cs);
+	spin_unlock_irq(&callback_lock);
+
+	/*
+	 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+	 */
+	update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+	update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+}
+
+/*
+ * remote_cpus_update - cpus_exclusive change of remote partition
+ * @cs: the cpuset to be updated
+ * @newmask: the new effective_xcpus mask
+ * @tmp: temparary masks
+ *
+ * top_cpuset and subpartitions_cpus will be updated or partition can be
+ * invalidated.
+ */
+static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
+			       struct tmpmasks *tmp)
+{
+	bool adding, deleting;
+
+	if (WARN_ON_ONCE(!is_remote_partition(cs)))
+		return;
+
+	WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
+
+	if (cpumask_empty(newmask))
+		goto invalidate;
+
+	adding   = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus);
+	deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask);
+
+	/*
+	 * Additions of remote CPUs is only allowed if those CPUs are
+	 * not allocated to other partitions and there are effective_cpus
+	 * left in the top cpuset.
+	 */
+	if (adding && (!capable(CAP_SYS_ADMIN) ||
+		       cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
+		       cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)))
+		goto invalidate;
+
+	spin_lock_irq(&callback_lock);
+	if (adding) {
+		cpumask_or(subpartitions_cpus,
+			   subpartitions_cpus, tmp->addmask);
+		cpumask_andnot(top_cpuset.effective_cpus,
+			       top_cpuset.effective_cpus, tmp->addmask);
+	}
+	if (deleting) {
+		cpumask_andnot(subpartitions_cpus,
+			       subpartitions_cpus, tmp->delmask);
+		cpumask_and(tmp->delmask,
+			    tmp->delmask, cpu_active_mask);
+		cpumask_or(top_cpuset.effective_cpus,
+			   top_cpuset.effective_cpus, tmp->delmask);
+	}
+	spin_unlock_irq(&callback_lock);
+
+	/*
+	 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+	 */
+	update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
+	update_sibling_cpumasks(&top_cpuset, NULL, tmp);
+	return;
+
+invalidate:
+	remote_partition_disable(cs, tmp);
+}
+
+/*
+ * remote_partition_check - check if a child remote partition needs update
+ * @cs: the cpuset to be updated
+ * @newmask: the new effective_xcpus mask
+ * @delmask: temporary mask for deletion (not in tmp)
+ * @tmp: temparary masks
+ *
+ * This should be called before the given cs has updated its cpus_allowed
+ * and/or effective_xcpus.
+ */
+static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
+				   struct cpumask *delmask, struct tmpmasks *tmp)
+{
+	struct cpuset *child, *next;
+	int disable_cnt = 0;
+
+	/*
+	 * Compute the effective exclusive CPUs that will be deleted.
+	 */
+	if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) ||
+	    !cpumask_intersects(delmask, subpartitions_cpus))
+		return;	/* No deletion of exclusive CPUs in partitions */
+
+	/*
+	 * Searching the remote children list to look for those that will
+	 * be impacted by the deletion of exclusive CPUs.
+	 *
+	 * Since a cpuset must be removed from the remote children list
+	 * before it can go offline and holding cpuset_mutex will prevent
+	 * any change in cpuset status. RCU read lock isn't needed.
+	 */
+	lockdep_assert_held(&cpuset_mutex);
+	list_for_each_entry_safe(child, next, &remote_children, remote_sibling)
+		if (cpumask_intersects(child->effective_cpus, delmask)) {
+			remote_partition_disable(child, tmp);
+			disable_cnt++;
+		}
+	if (disable_cnt)
+		rebuild_sched_domains_locked();
+}
+
 /**
  * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
  * @cs:      The cpuset that requests change in partition root state
@@ -1548,7 +1760,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 		subparts_delta++;
 	} else if (cmd == partcmd_disable) {
 		/*
-		 n* May need to add cpus to parent's effective_cpus for
+		 * May need to add cpus to parent's effective_cpus for
 		 * valid partition root.
 		 */
 		adding = !is_prs_invalid(old_prs) &&
@@ -1749,7 +1961,7 @@ write_error:
  * @new_ecpus: previously computed effective_cpus to be updated
  *
  * Compute the effective_cpus of a partition root by scanning effective_xcpus
- * of child partition roots and exclusing their effective_xcpus.
+ * of child partition roots and excluding their effective_xcpus.
  *
  * This has the side effect of invalidating valid child partition roots,
  * if necessary. Since it is called from either cpuset_hotplug_update_tasks()
@@ -1840,9 +2052,17 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 	rcu_read_lock();
 	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
 		struct cpuset *parent = parent_cs(cp);
+		bool remote = is_remote_partition(cp);
 		bool update_parent = false;
 
-		compute_effective_cpumask(tmp->new_cpus, cp, parent);
+		/*
+		 * Skip descendent remote partition that acquires CPUs
+		 * directly from top cpuset unless it is cs.
+		 */
+		if (remote && (cp != cs)) {
+			pos_css = css_rightmost_descendant(pos_css);
+			continue;
+		}
 
 		/*
 		 * Update effective_xcpus if exclusive_cpus set.
@@ -1854,8 +2074,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 			spin_unlock_irq(&callback_lock);
 		}
 
-		if (is_partition_valid(parent) && is_partition_valid(cp))
+		old_prs = new_prs = cp->partition_root_state;
+		if (remote || (is_partition_valid(parent) &&
+			       is_partition_valid(cp)))
 			compute_partition_effective_cpumask(cp, tmp->new_cpus);
+		else
+			compute_effective_cpumask(tmp->new_cpus, cp, parent);
 
 		/*
 		 * A partition with no effective_cpus is allowed as long as
@@ -1873,7 +2097,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 		 * it is a partition root that has explicitly distributed
 		 * out all its CPUs.
 		 */
-		if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
+		if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) {
 			cpumask_copy(tmp->new_cpus, parent->effective_cpus);
 			if (!cp->use_parent_ecpus) {
 				cp->use_parent_ecpus = true;
@@ -1885,6 +2109,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 			parent->child_ecpus_count--;
 		}
 
+		if (remote)
+			goto get_css;
+
 		/*
 		 * Skip the whole subtree if
 		 * 1) the cpumask remains the same,
@@ -1907,7 +2134,6 @@ update_parent_effective:
 		 * update_tasks_cpumask() again for tasks in the parent
 		 * cpuset if the parent's effective_cpus changes.
 		 */
-		old_prs = new_prs = cp->partition_root_state;
 		if ((cp != cs) && old_prs) {
 			switch (parent->partition_root_state) {
 			case PRS_ROOT:
@@ -1929,7 +2155,7 @@ update_parent_effective:
 				break;
 			}
 		}
-
+get_css:
 		if (!css_tryget_online(&cp->css))
 			continue;
 		rcu_read_unlock();
@@ -1953,13 +2179,8 @@ update_parent_effective:
 		if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
 			cpumask_and(cp->effective_xcpus,
 				    cp->cpus_allowed, parent->effective_xcpus);
-		if (new_prs < 0) {
-			/* Reset partition data */
-			cp->nr_subparts = 0;
-			cpumask_clear(cp->effective_xcpus);
-			if (is_cpu_exclusive(cp))
-				clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
-		}
+		else if (new_prs < 0)
+			reset_partition_data(cp);
 		spin_unlock_irq(&callback_lock);
 
 		notify_partition_change(cp, old_prs);
@@ -2157,12 +2378,23 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 		goto out_free;
 
 	if (is_partition_valid(cs)) {
-		if (invalidate)
+		/*
+		 * Call remote_cpus_update() to handle valid remote partition
+		 */
+		if (is_remote_partition(cs))
+			remote_cpus_update(cs, trialcs->effective_xcpus, &tmp);
+		else if (invalidate)
 			update_parent_effective_cpumask(cs, partcmd_invalidate,
 							NULL, &tmp);
 		else
 			update_parent_effective_cpumask(cs, partcmd_update,
 						trialcs->effective_xcpus, &tmp);
+	} else if (!cpumask_empty(cs->exclusive_cpus)) {
+		/*
+		 * Use trialcs->effective_cpus as a temp cpumask
+		 */
+		remote_partition_check(cs, trialcs->effective_xcpus,
+				       trialcs->effective_cpus, &tmp);
 	}
 
 	spin_lock_irq(&callback_lock);
@@ -2203,6 +2435,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 
 	if (!*buf) {
 		cpumask_clear(trialcs->exclusive_cpus);
+		cpumask_clear(trialcs->effective_xcpus);
 	} else {
 		retval = cpulist_parse(buf, trialcs->exclusive_cpus);
 		if (retval < 0)
@@ -2218,7 +2451,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (alloc_cpumasks(NULL, &tmp))
 		return -ENOMEM;
 
-	compute_effective_exclusive_cpumask(trialcs, NULL);
+	if (*buf)
+		compute_effective_exclusive_cpumask(trialcs, NULL);
 
 	/*
 	 * Check all the descendants in update_cpumasks_hier() if
@@ -2240,14 +2474,26 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 			cs->prs_err = PERR_NOCPUS;
 		}
 
-		if (invalidate)
+		if (is_remote_partition(cs)) {
+			if (invalidate)
+				remote_partition_disable(cs, &tmp);
+			else
+				remote_cpus_update(cs, trialcs->effective_xcpus,
+						   &tmp);
+		} else if (invalidate) {
 			update_parent_effective_cpumask(cs, partcmd_invalidate,
 							NULL, &tmp);
-		else
+		} else {
 			update_parent_effective_cpumask(cs, partcmd_update,
 						trialcs->effective_xcpus, &tmp);
+		}
+	} else if (!cpumask_empty(trialcs->exclusive_cpus)) {
+		/*
+		 * Use trialcs->effective_cpus as a temp cpumask
+		 */
+		remote_partition_check(cs, trialcs->effective_xcpus,
+				       trialcs->effective_cpus, &tmp);
 	}
-
 	spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
 	cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
@@ -2643,18 +2889,25 @@ out:
 static int update_prstate(struct cpuset *cs, int new_prs)
 {
 	int err = PERR_NONE, old_prs = cs->partition_root_state;
+	struct cpuset *parent = parent_cs(cs);
 	struct tmpmasks tmpmask;
 
 	if (old_prs == new_prs)
 		return 0;
 
 	/*
-	 * For a previously invalid partition root, leave it at being
-	 * invalid if new_prs is not "member".
+	 * For a previously invalid partition root with valid partition root
+	 * parent, treat it as if it is a "member". Otherwise, reject it as
+	 * remote partition cannot currently self-recover from an invalid
+	 * state.
 	 */
 	if (new_prs && is_prs_invalid(old_prs)) {
-		cs->partition_root_state = -new_prs;
-		return 0;
+		if (is_partition_valid(parent)) {
+			old_prs = PRS_MEMBER;
+		} else {
+			cs->partition_root_state = -new_prs;
+			return 0;
+		}
 	}
 
 	if (alloc_cpumasks(NULL, &tmpmask))
@@ -2665,8 +2918,6 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 	 * later if partition becomes invalid.
 	 */
 	if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) {
-		struct cpuset *parent = parent_cs(cs);
-
 		spin_lock_irq(&callback_lock);
 		cpumask_and(cs->effective_xcpus,
 			    cs->cpus_allowed, parent->effective_xcpus);
@@ -2688,6 +2939,12 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 
 		err = update_parent_effective_cpumask(cs, partcmd_enable,
 						      NULL, &tmpmask);
+		/*
+		 * If an attempt to become local partition root fails,
+		 * try to become a remote partition root instead.
+		 */
+		if (err && remote_partition_enable(cs, &tmpmask))
+			err = 0;
 	} else if (old_prs && new_prs) {
 		/*
 		 * A change in load balance state only, no change in cpumasks.
@@ -2698,8 +2955,11 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 		 * Switching back to member is always allowed even if it
 		 * disables child partitions.
 		 */
-		update_parent_effective_cpumask(cs, partcmd_disable, NULL,
-						&tmpmask);
+		if (is_remote_partition(cs))
+			remote_partition_disable(cs, &tmpmask);
+		else
+			update_parent_effective_cpumask(cs, partcmd_disable,
+							NULL, &tmpmask);
 
 		/*
 		 * Invalidation of child partitions will be done in
@@ -3602,6 +3862,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
 	nodes_clear(cs->effective_mems);
 	fmeter_init(&cs->fmeter);
 	cs->relax_domain_level = -1;
+	INIT_LIST_HEAD(&cs->remote_sibling);
 
 	/* Set CS_MEMORY_MIGRATE for default hierarchy */
 	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
@@ -3637,6 +3898,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 		cs->effective_mems = parent->effective_mems;
 		cs->use_parent_ecpus = true;
 		parent->child_ecpus_count++;
+		/*
+		 * Clear CS_SCHED_LOAD_BALANCE if parent is isolated
+		 */
+		if (!is_sched_load_balance(parent))
+			clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	}
 
 	/*
@@ -3891,6 +4157,7 @@ int __init cpuset_init(void)
 	fmeter_init(&top_cpuset.fmeter);
 	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
 	top_cpuset.relax_domain_level = -1;
+	INIT_LIST_HEAD(&remote_children);
 
 	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
 
@@ -4006,6 +4273,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
 	static nodemask_t new_mems;
 	bool cpus_updated;
 	bool mems_updated;
+	bool remote;
 	struct cpuset *parent;
 retry:
 	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -4032,9 +4300,18 @@ retry:
 	 * Compute effective_cpus for valid partition root, may invalidate
 	 * child partition roots if necessary.
 	 */
-	if (is_partition_valid(cs) && is_partition_valid(parent))
+	remote = is_remote_partition(cs);
+	if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))
 		compute_partition_effective_cpumask(cs, &new_cpus);
 
+	if (remote && cpumask_empty(&new_cpus) &&
+	    partition_is_populated(cs, NULL)) {
+		remote_partition_disable(cs, tmp);
+		compute_effective_cpumask(&new_cpus, cs, parent);
+		remote = false;
+		cpuset_force_rebuild();
+	}
+
 	/*
 	 * Force the partition to become invalid if either one of
 	 * the following conditions hold:
@@ -4042,7 +4319,7 @@ retry:
 	 * 2) parent is invalid or doesn't grant any cpus to child
 	 *    partitions.
 	 */
-	if (is_partition_valid(cs) && (!is_partition_valid(parent) ||
+	if (is_local_partition(cs) && (!is_partition_valid(parent) ||
 				tasks_nocpu_error(parent, cs, &new_cpus))) {
 		update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp);
 		compute_effective_cpumask(&new_cpus, cs, parent);
-- 
cgit v1.2.3


From 4a74e418881f26cdeae1011453acd66cedc8ad2c Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 5 Sep 2023 09:32:41 -0400
Subject: cgroup/cpuset: Check partition conflict with housekeeping setup

A user can pre-configure certain CPUs in an isolated state at boot time
with the "isolcpus" kernel boot command line option. Those CPUs will
not be in the housekeeping_cpumask(HK_TYPE_DOMAIN) and so will not
be in any sched domains. This may conflict with the partition setup
at runtime. Those boot time isolated CPUs should only be used in an
isolated partition.

This patch adds the necessary check and disallows partition setup if the
check fails.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 7ac320e079b8..15f399153a2e 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -75,6 +75,7 @@ enum prs_errcode {
 	PERR_NOCPUS,
 	PERR_HOTPLUG,
 	PERR_CPUSEMPTY,
+	PERR_HKEEPING,
 };
 
 static const char * const perr_strings[] = {
@@ -85,6 +86,7 @@ static const char * const perr_strings[] = {
 	[PERR_NOCPUS]    = "Parent unable to distribute cpu downstream",
 	[PERR_HOTPLUG]   = "No cpu available due to hotplug",
 	[PERR_CPUSEMPTY] = "cpuset.cpus is empty",
+	[PERR_HKEEPING]  = "partition config conflicts with housekeeping setup",
 };
 
 struct cpuset {
@@ -1646,6 +1648,26 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
 		rebuild_sched_domains_locked();
 }
 
+/*
+ * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
+ * @prstate: partition root state to be checked
+ * @new_cpus: cpu mask
+ * Return: true if there is conflict, false otherwise
+ *
+ * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in
+ * an isolated partition.
+ */
+static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
+{
+	const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN);
+	bool all_in_hk = cpumask_subset(new_cpus, hk_domain);
+
+	if (!all_in_hk && (prstate != PRS_ISOLATED))
+		return true;
+
+	return false;
+}
+
 /**
  * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
  * @cs:      The cpuset that requests change in partition root state
@@ -1748,6 +1770,9 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 		    !cpumask_intersects(xcpus, parent->effective_xcpus))
 			return PERR_INVCPUS;
 
+		if (prstate_housekeeping_conflict(new_prs, xcpus))
+			return PERR_HKEEPING;
+
 		/*
 		 * A parent can be left with no CPU as long as there is no
 		 * task directly associated with the parent partition.
@@ -2335,6 +2360,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 		if (cpumask_empty(trialcs->effective_xcpus)) {
 			invalidate = true;
 			cs->prs_err = PERR_INVCPUS;
+		} else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
+			invalidate = true;
+			cs->prs_err = PERR_HKEEPING;
 		} else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
 			invalidate = true;
 			cs->prs_err = PERR_NOCPUS;
@@ -2469,6 +2497,9 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 		if (cpumask_empty(trialcs->effective_xcpus)) {
 			invalidate = true;
 			cs->prs_err = PERR_INVCPUS;
+		} else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
+			invalidate = true;
+			cs->prs_err = PERR_HKEEPING;
 		} else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
 			invalidate = true;
 			cs->prs_err = PERR_NOCPUS;
-- 
cgit v1.2.3


From 82845683ca6a15fe8c7912c6264bb0e84ec6f5fb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 19 Sep 2023 10:31:15 +0200
Subject: sched/fair: Rename check_preempt_wakeup() to
 check_preempt_wakeup_fair()

Other scheduling classes already postfix their similar methods
with the class name.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d0877878bcdb..aeaf31e32c67 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7994,7 +7994,7 @@ static void set_next_buddy(struct sched_entity *se)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
@@ -12830,7 +12830,7 @@ DEFINE_SCHED_CLASS(fair) = {
 	.yield_task		= yield_task_fair,
 	.yield_to_task		= yield_to_task_fair,
 
-	.check_preempt_curr	= check_preempt_wakeup,
+	.check_preempt_curr	= check_preempt_wakeup_fair,
 
 	.pick_next_task		= __pick_next_task_fair,
 	.put_prev_task		= put_prev_task_fair,
-- 
cgit v1.2.3


From e23edc86b09df655bf8963bbcb16647adc787395 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 19 Sep 2023 10:38:21 +0200
Subject: sched/fair: Rename check_preempt_curr() to wakeup_preempt()

The name is a bit opaque - make it clear that this is about wakeup
preemption.

Also rename the ->check_preempt_curr() methods similarly.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c      | 14 +++++++-------
 kernel/sched/deadline.c  | 10 +++++-----
 kernel/sched/fair.c      | 10 +++++-----
 kernel/sched/idle.c      |  4 ++--
 kernel/sched/rt.c        |  6 +++---
 kernel/sched/sched.h     |  4 ++--
 kernel/sched/stop_task.c |  4 ++--
 7 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5a50c4e41be9..52ceb85b6421 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2211,10 +2211,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 		p->sched_class->prio_changed(rq, p, oldprio);
 }
 
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (p->sched_class == rq->curr->sched_class)
-		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+		rq->curr->sched_class->wakeup_preempt(rq, p, flags);
 	else if (sched_class_above(p->sched_class, rq->curr->sched_class))
 		resched_curr(rq);
 
@@ -2508,7 +2508,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
 	rq_lock(rq, rf);
 	WARN_ON_ONCE(task_cpu(p) != new_cpu);
 	activate_task(rq, p, 0);
-	check_preempt_curr(rq, p, 0);
+	wakeup_preempt(rq, p, 0);
 
 	return rq;
 }
@@ -3390,7 +3390,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 		deactivate_task(src_rq, p, 0);
 		set_task_cpu(p, cpu);
 		activate_task(dst_rq, p, 0);
-		check_preempt_curr(dst_rq, p, 0);
+		wakeup_preempt(dst_rq, p, 0);
 
 		rq_unpin_lock(dst_rq, &drf);
 		rq_unpin_lock(src_rq, &srf);
@@ -3764,7 +3764,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 	}
 
 	activate_task(rq, p, en_flags);
-	check_preempt_curr(rq, p, wake_flags);
+	wakeup_preempt(rq, p, wake_flags);
 
 	ttwu_do_wakeup(p);
 
@@ -3835,7 +3835,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
 			 * it should preempt the task that is current now.
 			 */
 			update_rq_clock(rq);
-			check_preempt_curr(rq, p, wake_flags);
+			wakeup_preempt(rq, p, wake_flags);
 		}
 		ttwu_do_wakeup(p);
 		ret = 1;
@@ -4854,7 +4854,7 @@ void wake_up_new_task(struct task_struct *p)
 
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
 	trace_sched_wakeup_new(p);
-	check_preempt_curr(rq, p, WF_FORK);
+	wakeup_preempt(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken) {
 		/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 58b542bf2893..fb1996a674db 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -763,7 +763,7 @@ static inline void deadline_queue_pull_task(struct rq *rq)
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
 static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags);
 
 static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
 					    struct rq *rq)
@@ -1175,7 +1175,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 
 	enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
 	if (dl_task(rq->curr))
-		check_preempt_curr_dl(rq, p, 0);
+		wakeup_preempt_dl(rq, p, 0);
 	else
 		resched_curr(rq);
 
@@ -1939,7 +1939,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
  * Only called when both the current and waking task are -deadline
  * tasks.
  */
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
 				  int flags)
 {
 	if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
@@ -2652,7 +2652,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 			deadline_queue_push_tasks(rq);
 #endif
 		if (dl_task(rq->curr))
-			check_preempt_curr_dl(rq, p, 0);
+			wakeup_preempt_dl(rq, p, 0);
 		else
 			resched_curr(rq);
 	} else {
@@ -2721,7 +2721,7 @@ DEFINE_SCHED_CLASS(dl) = {
 	.dequeue_task		= dequeue_task_dl,
 	.yield_task		= yield_task_dl,
 
-	.check_preempt_curr	= check_preempt_curr_dl,
+	.wakeup_preempt		= wakeup_preempt_dl,
 
 	.pick_next_task		= pick_next_task_dl,
 	.put_prev_task		= put_prev_task_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aeaf31e32c67..fcf0c5bc8b47 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8007,7 +8007,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 
 	/*
 	 * This is possible from callers such as attach_tasks(), in which we
-	 * unconditionally check_preempt_curr() after an enqueue (which may have
+	 * unconditionally wakeup_preempt() after an enqueue (which may have
 	 * lead to a throttle).  This both saves work and prevents false
 	 * next-buddy nomination below.
 	 */
@@ -8914,7 +8914,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 
 	WARN_ON_ONCE(task_rq(p) != rq);
 	activate_task(rq, p, ENQUEUE_NOCLOCK);
-	check_preempt_curr(rq, p, 0);
+	wakeup_preempt(rq, p, 0);
 }
 
 /*
@@ -12369,7 +12369,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 		if (p->prio > oldprio)
 			resched_curr(rq);
 	} else
-		check_preempt_curr(rq, p, 0);
+		wakeup_preempt(rq, p, 0);
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -12471,7 +12471,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
 		if (task_current(rq, p))
 			resched_curr(rq);
 		else
-			check_preempt_curr(rq, p, 0);
+			wakeup_preempt(rq, p, 0);
 	}
 }
 
@@ -12830,7 +12830,7 @@ DEFINE_SCHED_CLASS(fair) = {
 	.yield_task		= yield_task_fair,
 	.yield_to_task		= yield_to_task_fair,
 
-	.check_preempt_curr	= check_preempt_wakeup_fair,
+	.wakeup_preempt		= check_preempt_wakeup_fair,
 
 	.pick_next_task		= __pick_next_task_fair,
 	.put_prev_task		= put_prev_task_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 342f58a329f5..26f714003c1f 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -400,7 +400,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 /*
  * Idle tasks are unconditionally rescheduled:
  */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
 {
 	resched_curr(rq);
 }
@@ -481,7 +481,7 @@ DEFINE_SCHED_CLASS(idle) = {
 	/* dequeue is not valid, we print a debug message there: */
 	.dequeue_task		= dequeue_task_idle,
 
-	.check_preempt_curr	= check_preempt_curr_idle,
+	.wakeup_preempt		= wakeup_preempt_idle,
 
 	.pick_next_task		= pick_next_task_idle,
 	.put_prev_task		= put_prev_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0597ba0f85ff..3e442fa3f6bc 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -953,7 +953,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 
 				/*
 				 * When we're idle and a woken (rt) task is
-				 * throttled check_preempt_curr() will set
+				 * throttled wakeup_preempt() will set
 				 * skip_update and the time between the wakeup
 				 * and this unthrottle will get accounted as
 				 * 'runtime'.
@@ -1715,7 +1715,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (p->prio < rq->curr->prio) {
 		resched_curr(rq);
@@ -2702,7 +2702,7 @@ DEFINE_SCHED_CLASS(rt) = {
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
 
-	.check_preempt_curr	= check_preempt_curr_rt,
+	.wakeup_preempt		= wakeup_preempt_rt,
 
 	.pick_next_task		= pick_next_task_rt,
 	.put_prev_task		= put_prev_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5f217b1e8f1c..7e070dcf7074 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2236,7 +2236,7 @@ struct sched_class {
 	void (*yield_task)   (struct rq *rq);
 	bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
 
-	void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
+	void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
 
 	struct task_struct *(*pick_next_task)(struct rq *rq);
 
@@ -2510,7 +2510,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
-extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
 
 #ifdef CONFIG_PREEMPT_RT
 #define SCHED_NR_MIGRATE_BREAK 8
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 85590599b4d6..6cf7304e6449 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 #endif /* CONFIG_SMP */
 
 static void
-check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags)
 {
 	/* we're never preempted */
 }
@@ -120,7 +120,7 @@ DEFINE_SCHED_CLASS(stop) = {
 	.dequeue_task		= dequeue_task_stop,
 	.yield_task		= yield_task_stop,
 
-	.check_preempt_curr	= check_preempt_curr_stop,
+	.wakeup_preempt		= wakeup_preempt_stop,
 
 	.pick_next_task		= pick_next_task_stop,
 	.put_prev_task		= put_prev_task_stop,
-- 
cgit v1.2.3


From cff9b2332ab762b7e0586c793c431a8f2ea4db04 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Fri, 15 Sep 2023 13:44:44 -0400
Subject: kernel/sched: Modify initial boot task idle setup

Initial booting is setting the task flag to idle (PF_IDLE) by the call
path sched_init() -> init_idle().  Having the task idle and calling
call_rcu() in kernel/rcu/tiny.c means that TIF_NEED_RESCHED will be
set.  Subsequent calls to any cond_resched() will enable IRQs,
potentially earlier than the IRQ setup has completed.  Recent changes
have caused just this scenario and IRQs have been enabled early.

This causes a warning later in start_kernel() as interrupts are enabled
before they are fully set up.

Fix this issue by setting the PF_IDLE flag later in the boot sequence.

Although the boot task was marked as idle since (at least) d80e4fda576d,
I am not sure that it is wrong to do so.  The forced context-switch on
idle task was introduced in the tiny_rcu update, so I'm going to claim
this fixes 5f6130fa52ee.

Fixes: 5f6130fa52ee ("tiny_rcu: Directly force QS when call_rcu_[bh|sched]() on idle_task")
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/linux-mm/CAMuHMdWpvpWoDa=Ox-do92czYRvkok6_x6pYUH+ZouMcJbXy+Q@mail.gmail.com/
---
 kernel/sched/core.c | 2 +-
 kernel/sched/idle.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2299a5cfbfb9..802551e0009b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9269,7 +9269,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
 	 * PF_KTHREAD should already be set at this point; regardless, make it
 	 * look like a proper per-CPU kthread.
 	 */
-	idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
+	idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
 	kthread_set_per_cpu(idle, cpu);
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 342f58a329f5..5007b25c5bc6 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -373,6 +373,7 @@ EXPORT_SYMBOL_GPL(play_idle_precise);
 
 void cpu_startup_entry(enum cpuhp_state state)
 {
+	current->flags |= PF_IDLE;
 	arch_cpu_idle_prepare();
 	cpuhp_online_idle(state);
 	while (1)
-- 
cgit v1.2.3


From 7d3460632da2c2ad5c5708db82a0b72e2b66396c Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Mon, 18 Sep 2023 17:52:32 +0200
Subject: bpf: Fix bpf_throw warning on 32-bit arch

On 32-bit architectures, the pointer width is 32-bit, while we try to
cast from a u64 down to it, the compiler complains on mismatch in
integer size. Fix this by first casting to long which should match
the pointer width on targets supported by Linux.

Fixes: ec5290a178b7 ("bpf: Prevent KASAN false positive with bpf_throw")
Reported-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Tested-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Link: https://lore.kernel.org/r/20230918155233.297024-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 7ff2a42f1996..dd1c69ee3375 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2488,7 +2488,7 @@ __bpf_kfunc void bpf_throw(u64 cookie)
 	 * deeper stack depths than ctx.sp as we do not return from bpf_throw,
 	 * which skips compiler generated instrumentation to do the same.
 	 */
-	kasan_unpoison_task_stack_below((void *)ctx.sp);
+	kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
 	ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp);
 	WARN(1, "A call to BPF exception callback should never return\n");
 }
-- 
cgit v1.2.3


From aec42f36237b09e42eac39f6c74305aec02b4694 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 19 Sep 2023 02:25:20 -0700
Subject: bpf: Remove unused variables.

Remove unused prev_offset, min_size, krec_size variables.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202309190634.fL17FWoT-lkp@intel.com/
Fixes: aaa619ebccb2 ("bpf: Refactor check_btf_func and split into two phases")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a7178ecf676d..38f8718f1602 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15339,14 +15339,12 @@ static int check_btf_func(struct bpf_verifier_env *env,
 			  bpfptr_t uattr)
 {
 	const struct btf_type *type, *func_proto, *ret_type;
-	u32 i, nfuncs, urec_size, min_size;
-	u32 krec_size = sizeof(struct bpf_func_info);
+	u32 i, nfuncs, urec_size;
 	struct bpf_func_info *krecord;
 	struct bpf_func_info_aux *info_aux = NULL;
 	struct bpf_prog *prog;
 	const struct btf *btf;
 	bpfptr_t urecord;
-	u32 prev_offset = 0;
 	bool scalar_return;
 	int ret = -ENOMEM;
 
@@ -15367,7 +15365,6 @@ static int check_btf_func(struct bpf_verifier_env *env,
 	btf = prog->aux->btf;
 
 	urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
-	min_size = min_t(u32, krec_size, urec_size);
 
 	krecord = prog->aux->func_info;
 	info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
@@ -15401,7 +15398,6 @@ static int check_btf_func(struct bpf_verifier_env *env,
 			goto err_free;
 		}
 
-		prev_offset = krecord[i].insn_off;
 		bpfptr_add(&urecord, urec_size);
 	}
 
-- 
cgit v1.2.3


From a20d6f63dbfc176697886d7709312ad0a795648e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 3 Aug 2023 12:09:31 +0200
Subject: signal: Add a proper comment about preempt_disable() in ptrace_stop()

Commit 53da1d9456fe7 ("fix ptrace slowness") added a preempt-disable section
between read_unlock() and the following schedule() invocation without
explaining why it is needed.

Replace the existing contentless comment with a proper explanation to
clarify that it is not needed for correctness but for performance reasons.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20230803100932.325870-2-bigeasy@linutronix.de
---
 kernel/signal.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 09019017d669..3035bebd7075 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2329,10 +2329,22 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
 		do_notify_parent_cldstop(current, false, why);
 
 	/*
-	 * Don't want to allow preemption here, because
-	 * sys_ptrace() needs this task to be inactive.
+	 * The previous do_notify_parent_cldstop() invocation woke ptracer.
+	 * One a PREEMPTION kernel this can result in preemption requirement
+	 * which will be fulfilled after read_unlock() and the ptracer will be
+	 * put on the CPU.
+	 * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for
+	 * this task wait in schedule(). If this task gets preempted then it
+	 * remains enqueued on the runqueue. The ptracer will observe this and
+	 * then sleep for a delay of one HZ tick. In the meantime this task
+	 * gets scheduled, enters schedule() and will wait for the ptracer.
 	 *
-	 * XXX: implement read_unlock_no_resched().
+	 * This preemption point is not bad from a correctness point of
+	 * view but extends the runtime by one HZ tick time due to the
+	 * ptracer's sleep.  The preempt-disable section ensures that there
+	 * will be no preemption between unlock and schedule() and so
+	 * improving the performance since the ptracer will observe that
+	 * the tracee is scheduled out once it gets on the CPU.
 	 */
 	preempt_disable();
 	read_unlock(&tasklist_lock);
-- 
cgit v1.2.3


From 1aabbc532413ced293952f8e149ad0a607d6e470 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 3 Aug 2023 12:09:32 +0200
Subject: signal: Don't disable preemption in ptrace_stop() on PREEMPT_RT

On PREEMPT_RT keeping preemption disabled during the invocation of
cgroup_enter_frozen() is a problem because the function acquires
css_set_lock which is a sleeping lock on PREEMPT_RT and must not be
acquired with disabled preemption.

The preempt-disabled section is only for performance optimisation reasons
and can be avoided.

Extend the comment and don't disable preemption before scheduling on
PREEMPT_RT.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20230803100932.325870-3-bigeasy@linutronix.de
---
 kernel/signal.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 3035bebd7075..f2a5578326ad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2345,11 +2345,22 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
 	 * will be no preemption between unlock and schedule() and so
 	 * improving the performance since the ptracer will observe that
 	 * the tracee is scheduled out once it gets on the CPU.
+	 *
+	 * On PREEMPT_RT locking tasklist_lock does not disable preemption.
+	 * Therefore the task can be preempted after do_notify_parent_cldstop()
+	 * before unlocking tasklist_lock so there is no benefit in doing this.
+	 *
+	 * In fact disabling preemption is harmful on PREEMPT_RT because
+	 * the spinlock_t in cgroup_enter_frozen() must not be acquired
+	 * with preemption disabled due to the 'sleeping' spinlock
+	 * substitution of RT.
 	 */
-	preempt_disable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_disable();
 	read_unlock(&tasklist_lock);
 	cgroup_enter_frozen();
-	preempt_enable_no_resched();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable_no_resched();
 	schedule();
 	cgroup_leave_frozen(true);
 
-- 
cgit v1.2.3


From 4653e5dd04cb869526477a76b87d0aa1a5c65101 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 11 Sep 2023 14:24:06 -0600
Subject: task_work: add kerneldoc annotation for 'data' argument

A previous commit changed the arguments to task_work_cancel_match(), but
didn't document all of them.

Link: https://lkml.kernel.org/r/93938bff-baa3-4091-85f5-784aae297a07@kernel.dk
Fixes: c7aab1a7c52b ("task_work: add helper for more targeted task_work canceling")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202309120307.zis3yQGe-lkp@intel.com/
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/task_work.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/task_work.c b/kernel/task_work.c
index 065e1ef8fc8d..95a7e1b7f1da 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -78,6 +78,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
  * task_work_cancel_match - cancel a pending work added by task_work_add()
  * @task: the task which should execute the work
  * @match: match function to call
+ * @data: data to be passed in to match function
  *
  * RETURNS:
  * The found work or NULL if not found.
-- 
cgit v1.2.3


From 0c7752d5b1ac62c8b926c907f34073ef7e9ad42b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 11 Sep 2023 23:08:22 -0700
Subject: pidfd: prevent a kernel-doc warning

Change the comment to match the function name that the SYSCALL_DEFINE()
macros generate to prevent a kernel-doc warning.

kernel/pid.c:628: warning: expecting prototype for pidfd_open(). Prototype was for sys_pidfd_open() instead

Link: https://lkml.kernel.org/r/20230912060822.2500-1-rdunlap@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Christian Brauner <brauner@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/pid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index fee14a4486a3..6500ef956f2f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -609,7 +609,7 @@ int pidfd_create(struct pid *pid, unsigned int flags)
 }
 
 /**
- * pidfd_open() - Open new pid file descriptor.
+ * sys_pidfd_open() - Open new pid file descriptor.
  *
  * @pid:   pid for which to retrieve a pidfd
  * @flags: flags to pass
-- 
cgit v1.2.3


From 28bc55f654de49f6122c7475b01b5d5ef4bdf0d4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 8 Sep 2023 18:22:48 +0200
Subject: sched: Constrain locks in sched_submit_work()

Even though sched_submit_work() is ran from preemptible context,
it is discouraged to have it use blocking locks due to the recursion
potential.

Enforce this.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230908162254.999499-2-bigeasy@linutronix.de
---
 kernel/sched/core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2299a5cfbfb9..d55564097bd8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6720,11 +6720,18 @@ void __noreturn do_task_dead(void)
 
 static inline void sched_submit_work(struct task_struct *tsk)
 {
+	static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
 	unsigned int task_flags;
 
 	if (task_is_running(tsk))
 		return;
 
+	/*
+	 * Establish LD_WAIT_CONFIG context to ensure none of the code called
+	 * will use a blocking primitive -- which would lead to recursion.
+	 */
+	lock_map_acquire_try(&sched_map);
+
 	task_flags = tsk->flags;
 	/*
 	 * If a worker goes to sleep, notify and ask workqueue whether it
@@ -6749,6 +6756,8 @@ static inline void sched_submit_work(struct task_struct *tsk)
 	 * make sure to submit it to avoid deadlocks.
 	 */
 	blk_flush_plug(tsk->plug, true);
+
+	lock_map_release(&sched_map);
 }
 
 static void sched_update_worker(struct task_struct *tsk)
-- 
cgit v1.2.3


From af9f006393b53409be0ca83ae234bef840cdef4a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 8 Sep 2023 18:22:49 +0200
Subject: locking/rtmutex: Avoid unconditional slowpath for DEBUG_RT_MUTEXES

With DEBUG_RT_MUTEXES enabled the fast-path rt_mutex_cmpxchg_acquire()
always fails and all lock operations take the slow path.

Provide a new helper inline rt_mutex_try_acquire() which maps to
rt_mutex_cmpxchg_acquire() in the non-debug case. For the debug case
it invokes rt_mutex_slowtrylock() which can acquire a non-contended
rtmutex under full debug coverage.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230908162254.999499-3-bigeasy@linutronix.de
---
 kernel/locking/rtmutex.c     | 21 ++++++++++++++++++++-
 kernel/locking/ww_rt_mutex.c |  2 +-
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 21db0df0eb00..bcec0533a0cc 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -218,6 +218,11 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
 	return try_cmpxchg_acquire(&lock->owner, &old, new);
 }
 
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+	return rt_mutex_cmpxchg_acquire(lock, NULL, current);
+}
+
 static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
 						     struct task_struct *old,
 						     struct task_struct *new)
@@ -297,6 +302,20 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
 
 }
 
+static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock);
+
+static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
+{
+	/*
+	 * With debug enabled rt_mutex_cmpxchg trylock() will always fail.
+	 *
+	 * Avoid unconditionally taking the slow path by using
+	 * rt_mutex_slow_trylock() which is covered by the debug code and can
+	 * acquire a non-contended rtmutex.
+	 */
+	return rt_mutex_slowtrylock(lock);
+}
+
 static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
 						     struct task_struct *old,
 						     struct task_struct *new)
@@ -1755,7 +1774,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
 					   unsigned int state)
 {
-	if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+	if (likely(rt_mutex_try_acquire(lock)))
 		return 0;
 
 	return rt_mutex_slowlock(lock, NULL, state);
diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c
index d1473c624105..c7196de838ed 100644
--- a/kernel/locking/ww_rt_mutex.c
+++ b/kernel/locking/ww_rt_mutex.c
@@ -62,7 +62,7 @@ __ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx,
 	}
 	mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip);
 
-	if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) {
+	if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) {
 		if (ww_ctx)
 			ww_mutex_set_context_fastpath(lock, ww_ctx);
 		return 0;
-- 
cgit v1.2.3


From de1474b46d889ee0367f6e71d9adfeb0711e4a8d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Sep 2023 18:22:50 +0200
Subject: sched: Extract __schedule_loop()

There are currently two implementations of this basic __schedule()
loop, and there is soon to be a third.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230908162254.999499-4-bigeasy@linutronix.de
---
 kernel/sched/core.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d55564097bd8..1ea7ba53aad2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6770,16 +6770,21 @@ static void sched_update_worker(struct task_struct *tsk)
 	}
 }
 
-asmlinkage __visible void __sched schedule(void)
+static __always_inline void __schedule_loop(unsigned int sched_mode)
 {
-	struct task_struct *tsk = current;
-
-	sched_submit_work(tsk);
 	do {
 		preempt_disable();
-		__schedule(SM_NONE);
+		__schedule(sched_mode);
 		sched_preempt_enable_no_resched();
 	} while (need_resched());
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+	struct task_struct *tsk = current;
+
+	sched_submit_work(tsk);
+	__schedule_loop(SM_NONE);
 	sched_update_worker(tsk);
 }
 EXPORT_SYMBOL(schedule);
@@ -6843,11 +6848,7 @@ void __sched schedule_preempt_disabled(void)
 #ifdef CONFIG_PREEMPT_RT
 void __sched notrace schedule_rtlock(void)
 {
-	do {
-		preempt_disable();
-		__schedule(SM_RTLOCK_WAIT);
-		sched_preempt_enable_no_resched();
-	} while (need_resched());
+	__schedule_loop(SM_RTLOCK_WAIT);
 }
 NOKPROBE_SYMBOL(schedule_rtlock);
 #endif
-- 
cgit v1.2.3


From 6b596e62ed9f90c4a97e68ae1f7b1af5beeb3c05 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 8 Sep 2023 18:22:51 +0200
Subject: sched: Provide rt_mutex specific scheduler helpers

With PREEMPT_RT there is a rt_mutex recursion problem where
sched_submit_work() can use an rtlock (aka spinlock_t). More
specifically what happens is:

  mutex_lock() /* really rt_mutex */
    ...
      __rt_mutex_slowlock_locked()
	task_blocks_on_rt_mutex()
          // enqueue current task as waiter
          // do PI chain walk
        rt_mutex_slowlock_block()
          schedule()
            sched_submit_work()
              ...
              spin_lock() /* really rtlock */
                ...
                  __rt_mutex_slowlock_locked()
                    task_blocks_on_rt_mutex()
                      // enqueue current task as waiter *AGAIN*
                      // *CONFUSION*

Fix this by making rt_mutex do the sched_submit_work() early, before
it enqueues itself as a waiter -- before it even knows *if* it will
wait.

[[ basically Thomas' patch but with different naming and a few asserts
   added ]]

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230908162254.999499-5-bigeasy@linutronix.de
---
 include/linux/sched.h    |  3 +++
 include/linux/sched/rt.h |  4 ++++
 kernel/sched/core.c      | 36 ++++++++++++++++++++++++++++++++----
 3 files changed, 39 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..67623ffd4a8e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -911,6 +911,9 @@ struct task_struct {
 	 * ->sched_remote_wakeup gets used, so it can be in this word.
 	 */
 	unsigned			sched_remote_wakeup:1;
+#ifdef CONFIG_RT_MUTEXES
+	unsigned			sched_rt_mutex:1;
+#endif
 
 	/* Bit to tell LSMs we're in execve(): */
 	unsigned			in_execve:1;
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 994c25640e15..b2b9e6eb9683 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -30,6 +30,10 @@ static inline bool task_is_realtime(struct task_struct *tsk)
 }
 
 #ifdef CONFIG_RT_MUTEXES
+extern void rt_mutex_pre_schedule(void);
+extern void rt_mutex_schedule(void);
+extern void rt_mutex_post_schedule(void);
+
 /*
  * Must hold either p->pi_lock or task_rq(p)->lock.
  */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1ea7ba53aad2..58d0346d1bb3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6723,9 +6723,6 @@ static inline void sched_submit_work(struct task_struct *tsk)
 	static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
 	unsigned int task_flags;
 
-	if (task_is_running(tsk))
-		return;
-
 	/*
 	 * Establish LD_WAIT_CONFIG context to ensure none of the code called
 	 * will use a blocking primitive -- which would lead to recursion.
@@ -6783,7 +6780,12 @@ asmlinkage __visible void __sched schedule(void)
 {
 	struct task_struct *tsk = current;
 
-	sched_submit_work(tsk);
+#ifdef CONFIG_RT_MUTEXES
+	lockdep_assert(!tsk->sched_rt_mutex);
+#endif
+
+	if (!task_is_running(tsk))
+		sched_submit_work(tsk);
 	__schedule_loop(SM_NONE);
 	sched_update_worker(tsk);
 }
@@ -7044,6 +7046,32 @@ static void __setscheduler_prio(struct task_struct *p, int prio)
 
 #ifdef CONFIG_RT_MUTEXES
 
+/*
+ * Would be more useful with typeof()/auto_type but they don't mix with
+ * bit-fields. Since it's a local thing, use int. Keep the generic sounding
+ * name such that if someone were to implement this function we get to compare
+ * notes.
+ */
+#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; })
+
+void rt_mutex_pre_schedule(void)
+{
+	lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1));
+	sched_submit_work(current);
+}
+
+void rt_mutex_schedule(void)
+{
+	lockdep_assert(current->sched_rt_mutex);
+	__schedule_loop(SM_NONE);
+}
+
+void rt_mutex_post_schedule(void)
+{
+	sched_update_worker(current);
+	lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
+}
+
 static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
 {
 	if (pi_task)
-- 
cgit v1.2.3


From d14f9e930b9073de264c106bf04968286ef9b3a4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 8 Sep 2023 18:22:52 +0200
Subject: locking/rtmutex: Use rt_mutex specific scheduler helpers

Have rt_mutex use the rt_mutex specific scheduler helpers to avoid
recursion vs rtlock on the PI state.

[[ peterz: adapted to new names ]]

Reported-by: Crystal Wood <swood@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230908162254.999499-6-bigeasy@linutronix.de
---
 kernel/futex/pi.c            | 11 +++++++++++
 kernel/locking/rtmutex.c     | 14 ++++++++++++--
 kernel/locking/rwbase_rt.c   |  6 ++++++
 kernel/locking/rwsem.c       |  8 +++++++-
 kernel/locking/spinlock_rt.c |  4 ++++
 5 files changed, 40 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index ce2889f12375..f8e65b27d9d6 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include <linux/slab.h>
+#include <linux/sched/rt.h>
 #include <linux/sched/task.h>
 
 #include "futex.h"
@@ -1002,6 +1003,12 @@ retry_private:
 		goto no_block;
 	}
 
+	/*
+	 * Must be done before we enqueue the waiter, here is unfortunately
+	 * under the hb lock, but that *should* work because it does nothing.
+	 */
+	rt_mutex_pre_schedule();
+
 	rt_mutex_init_waiter(&rt_waiter);
 
 	/*
@@ -1052,6 +1059,10 @@ cleanup:
 	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
 		ret = 0;
 
+	/*
+	 * Waiter is unqueued.
+	 */
+	rt_mutex_post_schedule();
 no_block:
 	/*
 	 * Fixup the pi_state owner and possibly acquire the lock if we
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index bcec0533a0cc..a3fe05dfd0d8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1632,7 +1632,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
 		raw_spin_unlock_irq(&lock->wait_lock);
 
 		if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
-			schedule();
+			rt_mutex_schedule();
 
 		raw_spin_lock_irq(&lock->wait_lock);
 		set_current_state(state);
@@ -1661,7 +1661,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
 	WARN(1, "rtmutex deadlock detected\n");
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		schedule();
+		rt_mutex_schedule();
 	}
 }
 
@@ -1756,6 +1756,15 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 	unsigned long flags;
 	int ret;
 
+	/*
+	 * Do all pre-schedule work here, before we queue a waiter and invoke
+	 * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would
+	 * otherwise recurse back into task_blocks_on_rt_mutex() through
+	 * rtlock_slowlock() and will then enqueue a second waiter for this
+	 * same task and things get really confusing real fast.
+	 */
+	rt_mutex_pre_schedule();
+
 	/*
 	 * Technically we could use raw_spin_[un]lock_irq() here, but this can
 	 * be called in early boot if the cmpxchg() fast path is disabled
@@ -1767,6 +1776,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 	ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+	rt_mutex_post_schedule();
 
 	return ret;
 }
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 25ec0239477c..c7258cb32d91 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -71,6 +71,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 	struct rt_mutex_base *rtm = &rwb->rtmutex;
 	int ret;
 
+	rwbase_pre_schedule();
 	raw_spin_lock_irq(&rtm->wait_lock);
 
 	/*
@@ -125,6 +126,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 		rwbase_rtmutex_unlock(rtm);
 
 	trace_contention_end(rwb, ret);
+	rwbase_post_schedule();
 	return ret;
 }
 
@@ -237,6 +239,8 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
 	/* Force readers into slow path */
 	atomic_sub(READER_BIAS, &rwb->readers);
 
+	rwbase_pre_schedule();
+
 	raw_spin_lock_irqsave(&rtm->wait_lock, flags);
 	if (__rwbase_write_trylock(rwb))
 		goto out_unlock;
@@ -248,6 +252,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
 		if (rwbase_signal_pending_state(state, current)) {
 			rwbase_restore_current_state();
 			__rwbase_write_unlock(rwb, 0, flags);
+			rwbase_post_schedule();
 			trace_contention_end(rwb, -EINTR);
 			return -EINTR;
 		}
@@ -266,6 +271,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
 
 out_unlock:
 	raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+	rwbase_post_schedule();
 	return 0;
 }
 
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 9eabd585ce7a..2340b6d90ec6 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1427,8 +1427,14 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 #define rwbase_signal_pending_state(state, current)	\
 	signal_pending_state(state, current)
 
+#define rwbase_pre_schedule()				\
+	rt_mutex_pre_schedule()
+
 #define rwbase_schedule()				\
-	schedule()
+	rt_mutex_schedule()
+
+#define rwbase_post_schedule()				\
+	rt_mutex_post_schedule()
 
 #include "rwbase_rt.c"
 
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
index 48a19ed8486d..842037b2ba54 100644
--- a/kernel/locking/spinlock_rt.c
+++ b/kernel/locking/spinlock_rt.c
@@ -184,9 +184,13 @@ static __always_inline int  rwbase_rtmutex_trylock(struct rt_mutex_base *rtm)
 
 #define rwbase_signal_pending_state(state, current)	(0)
 
+#define rwbase_pre_schedule()
+
 #define rwbase_schedule()				\
 	schedule_rtlock()
 
+#define rwbase_post_schedule()
+
 #include "rwbase_rt.c"
 /*
  * The common functions which get wrapped into the rwlock API.
-- 
cgit v1.2.3


From 45f67f30a22f264bc7a0a61255c2ee1a838e9403 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 8 Sep 2023 18:22:53 +0200
Subject: locking/rtmutex: Add a lockdep assert to catch potential nested
 blocking

There used to be a BUG_ON(current->pi_blocked_on) in the lock acquisition
functions, but that vanished in one of the rtmutex overhauls.

Bring it back in form of a lockdep assert to catch code paths which take
rtmutex based locks with current::pi_blocked_on != NULL.

Reported-by: Crystal Wood <swood@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230908162254.999499-7-bigeasy@linutronix.de
---
 kernel/locking/rtmutex.c     | 2 ++
 kernel/locking/rwbase_rt.c   | 2 ++
 kernel/locking/spinlock_rt.c | 2 ++
 3 files changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a3fe05dfd0d8..4a10e8c16fd2 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1784,6 +1784,8 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
 					   unsigned int state)
 {
+	lockdep_assert(!current->pi_blocked_on);
+
 	if (likely(rt_mutex_try_acquire(lock)))
 		return 0;
 
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index c7258cb32d91..34a59569db6b 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -133,6 +133,8 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
 					    unsigned int state)
 {
+	lockdep_assert(!current->pi_blocked_on);
+
 	if (rwbase_read_trylock(rwb))
 		return 0;
 
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
index 842037b2ba54..38e292454fcc 100644
--- a/kernel/locking/spinlock_rt.c
+++ b/kernel/locking/spinlock_rt.c
@@ -37,6 +37,8 @@
 
 static __always_inline void rtlock_lock(struct rt_mutex_base *rtm)
 {
+	lockdep_assert(!current->pi_blocked_on);
+
 	if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current)))
 		rtlock_slowlock(rtm);
 }
-- 
cgit v1.2.3


From fbeb558b0dd0d6348e0872bbbbe96e30c65867b7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Sep 2023 17:19:44 +0200
Subject: futex/pi: Fix recursive rt_mutex waiter state

Some new assertions pointed out that the existing code has nested rt_mutex wait
state in the futex code.

Specifically, the futex_lock_pi() cancel case uses spin_lock() while there
still is a rt_waiter enqueued for this task, resulting in a state where there
are two waiters for the same task (and task_struct::pi_blocked_on gets
scrambled).

The reason to take hb->lock at this point is to avoid the wake_futex_pi()
EAGAIN case.

This happens when futex_top_waiter() and rt_mutex_top_waiter() state becomes
inconsistent. The current rules are such that this inconsistency will not be
observed.

Notably the case that needs to be avoided is where futex_lock_pi() and
futex_unlock_pi() interleave such that unlock will fail to observe a new
waiter.

*However* the case at hand is where a waiter is leaving, in this case the race
means a waiter that is going away is not observed -- which is harmless,
provided this race is explicitly handled.

This is a somewhat dangerous proposition because the converse race is not
observing a new waiter, which must absolutely not happen. But since the race is
valid this cannot be asserted.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/20230915151943.GD6743@noisy.programming.kicks-ass.net
---
 kernel/futex/pi.c      | 76 +++++++++++++++++++++++++++++++-------------------
 kernel/futex/requeue.c |  6 ++--
 2 files changed, 52 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index f8e65b27d9d6..d636a1bbd7d0 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -611,29 +611,16 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 /*
  * Caller must hold a reference on @pi_state.
  */
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
+static int wake_futex_pi(u32 __user *uaddr, u32 uval,
+			 struct futex_pi_state *pi_state,
+			 struct rt_mutex_waiter *top_waiter)
 {
-	struct rt_mutex_waiter *top_waiter;
 	struct task_struct *new_owner;
 	bool postunlock = false;
 	DEFINE_RT_WAKE_Q(wqh);
 	u32 curval, newval;
 	int ret = 0;
 
-	top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
-	if (WARN_ON_ONCE(!top_waiter)) {
-		/*
-		 * As per the comment in futex_unlock_pi() this should not happen.
-		 *
-		 * When this happens, give up our locks and try again, giving
-		 * the futex_lock_pi() instance time to complete, either by
-		 * waiting on the rtmutex or removing itself from the futex
-		 * queue.
-		 */
-		ret = -EAGAIN;
-		goto out_unlock;
-	}
-
 	new_owner = top_waiter->task;
 
 	/*
@@ -1046,19 +1033,33 @@ retry_private:
 	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
 
 cleanup:
-	spin_lock(q.lock_ptr);
 	/*
 	 * If we failed to acquire the lock (deadlock/signal/timeout), we must
-	 * first acquire the hb->lock before removing the lock from the
-	 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
-	 * lists consistent.
+	 * must unwind the above, however we canont lock hb->lock because
+	 * rt_mutex already has a waiter enqueued and hb->lock can itself try
+	 * and enqueue an rt_waiter through rtlock.
+	 *
+	 * Doing the cleanup without holding hb->lock can cause inconsistent
+	 * state between hb and pi_state, but only in the direction of not
+	 * seeing a waiter that is leaving.
+	 *
+	 * See futex_unlock_pi(), it deals with this inconsistency.
 	 *
-	 * In particular; it is important that futex_unlock_pi() can not
-	 * observe this inconsistency.
+	 * There be dragons here, since we must deal with the inconsistency on
+	 * the way out (here), it is impossible to detect/warn about the race
+	 * the other way around (missing an incoming waiter).
+	 *
+	 * What could possibly go wrong...
 	 */
 	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
 		ret = 0;
 
+	/*
+	 * Now that the rt_waiter has been dequeued, it is safe to use
+	 * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
+	 * the
+	 */
+	spin_lock(q.lock_ptr);
 	/*
 	 * Waiter is unqueued.
 	 */
@@ -1143,6 +1144,7 @@ retry:
 	top_waiter = futex_top_waiter(hb, &key);
 	if (top_waiter) {
 		struct futex_pi_state *pi_state = top_waiter->pi_state;
+		struct rt_mutex_waiter *rt_waiter;
 
 		ret = -EINVAL;
 		if (!pi_state)
@@ -1155,22 +1157,39 @@ retry:
 		if (pi_state->owner != current)
 			goto out_unlock;
 
-		get_pi_state(pi_state);
 		/*
 		 * By taking wait_lock while still holding hb->lock, we ensure
-		 * there is no point where we hold neither; and therefore
-		 * wake_futex_p() must observe a state consistent with what we
-		 * observed.
+		 * there is no point where we hold neither; and thereby
+		 * wake_futex_pi() must observe any new waiters.
+		 *
+		 * Since the cleanup: case in futex_lock_pi() removes the
+		 * rt_waiter without holding hb->lock, it is possible for
+		 * wake_futex_pi() to not find a waiter while the above does,
+		 * in this case the waiter is on the way out and it can be
+		 * ignored.
 		 *
 		 * In particular; this forces __rt_mutex_start_proxy() to
 		 * complete such that we're guaranteed to observe the
-		 * rt_waiter. Also see the WARN in wake_futex_pi().
+		 * rt_waiter.
 		 */
 		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+		/*
+		 * Futex vs rt_mutex waiter state -- if there are no rt_mutex
+		 * waiters even though futex thinks there are, then the waiter
+		 * is leaving and the uncontended path is safe to take.
+		 */
+		rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
+		if (!rt_waiter) {
+			raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+			goto do_uncontended;
+		}
+
+		get_pi_state(pi_state);
 		spin_unlock(&hb->lock);
 
 		/* drops pi_state->pi_mutex.wait_lock */
-		ret = wake_futex_pi(uaddr, uval, pi_state);
+		ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
 
 		put_pi_state(pi_state);
 
@@ -1198,6 +1217,7 @@ retry:
 		return ret;
 	}
 
+do_uncontended:
 	/*
 	 * We have no kernel internal state, i.e. no waiters in the
 	 * kernel. Waiters which are about to queue themselves are stuck
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index cba8b1a6a4cc..4c73e0b81acc 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -850,11 +850,13 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		pi_mutex = &q.pi_state->pi_mutex;
 		ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
 
-		/* Current is not longer pi_blocked_on */
-		spin_lock(q.lock_ptr);
+		/*
+		 * See futex_unlock_pi()'s cleanup: comment.
+		 */
 		if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
 			ret = 0;
 
+		spin_lock(q.lock_ptr);
 		debug_rt_mutex_free_waiter(&rt_waiter);
 		/*
 		 * Fixup the pi_state owner and possibly acquire the lock if we
-- 
cgit v1.2.3


From 67e18e132f0fd738f8c8cac3aa1420312073f795 Mon Sep 17 00:00:00 2001
From: Zheng Yejian <zhengyejian1@huawei.com>
Date: Thu, 14 Sep 2023 15:26:44 +0800
Subject: livepatch: Fix missing newline character in klp_resolve_symbols()

Without the newline character, the log may not be printed immediately
after the error occurs.

Fixes: ca376a937486 ("livepatch: Prevent module-specific KLP rela sections from referencing vmlinux symbols")
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230914072644.4098857-1-zhengyejian1@huawei.com
---
 kernel/livepatch/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 61328328c474..ecbc9b6aba3a 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -243,7 +243,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
 		 * symbols are exported and normal relas can be used instead.
 		 */
 		if (!sec_vmlinux && sym_vmlinux) {
-			pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section",
+			pr_err("invalid access to vmlinux symbol '%s' from module-specific livepatch relocation section\n",
 			       sym_name);
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From 81335f90e8a88b81932df011105c46e708744f44 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 18 Sep 2023 14:01:10 -0700
Subject: bpf: unconditionally reset backtrack_state masks on global func exit

In mark_chain_precision() logic, when we reach the entry to a global
func, it is expected that R1-R5 might be still requested to be marked
precise. This would correspond to some integer input arguments being
tracked as precise. This is all expected and handled as a special case.

What's not expected is that we'll leave backtrack_state structure with
some register bits set. This is because for subsequent precision
propagations backtrack_state is reused without clearing masks, as all
code paths are carefully written in a way to leave empty backtrack_state
with zeroed out masks, for speed.

The fix is trivial, we always clear register bit in the register mask, and
then, optionally, set reg->precise if register is SCALAR_VALUE type.

Reported-by: Chris Mason <clm@meta.com>
Fixes: be2ef8161572 ("bpf: allow precision tracking for programs with subprogs")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230918210110.2241458-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bb78212fa5b2..c0c7d137066a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4047,11 +4047,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 				bitmap_from_u64(mask, bt_reg_mask(bt));
 				for_each_set_bit(i, mask, 32) {
 					reg = &st->frame[0]->regs[i];
-					if (reg->type != SCALAR_VALUE) {
-						bt_clear_reg(bt, i);
-						continue;
-					}
-					reg->precise = true;
+					bt_clear_reg(bt, i);
+					if (reg->type == SCALAR_VALUE)
+						reg->precise = true;
 				}
 				return 0;
 			}
-- 
cgit v1.2.3


From 17e7170645e34c519443ba63895264bbdee7beee Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 20 Sep 2023 15:00:24 +0200
Subject: sched/debug: Remove the /proc/sys/kernel/sched_child_runs_first
 sysctl

The /proc/sys/kernel/sched_child_runs_first knob is no longer connected since:

   5e963f2bd4654 ("sched/fair: Commit to EEVDF")

Remove it.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230920130025.412071-2-bigeasy@linutronix.de
---
 kernel/sched/debug.c |  1 -
 kernel/sched/fair.c  | 13 -------------
 kernel/sched/sched.h |  2 --
 3 files changed, 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4c3d0d9f3db6..132dfd1e6f47 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -864,7 +864,6 @@ static void sched_debug_header(struct seq_file *m)
 #define PN(x) \
 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
 	PN(sysctl_sched_base_slice);
-	P(sysctl_sched_child_runs_first);
 	P(sysctl_sched_features);
 #undef PN
 #undef P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fcf0c5bc8b47..75720008fdd2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -78,12 +78,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
 unsigned int sysctl_sched_base_slice			= 750000ULL;
 static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
 
-/*
- * After fork, child runs first. If set to 0 (default) then
- * parent will (try to) run first.
- */
-unsigned int sysctl_sched_child_runs_first __read_mostly;
-
 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
 
 int sched_thermal_decay_shift;
@@ -145,13 +139,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table sched_fair_sysctls[] = {
-	{
-		.procname       = "sched_child_runs_first",
-		.data           = &sysctl_sched_child_runs_first,
-		.maxlen         = sizeof(unsigned int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec,
-	},
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
 		.procname       = "sched_cfs_bandwidth_slice_us",
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7e070dcf7074..9260120ed2a5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -100,8 +100,6 @@ extern __read_mostly int scheduler_running;
 extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
 
-extern unsigned int sysctl_sched_child_runs_first;
-
 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 
-- 
cgit v1.2.3


From 622f0a1d544fa88dda10d27727835e825c84ae0f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 20 Sep 2023 15:00:25 +0200
Subject: sched/debug: Update stale reference to sched_debug.c

Since commit:

   8a99b6833c884 ("sched: Move SCHED_DEBUG sysctl to debugfs")

The sched_debug interface moved from /proc to debugfs. The comment
mentions still the outdated proc interfaces.

Update the comment, point to the current location of the interface.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230920130025.412071-3-bigeasy@linutronix.de
---
 kernel/sched/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 132dfd1e6f47..5e34a8cb2c76 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,7 +8,7 @@
  */
 
 /*
- * This allows printing both to /proc/sched_debug and
+ * This allows printing both to /sys/kernel/debug/sched/debug and
  * to the console
  */
 #define SEQ_printf(m, x...)			\
-- 
cgit v1.2.3


From 98a0465531a5982dd897fd81222a5d3465999951 Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 20 Sep 2023 17:58:38 +0206
Subject: printk: fix illegal pbufs access for !CONFIG_PRINTK

When CONFIG_PRINTK is not set, PRINTK_MESSAGE_MAX is 0. This
leads to a zero-sized array @outbuf in @printk_shared_pbufs. In
console_flush_all() a pointer to the first element of the array
is assigned with:

   char *outbuf = &printk_shared_pbufs.outbuf[0];

For !CONFIG_PRINTK this leads to a compiler warning:

   warning: array subscript 0 is outside array bounds of
   'char[0]' [-Warray-bounds]

This is not really dangerous because printk_get_next_message()
always returns false for !CONFIG_PRINTK, which leads to @outbuf
never being used. However, it makes no sense to even compile
these functions for !CONFIG_PRINTK.

Extend the existing '#ifdef CONFIG_PRINTK' block to contain
the formatting and emitting functions since these have no
purpose in !CONFIG_PRINTK. This also allows removing several
more !CONFIG_PRINTK dummies as well as moving
@suppress_panic_printk into a CONFIG_PRINTK block.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202309201724.M9BMAQIh-lkp@intel.com/
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20230920155238.670439-1-john.ogness@linutronix.de
---
 kernel/printk/printk.c | 44 ++++++++++++++++++--------------------------
 1 file changed, 18 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 778359b21761..7f40d9122caa 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -102,12 +102,6 @@ DEFINE_STATIC_SRCU(console_srcu);
  */
 int __read_mostly suppress_printk;
 
-/*
- * During panic, heavy printk by other CPUs can delay the
- * panic and risk deadlock on console resources.
- */
-static int __read_mostly suppress_panic_printk;
-
 #ifdef CONFIG_LOCKDEP
 static struct lockdep_map console_lock_dep_map = {
 	.name = "console_lock"
@@ -445,6 +439,12 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
 static DEFINE_MUTEX(syslog_lock);
 
 #ifdef CONFIG_PRINTK
+/*
+ * During panic, heavy printk by other CPUs can delay the
+ * panic and risk deadlock on console resources.
+ */
+static int __read_mostly suppress_panic_printk;
+
 DECLARE_WAIT_QUEUE_HEAD(log_wait);
 /* All 3 protected by @syslog_lock. */
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -2346,22 +2346,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 
 static u64 syslog_seq;
 
-static size_t record_print_text(const struct printk_record *r,
-				bool syslog, bool time)
-{
-	return 0;
-}
-static ssize_t info_print_ext_header(char *buf, size_t size,
-				     struct printk_info *info)
-{
-	return 0;
-}
-static ssize_t msg_print_ext_body(char *buf, size_t size,
-				  char *text, size_t text_len,
-				  struct dev_printk_info *dev_info) { return 0; }
-static void console_lock_spinning_enable(void) { }
-static int console_lock_spinning_disable_and_check(int cookie) { return 0; }
-static bool suppress_message_printing(int level) { return false; }
 static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }
 static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
 
@@ -2715,6 +2699,8 @@ static void __console_unlock(void)
 	up_console_sem();
 }
 
+#ifdef CONFIG_PRINTK
+
 /*
  * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This
  * is achieved by shifting the existing message over and inserting the dropped
@@ -2729,7 +2715,6 @@ static void __console_unlock(void)
  *
  * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
  */
-#ifdef CONFIG_PRINTK
 void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
 {
 	struct printk_buffers *pbufs = pmsg->pbufs;
@@ -2761,9 +2746,6 @@ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
 	memcpy(outbuf, scratchbuf, len);
 	pmsg->outbuf_len += len;
 }
-#else
-#define console_prepend_dropped(pmsg, dropped)
-#endif /* CONFIG_PRINTK */
 
 /*
  * Read and format the specified record (or a later record if the specified
@@ -2921,6 +2903,16 @@ skip:
 	return true;
 }
 
+#else
+
+static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
+{
+	*handover = false;
+	return false;
+}
+
+#endif /* CONFIG_PRINTK */
+
 /*
  * Print out all remaining records to all consoles.
  *
-- 
cgit v1.2.3


From 4923954bbc4a760e0b2210e0cb5733726ac2e2e9 Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:06 +0200
Subject: futex: Clarify FUTEX2 flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

sys_futex_waitv() is part of the futex2 series (the first and only so
far) of syscalls and has a flags field per futex (as opposed to flags
being encoded in the futex op).

This new flags field has a new namespace, which unfortunately isn't
super explicit. Notably it currently takes FUTEX_32 and
FUTEX_PRIVATE_FLAG.

Introduce the FUTEX2 namespace to clarify this

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: André Almeida <andrealmeid@igalia.com>
Link: https://lore.kernel.org/r/20230921105247.507327749@noisy.programming.kicks-ass.net
---
 include/uapi/linux/futex.h | 16 +++++++++++++---
 kernel/futex/syscalls.c    |  7 +++----
 2 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index 71a5df8d2689..21d4eff41162 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -44,10 +44,20 @@
 					 FUTEX_PRIVATE_FLAG)
 
 /*
- * Flags to specify the bit length of the futex word for futex2 syscalls.
- * Currently, only 32 is supported.
+ * Flags for futex2 syscalls.
  */
-#define FUTEX_32		2
+			/*	0x00 */
+			/*	0x01 */
+#define FUTEX2_SIZE_U32		0x02
+			/*	0x04 */
+			/*	0x08 */
+			/*	0x10 */
+			/*	0x20 */
+			/*	0x40 */
+#define FUTEX2_PRIVATE		FUTEX_PRIVATE_FLAG
+
+/* do not use */
+#define FUTEX_32		FUTEX2_SIZE_U32 /* historical accident :-( */
 
 /*
  * Max numbers of elements in a futex_waitv array
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index a8074079b09e..ff696b0e2e5c 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -183,8 +183,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
 }
 
-/* Mask of available flags for each futex in futex_waitv list */
-#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
+#define FUTEX2_VALID_MASK (FUTEX2_SIZE_U32 | FUTEX2_PRIVATE)
 
 /**
  * futex_parse_waitv - Parse a waitv array from userspace
@@ -205,10 +204,10 @@ static int futex_parse_waitv(struct futex_vector *futexv,
 		if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
 			return -EFAULT;
 
-		if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved)
+		if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
 			return -EINVAL;
 
-		if (!(aux.flags & FUTEX_32))
+		if (!(aux.flags & FUTEX2_SIZE_U32))
 			return -EINVAL;
 
 		futexv[i].w.flags = aux.flags;
-- 
cgit v1.2.3


From d6d08d24790e82c69a46ef78ae44fe1b1ed30775 Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:07 +0200
Subject: futex: Extend the FUTEX2 flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the definition for the missing but always intended extra sizes,
and add a NUMA flag for the planned numa extention.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: André Almeida <andrealmeid@igalia.com>
Link: https://lore.kernel.org/r/20230921105247.617057368@noisy.programming.kicks-ass.net
---
 include/uapi/linux/futex.h | 21 ++++++++++++++++++---
 kernel/futex/syscalls.c    |  9 +++++++--
 2 files changed, 25 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index 21d4eff41162..d2ee625ea189 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -45,17 +45,32 @@
 
 /*
  * Flags for futex2 syscalls.
+ *
+ * NOTE: these are not pure flags, they can also be seen as:
+ *
+ *   union {
+ *     u32  flags;
+ *     struct {
+ *       u32 size    : 2,
+ *           numa    : 1,
+ *                   : 4,
+ *           private : 1;
+ *     };
+ *   };
  */
-			/*	0x00 */
-			/*	0x01 */
+#define FUTEX2_SIZE_U8		0x00
+#define FUTEX2_SIZE_U16		0x01
 #define FUTEX2_SIZE_U32		0x02
-			/*	0x04 */
+#define FUTEX2_SIZE_U64		0x03
+#define FUTEX2_NUMA		0x04
 			/*	0x08 */
 			/*	0x10 */
 			/*	0x20 */
 			/*	0x40 */
 #define FUTEX2_PRIVATE		FUTEX_PRIVATE_FLAG
 
+#define FUTEX2_SIZE_MASK	0x03
+
 /* do not use */
 #define FUTEX_32		FUTEX2_SIZE_U32 /* historical accident :-( */
 
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index ff696b0e2e5c..953f0a49de3a 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -183,7 +183,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
 }
 
-#define FUTEX2_VALID_MASK (FUTEX2_SIZE_U32 | FUTEX2_PRIVATE)
+#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
 
 /**
  * futex_parse_waitv - Parse a waitv array from userspace
@@ -207,7 +207,12 @@ static int futex_parse_waitv(struct futex_vector *futexv,
 		if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
 			return -EINVAL;
 
-		if (!(aux.flags & FUTEX2_SIZE_U32))
+		if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) {
+			if ((aux.flags & FUTEX2_SIZE_MASK) == FUTEX2_SIZE_U64)
+				return -EINVAL;
+		}
+
+		if ((aux.flags & FUTEX2_SIZE_MASK) != FUTEX2_SIZE_U32)
 			return -EINVAL;
 
 		futexv[i].w.flags = aux.flags;
-- 
cgit v1.2.3


From 5694289ce183bc3336407a78c8c722a0b9208f9b Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:08 +0200
Subject: futex: Flag conversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Futex has 3 sets of flags:

 - legacy futex op bits
 - futex2 flags
 - internal flags

Add a few helpers to convert from the API flags into the internal
flags.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: André Almeida <andrealmeid@igalia.com>
Link: https://lore.kernel.org/r/20230921105247.722140574@noisy.programming.kicks-ass.net
---
 kernel/futex/futex.h    | 63 ++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/futex/syscalls.c | 24 +++++++------------
 kernel/futex/waitwake.c |  4 ++--
 3 files changed, 71 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index b5379c0e6d6d..68fc052dc09b 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -5,6 +5,7 @@
 #include <linux/futex.h>
 #include <linux/rtmutex.h>
 #include <linux/sched/wake_q.h>
+#include <linux/compat.h>
 
 #ifdef CONFIG_PREEMPT_RT
 #include <linux/rcuwait.h>
@@ -16,8 +17,15 @@
  * Futex flags used to encode options to functions and preserve them across
  * restarts.
  */
+#define FLAGS_SIZE_8		0x00
+#define FLAGS_SIZE_16		0x01
+#define FLAGS_SIZE_32		0x02
+#define FLAGS_SIZE_64		0x03
+
+#define FLAGS_SIZE_MASK		0x03
+
 #ifdef CONFIG_MMU
-# define FLAGS_SHARED		0x01
+# define FLAGS_SHARED		0x10
 #else
 /*
  * NOMMU does not have per process address space. Let the compiler optimize
@@ -25,8 +33,57 @@
  */
 # define FLAGS_SHARED		0x00
 #endif
-#define FLAGS_CLOCKRT		0x02
-#define FLAGS_HAS_TIMEOUT	0x04
+#define FLAGS_CLOCKRT		0x20
+#define FLAGS_HAS_TIMEOUT	0x40
+#define FLAGS_NUMA		0x80
+
+/* FUTEX_ to FLAGS_ */
+static inline unsigned int futex_to_flags(unsigned int op)
+{
+	unsigned int flags = FLAGS_SIZE_32;
+
+	if (!(op & FUTEX_PRIVATE_FLAG))
+		flags |= FLAGS_SHARED;
+
+	if (op & FUTEX_CLOCK_REALTIME)
+		flags |= FLAGS_CLOCKRT;
+
+	return flags;
+}
+
+/* FUTEX2_ to FLAGS_ */
+static inline unsigned int futex2_to_flags(unsigned int flags2)
+{
+	unsigned int flags = flags2 & FUTEX2_SIZE_MASK;
+
+	if (!(flags2 & FUTEX2_PRIVATE))
+		flags |= FLAGS_SHARED;
+
+	if (flags2 & FUTEX2_NUMA)
+		flags |= FLAGS_NUMA;
+
+	return flags;
+}
+
+static inline unsigned int futex_size(unsigned int flags)
+{
+	return 1 << (flags & FLAGS_SIZE_MASK);
+}
+
+static inline bool futex_flags_valid(unsigned int flags)
+{
+	/* Only 64bit futexes for 64bit code */
+	if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) {
+		if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64)
+			return false;
+	}
+
+	/* Only 32bit futexes are implemented -- for now */
+	if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
+		return false;
+
+	return true;
+}
 
 #ifdef CONFIG_FAIL_FUTEX
 extern bool should_fail_futex(bool fshared);
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 953f0a49de3a..948ac247c1c6 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
-#include <linux/compat.h>
 #include <linux/syscalls.h>
 #include <linux/time_namespace.h>
 
@@ -85,15 +84,12 @@ err_unlock:
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
+	unsigned int flags = futex_to_flags(op);
 	int cmd = op & FUTEX_CMD_MASK;
-	unsigned int flags = 0;
 
-	if (!(op & FUTEX_PRIVATE_FLAG))
-		flags |= FLAGS_SHARED;
-
-	if (op & FUTEX_CLOCK_REALTIME) {
-		flags |= FLAGS_CLOCKRT;
-		if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
+	if (flags & FLAGS_CLOCKRT) {
+		if (cmd != FUTEX_WAIT_BITSET &&
+		    cmd != FUTEX_WAIT_REQUEUE_PI &&
 		    cmd != FUTEX_LOCK_PI2)
 			return -ENOSYS;
 	}
@@ -201,21 +197,19 @@ static int futex_parse_waitv(struct futex_vector *futexv,
 	unsigned int i;
 
 	for (i = 0; i < nr_futexes; i++) {
+		unsigned int flags;
+
 		if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
 			return -EFAULT;
 
 		if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
 			return -EINVAL;
 
-		if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) {
-			if ((aux.flags & FUTEX2_SIZE_MASK) == FUTEX2_SIZE_U64)
-				return -EINVAL;
-		}
-
-		if ((aux.flags & FUTEX2_SIZE_MASK) != FUTEX2_SIZE_U32)
+		flags = futex2_to_flags(aux.flags);
+		if (!futex_flags_valid(flags))
 			return -EINVAL;
 
-		futexv[i].w.flags = aux.flags;
+		futexv[i].w.flags = flags;
 		futexv[i].w.val = aux.val;
 		futexv[i].w.uaddr = aux.uaddr;
 		futexv[i].q = futex_q_init;
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index ba01b9408203..fa9757766103 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -419,11 +419,11 @@ static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *wo
 	 */
 retry:
 	for (i = 0; i < count; i++) {
-		if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry)
+		if (!(vs[i].w.flags & FLAGS_SHARED) && retry)
 			continue;
 
 		ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
-				    !(vs[i].w.flags & FUTEX_PRIVATE_FLAG),
+				    vs[i].w.flags & FLAGS_SHARED,
 				    &vs[i].q.key, FUTEX_READ);
 
 		if (unlikely(ret))
-- 
cgit v1.2.3


From 698eb826383616ce0e817d2384da6413d1439fb6 Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:09 +0200
Subject: futex: Validate futex value against futex size

Ensure the futex value fits in the given futex size. Since this adds a
constraint to an existing syscall, it might possibly change behaviour.

Currently the value would be truncated to a u32 and any high bits
would get silently lost.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230921105247.828934099@noisy.programming.kicks-ass.net
---
 kernel/futex/futex.h    | 10 ++++++++++
 kernel/futex/syscalls.c |  3 +++
 2 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 68fc052dc09b..a3f1fceafcbe 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -85,6 +85,16 @@ static inline bool futex_flags_valid(unsigned int flags)
 	return true;
 }
 
+static inline bool futex_validate_input(unsigned int flags, u64 val)
+{
+	int bits = 8 * futex_size(flags);
+
+	if (bits < 64 && (val >> bits))
+		return false;
+
+	return true;
+}
+
 #ifdef CONFIG_FAIL_FUTEX
 extern bool should_fail_futex(bool fshared);
 #else
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 948ac247c1c6..2339f9ccee7f 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -209,6 +209,9 @@ static int futex_parse_waitv(struct futex_vector *futexv,
 		if (!futex_flags_valid(flags))
 			return -EINVAL;
 
+		if (!futex_validate_input(flags, aux.val))
+			return -EINVAL;
+
 		futexv[i].w.flags = flags;
 		futexv[i].w.val = aux.val;
 		futexv[i].w.uaddr = aux.uaddr;
-- 
cgit v1.2.3


From 9f6c532f59b20580acf8ede9409c9b8dce6e74e1 Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:10 +0200
Subject: futex: Add sys_futex_wake()

To complement sys_futex_waitv() add sys_futex_wake(). This syscall
implements what was previously known as FUTEX_WAKE_BITSET except it
uses 'unsigned long' for the bitmask and takes FUTEX2 flags.

The 'unsigned long' allows FUTEX2_SIZE_U64 on 64bit platforms.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20230921105247.936205525@noisy.programming.kicks-ass.net
---
 arch/alpha/kernel/syscalls/syscall.tbl      |  1 +
 arch/arm/tools/syscall.tbl                  |  1 +
 arch/arm64/include/asm/unistd.h             |  2 +-
 arch/arm64/include/asm/unistd32.h           |  2 ++
 arch/ia64/kernel/syscalls/syscall.tbl       |  1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |  1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |  1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |  1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |  1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |  1 +
 arch/s390/kernel/syscalls/syscall.tbl       |  1 +
 arch/sh/kernel/syscalls/syscall.tbl         |  1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |  1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |  1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |  1 +
 include/linux/syscalls.h                    |  3 +++
 include/uapi/asm-generic/unistd.h           |  4 +++-
 kernel/futex/syscalls.c                     | 30 +++++++++++++++++++++++++++++
 kernel/sys_ni.c                             |  1 +
 22 files changed, 56 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index ad37569d0507..3b86519d68e4 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -492,3 +492,4 @@
 560	common	set_mempolicy_home_node		sys_ni_syscall
 561	common	cachestat			sys_cachestat
 562	common	fchmodat2			sys_fchmodat2
+563	common	futex_wake			sys_futex_wake
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index c572d6c3dee0..714abeb1e6fa 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -466,3 +466,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index bd77253b62e0..63a8a9c4abc1 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		453
+#define __NR_compat_syscalls		455
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 78b68311ec81..68974683737b 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -911,6 +911,8 @@ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
 __SYSCALL(__NR_cachestat, sys_cachestat)
 #define __NR_fchmodat2 452
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
+#define __NR_futex_wake 454
+__SYSCALL(__NR_futex_wake, sys_futex_wake)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 83d8609aec03..cd50247508e6 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -373,3 +373,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 259ceb125367..21eb35c693e1 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -452,3 +452,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index a3798c2637fd..3a4e8513a8e1 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -458,3 +458,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 152034b8e0a0..6883ea3b830d 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -391,3 +391,4 @@
 450	n32	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	n32	cachestat			sys_cachestat
 452	n32	fchmodat2			sys_fchmodat2
+454	n32	futex_wake			sys_futex_wake
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index cb5e757f6621..48bc0fb4e3dc 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -367,3 +367,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	n64	cachestat			sys_cachestat
 452	n64	fchmodat2			sys_fchmodat2
+454	n64	futex_wake			sys_futex_wake
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 1a646813afdc..a92625f5bad8 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -440,3 +440,4 @@
 450	o32	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	o32	cachestat			sys_cachestat
 452	o32	fchmodat2			sys_fchmodat2
+454	o32	futex_wake			sys_futex_wake
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index e97c175b56f9..57faa9786ffe 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -451,3 +451,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 20e50586e8a2..e6c6ed6b30ee 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -539,3 +539,4 @@
 450 	nospu	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 0122cc156952..754720154dc1 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -455,3 +455,4 @@
 450  common	set_mempolicy_home_node	sys_set_mempolicy_home_node	sys_set_mempolicy_home_node
 451  common	cachestat		sys_cachestat			sys_cachestat
 452  common	fchmodat2		sys_fchmodat2			sys_fchmodat2
+454  common	futex_wake		sys_futex_wake			sys_futex_wake
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index e90d585c4d3e..902a997e7ec6 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -455,3 +455,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 4ed06c71c43f..8a1f887c8be6 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -498,3 +498,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 2d0b1bd866ea..9e81323979b0 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -457,3 +457,4 @@
 450	i386	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	i386	cachestat		sys_cachestat
 452	i386	fchmodat2		sys_fchmodat2
+454	i386	futex_wake		sys_futex_wake
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 1d6eee30eceb..d10a6003a7c9 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -375,6 +375,7 @@
 451	common	cachestat		sys_cachestat
 452	common	fchmodat2		sys_fchmodat2
 453	64	map_shadow_stack	sys_map_shadow_stack
+454	common	futex_wake		sys_futex_wake
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index fc1a4f3c81d9..4e511bfd4b8f 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -423,3 +423,4 @@
 450	common	set_mempolicy_home_node		sys_set_mempolicy_home_node
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
+454	common	futex_wake			sys_futex_wake
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 22bc6bc147f8..e174ed86da1d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -549,6 +549,9 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
 asmlinkage long sys_futex_waitv(struct futex_waitv *waiters,
 				unsigned int nr_futexes, unsigned int flags,
 				struct __kernel_timespec __user *timeout, clockid_t clockid);
+
+asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long mask, int nr, unsigned int flags);
+
 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
 			      struct __kernel_timespec __user *rmtp);
 asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index abe087c53b4b..f5454e6f4c6f 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -822,9 +822,11 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
 
 #define __NR_fchmodat2 452
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
+#define __NR_futex_wake 454
+__SYSCALL(__NR_futex_wake, sys_futex_wake)
 
 #undef __NR_syscalls
-#define __NR_syscalls 453
+#define __NR_syscalls 455
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 2339f9ccee7f..7049a52ef68e 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -306,6 +306,36 @@ destroy_timer:
 	return ret;
 }
 
+/*
+ * sys_futex_wake - Wake a number of futexes
+ * @uaddr:	Address of the futex(es) to wake
+ * @mask:	bitmask
+ * @nr:		Number of the futexes to wake
+ * @flags:	FUTEX2 flags
+ *
+ * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_wake,
+		void __user *, uaddr,
+		unsigned long, mask,
+		int, nr,
+		unsigned int, flags)
+{
+	if (flags & ~FUTEX2_VALID_MASK)
+		return -EINVAL;
+
+	flags = futex2_to_flags(flags);
+	if (!futex_flags_valid(flags))
+		return -EINVAL;
+
+	if (!futex_validate_input(flags, mask))
+		return -EINVAL;
+
+	return futex_wake(uaddr, flags, nr, mask);
+}
+
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(set_robust_list,
 		struct compat_robust_list_head __user *, head,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e137c1385c56..983c0583c627 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -87,6 +87,7 @@ COND_SYSCALL_COMPAT(set_robust_list);
 COND_SYSCALL(get_robust_list);
 COND_SYSCALL_COMPAT(get_robust_list);
 COND_SYSCALL(futex_waitv);
+COND_SYSCALL(futex_wake);
 COND_SYSCALL(kexec_load);
 COND_SYSCALL_COMPAT(kexec_load);
 COND_SYSCALL(init_module);
-- 
cgit v1.2.3


From 43adf844951084c266f172561f84c5f8120dd60b Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:11 +0200
Subject: futex: FLAGS_STRICT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current semantics for futex_wake() are a bit loose, specifically
asking for 0 futexes to be woken actually gets you 1.

Adding a !nr check to sys_futex_wake() makes that it would return 0
for unaligned futex words, because that check comes in the shared
futex_wake() function. Adding the !nr check there, would affect the
legacy sys_futex() semantics.

Hence frob a flag :-(

Suggested-by: André Almeida <andrealmeid@igalia.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230921105248.048643656@noisy.programming.kicks-ass.net
---
 kernel/futex/futex.h    | 21 +++++++++++----------
 kernel/futex/syscalls.c |  2 +-
 kernel/futex/waitwake.c |  3 +++
 3 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index a3f1fceafcbe..0e7821a944a2 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -17,25 +17,26 @@
  * Futex flags used to encode options to functions and preserve them across
  * restarts.
  */
-#define FLAGS_SIZE_8		0x00
-#define FLAGS_SIZE_16		0x01
-#define FLAGS_SIZE_32		0x02
-#define FLAGS_SIZE_64		0x03
+#define FLAGS_SIZE_8		0x0000
+#define FLAGS_SIZE_16		0x0001
+#define FLAGS_SIZE_32		0x0002
+#define FLAGS_SIZE_64		0x0003
 
-#define FLAGS_SIZE_MASK		0x03
+#define FLAGS_SIZE_MASK		0x0003
 
 #ifdef CONFIG_MMU
-# define FLAGS_SHARED		0x10
+# define FLAGS_SHARED		0x0010
 #else
 /*
  * NOMMU does not have per process address space. Let the compiler optimize
  * code away.
  */
-# define FLAGS_SHARED		0x00
+# define FLAGS_SHARED		0x0000
 #endif
-#define FLAGS_CLOCKRT		0x20
-#define FLAGS_HAS_TIMEOUT	0x40
-#define FLAGS_NUMA		0x80
+#define FLAGS_CLOCKRT		0x0020
+#define FLAGS_HAS_TIMEOUT	0x0040
+#define FLAGS_NUMA		0x0080
+#define FLAGS_STRICT		0x0100
 
 /* FUTEX_ to FLAGS_ */
 static inline unsigned int futex_to_flags(unsigned int op)
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 7049a52ef68e..47398926765e 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -333,7 +333,7 @@ SYSCALL_DEFINE4(futex_wake,
 	if (!futex_validate_input(flags, mask))
 		return -EINVAL;
 
-	return futex_wake(uaddr, flags, nr, mask);
+	return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index fa9757766103..ceb05b876597 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -155,6 +155,9 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 	if (unlikely(ret != 0))
 		return ret;
 
+	if ((flags & FLAGS_STRICT) && !nr_wake)
+		return 0;
+
 	hb = futex_hash(&key);
 
 	/* Make sure we really have tasks to wakeup */
-- 
cgit v1.2.3


From cb8c4312afca1b2dc64107e7e7cea81911055612 Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:12 +0200
Subject: futex: Add sys_futex_wait()

To complement sys_futex_waitv()/wake(), add sys_futex_wait(). This
syscall implements what was previously known as FUTEX_WAIT_BITSET
except it uses 'unsigned long' for the value and bitmask arguments,
takes timespec and clockid_t arguments for the absolute timeout and
uses FUTEX2 flags.

The 'unsigned long' allows FUTEX2_SIZE_U64 on 64bit platforms.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20230921105248.164324363@noisy.programming.kicks-ass.net
---
 arch/alpha/kernel/syscalls/syscall.tbl      |   1 +
 arch/arm/tools/syscall.tbl                  |   1 +
 arch/arm64/include/asm/unistd.h             |   2 +-
 arch/arm64/include/asm/unistd32.h           |   2 +
 arch/ia64/kernel/syscalls/syscall.tbl       |   1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |   1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |   1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |   1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |   1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |   1 +
 arch/s390/kernel/syscalls/syscall.tbl       |   1 +
 arch/sh/kernel/syscalls/syscall.tbl         |   1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |   1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |   1 +
 include/linux/syscalls.h                    |   4 +
 include/uapi/asm-generic/unistd.h           |   4 +-
 kernel/futex/futex.h                        |   3 +
 kernel/futex/syscalls.c                     | 120 +++++++++++++++++++++-------
 kernel/futex/waitwake.c                     |  61 ++++++++------
 kernel/sys_ni.c                             |   1 +
 24 files changed, 156 insertions(+), 57 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 3b86519d68e4..c49f12fd264e 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -493,3 +493,4 @@
 561	common	cachestat			sys_cachestat
 562	common	fchmodat2			sys_fchmodat2
 563	common	futex_wake			sys_futex_wake
+564	common	futex_wait			sys_futex_wait
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 714abeb1e6fa..a6cf56277327 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -467,3 +467,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 63a8a9c4abc1..f33190f17ebb 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		455
+#define __NR_compat_syscalls		456
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 68974683737b..6e7d37282ba1 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -913,6 +913,8 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 #define __NR_futex_wake 454
 __SYSCALL(__NR_futex_wake, sys_futex_wake)
+#define __NR_futex_wait 455
+__SYSCALL(__NR_futex_wait, sys_futex_wait)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index cd50247508e6..4043f0c55170 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -374,3 +374,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 21eb35c693e1..24841674acc5 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -453,3 +453,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 3a4e8513a8e1..f03927ab0220 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -459,3 +459,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 6883ea3b830d..dbb5edfb667b 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -392,3 +392,4 @@
 451	n32	cachestat			sys_cachestat
 452	n32	fchmodat2			sys_fchmodat2
 454	n32	futex_wake			sys_futex_wake
+455	n32	futex_wait			sys_futex_wait
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 48bc0fb4e3dc..faff8dfd2983 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -368,3 +368,4 @@
 451	n64	cachestat			sys_cachestat
 452	n64	fchmodat2			sys_fchmodat2
 454	n64	futex_wake			sys_futex_wake
+455	n64	futex_wait			sys_futex_wait
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index a92625f5bad8..542f75605b3e 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -441,3 +441,4 @@
 451	o32	cachestat			sys_cachestat
 452	o32	fchmodat2			sys_fchmodat2
 454	o32	futex_wake			sys_futex_wake
+455	o32	futex_wait			sys_futex_wait
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 57faa9786ffe..8e50e89551f7 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -452,3 +452,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index e6c6ed6b30ee..ad33a9993a6a 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -540,3 +540,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 754720154dc1..418853fd2a6b 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -456,3 +456,4 @@
 451  common	cachestat		sys_cachestat			sys_cachestat
 452  common	fchmodat2		sys_fchmodat2			sys_fchmodat2
 454  common	futex_wake		sys_futex_wake			sys_futex_wake
+455  common	futex_wait		sys_futex_wait			sys_futex_wait
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 902a997e7ec6..8ef9557d2779 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -456,3 +456,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 8a1f887c8be6..df59a9d5f109 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -499,3 +499,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 9e81323979b0..0f6616822bd5 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -458,3 +458,4 @@
 451	i386	cachestat		sys_cachestat
 452	i386	fchmodat2		sys_fchmodat2
 454	i386	futex_wake		sys_futex_wake
+455	i386	futex_wait		sys_futex_wait
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index d10a6003a7c9..ddf6288823ad 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -376,6 +376,7 @@
 452	common	fchmodat2		sys_fchmodat2
 453	64	map_shadow_stack	sys_map_shadow_stack
 454	common	futex_wake		sys_futex_wake
+455	common	futex_wait		sys_futex_wait
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 4e511bfd4b8f..ac278dbce2ee 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -424,3 +424,4 @@
 451	common	cachestat			sys_cachestat
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
+455	common	futex_wait			sys_futex_wait
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e174ed86da1d..11f3fdd1ee03 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -552,6 +552,10 @@ asmlinkage long sys_futex_waitv(struct futex_waitv *waiters,
 
 asmlinkage long sys_futex_wake(void __user *uaddr, unsigned long mask, int nr, unsigned int flags);
 
+asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, unsigned long mask,
+			       unsigned int flags, struct __kernel_timespec __user *timespec,
+			       clockid_t clockid);
+
 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
 			      struct __kernel_timespec __user *rmtp);
 asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index f5454e6f4c6f..f6553bd5d213 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -824,9 +824,11 @@ __SYSCALL(__NR_cachestat, sys_cachestat)
 __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 #define __NR_futex_wake 454
 __SYSCALL(__NR_futex_wake, sys_futex_wake)
+#define __NR_futex_wait 455
+__SYSCALL(__NR_futex_wait, sys_futex_wait)
 
 #undef __NR_syscalls
-#define __NR_syscalls 455
+#define __NR_syscalls 456
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 0e7821a944a2..e74888a7d71d 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -332,6 +332,9 @@ extern int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 			 u32 __user *uaddr2, int nr_wake, int nr_requeue,
 			 u32 *cmpval, int requeue_pi);
 
+extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+			struct hrtimer_sleeper *to, u32 bitset);
+
 extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 		      ktime_t *abs_time, u32 bitset);
 
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 47398926765e..e4c8ec713787 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -221,6 +221,46 @@ static int futex_parse_waitv(struct futex_vector *futexv,
 	return 0;
 }
 
+static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
+				clockid_t clockid, struct hrtimer_sleeper *to)
+{
+	int flag_clkid = 0, flag_init = 0;
+	struct timespec64 ts;
+	ktime_t time;
+	int ret;
+
+	if (!timeout)
+		return 0;
+
+	if (clockid == CLOCK_REALTIME) {
+		flag_clkid = FLAGS_CLOCKRT;
+		flag_init = FUTEX_CLOCK_REALTIME;
+	}
+
+	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
+		return -EINVAL;
+
+	if (get_timespec64(&ts, timeout))
+		return -EFAULT;
+
+	/*
+	 * Since there's no opcode for futex_waitv, use
+	 * FUTEX_WAIT_BITSET that uses absolute timeout as well
+	 */
+	ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
+	if (ret)
+		return ret;
+
+	futex_setup_timer(&time, to, flag_clkid, 0);
+	return 0;
+}
+
+static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
+{
+	hrtimer_cancel(&to->timer);
+	destroy_hrtimer_on_stack(&to->timer);
+}
+
 /**
  * sys_futex_waitv - Wait on a list of futexes
  * @waiters:    List of futexes to wait on
@@ -250,8 +290,6 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
 {
 	struct hrtimer_sleeper to;
 	struct futex_vector *futexv;
-	struct timespec64 ts;
-	ktime_t time;
 	int ret;
 
 	/* This syscall supports no flags for now */
@@ -261,30 +299,8 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
 	if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
 		return -EINVAL;
 
-	if (timeout) {
-		int flag_clkid = 0, flag_init = 0;
-
-		if (clockid == CLOCK_REALTIME) {
-			flag_clkid = FLAGS_CLOCKRT;
-			flag_init = FUTEX_CLOCK_REALTIME;
-		}
-
-		if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
-			return -EINVAL;
-
-		if (get_timespec64(&ts, timeout))
-			return -EFAULT;
-
-		/*
-		 * Since there's no opcode for futex_waitv, use
-		 * FUTEX_WAIT_BITSET that uses absolute timeout as well
-		 */
-		ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
-		if (ret)
-			return ret;
-
-		futex_setup_timer(&time, &to, flag_clkid, 0);
-	}
+	if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+		return ret;
 
 	futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
 	if (!futexv) {
@@ -299,10 +315,8 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
 	kfree(futexv);
 
 destroy_timer:
-	if (timeout) {
-		hrtimer_cancel(&to.timer);
-		destroy_hrtimer_on_stack(&to.timer);
-	}
+	if (timeout)
+		futex2_destroy_timeout(&to);
 	return ret;
 }
 
@@ -336,6 +350,52 @@ SYSCALL_DEFINE4(futex_wake,
 	return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
 }
 
+/*
+ * sys_futex_wait - Wait on a futex
+ * @uaddr:	Address of the futex to wait on
+ * @val:	Value of @uaddr
+ * @mask:	bitmask
+ * @flags:	FUTEX2 flags
+ * @timeout:	Optional absolute timeout
+ * @clockid:	Clock to be used for the timeout, realtime or monotonic
+ *
+ * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
+ * futex2 familiy of calls.
+ */
+
+SYSCALL_DEFINE6(futex_wait,
+		void __user *, uaddr,
+		unsigned long, val,
+		unsigned long, mask,
+		unsigned int, flags,
+		struct __kernel_timespec __user *, timeout,
+		clockid_t, clockid)
+{
+	struct hrtimer_sleeper to;
+	int ret;
+
+	if (flags & ~FUTEX2_VALID_MASK)
+		return -EINVAL;
+
+	flags = futex2_to_flags(flags);
+	if (!futex_flags_valid(flags))
+		return -EINVAL;
+
+	if (!futex_validate_input(flags, val) ||
+	    !futex_validate_input(flags, mask))
+		return -EINVAL;
+
+	if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
+		return ret;
+
+	ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);
+
+	if (timeout)
+		futex2_destroy_timeout(&to);
+
+	return ret;
+}
+
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(set_robust_list,
 		struct compat_robust_list_head __user *, head,
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index ceb05b876597..b109a0810a2c 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -632,20 +632,18 @@ retry_private:
 	return ret;
 }
 
-int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+		 struct hrtimer_sleeper *to, u32 bitset)
 {
-	struct hrtimer_sleeper timeout, *to;
-	struct restart_block *restart;
-	struct futex_hash_bucket *hb;
 	struct futex_q q = futex_q_init;
+	struct futex_hash_bucket *hb;
 	int ret;
 
 	if (!bitset)
 		return -EINVAL;
+
 	q.bitset = bitset;
 
-	to = futex_setup_timer(abs_time, &timeout, flags,
-			       current->timer_slack_ns);
 retry:
 	/*
 	 * Prepare to wait on uaddr. On success, it holds hb->lock and q
@@ -653,18 +651,17 @@ retry:
 	 */
 	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
 	if (ret)
-		goto out;
+		return ret;
 
 	/* futex_queue and wait for wakeup, timeout, or a signal. */
 	futex_wait_queue(hb, &q, to);
 
 	/* If we were woken (and unqueued), we succeeded, whatever. */
-	ret = 0;
 	if (!futex_unqueue(&q))
-		goto out;
-	ret = -ETIMEDOUT;
+		return 0;
+
 	if (to && !to->task)
-		goto out;
+		return -ETIMEDOUT;
 
 	/*
 	 * We expect signal_pending(current), but we might be the
@@ -673,24 +670,38 @@ retry:
 	if (!signal_pending(current))
 		goto retry;
 
-	ret = -ERESTARTSYS;
-	if (!abs_time)
-		goto out;
+	return -ERESTARTSYS;
+}
 
-	restart = &current->restart_block;
-	restart->futex.uaddr = uaddr;
-	restart->futex.val = val;
-	restart->futex.time = *abs_time;
-	restart->futex.bitset = bitset;
-	restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
+{
+	struct hrtimer_sleeper timeout, *to;
+	struct restart_block *restart;
+	int ret;
+
+	to = futex_setup_timer(abs_time, &timeout, flags,
+			       current->timer_slack_ns);
+
+	ret = __futex_wait(uaddr, flags, val, to, bitset);
+
+	/* No timeout, nothing to clean up. */
+	if (!to)
+		return ret;
+
+	hrtimer_cancel(&to->timer);
+	destroy_hrtimer_on_stack(&to->timer);
 
-	ret = set_restart_fn(restart, futex_wait_restart);
+	if (ret == -ERESTARTSYS) {
+		restart = &current->restart_block;
+		restart->futex.uaddr = uaddr;
+		restart->futex.val = val;
+		restart->futex.time = *abs_time;
+		restart->futex.bitset = bitset;
+		restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
 
-out:
-	if (to) {
-		hrtimer_cancel(&to->timer);
-		destroy_hrtimer_on_stack(&to->timer);
+		return set_restart_fn(restart, futex_wait_restart);
 	}
+
 	return ret;
 }
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 983c0583c627..13df391194e2 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -88,6 +88,7 @@ COND_SYSCALL(get_robust_list);
 COND_SYSCALL_COMPAT(get_robust_list);
 COND_SYSCALL(futex_waitv);
 COND_SYSCALL(futex_wake);
+COND_SYSCALL(futex_wait);
 COND_SYSCALL(kexec_load);
 COND_SYSCALL_COMPAT(kexec_load);
 COND_SYSCALL(init_module);
-- 
cgit v1.2.3


From 3b63a55f498b763aba0886b244df613587a73c46 Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:13 +0200
Subject: futex: Propagate flags into get_futex_key()

Instead of only passing FLAGS_SHARED as a boolean, pass down flags as
a whole.

No functional change intended.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230921105248.282857501@noisy.programming.kicks-ass.net
---
 kernel/futex/core.c     |  7 +++++--
 kernel/futex/futex.h    |  2 +-
 kernel/futex/pi.c       |  4 ++--
 kernel/futex/requeue.c  |  6 +++---
 kernel/futex/waitwake.c | 14 +++++++-------
 5 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index d1d7b3c175a4..ade7c731972d 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -193,7 +193,7 @@ static u64 get_inode_sequence_number(struct inode *inode)
 /**
  * get_futex_key() - Get parameters which are the keys for a futex
  * @uaddr:	virtual address of the futex
- * @fshared:	false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
+ * @flags:	FLAGS_*
  * @key:	address where result is stored.
  * @rw:		mapping needs to be read/write (values: FUTEX_READ,
  *              FUTEX_WRITE)
@@ -217,7 +217,7 @@ static u64 get_inode_sequence_number(struct inode *inode)
  *
  * lock_page() might sleep, the caller should not hold a spinlock.
  */
-int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
 		  enum futex_access rw)
 {
 	unsigned long address = (unsigned long)uaddr;
@@ -226,6 +226,9 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
 	struct folio *folio;
 	struct address_space *mapping;
 	int err, ro = 0;
+	bool fshared;
+
+	fshared = flags & FLAGS_SHARED;
 
 	/*
 	 * The futex address must be "naturally" aligned.
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index e74888a7d71d..a8ea5ef52424 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -184,7 +184,7 @@ enum futex_access {
 	FUTEX_WRITE
 };
 
-extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
 			 enum futex_access rw);
 
 extern struct hrtimer_sleeper *
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index d636a1bbd7d0..90e5197f4e56 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -933,7 +933,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
 	to = futex_setup_timer(time, &timeout, flags, 0);
 
 retry:
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
+	ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -1129,7 +1129,7 @@ retry:
 	if ((uval & FUTEX_TID_MASK) != vpid)
 		return -EPERM;
 
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
+	ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
 	if (ret)
 		return ret;
 
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index 4c73e0b81acc..5bf69581a937 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -424,10 +424,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	}
 
 retry:
-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+	ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+	ret = get_futex_key(uaddr2, flags, &key2,
 			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;
@@ -789,7 +789,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	 */
 	rt_mutex_init_waiter(&rt_waiter);
 
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+	ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out;
 
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index b109a0810a2c..37860f794bf7 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -145,13 +145,13 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
 	union futex_key key = FUTEX_KEY_INIT;
-	int ret;
 	DEFINE_WAKE_Q(wake_q);
+	int ret;
 
 	if (!bitset)
 		return -EINVAL;
 
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
+	ret = get_futex_key(uaddr, flags, &key, FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;
 
@@ -248,10 +248,10 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	DEFINE_WAKE_Q(wake_q);
 
 retry:
-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+	ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+	ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		return ret;
 
@@ -426,7 +426,7 @@ retry:
 			continue;
 
 		ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
-				    vs[i].w.flags & FLAGS_SHARED,
+				    vs[i].w.flags,
 				    &vs[i].q.key, FUTEX_READ);
 
 		if (unlikely(ret))
@@ -438,7 +438,7 @@ retry:
 	for (i = 0; i < count; i++) {
 		u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
 		struct futex_q *q = &vs[i].q;
-		u32 val = (u32)vs[i].w.val;
+		u32 val = vs[i].w.val;
 
 		hb = futex_q_lock(q);
 		ret = futex_get_value_locked(&uval, uaddr);
@@ -602,7 +602,7 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 	 * while the syscall executes.
 	 */
 retry:
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
+	ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;
 
-- 
cgit v1.2.3


From 27b88f3519e72d71c8cead6b835a26c171109c9b Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:14 +0200
Subject: futex: Add flags2 argument to futex_requeue()

In order to support mixed size requeue, add a second flags argument to
the internal futex_requeue() function.

No functional change intended.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230921105248.396780136@noisy.programming.kicks-ass.net
---
 kernel/futex/futex.h    |  5 +++--
 kernel/futex/requeue.c  | 12 +++++++-----
 kernel/futex/syscalls.c |  6 +++---
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index a8ea5ef52424..a06030a1a27b 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -328,8 +328,9 @@ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32
 				 val, ktime_t *abs_time, u32 bitset, u32 __user
 				 *uaddr2);
 
-extern int futex_requeue(u32 __user *uaddr1, unsigned int flags,
-			 u32 __user *uaddr2, int nr_wake, int nr_requeue,
+extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
+			 u32 __user *uaddr2, unsigned int flags2,
+			 int nr_wake, int nr_requeue,
 			 u32 *cmpval, int requeue_pi);
 
 extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index 5bf69581a937..a0a79954f506 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -346,8 +346,9 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
 /**
  * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
  * @uaddr1:	source futex user address
- * @flags:	futex flags (FLAGS_SHARED, etc.)
+ * @flags1:	futex flags (FLAGS_SHARED, etc.)
  * @uaddr2:	target futex user address
+ * @flags2:	futex flags (FLAGS_SHARED, etc.)
  * @nr_wake:	number of waiters to wake (must be 1 for requeue_pi)
  * @nr_requeue:	number of waiters to requeue (0-INT_MAX)
  * @cmpval:	@uaddr1 expected value (or %NULL)
@@ -361,7 +362,8 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
  *  - >=0 - on success, the number of tasks requeued or woken;
  *  -  <0 - on error
  */
-int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
+		  u32 __user *uaddr2, unsigned int flags2,
 		  int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi)
 {
 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -424,10 +426,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	}
 
 retry:
-	ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
+	ret = get_futex_key(uaddr1, flags1, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;
-	ret = get_futex_key(uaddr2, flags, &key2,
+	ret = get_futex_key(uaddr2, flags2, &key2,
 			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;
@@ -459,7 +461,7 @@ retry_private:
 			if (ret)
 				return ret;
 
-			if (!(flags & FLAGS_SHARED))
+			if (!(flags1 & FLAGS_SHARED))
 				goto retry_private;
 
 			goto retry;
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index e4c8ec713787..dde9b74db9af 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -106,9 +106,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	case FUTEX_WAKE_BITSET:
 		return futex_wake(uaddr, flags, val, val3);
 	case FUTEX_REQUEUE:
-		return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
+		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
 	case FUTEX_CMP_REQUEUE:
-		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
+		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
 	case FUTEX_WAKE_OP:
 		return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
 	case FUTEX_LOCK_PI:
@@ -125,7 +125,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
 					     uaddr2);
 	case FUTEX_CMP_REQUEUE_PI:
-		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+		return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
 	}
 	return -ENOSYS;
 }
-- 
cgit v1.2.3


From 0f4b5f972216782a4acb1ae00dcb55173847c2ff Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Thu, 21 Sep 2023 12:45:15 +0200
Subject: futex: Add sys_futex_requeue()

Finish off the 'simple' futex2 syscall group by adding
sys_futex_requeue(). Unlike sys_futex_{wait,wake}() its arguments are
too numerous to fit into a regular syscall. As such, use struct
futex_waitv to pass the 'source' and 'destination' futexes to the
syscall.

This syscall implements what was previously known as FUTEX_CMP_REQUEUE
and uses {val, uaddr, flags} for source and {uaddr, flags} for
destination.

This design explicitly allows requeueing between different types of
futex by having a different flags word per uaddr.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20230921105248.511860556@noisy.programming.kicks-ass.net
---
 arch/alpha/kernel/syscalls/syscall.tbl      |  1 +
 arch/arm/tools/syscall.tbl                  |  1 +
 arch/arm64/include/asm/unistd.h             |  2 +-
 arch/arm64/include/asm/unistd32.h           |  2 ++
 arch/ia64/kernel/syscalls/syscall.tbl       |  1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |  1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |  1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |  1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |  1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |  1 +
 arch/s390/kernel/syscalls/syscall.tbl       |  1 +
 arch/sh/kernel/syscalls/syscall.tbl         |  1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |  1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |  1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |  1 +
 include/linux/syscalls.h                    |  3 +++
 include/uapi/asm-generic/unistd.h           |  4 ++-
 kernel/futex/syscalls.c                     | 38 +++++++++++++++++++++++++++++
 kernel/sys_ni.c                             |  1 +
 22 files changed, 64 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index c49f12fd264e..b1865f9bb31e 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -494,3 +494,4 @@
 562	common	fchmodat2			sys_fchmodat2
 563	common	futex_wake			sys_futex_wake
 564	common	futex_wait			sys_futex_wait
+565	common	futex_requeue			sys_futex_requeue
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index a6cf56277327..93d0d46cbb15 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -468,3 +468,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index f33190f17ebb..531effca5f1f 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		456
+#define __NR_compat_syscalls		457
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 6e7d37282ba1..c453291154fd 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -915,6 +915,8 @@ __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 __SYSCALL(__NR_futex_wake, sys_futex_wake)
 #define __NR_futex_wait 455
 __SYSCALL(__NR_futex_wait, sys_futex_wait)
+#define __NR_futex_requeue 456
+__SYSCALL(__NR_futex_requeue, sys_futex_requeue)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 4043f0c55170..81375ea78288 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -375,3 +375,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 24841674acc5..f7f997a88bab 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -454,3 +454,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index f03927ab0220..2967ec26b978 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -460,3 +460,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index dbb5edfb667b..383abb1713f4 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -393,3 +393,4 @@
 452	n32	fchmodat2			sys_fchmodat2
 454	n32	futex_wake			sys_futex_wake
 455	n32	futex_wait			sys_futex_wait
+456	n32	futex_requeue			sys_futex_requeue
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index faff8dfd2983..c9bd09ba905f 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -369,3 +369,4 @@
 452	n64	fchmodat2			sys_fchmodat2
 454	n64	futex_wake			sys_futex_wake
 455	n64	futex_wait			sys_futex_wait
+456	n64	futex_requeue			sys_futex_requeue
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 542f75605b3e..ba5ef6cea97a 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -442,3 +442,4 @@
 452	o32	fchmodat2			sys_fchmodat2
 454	o32	futex_wake			sys_futex_wake
 455	o32	futex_wait			sys_futex_wait
+456	o32	futex_requeue			sys_futex_requeue
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 8e50e89551f7..9f0f6df55361 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -453,3 +453,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index ad33a9993a6a..26fc41904266 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -541,3 +541,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 418853fd2a6b..31be90b241f7 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -457,3 +457,4 @@
 452  common	fchmodat2		sys_fchmodat2			sys_fchmodat2
 454  common	futex_wake		sys_futex_wake			sys_futex_wake
 455  common	futex_wait		sys_futex_wait			sys_futex_wait
+456  common	futex_requeue		sys_futex_requeue			sys_futex_requeue
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 8ef9557d2779..4bc5d488ab17 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -457,3 +457,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index df59a9d5f109..8404c8e50394 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -500,3 +500,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 0f6616822bd5..31c48bc2c3d8 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -459,3 +459,4 @@
 452	i386	fchmodat2		sys_fchmodat2
 454	i386	futex_wake		sys_futex_wake
 455	i386	futex_wait		sys_futex_wait
+456	i386	futex_requeue		sys_futex_requeue
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index ddf6288823ad..a577bb27c16d 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -377,6 +377,7 @@
 453	64	map_shadow_stack	sys_map_shadow_stack
 454	common	futex_wake		sys_futex_wake
 455	common	futex_wait		sys_futex_wait
+456	common	futex_requeue		sys_futex_requeue
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index ac278dbce2ee..dd71ecce8b86 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -425,3 +425,4 @@
 452	common	fchmodat2			sys_fchmodat2
 454	common	futex_wake			sys_futex_wake
 455	common	futex_wait			sys_futex_wait
+456	common	futex_requeue			sys_futex_requeue
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 11f3fdd1ee03..0901af60d971 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -556,6 +556,9 @@ asmlinkage long sys_futex_wait(void __user *uaddr, unsigned long val, unsigned l
 			       unsigned int flags, struct __kernel_timespec __user *timespec,
 			       clockid_t clockid);
 
+asmlinkage long sys_futex_requeue(struct futex_waitv __user *waiters,
+				  unsigned int flags, int nr_wake, int nr_requeue);
+
 asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
 			      struct __kernel_timespec __user *rmtp);
 asmlinkage long sys_nanosleep_time32(struct old_timespec32 __user *rqtp,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index f6553bd5d213..d9e9cd13e577 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -826,9 +826,11 @@ __SYSCALL(__NR_fchmodat2, sys_fchmodat2)
 __SYSCALL(__NR_futex_wake, sys_futex_wake)
 #define __NR_futex_wait 455
 __SYSCALL(__NR_futex_wait, sys_futex_wait)
+#define __NR_futex_requeue 456
+__SYSCALL(__NR_futex_requeue, sys_futex_requeue)
 
 #undef __NR_syscalls
-#define __NR_syscalls 456
+#define __NR_syscalls 457
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index dde9b74db9af..8200d86d30e1 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -396,6 +396,44 @@ SYSCALL_DEFINE6(futex_wait,
 	return ret;
 }
 
+/*
+ * sys_futex_requeue - Requeue a waiter from one futex to another
+ * @waiters:	array describing the source and destination futex
+ * @flags:	unused
+ * @nr_wake:	number of futexes to wake
+ * @nr_requeue:	number of futexes to requeue
+ *
+ * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
+ * futex2 family of calls.
+ */
+
+SYSCALL_DEFINE4(futex_requeue,
+		struct futex_waitv __user *, waiters,
+		unsigned int, flags,
+		int, nr_wake,
+		int, nr_requeue)
+{
+	struct futex_vector futexes[2];
+	u32 cmpval;
+	int ret;
+
+	if (flags)
+		return -EINVAL;
+
+	if (!waiters)
+		return -EINVAL;
+
+	ret = futex_parse_waitv(futexes, waiters, 2);
+	if (ret)
+		return ret;
+
+	cmpval = futexes[0].w.val;
+
+	return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
+			     u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
+			     nr_wake, nr_requeue, &cmpval, 0);
+}
+
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(set_robust_list,
 		struct compat_robust_list_head __user *, head,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 13df391194e2..9db51ea373b0 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -89,6 +89,7 @@ COND_SYSCALL_COMPAT(get_robust_list);
 COND_SYSCALL(futex_waitv);
 COND_SYSCALL(futex_wake);
 COND_SYSCALL(futex_wait);
+COND_SYSCALL(futex_requeue);
 COND_SYSCALL(kexec_load);
 COND_SYSCALL_COMPAT(kexec_load);
 COND_SYSCALL(init_module);
-- 
cgit v1.2.3


From 9d900d4ea352069de8728f11fe4b20051d64cc20 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 11 Jul 2023 10:31:10 -0600
Subject: exit: abstract out should_wake helper for child_wait_callback()

Abstract out the helper that decides if we should wake up following
a wake_up() callback on our internal waitqueue.

No functional changes intended in this patch.

Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/exit.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index edb50b4c9972..2809dad69492 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1520,6 +1520,17 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 	return 0;
 }
 
+static bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
+{
+	if (!eligible_pid(wo, p))
+		return false;
+
+	if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
+		return false;
+
+	return true;
+}
+
 static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
 				int sync, void *key)
 {
@@ -1527,13 +1538,10 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
 						child_wait);
 	struct task_struct *p = key;
 
-	if (!eligible_pid(wo, p))
-		return 0;
+	if (pid_child_should_wake(wo, p))
+		return default_wake_function(wait, mode, sync, key);
 
-	if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
-		return 0;
-
-	return default_wake_function(wait, mode, sync, key);
+	return 0;
 }
 
 void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
-- 
cgit v1.2.3


From 06a101ca45b296fe951692620b0bc49abf90c368 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 11 Jul 2023 10:34:37 -0600
Subject: exit: move core of do_wait() into helper

Rather than have a maze of gotos, put the actual logic in __do_wait()
and have do_wait() loop deal with waitqueue setup/teardown and whether
to call __do_wait() again.

No functional changes intended in this patch.

Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/exit.c | 51 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2809dad69492..c6fba9ecca27 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1590,16 +1590,10 @@ static int do_wait_pid(struct wait_opts *wo)
 	return 0;
 }
 
-static long do_wait(struct wait_opts *wo)
+static long __do_wait(struct wait_opts *wo)
 {
-	int retval;
-
-	trace_sched_process_wait(wo->wo_pid);
+	long retval;
 
-	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
-	wo->child_wait.private = current;
-	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
-repeat:
 	/*
 	 * If there is nothing that can match our criteria, just get out.
 	 * We will clear ->notask_error to zero if we see any child that
@@ -1611,24 +1605,23 @@ repeat:
 	   (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
 		goto notask;
 
-	set_current_state(TASK_INTERRUPTIBLE);
 	read_lock(&tasklist_lock);
 
 	if (wo->wo_type == PIDTYPE_PID) {
 		retval = do_wait_pid(wo);
 		if (retval)
-			goto end;
+			return retval;
 	} else {
 		struct task_struct *tsk = current;
 
 		do {
 			retval = do_wait_thread(wo, tsk);
 			if (retval)
-				goto end;
+				return retval;
 
 			retval = ptrace_do_wait(wo, tsk);
 			if (retval)
-				goto end;
+				return retval;
 
 			if (wo->wo_flags & __WNOTHREAD)
 				break;
@@ -1638,14 +1631,32 @@ repeat:
 
 notask:
 	retval = wo->notask_error;
-	if (!retval && !(wo->wo_flags & WNOHANG)) {
-		retval = -ERESTARTSYS;
-		if (!signal_pending(current)) {
-			schedule();
-			goto repeat;
-		}
-	}
-end:
+	if (!retval && !(wo->wo_flags & WNOHANG))
+		return -ERESTARTSYS;
+
+	return retval;
+}
+
+static long do_wait(struct wait_opts *wo)
+{
+	int retval;
+
+	trace_sched_process_wait(wo->wo_pid);
+
+	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
+	wo->child_wait.private = current;
+	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
+
+	do {
+		set_current_state(TASK_INTERRUPTIBLE);
+		retval = __do_wait(wo);
+		if (retval != -ERESTARTSYS)
+			break;
+		if (signal_pending(current))
+			break;
+		schedule();
+	} while (1);
+
 	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
 	return retval;
-- 
cgit v1.2.3


From eda7e9d409ce16960d5ed28bedf8a33b2667a93c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 11 Jul 2023 10:38:23 -0600
Subject: exit: add kernel_waitid_prepare() helper

Move the setup logic out of kernel_waitid(), and into a separate helper.

No functional changes intended in this patch.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/exit.c | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index c6fba9ecca27..817c22bd7ae0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1662,14 +1662,13 @@ static long do_wait(struct wait_opts *wo)
 	return retval;
 }
 
-static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
-			  int options, struct rusage *ru)
+static int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+				 struct waitid_info *infop, int options,
+				 struct rusage *ru)
 {
-	struct wait_opts wo;
+	unsigned int f_flags = 0;
 	struct pid *pid = NULL;
 	enum pid_type type;
-	long ret;
-	unsigned int f_flags = 0;
 
 	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
 			__WNOTHREAD|__WCLONE|__WALL))
@@ -1712,19 +1711,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
 		return -EINVAL;
 	}
 
-	wo.wo_type	= type;
-	wo.wo_pid	= pid;
-	wo.wo_flags	= options;
-	wo.wo_info	= infop;
-	wo.wo_rusage	= ru;
+	wo->wo_type	= type;
+	wo->wo_pid	= pid;
+	wo->wo_flags	= options;
+	wo->wo_info	= infop;
+	wo->wo_rusage	= ru;
 	if (f_flags & O_NONBLOCK)
-		wo.wo_flags |= WNOHANG;
+		wo->wo_flags |= WNOHANG;
+
+	return 0;
+}
+
+static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
+			  int options, struct rusage *ru)
+{
+	struct wait_opts wo;
+	long ret;
+
+	ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
+	if (ret)
+		return ret;
 
 	ret = do_wait(&wo);
-	if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
+	if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
 		ret = -EAGAIN;
 
-	put_pid(pid);
+	put_pid(wo.wo_pid);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 2e521a2064bf8b26cf178c0f7644a70ed1a512fa Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 11 Jul 2023 10:40:31 -0600
Subject: exit: add internal include file with helpers

Move struct wait_opts and waitid_info into kernel/exit.h, and include
function declarations for the recently added helpers. Make them
non-static as well.

This is in preparation for adding a waitid operation through io_uring.
With the abtracted helpers, this is now possible.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/exit.c | 32 +++++++-------------------------
 kernel/exit.h | 30 ++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 25 deletions(-)
 create mode 100644 kernel/exit.h

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 817c22bd7ae0..2b4a232f2f68 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -74,6 +74,8 @@
 #include <asm/unistd.h>
 #include <asm/mmu_context.h>
 
+#include "exit.h"
+
 /*
  * The default value should be high enough to not crash a system that randomly
  * crashes its kernel from time to time, but low enough to at least not permit
@@ -1037,26 +1039,6 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
 	return 0;
 }
 
-struct waitid_info {
-	pid_t pid;
-	uid_t uid;
-	int status;
-	int cause;
-};
-
-struct wait_opts {
-	enum pid_type		wo_type;
-	int			wo_flags;
-	struct pid		*wo_pid;
-
-	struct waitid_info	*wo_info;
-	int			wo_stat;
-	struct rusage		*wo_rusage;
-
-	wait_queue_entry_t		child_wait;
-	int			notask_error;
-};
-
 static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 {
 	return	wo->wo_type == PIDTYPE_MAX ||
@@ -1520,7 +1502,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 	return 0;
 }
 
-static bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
 {
 	if (!eligible_pid(wo, p))
 		return false;
@@ -1590,7 +1572,7 @@ static int do_wait_pid(struct wait_opts *wo)
 	return 0;
 }
 
-static long __do_wait(struct wait_opts *wo)
+long __do_wait(struct wait_opts *wo)
 {
 	long retval;
 
@@ -1662,9 +1644,9 @@ static long do_wait(struct wait_opts *wo)
 	return retval;
 }
 
-static int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
-				 struct waitid_info *infop, int options,
-				 struct rusage *ru)
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+			  struct waitid_info *infop, int options,
+			  struct rusage *ru)
 {
 	unsigned int f_flags = 0;
 	struct pid *pid = NULL;
diff --git a/kernel/exit.h b/kernel/exit.h
new file mode 100644
index 000000000000..278faa26a653
--- /dev/null
+++ b/kernel/exit.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef LINUX_WAITID_H
+#define LINUX_WAITID_H
+
+struct waitid_info {
+	pid_t pid;
+	uid_t uid;
+	int status;
+	int cause;
+};
+
+struct wait_opts {
+	enum pid_type		wo_type;
+	int			wo_flags;
+	struct pid		*wo_pid;
+
+	struct waitid_info	*wo_info;
+	int			wo_stat;
+	struct rusage		*wo_rusage;
+
+	wait_queue_entry_t		child_wait;
+	int			notask_error;
+};
+
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p);
+long __do_wait(struct wait_opts *wo);
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+			  struct waitid_info *infop, int options,
+			  struct rusage *ru);
+#endif
-- 
cgit v1.2.3


From f0c7183008b41e92fa676406d87f18773724b48b Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Thu, 21 Sep 2023 13:00:45 -0400
Subject: PM: hibernate: Use __get_safe_page() rather than touching the list

We found at least one situation where the safe pages list was empty and
get_buffer() would gladly try to use a NULL pointer.

Signed-off-by: Brian Geffon <bgeffon@google.com>
Fixes: 8357376d3df2 ("[PATCH] swsusp: Improve handling of highmem")
Cc: All applicable <stable@vger.kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 87e9f7e2bdc0..acd2e49b9ac8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2545,8 +2545,9 @@ static void *get_highmem_page_buffer(struct page *page,
 		pbe->copy_page = tmp;
 	} else {
 		/* Copy of the page will be stored in normal memory */
-		kaddr = safe_pages_list;
-		safe_pages_list = safe_pages_list->next;
+		kaddr = __get_safe_page(ca->gfp_mask);
+		if (!kaddr)
+			return ERR_PTR(-ENOMEM);
 		pbe->copy_page = virt_to_page(kaddr);
 	}
 	pbe->next = highmem_pblist;
@@ -2750,8 +2751,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 		return ERR_PTR(-ENOMEM);
 	}
 	pbe->orig_address = page_address(page);
-	pbe->address = safe_pages_list;
-	safe_pages_list = safe_pages_list->next;
+	pbe->address = __get_safe_page(ca->gfp_mask);
+	if (!pbe->address)
+		return ERR_PTR(-ENOMEM);
 	pbe->next = restore_pblist;
 	restore_pblist = pbe;
 	return pbe->address;
-- 
cgit v1.2.3


From 577c06af8188d1f6919ef7b62fc1b78fb1b86eb7 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Tue, 19 Sep 2023 12:09:03 +0200
Subject: bpf: Disable zero-extension for BPF_MEMSX

On the architectures that use bpf_jit_needs_zext(), e.g., s390x, the
verifier incorrectly inserts a zero-extension after BPF_MEMSX, leading
to miscompilations like the one below:

      24:       89 1a ff fe 00 00 00 00 "r1 = *(s16 *)(r10 - 2);"       # zext_dst set
   0x3ff7fdb910e:       lgh     %r2,-2(%r13,%r0)                        # load halfword
   0x3ff7fdb9114:       llgfr   %r2,%r2                                 # wrong!
      25:       65 10 00 03 00 00 7f ff if r1 s> 32767 goto +3 <l0_1>   # check_cond_jmp_op()

Disable such zero-extensions. The JITs need to insert sign-extension
themselves, if necessary.

Suggested-by: Puranjay Mohan <puranjay12@gmail.com>
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Reviewed-by: Puranjay Mohan <puranjay12@gmail.com>
Link: https://lore.kernel.org/r/20230919101336.2223655-2-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 38f8718f1602..eed7350e15f4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3114,7 +3114,7 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
 
 	if (class == BPF_LDX) {
 		if (t != SRC_OP)
-			return BPF_SIZE(code) == BPF_DW;
+			return BPF_SIZE(code) == BPF_DW || BPF_MODE(code) == BPF_MEMSX;
 		/* LDX source must be ptr. */
 		return true;
 	}
-- 
cgit v1.2.3


From 41e845628511878d6e89e2a9249c095e72aab7eb Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Sat, 9 Sep 2023 21:19:32 +0200
Subject: cred: add get_cred_many and put_cred_many

Some of the frequent consumers of get_cred and put_cred operate on 2
references on the same creds back-to-back.

Switch them to doing the work in one go instead.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
[PM: removed changelog from commit description]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/cred.h | 59 ++++++++++++++++++++++++++++++++++++++++++++--------
 kernel/cred.c        | 26 +++++++++++++----------
 2 files changed, 65 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index f923528d5cc4..56bc432fe49b 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -218,6 +218,20 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
 					  cred->cap_inheritable));
 }
 
+/**
+ * get_new_cred_many - Get references on a new set of credentials
+ * @cred: The new credentials to reference
+ * @nr: Number of references to acquire
+ *
+ * Get references on the specified set of new credentials.  The caller must
+ * release all acquired references.
+ */
+static inline struct cred *get_new_cred_many(struct cred *cred, int nr)
+{
+	atomic_add(nr, &cred->usage);
+	return cred;
+}
+
 /**
  * get_new_cred - Get a reference on a new set of credentials
  * @cred: The new credentials to reference
@@ -227,16 +241,16 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
  */
 static inline struct cred *get_new_cred(struct cred *cred)
 {
-	atomic_inc(&cred->usage);
-	return cred;
+	return get_new_cred_many(cred, 1);
 }
 
 /**
- * get_cred - Get a reference on a set of credentials
+ * get_cred_many - Get references on a set of credentials
  * @cred: The credentials to reference
+ * @nr: Number of references to acquire
  *
- * Get a reference on the specified set of credentials.  The caller must
- * release the reference.  If %NULL is passed, it is returned with no action.
+ * Get references on the specified set of credentials.  The caller must release
+ * all acquired reference.  If %NULL is passed, it is returned with no action.
  *
  * This is used to deal with a committed set of credentials.  Although the
  * pointer is const, this will temporarily discard the const and increment the
@@ -244,14 +258,28 @@ static inline struct cred *get_new_cred(struct cred *cred)
  * accidental alteration of a set of credentials that should be considered
  * immutable.
  */
-static inline const struct cred *get_cred(const struct cred *cred)
+static inline const struct cred *get_cred_many(const struct cred *cred, int nr)
 {
 	struct cred *nonconst_cred = (struct cred *) cred;
 	if (!cred)
 		return cred;
 	validate_creds(cred);
 	nonconst_cred->non_rcu = 0;
-	return get_new_cred(nonconst_cred);
+	return get_new_cred_many(nonconst_cred, nr);
+}
+
+/*
+ * get_cred - Get a reference on a set of credentials
+ * @cred: The credentials to reference
+ *
+ * Get a reference on the specified set of credentials.  The caller must
+ * release the reference.  If %NULL is passed, it is returned with no action.
+ *
+ * This is used to deal with a committed set of credentials.
+ */
+static inline const struct cred *get_cred(const struct cred *cred)
+{
+	return get_cred_many(cred, 1);
 }
 
 static inline const struct cred *get_cred_rcu(const struct cred *cred)
@@ -269,6 +297,7 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
 /**
  * put_cred - Release a reference to a set of credentials
  * @cred: The credentials to release
+ * @nr: Number of references to release
  *
  * Release a reference to a set of credentials, deleting them when the last ref
  * is released.  If %NULL is passed, nothing is done.
@@ -277,17 +306,29 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
  * on task_struct are attached by const pointers to prevent accidental
  * alteration of otherwise immutable credential sets.
  */
-static inline void put_cred(const struct cred *_cred)
+static inline void put_cred_many(const struct cred *_cred, int nr)
 {
 	struct cred *cred = (struct cred *) _cred;
 
 	if (cred) {
 		validate_creds(cred);
-		if (atomic_dec_and_test(&(cred)->usage))
+		if (atomic_sub_and_test(nr, &cred->usage))
 			__put_cred(cred);
 	}
 }
 
+/*
+ * put_cred - Release a reference to a set of credentials
+ * @cred: The credentials to release
+ *
+ * Release a reference to a set of credentials, deleting them when the last ref
+ * is released.  If %NULL is passed, nothing is done.
+ */
+static inline void put_cred(const struct cred *cred)
+{
+	put_cred_many(cred, 1);
+}
+
 /**
  * current_cred - Access the current task's subjective credentials
  *
diff --git a/kernel/cred.c b/kernel/cred.c
index 98cb4eca23fb..9398e534b997 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -162,23 +162,29 @@ EXPORT_SYMBOL(__put_cred);
  */
 void exit_creds(struct task_struct *tsk)
 {
-	struct cred *cred;
+	struct cred *real_cred, *cred;
 
 	kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
 	       atomic_read(&tsk->cred->usage),
 	       read_cred_subscribers(tsk->cred));
 
-	cred = (struct cred *) tsk->real_cred;
+	real_cred = (struct cred *) tsk->real_cred;
 	tsk->real_cred = NULL;
-	validate_creds(cred);
-	alter_cred_subscribers(cred, -1);
-	put_cred(cred);
 
 	cred = (struct cred *) tsk->cred;
 	tsk->cred = NULL;
+
 	validate_creds(cred);
-	alter_cred_subscribers(cred, -1);
-	put_cred(cred);
+	if (real_cred == cred) {
+		alter_cred_subscribers(cred, -2);
+		put_cred_many(cred, 2);
+	} else {
+		validate_creds(real_cred);
+		alter_cred_subscribers(real_cred, -1);
+		put_cred(real_cred);
+		alter_cred_subscribers(cred, -1);
+		put_cred(cred);
+	}
 
 #ifdef CONFIG_KEYS_REQUEST_CACHE
 	key_put(tsk->cached_requested_key);
@@ -355,8 +361,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 #endif
 		clone_flags & CLONE_THREAD
 	    ) {
-		p->real_cred = get_cred(p->cred);
-		get_cred(p->cred);
+		p->real_cred = get_cred_many(p->cred, 2);
 		alter_cred_subscribers(p->cred, 2);
 		kdebug("share_creds(%p{%d,%d})",
 		       p->cred, atomic_read(&p->cred->usage),
@@ -520,8 +525,7 @@ int commit_creds(struct cred *new)
 		proc_id_connector(task, PROC_EVENT_GID);
 
 	/* release the old obj and subj refs both */
-	put_cred(old);
-	put_cred(old);
+	put_cred_many(old, 2);
 	return 0;
 }
 EXPORT_SYMBOL(commit_creds);
-- 
cgit v1.2.3


From 4812c54dc0498c4b757cbc7f41c1999b5a1c9f67 Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Fri, 22 Sep 2023 04:35:59 +0000
Subject: locking/ww_mutex/test: Use prng instead of rng to avoid hangs at
 bootup

Booting w/ qemu without kvm, and with 64 cpus, I noticed we'd
sometimes hung task watchdog splats in get_random_u32_below()
when using the test-ww_mutex stress test.

While entropy exhaustion is no longer an issue, the RNG may be
slower early in boot. The test-ww_mutex code will spawn off
128 threads (2x cpus) and each thread will call
get_random_u32_below() a number of times to generate a random
order of the 16 locks.

This intense use takes time and without kvm, qemu can be slow
enough that we trip the hung task watchdogs.

For this test, we don't need true randomness, just mixed up
orders for testing ww_mutex lock acquisitions, so it changes
the logic to use the prng instead, which takes less time
and avoids the watchdgos.

Feedback would be appreciated!

Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230922043616.19282-2-jstultz@google.com
---
 kernel/locking/test-ww_mutex.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 93cca6e69860..9bceba65858a 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -9,7 +9,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
 #include <linux/slab.h>
 #include <linux/ww_mutex.h>
 
@@ -386,6 +386,19 @@ struct stress {
 	int nlocks;
 };
 
+struct rnd_state rng;
+DEFINE_SPINLOCK(rng_lock);
+
+static inline u32 prandom_u32_below(u32 ceil)
+{
+	u32 ret;
+
+	spin_lock(&rng_lock);
+	ret = prandom_u32_state(&rng) % ceil;
+	spin_unlock(&rng_lock);
+	return ret;
+}
+
 static int *get_random_order(int count)
 {
 	int *order;
@@ -399,7 +412,7 @@ static int *get_random_order(int count)
 		order[n] = n;
 
 	for (n = count - 1; n > 1; n--) {
-		r = get_random_u32_below(n + 1);
+		r = prandom_u32_below(n + 1);
 		if (r != n) {
 			tmp = order[n];
 			order[n] = order[r];
@@ -625,6 +638,8 @@ static int __init test_ww_mutex_init(void)
 
 	printk(KERN_INFO "Beginning ww mutex selftests\n");
 
+	prandom_seed_state(&rng, get_random_u64());
+
 	wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0);
 	if (!wq)
 		return -ENOMEM;
-- 
cgit v1.2.3


From bccdd808902f8c677317cec47c306e42b93b849e Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Fri, 22 Sep 2023 04:36:00 +0000
Subject: locking/ww_mutex/test: Fix potential workqueue corruption

In some cases running with the test-ww_mutex code, I was seeing
odd behavior where sometimes it seemed flush_workqueue was
returning before all the work threads were finished.

Often this would cause strange crashes as the mutexes would be
freed while they were being used.

Looking at the code, there is a lifetime problem as the
controlling thread that spawns the work allocates the
"struct stress" structures that are passed to the workqueue
threads. Then when the workqueue threads are finished,
they free the stress struct that was passed to them.

Unfortunately the workqueue work_struct node is in the stress
struct. Which means the work_struct is freed before the work
thread returns and while flush_workqueue is waiting.

It seems like a better idea to have the controlling thread
both allocate and free the stress structures, so that we can
be sure we don't corrupt the workqueue by freeing the structure
prematurely.

So this patch reworks the test to do so, and with this change
I no longer see the early flush_workqueue returns.

Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230922043616.19282-3-jstultz@google.com
---
 kernel/locking/test-ww_mutex.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 9bceba65858a..358d66150426 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -479,7 +479,6 @@ retry:
 	} while (!time_after(jiffies, stress->timeout));
 
 	kfree(order);
-	kfree(stress);
 }
 
 struct reorder_lock {
@@ -544,7 +543,6 @@ out:
 	list_for_each_entry_safe(ll, ln, &locks, link)
 		kfree(ll);
 	kfree(order);
-	kfree(stress);
 }
 
 static void stress_one_work(struct work_struct *work)
@@ -565,8 +563,6 @@ static void stress_one_work(struct work_struct *work)
 			break;
 		}
 	} while (!time_after(jiffies, stress->timeout));
-
-	kfree(stress);
 }
 
 #define STRESS_INORDER BIT(0)
@@ -577,15 +573,24 @@ static void stress_one_work(struct work_struct *work)
 static int stress(int nlocks, int nthreads, unsigned int flags)
 {
 	struct ww_mutex *locks;
-	int n;
+	struct stress *stress_array;
+	int n, count;
 
 	locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL);
 	if (!locks)
 		return -ENOMEM;
 
+	stress_array = kmalloc_array(nthreads, sizeof(*stress_array),
+				     GFP_KERNEL);
+	if (!stress_array) {
+		kfree(locks);
+		return -ENOMEM;
+	}
+
 	for (n = 0; n < nlocks; n++)
 		ww_mutex_init(&locks[n], &ww_class);
 
+	count = 0;
 	for (n = 0; nthreads; n++) {
 		struct stress *stress;
 		void (*fn)(struct work_struct *work);
@@ -609,9 +614,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
 		if (!fn)
 			continue;
 
-		stress = kmalloc(sizeof(*stress), GFP_KERNEL);
-		if (!stress)
-			break;
+		stress = &stress_array[count++];
 
 		INIT_WORK(&stress->work, fn);
 		stress->locks = locks;
@@ -626,6 +629,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags)
 
 	for (n = 0; n < nlocks; n++)
 		ww_mutex_destroy(&locks[n]);
+	kfree(stress_array);
 	kfree(locks);
 
 	return 0;
-- 
cgit v1.2.3


From cfa92b6d52071aaa8f27d21affdcb14e7448fbc1 Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Fri, 22 Sep 2023 04:36:01 +0000
Subject: locking/ww_mutex/test: Make sure we bail out instead of livelock

I've seen what appears to be livelocks in the stress_inorder_work()
function, and looking at the code it is clear we can have a case
where we continually retry acquiring the locks and never check to
see if we have passed the specified timeout.

This patch reworks that function so we always check the timeout
before iterating through the loop again.

I believe others may have hit this previously here:

  https://lore.kernel.org/lkml/895ef450-4fb3-5d29-a6ad-790657106a5a@intel.com/

Reported-by: Li Zhijian <zhijianx.li@intel.com>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230922043616.19282-4-jstultz@google.com
---
 kernel/locking/test-ww_mutex.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 358d66150426..78719e1ef1b1 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -465,17 +465,18 @@ retry:
 			ww_mutex_unlock(&locks[order[n]]);
 
 		if (err == -EDEADLK) {
-			ww_mutex_lock_slow(&locks[order[contended]], &ctx);
-			goto retry;
+			if (!time_after(jiffies, stress->timeout)) {
+				ww_mutex_lock_slow(&locks[order[contended]], &ctx);
+				goto retry;
+			}
 		}
 
+		ww_acquire_fini(&ctx);
 		if (err) {
 			pr_err_once("stress (%s) failed with %d\n",
 				    __func__, err);
 			break;
 		}
-
-		ww_acquire_fini(&ctx);
 	} while (!time_after(jiffies, stress->timeout));
 
 	kfree(order);
-- 
cgit v1.2.3


From dc461c48deda8a2d243fbaf49e276d555eb833d8 Mon Sep 17 00:00:00 2001
From: Liming Wu <liming.wu@jaguarmicro.com>
Date: Fri, 25 Aug 2023 10:35:00 +0800
Subject: sched/debug: Avoid checking in_atomic_preempt_off() twice in
 schedule_debug()

in_atomic_preempt_off() already gets called in schedule_debug() once,
which is the only caller of __schedule_bug().

Skip the second call within __schedule_bug(), it should always be true
at this point.

[ mingo: Clarified the changelog. ]

Signed-off-by: Liming Wu <liming.wu@jaguarmicro.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230825023501.1848-1-liming.wu@jaguarmicro.com
---
 kernel/sched/core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 52ceb85b6421..107493469b4e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5899,8 +5899,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
 	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
-	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
-	    && in_atomic_preempt_off()) {
+	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
 		pr_err("Preemption disabled at:");
 		print_ip_sym(KERN_ERR, preempt_disable_ip);
 	}
-- 
cgit v1.2.3


From 215199e3d9f3dc01a6d10b8229891e6f7f1085e7 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 24 Aug 2023 21:25:55 -0700
Subject: hardening: Provide Kconfig fragments for basic options

Inspired by Salvatore Mesoraca's earlier[1] efforts to provide some
in-tree guidance for kernel hardening Kconfig options, add a new fragment
named "hardening-basic.config" (along with some arch-specific fragments)
that enable a basic set of kernel hardening options that have the least
(or no) performance impact and remove a reasonable set of legacy APIs.

Using this fragment is as simple as running "make hardening.config".

More extreme fragments can be added[2] in the future to cover all the
recognized hardening options, and more per-architecture files can be
added too.

For now, document the fragments directly via comments. Perhaps .rst
documentation can be generated from them in the future (rather than the
other way around).

[1] https://lore.kernel.org/kernel-hardening/1536516257-30871-1-git-send-email-s.mesoraca16@gmail.com/
[2] https://github.com/KSPP/linux/issues/14

Cc: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Cc: x86@kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-doc@vger.kernel.org
Cc: linux-kbuild@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 MAINTAINERS                           |  2 +
 arch/arm/configs/hardening.config     |  7 +++
 arch/arm64/configs/hardening.config   | 22 ++++++++
 arch/powerpc/configs/hardening.config | 10 ++++
 arch/x86/configs/hardening.config     | 15 ++++++
 kernel/configs/hardening.config       | 98 +++++++++++++++++++++++++++++++++++
 6 files changed, 154 insertions(+)
 create mode 100644 arch/arm/configs/hardening.config
 create mode 100644 arch/arm64/configs/hardening.config
 create mode 100644 arch/powerpc/configs/hardening.config
 create mode 100644 arch/x86/configs/hardening.config
 create mode 100644 kernel/configs/hardening.config

(limited to 'kernel')

diff --git a/MAINTAINERS b/MAINTAINERS
index bf0f54c24f81..737dcc7a2155 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11398,8 +11398,10 @@ S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening
 F:	Documentation/ABI/testing/sysfs-kernel-oops_count
 F:	Documentation/ABI/testing/sysfs-kernel-warn_count
+F:	arch/*/configs/hardening.config
 F:	include/linux/overflow.h
 F:	include/linux/randomize_kstack.h
+F:	kernel/configs/hardening.config
 F:	mm/usercopy.c
 K:	\b(add|choose)_random_kstack_offset\b
 K:	\b__check_(object_size|heap_object)\b
diff --git a/arch/arm/configs/hardening.config b/arch/arm/configs/hardening.config
new file mode 100644
index 000000000000..327349ce6377
--- /dev/null
+++ b/arch/arm/configs/hardening.config
@@ -0,0 +1,7 @@
+# Basic kernel hardening options (specific to arm)
+
+# Make sure PXN/PAN emulation is enabled.
+CONFIG_CPU_SW_DOMAIN_PAN=y
+
+# Dangerous; old interfaces and needless additional attack surface.
+# CONFIG_OABI_COMPAT is not set
diff --git a/arch/arm64/configs/hardening.config b/arch/arm64/configs/hardening.config
new file mode 100644
index 000000000000..b0e795208998
--- /dev/null
+++ b/arch/arm64/configs/hardening.config
@@ -0,0 +1,22 @@
+# Basic kernel hardening options (specific to arm64)
+
+# Make sure PAN emulation is enabled.
+CONFIG_ARM64_SW_TTBR0_PAN=y
+
+# Software Shadow Stack or PAC
+CONFIG_SHADOW_CALL_STACK=y
+
+# Pointer authentication (ARMv8.3 and later). If hardware actually supports
+# it, one can turn off CONFIG_STACKPROTECTOR_STRONG with this enabled.
+CONFIG_ARM64_PTR_AUTH=y
+CONFIG_ARM64_PTR_AUTH_KERNEL=y
+
+# Available in ARMv8.5 and later.
+CONFIG_ARM64_BTI=y
+CONFIG_ARM64_BTI_KERNEL=y
+CONFIG_ARM64_MTE=y
+CONFIG_KASAN_HW_TAGS=y
+CONFIG_ARM64_E0PD=y
+
+# Available in ARMv8.7 and later.
+CONFIG_ARM64_EPAN=y
diff --git a/arch/powerpc/configs/hardening.config b/arch/powerpc/configs/hardening.config
new file mode 100644
index 000000000000..4e9bba327e8f
--- /dev/null
+++ b/arch/powerpc/configs/hardening.config
@@ -0,0 +1,10 @@
+# PowerPC specific hardening options
+
+# Block kernel from unexpectedly reading userspace memory.
+CONFIG_PPC_KUAP=y
+
+# Attack surface reduction.
+# CONFIG_SCOM_DEBUGFS is not set
+
+# Disable internal kernel debugger.
+# CONFIG_XMON is not set
diff --git a/arch/x86/configs/hardening.config b/arch/x86/configs/hardening.config
new file mode 100644
index 000000000000..19bb0c7a7669
--- /dev/null
+++ b/arch/x86/configs/hardening.config
@@ -0,0 +1,15 @@
+# Basic kernel hardening options (specific to x86)
+
+# Modern libc no longer needs a fixed-position mapping in userspace, remove
+# it as a possible target.
+CONFIG_LEGACY_VSYSCALL_NONE=y
+
+# Enable chip-specific IOMMU support.
+CONFIG_INTEL_IOMMU=y
+CONFIG_INTEL_IOMMU_DEFAULT_ON=y
+CONFIG_INTEL_IOMMU_SVM=y
+CONFIG_AMD_IOMMU=y
+CONFIG_AMD_IOMMU_V2=y
+
+# Enable CET Shadow Stack for userspace.
+CONFIG_X86_USER_SHADOW_STACK=y
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
new file mode 100644
index 000000000000..95a400f042b1
--- /dev/null
+++ b/kernel/configs/hardening.config
@@ -0,0 +1,98 @@
+# Help: Basic kernel hardening options
+#
+# These are considered the basic kernel hardening, self-protection, and
+# attack surface reduction options. They are expected to have low (or
+# no) performance impact on most workloads, and have a reasonable level
+# of legacy API removals.
+
+# Make sure reporting of various hardening actions is possible.
+CONFIG_BUG=y
+
+# Basic kernel memory permission enforcement.
+CONFIG_STRICT_KERNEL_RWX=y
+CONFIG_STRICT_MODULE_RWX=y
+CONFIG_VMAP_STACK=y
+
+# Kernel image and memory ASLR.
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_RANDOMIZE_MEMORY=y
+
+# Randomize allocator freelists, harden metadata.
+CONFIG_SLAB_FREELIST_RANDOM=y
+CONFIG_SLAB_FREELIST_HARDENED=y
+CONFIG_SHUFFLE_PAGE_ALLOCATOR=y
+CONFIG_RANDOM_KMALLOC_CACHES=y
+
+# Randomize kernel stack offset on syscall entry.
+CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y
+
+# Basic stack frame overflow protection.
+CONFIG_STACKPROTECTOR=y
+CONFIG_STACKPROTECTOR_STRONG=y
+
+# Basic buffer length bounds checking.
+CONFIG_HARDENED_USERCOPY=y
+CONFIG_FORTIFY_SOURCE=y
+
+# Basic array index bounds checking.
+CONFIG_UBSAN=y
+CONFIG_UBSAN_TRAP=y
+CONFIG_UBSAN_BOUNDS=y
+# CONFIG_UBSAN_SHIFT is not set
+# CONFIG_UBSAN_DIV_ZERO
+# CONFIG_UBSAN_UNREACHABLE
+# CONFIG_UBSAN_BOOL
+# CONFIG_UBSAN_ENUM
+# CONFIG_UBSAN_ALIGNMENT
+CONFIG_UBSAN_SANITIZE_ALL=y
+
+# Linked list integrity checking.
+CONFIG_LIST_HARDENED=y
+
+# Initialize all heap variables to zero on allocation.
+CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
+
+# Initialize all stack variables to zero on function entry.
+CONFIG_INIT_STACK_ALL_ZERO=y
+
+# Wipe RAM at reboot via EFI. For more details, see:
+# https://trustedcomputinggroup.org/resource/pc-client-work-group-platform-reset-attack-mitigation-specification/
+# https://bugzilla.redhat.com/show_bug.cgi?id=1532058
+CONFIG_RESET_ATTACK_MITIGATION=y
+
+# Disable DMA between EFI hand-off and the kernel's IOMMU setup.
+CONFIG_EFI_DISABLE_PCI_DMA=y
+
+# Force IOMMU TLB invalidation so devices will never be able to access stale
+# data content.
+CONFIG_IOMMU_SUPPORT=y
+CONFIG_IOMMU_DEFAULT_DMA_STRICT=y
+
+# Do not allow direct physical memory access to non-device memory.
+CONFIG_STRICT_DEVMEM=y
+CONFIG_IO_STRICT_DEVMEM=y
+
+# Provide userspace with seccomp BPF API for syscall attack surface reduction.
+CONFIG_SECCOMP=y
+CONFIG_SECCOMP_FILTER=y
+
+# Provides some protections against SYN flooding.
+CONFIG_SYN_COOKIES=y
+
+# Attack surface reduction: do not autoload TTY line disciplines.
+# CONFIG_LDISC_AUTOLOAD is not set
+
+# Dangerous; enabling this disables userspace brk ASLR.
+# CONFIG_COMPAT_BRK is not set
+
+# Dangerous; exposes kernel text image layout.
+# CONFIG_PROC_KCORE is not set
+
+# Dangerous; enabling this disables userspace VDSO ASLR.
+# CONFIG_COMPAT_VDSO is not set
+
+# Attack surface reduction: Use the modern PTY interface (devpts) only.
+# CONFIG_LEGACY_PTYS is not set
+
+# Attack surface reduction: Use only modesetting video drivers.
+# CONFIG_DRM_LEGACY is not set
-- 
cgit v1.2.3


From 45d99ea451d0c30bfd4864f0fe485d7dac014902 Mon Sep 17 00:00:00 2001
From: Zheng Yejian <zhengyejian1@huawei.com>
Date: Thu, 21 Sep 2023 20:54:25 +0800
Subject: ring-buffer: Fix bytes info in per_cpu buffer stats

The 'bytes' info in file 'per_cpu/cpu<X>/stats' means the number of
bytes in cpu buffer that have not been consumed. However, currently
after consuming data by reading file 'trace_pipe', the 'bytes' info
was not changed as expected.

  # cat per_cpu/cpu0/stats
  entries: 0
  overrun: 0
  commit overrun: 0
  bytes: 568             <--- 'bytes' is problematical !!!
  oldest event ts:  8651.371479
  now ts:  8653.912224
  dropped events: 0
  read events: 8

The root cause is incorrect stat on cpu_buffer->read_bytes. To fix it:
  1. When stat 'read_bytes', account consumed event in rb_advance_reader();
  2. When stat 'entries_bytes', exclude the discarded padding event which
     is smaller than minimum size because it is invisible to reader. Then
     use rb_page_commit() instead of BUF_PAGE_SIZE at where accounting for
     page-based read/remove/overrun.

Also correct the comments of ring_buffer_bytes_cpu() in this patch.

Link: https://lore.kernel.org/linux-trace-kernel/20230921125425.1708423-1-zhengyejian1@huawei.com

Cc: stable@vger.kernel.org
Fixes: c64e148a3be3 ("trace: Add ring buffer stats to measure rate of events")
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a1651edc48d5..28daf0ce95c5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -354,6 +354,11 @@ static void rb_init_page(struct buffer_data_page *bpage)
 	local_set(&bpage->commit, 0);
 }
 
+static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
+{
+	return local_read(&bpage->page->commit);
+}
+
 static void free_buffer_page(struct buffer_page *bpage)
 {
 	free_page((unsigned long)bpage->page);
@@ -2003,7 +2008,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
 			 * Increment overrun to account for the lost events.
 			 */
 			local_add(page_entries, &cpu_buffer->overrun);
-			local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+			local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes);
 			local_inc(&cpu_buffer->pages_lost);
 		}
 
@@ -2367,11 +2372,6 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
 			       cpu_buffer->reader_page->read);
 }
 
-static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
-{
-	return local_read(&bpage->page->commit);
-}
-
 static struct ring_buffer_event *
 rb_iter_head_event(struct ring_buffer_iter *iter)
 {
@@ -2517,7 +2517,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
 		 * the counters.
 		 */
 		local_add(entries, &cpu_buffer->overrun);
-		local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
+		local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes);
 		local_inc(&cpu_buffer->pages_lost);
 
 		/*
@@ -2660,9 +2660,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
 	event = __rb_page_index(tail_page, tail);
 
-	/* account for padding bytes */
-	local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
-
 	/*
 	 * Save the original length to the meta data.
 	 * This will be used by the reader to add lost event
@@ -2676,7 +2673,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	 * write counter enough to allow another writer to slip
 	 * in on this page.
 	 * We put in a discarded commit instead, to make sure
-	 * that this space is not used again.
+	 * that this space is not used again, and this space will
+	 * not be accounted into 'entries_bytes'.
 	 *
 	 * If we are less than the minimum size, we don't need to
 	 * worry about it.
@@ -2701,6 +2699,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	/* time delta must be non zero */
 	event->time_delta = 1;
 
+	/* account for padding bytes */
+	local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
+
 	/* Make sure the padding is visible before the tail_page->write update */
 	smp_wmb();
 
@@ -4215,7 +4216,7 @@ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu)
 EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
 
 /**
- * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
+ * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer
  * @buffer: The ring buffer
  * @cpu: The per CPU buffer to read from.
  */
@@ -4723,6 +4724,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 	length = rb_event_length(event);
 	cpu_buffer->reader_page->read += length;
+	cpu_buffer->read_bytes += length;
 }
 
 static void rb_advance_iter(struct ring_buffer_iter *iter)
@@ -5816,7 +5818,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	} else {
 		/* update the entry counter */
 		cpu_buffer->read += rb_page_entries(reader);
-		cpu_buffer->read_bytes += BUF_PAGE_SIZE;
+		cpu_buffer->read_bytes += rb_page_commit(reader);
 
 		/* swap the pages */
 		rb_init_page(bpage);
-- 
cgit v1.2.3


From 30797bce8ef0c73f0c388148ffac92458533b10e Mon Sep 17 00:00:00 2001
From: Josh Don <joshdon@google.com>
Date: Fri, 22 Sep 2023 16:05:34 -0700
Subject: sched/fair: Make cfs_rq->throttled_csd_list available on !SMP

This makes the following patch cleaner by avoiding extra CONFIG_SMP
conditionals on the availability of rq->throttled_csd_list.

Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230922230535.296350-1-joshdon@google.com
---
 kernel/sched/fair.c  | 4 ----
 kernel/sched/sched.h | 2 --
 2 files changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 75720008fdd2..41c960eca792 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5763,11 +5763,9 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 		if (!cfs_rq_throttled(cfs_rq))
 			goto next;
 
-#ifdef CONFIG_SMP
 		/* Already queued for async unthrottle */
 		if (!list_empty(&cfs_rq->throttled_csd_list))
 			goto next;
-#endif
 
 		/* By the above checks, this should never be true */
 		SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
@@ -6134,9 +6132,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
-#ifdef CONFIG_SMP
 	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
-#endif
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9260120ed2a5..96f8ab7a0702 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -634,9 +634,7 @@ struct cfs_rq {
 	int			throttled;
 	int			throttle_count;
 	struct list_head	throttled_list;
-#ifdef CONFIG_SMP
 	struct list_head	throttled_csd_list;
-#endif
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
-- 
cgit v1.2.3


From 2f8c62296b6f656bbfd17e9f1fadd7478003a9d9 Mon Sep 17 00:00:00 2001
From: Josh Don <joshdon@google.com>
Date: Fri, 22 Sep 2023 16:05:35 -0700
Subject: sched/fair: Fix warning in bandwidth distribution

We've observed the following warning being hit in
distribute_cfs_runtime():

	SCHED_WARN_ON(cfs_rq->runtime_remaining > 0)

We have the following race:

 - CPU 0: running bandwidth distribution (distribute_cfs_runtime).
   Inspects the local cfs_rq and makes its runtime_remaining positive.
   However, we defer unthrottling the local cfs_rq until after
   considering all remote cfs_rq's.

 - CPU 1: starts running bandwidth distribution from the slack timer. When
   it finds the cfs_rq for CPU 0 on the throttled list, it observers the
   that the cfs_rq is throttled, yet is not on the CSD list, and has a
   positive runtime_remaining, thus triggering the warning in
   distribute_cfs_runtime.

To fix this, we can rework the local unthrottling logic to put the local
cfs_rq on a local list, so that any future bandwidth distributions will
realize that the cfs_rq is about to be unthrottled.

Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230922230535.296350-2-joshdon@google.com
---
 kernel/sched/fair.c | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 41c960eca792..2973173ad850 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5741,13 +5741,13 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
 
 static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 {
-	struct cfs_rq *local_unthrottle = NULL;
 	int this_cpu = smp_processor_id();
 	u64 runtime, remaining = 1;
 	bool throttled = false;
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *tmp;
 	struct rq_flags rf;
 	struct rq *rq;
+	LIST_HEAD(local_unthrottle);
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -5782,11 +5782,17 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 
 		/* we check whether we're throttled above */
 		if (cfs_rq->runtime_remaining > 0) {
-			if (cpu_of(rq) != this_cpu ||
-			    SCHED_WARN_ON(local_unthrottle))
+			if (cpu_of(rq) != this_cpu) {
 				unthrottle_cfs_rq_async(cfs_rq);
-			else
-				local_unthrottle = cfs_rq;
+			} else {
+				/*
+				 * We currently only expect to be unthrottling
+				 * a single cfs_rq locally.
+				 */
+				SCHED_WARN_ON(!list_empty(&local_unthrottle));
+				list_add_tail(&cfs_rq->throttled_csd_list,
+					      &local_unthrottle);
+			}
 		} else {
 			throttled = true;
 		}
@@ -5794,15 +5800,23 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 next:
 		rq_unlock_irqrestore(rq, &rf);
 	}
-	rcu_read_unlock();
 
-	if (local_unthrottle) {
-		rq = cpu_rq(this_cpu);
+	list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
+				 throttled_csd_list) {
+		struct rq *rq = rq_of(cfs_rq);
+
 		rq_lock_irqsave(rq, &rf);
-		if (cfs_rq_throttled(local_unthrottle))
-			unthrottle_cfs_rq(local_unthrottle);
+
+		list_del_init(&cfs_rq->throttled_csd_list);
+
+		if (cfs_rq_throttled(cfs_rq))
+			unthrottle_cfs_rq(cfs_rq);
+
 		rq_unlock_irqrestore(rq, &rf);
 	}
+	SCHED_WARN_ON(!list_empty(&local_unthrottle));
+
+	rcu_read_unlock();
 
 	return throttled;
 }
-- 
cgit v1.2.3


From 3eafe225995c67f8c179011ec2d6e4c12b32a53d Mon Sep 17 00:00:00 2001
From: Wang Jinchao <wangjinchao@xfusion.com>
Date: Sun, 20 Aug 2023 20:53:17 +0800
Subject: sched/core: Refactor the task_flags check for worker sleeping in
 sched_submit_work()

Simplify the conditional logic for checking worker flags
by splitting the original compound `if` statement into
separate `if` and `else if` clauses.

This modification not only retains the previous functionality,
but also reduces a single `if` check, improving code clarity
and potentially enhancing performance.

Signed-off-by: Wang Jinchao <wangjinchao@xfusion.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/ZOIMvURE99ZRAYEj@fedora
---
 kernel/sched/core.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 107493469b4e..84881a582847 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6711,12 +6711,10 @@ static inline void sched_submit_work(struct task_struct *tsk)
 	 * If a worker goes to sleep, notify and ask workqueue whether it
 	 * wants to wake up a task to maintain concurrency.
 	 */
-	if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
-		if (task_flags & PF_WQ_WORKER)
-			wq_worker_sleeping(tsk);
-		else
-			io_wq_worker_sleeping(tsk);
-	}
+	if (task_flags & PF_WQ_WORKER)
+		wq_worker_sleeping(tsk);
+	else if (task_flags & PF_IO_WORKER)
+		io_wq_worker_sleeping(tsk);
 
 	/*
 	 * spinlock and rwlock must not flush block requests.  This will
-- 
cgit v1.2.3


From d0b654e19a83840e11d6ba87ea0be30e5fae343a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 19 Jul 2023 12:03:20 -0700
Subject: torture: Share torture_random_state with torture_shuffle_tasks()

Both torture_shuffle_tasks() and its caller torture_shuffle()
define a torture_random_state structure.  This is suboptimal given
that torture_shuffle_tasks() runs for a very short period of time.
This commit therefore causes torture_shuffle() to pass a pointer to its
torture_random_state structure down to torture_shuffle_tasks().

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/torture.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/torture.c b/kernel/torture.c
index b28b05bbef02..68dba4ecab5c 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -520,9 +520,8 @@ static void torture_shuffle_task_unregister_all(void)
  * A special case is when shuffle_idle_cpu = -1, in which case we allow
  * the tasks to run on all CPUs.
  */
-static void torture_shuffle_tasks(void)
+static void torture_shuffle_tasks(struct torture_random_state *trp)
 {
-	DEFINE_TORTURE_RANDOM(rand);
 	struct shuffle_task *stp;
 
 	cpumask_setall(shuffle_tmp_mask);
@@ -543,7 +542,7 @@ static void torture_shuffle_tasks(void)
 
 	mutex_lock(&shuffle_task_mutex);
 	list_for_each_entry(stp, &shuffle_task_list, st_l) {
-		if (!random_shuffle || torture_random(&rand) & 0x1)
+		if (!random_shuffle || torture_random(trp) & 0x1)
 			set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
 	}
 	mutex_unlock(&shuffle_task_mutex);
@@ -562,7 +561,7 @@ static int torture_shuffle(void *arg)
 	VERBOSE_TOROUT_STRING("torture_shuffle task started");
 	do {
 		torture_hrtimeout_jiffies(shuffle_interval, &rand);
-		torture_shuffle_tasks();
+		torture_shuffle_tasks(&rand);
 		torture_shutdown_absorb("torture_shuffle");
 	} while (!torture_must_stop());
 	torture_kthread_stopping("torture_shuffle");
@@ -673,7 +672,7 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void))
 	if (ssecs > 0) {
 		shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0));
 		return torture_create_kthread(torture_shutdown, NULL,
-					     shutdown_task);
+					      shutdown_task);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From a741deac787f0d2d7068638c067db20af9e63752 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 26 Jul 2023 13:57:03 -0700
Subject: torture: Make torture_hrtimeout_ns() take an hrtimer mode parameter

The current torture-test sleeps are waiting for a duration, but there
are situations where it is better to wait for an absolute time, for
example, when ending a stutter interval.  This commit therefore adds
an hrtimer mode parameter to torture_hrtimeout_ns().  Why not also the
other torture_hrtimeout_*() functions?  The theory is that most absolute
times will be in nanoseconds, especially not (say) jiffies.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/torture.h |  3 ++-
 kernel/torture.c        | 13 +++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index bb466eec01e4..017f0f710815 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -81,7 +81,8 @@ static inline void torture_random_init(struct torture_random_state *trsp)
 }
 
 /* Definitions for high-resolution-timer sleeps. */
-int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp);
+int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode mode,
+			 struct torture_random_state *trsp);
 int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp);
 int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp);
 int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp);
diff --git a/kernel/torture.c b/kernel/torture.c
index 68dba4ecab5c..6ba62e5993e7 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -87,14 +87,15 @@ EXPORT_SYMBOL_GPL(verbose_torout_sleep);
  * nanosecond random fuzz.  This function and its friends desynchronize
  * testing from the timer wheel.
  */
-int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp)
+int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode mode,
+			 struct torture_random_state *trsp)
 {
 	ktime_t hto = baset_ns;
 
 	if (trsp)
 		hto += torture_random(trsp) % fuzzt_ns;
 	set_current_state(TASK_IDLE);
-	return schedule_hrtimeout(&hto, HRTIMER_MODE_REL);
+	return schedule_hrtimeout(&hto, mode);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_ns);
 
@@ -106,7 +107,7 @@ int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state
 {
 	ktime_t baset_ns = baset_us * NSEC_PER_USEC;
 
-	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_us);
 
@@ -123,7 +124,7 @@ int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state
 		fuzzt_ns = (u32)~0U;
 	else
 		fuzzt_ns = fuzzt_us * NSEC_PER_USEC;
-	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_ms);
 
@@ -136,7 +137,7 @@ int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp)
 {
 	ktime_t baset_ns = jiffies_to_nsecs(baset_j);
 
-	return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), trsp);
+	return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), HRTIMER_MODE_REL, trsp);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies);
 
@@ -153,7 +154,7 @@ int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *
 		fuzzt_ns = (u32)~0U;
 	else
 		fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC;
-	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp);
+	return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp);
 }
 EXPORT_SYMBOL_GPL(torture_hrtimeout_s);
 
-- 
cgit v1.2.3


From 3853a720f8bce9c5facb593e817f97659cc0512a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 3 Aug 2023 16:40:11 +0200
Subject: rcu: Include torture_sched_setaffinity() declaration

The prototype for torture_sched_setaffinity() will be moved to a
different header, which will need to be included from update.c to avoid
this W=1 warning:

kernel/rcu/update.c:529:6: error: no previous prototype for 'torture_sched_setaffinity' [-Werror=missing-prototypes]
  529 | long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/update.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 19bf6fa3ee6a..9d3c2e6ba667 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -25,6 +25,7 @@
 #include <linux/interrupt.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/torture.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/percpu.h>
-- 
cgit v1.2.3


From 0cfecd7d754f2ef5d7e6b56ee656a8544ade920a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 31 Jul 2023 13:10:34 -0700
Subject: torture: Move rcutorture_sched_setaffinity() out of rcutorture

The rcutorture_sched_setaffinity() function is needed by locktorture,
so move its declaration from rcu.h to torture.h and rename it to the
more generic torture_sched_setaffinity() name.

Please note that use of this function is still restricted to torture
tests, and of those, currently only rcutorture and locktorture.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/torture.h | 5 +++++
 kernel/rcu/rcu.h        | 4 ----
 kernel/rcu/rcutorture.c | 2 +-
 kernel/rcu/update.c     | 8 ++++----
 4 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 017f0f710815..c98d0c83d117 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -121,10 +121,15 @@ void _torture_stop_kthread(char *m, struct task_struct **tp);
 #define torture_stop_kthread(n, tp) \
 	_torture_stop_kthread("Stopping " #n " task", &(tp))
 
+/* Scheduler-related definitions. */
 #ifdef CONFIG_PREEMPTION
 #define torture_preempt_schedule() __preempt_schedule()
 #else
 #define torture_preempt_schedule()	do { } while (0)
 #endif
 
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST)
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
+#endif
+
 #endif /* __LINUX_TORTURE_H */
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 98e13be411af..567bd3d72e39 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -568,10 +568,6 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
 static inline void rcu_gp_set_torture_wait(int duration) { }
 #endif
 
-#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
-long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
-#endif
-
 #ifdef CONFIG_TINY_SRCU
 
 static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index ade42d6a9d9b..7e82fb887d09 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -810,7 +810,7 @@ static void synchronize_rcu_trivial(void)
 	int cpu;
 
 	for_each_online_cpu(cpu) {
-		rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
+		torture_sched_setaffinity(current->pid, cpumask_of(cpu));
 		WARN_ON_ONCE(raw_smp_processor_id() != cpu);
 	}
 }
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 9d3c2e6ba667..c534d6806d3d 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -525,17 +525,17 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
 	do { } while (0)
 #endif
 
-#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST)
 /* Get rcutorture access to sched_setaffinity(). */
-long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	int ret;
 
 	ret = sched_setaffinity(pid, in_mask);
-	WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret);
+	WARN_ONCE(ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
+EXPORT_SYMBOL_GPL(torture_sched_setaffinity);
 #endif
 
 #ifdef CONFIG_RCU_STALL_COMMON
-- 
cgit v1.2.3


From 73e3412424832f519004dd2176226982be35d34e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 27 Jul 2023 20:04:06 -0700
Subject: locktorture: Add readers_bind and writers_bind module parameters

This commit adds readers_bind and writers_bind module parameters to
locktorture in order to skew tests across socket boundaries.  This skewing
is intended to provide additional variable-latency stress on the primitive
under test.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/locking/locktorture.c | 64 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 62 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 270c7f80ce84..441866259278 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -56,6 +56,55 @@ module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type,
 		 "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
 
+static cpumask_var_t readers_bind; // Bind the readers to the specified set of CPUs.
+static cpumask_var_t writers_bind; // Bind the writers to the specified set of CPUs.
+
+// Parse a cpumask kernel parameter.  If there are more users later on,
+// this might need to got to a more central location.
+static int param_set_cpumask(const char *val, const struct kernel_param *kp)
+{
+	cpumask_var_t *cm_bind = kp->arg;
+	int ret;
+	char *s;
+
+	if (!alloc_cpumask_var(cm_bind, GFP_KERNEL)) {
+		s = "Out of memory";
+		ret = -ENOMEM;
+		goto out_err;
+	}
+	ret = cpulist_parse(val, *cm_bind);
+	if (!ret)
+		return ret;
+	s = "Bad CPU range";
+out_err:
+	pr_warn("%s: %s, all CPUs set\n", kp->name, s);
+	cpumask_setall(*cm_bind);
+	return ret;
+}
+
+// Output a cpumask kernel parameter.
+static int param_get_cpumask(char *buffer, const struct kernel_param *kp)
+{
+	cpumask_var_t *cm_bind = kp->arg;
+
+	return sprintf(buffer, "%*pbl", cpumask_pr_args(*cm_bind));
+}
+
+static bool cpumask_nonempty(cpumask_var_t mask)
+{
+	return cpumask_available(mask) && !cpumask_empty(mask);
+}
+
+static const struct kernel_param_ops lt_bind_ops = {
+	.set = param_set_cpumask,
+	.get = param_get_cpumask,
+};
+
+module_param_cb(readers_bind, &lt_bind_ops, &readers_bind, 0644);
+module_param_cb(writers_bind, &lt_bind_ops, &writers_bind, 0644);
+
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
+
 static struct task_struct *stats_task;
 static struct task_struct **writer_tasks;
 static struct task_struct **reader_tasks;
@@ -986,16 +1035,23 @@ static int lock_torture_stats(void *arg)
 	return 0;
 }
 
+
 static inline void
 lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
 				const char *tag)
 {
+	static cpumask_t cpumask_all;
+	cpumask_t *rcmp = cpumask_nonempty(readers_bind) ? readers_bind : &cpumask_all;
+	cpumask_t *wcmp = cpumask_nonempty(writers_bind) ? writers_bind : &cpumask_all;
+
+	cpumask_setall(&cpumask_all);
 	pr_alert("%s" TORTURE_FLAG
-		 "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+		 "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d readers_bind=%*pbl writers_bind=%*pbl\n",
 		 torture_type, tag, cxt.debug_lock ? " [debug]": "",
 		 cxt.nrealwriters_stress, cxt.nrealreaders_stress,
 		 nested_locks, stat_interval, verbose, shuffle_interval,
-		 stutter, shutdown_secs, onoff_interval, onoff_holdoff);
+		 stutter, shutdown_secs, onoff_interval, onoff_holdoff,
+		 cpumask_pr_args(rcmp), cpumask_pr_args(wcmp));
 }
 
 static void lock_torture_cleanup(void)
@@ -1250,6 +1306,8 @@ static int __init lock_torture_init(void)
 						     writer_fifo ? sched_set_fifo : NULL);
 		if (torture_init_error(firsterr))
 			goto unwind;
+		if (cpumask_nonempty(writers_bind))
+			torture_sched_setaffinity(writer_tasks[i]->pid, writers_bind);
 
 	create_reader:
 		if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
@@ -1259,6 +1317,8 @@ static int __init lock_torture_init(void)
 						  reader_tasks[j]);
 		if (torture_init_error(firsterr))
 			goto unwind;
+		if (cpumask_nonempty(readers_bind))
+			torture_sched_setaffinity(reader_tasks[j]->pid, readers_bind);
 	}
 	if (stat_interval > 0) {
 		firsterr = torture_create_kthread(lock_torture_stats, NULL,
-- 
cgit v1.2.3


From cca42bd8eb1b54a4c9bbf48c79d120e66619a3e4 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sat, 29 Jul 2023 14:27:31 +0000
Subject: rcutorture: Fix stuttering races and other issues

The stuttering code isn't functioning as expected. Ideally, it should
pause the torture threads for a designated period before resuming. Yet,
it fails to halt the test for the correct duration. Additionally, a race
condition exists, potentially causing the stuttering code to pause for
an extended period if the 'spt' variable is non-zero due to the stutter
orchestration thread's inadequate CPU time.

Moreover, over-stuttering can hinder RCU's progress on TREE07 kernels.
This happens as the stuttering code may run within a softirq due to RCU
callbacks. Consequently, ksoftirqd keeps a CPU busy for several seconds,
thus obstructing RCU's progress. This situation triggers a warning
message in the logs:

[ 2169.481783] rcu_torture_writer: rtort_pipe_count: 9

This warning suggests that an RCU torture object, although invisible to
RCU readers, couldn't make it past the pipe array and be freed -- a
strong indication that there weren't enough grace periods during the
stutter interval.

To address these issues, this patch sets the "stutter end" time to an
absolute point in the future set by the main stutter thread. This is
then used for waiting in stutter_wait(). While the stutter thread still
defines this absolute time, the waiters' waiting logic doesn't rely on
the stutter thread receiving sufficient CPU time to halt the stuttering
as the halting is now self-controlled.

Cc: stable@vger.kernel.org
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/torture.c | 45 ++++++++++++---------------------------------
 1 file changed, 12 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/torture.c b/kernel/torture.c
index 6ba62e5993e7..fd353f98162f 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -720,7 +720,7 @@ static void torture_shutdown_cleanup(void)
  * suddenly applied to or removed from the system.
  */
 static struct task_struct *stutter_task;
-static int stutter_pause_test;
+static ktime_t stutter_till_abs_time;
 static int stutter;
 static int stutter_gap;
 
@@ -730,30 +730,16 @@ static int stutter_gap;
  */
 bool stutter_wait(const char *title)
 {
-	unsigned int i = 0;
 	bool ret = false;
-	int spt;
+	ktime_t till_ns;
 
 	cond_resched_tasks_rcu_qs();
-	spt = READ_ONCE(stutter_pause_test);
-	for (; spt; spt = READ_ONCE(stutter_pause_test)) {
-		if (!ret && !rt_task(current)) {
-			sched_set_normal(current, MAX_NICE);
-			ret = true;
-		}
-		if (spt == 1) {
-			torture_hrtimeout_jiffies(1, NULL);
-		} else if (spt == 2) {
-			while (READ_ONCE(stutter_pause_test)) {
-				if (!(i++ & 0xffff))
-					torture_hrtimeout_us(10, 0, NULL);
-				cond_resched();
-			}
-		} else {
-			torture_hrtimeout_jiffies(round_jiffies_relative(HZ), NULL);
-		}
-		torture_shutdown_absorb(title);
+	till_ns = READ_ONCE(stutter_till_abs_time);
+	if (till_ns && ktime_before(ktime_get(), till_ns)) {
+		torture_hrtimeout_ns(till_ns, 0, HRTIMER_MODE_ABS, NULL);
+		ret = true;
 	}
+	torture_shutdown_absorb(title);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(stutter_wait);
@@ -764,23 +750,16 @@ EXPORT_SYMBOL_GPL(stutter_wait);
  */
 static int torture_stutter(void *arg)
 {
-	DEFINE_TORTURE_RANDOM(rand);
-	int wtime;
+	ktime_t till_ns;
 
 	VERBOSE_TOROUT_STRING("torture_stutter task started");
 	do {
 		if (!torture_must_stop() && stutter > 1) {
-			wtime = stutter;
-			if (stutter > 2) {
-				WRITE_ONCE(stutter_pause_test, 1);
-				wtime = stutter - 3;
-				torture_hrtimeout_jiffies(wtime, &rand);
-				wtime = 2;
-			}
-			WRITE_ONCE(stutter_pause_test, 2);
-			torture_hrtimeout_jiffies(wtime, NULL);
+			till_ns = ktime_add_ns(ktime_get(),
+					       jiffies_to_nsecs(stutter));
+			WRITE_ONCE(stutter_till_abs_time, till_ns);
+			torture_hrtimeout_jiffies(stutter - 1, NULL);
 		}
-		WRITE_ONCE(stutter_pause_test, 0);
 		if (!torture_must_stop())
 			torture_hrtimeout_jiffies(stutter_gap, NULL);
 		torture_shutdown_absorb("torture_stutter");
-- 
cgit v1.2.3


From 31742a56c676b6014fde99a0cf07d3d12b822e9a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 2 Aug 2023 15:30:31 -0700
Subject: locktorture: Alphabetize torture_param() entries

There are getting to be too many module parameters for a random list to be
comfortable, so this commit alphabetizes the list.  Strictly code motion.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/locking/locktorture.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 441866259278..57ee16cf879d 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -33,21 +33,21 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
 
-torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads");
-torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
 torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable");
+torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
+torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
+torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads");
 torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
 torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, rt_boost, 2,
+		   "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
+torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
 torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable");
 torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
 torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
 torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
-torture_param(int, rt_boost, 2,
-		   "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types.");
-torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
 torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority");
 torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
-torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
 /* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
 #define MAX_NESTED_LOCKS 8
 
-- 
cgit v1.2.3


From 84cee9e72e15b65921f05d41788d66a79c1baa25 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 2 Aug 2023 15:42:03 -0700
Subject: locktorture: Consolidate "if" statements in lock_torture_writer()

There is a pair of adjacent "if" statements with identical conditions in
the lock_torture_writer() function.  This commit therefore combines them.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/locking/locktorture.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 57ee16cf879d..c8c322e69a90 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -889,11 +889,10 @@ static int lock_torture_writer(void *arg)
 			lock_is_write_held = true;
 			if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
 				lwsp->n_lock_fail++; /* rare, but... */
-
 			lwsp->n_lock_acquired++;
-		}
-		if (!skip_main_lock) {
+
 			cxt.cur_ops->write_delay(&rand);
+
 			lock_is_write_held = false;
 			WRITE_ONCE(last_lock_release, jiffies);
 			cxt.cur_ops->writeunlock(tid);
-- 
cgit v1.2.3


From e3bdaefbccbd27b1829414604456611928a5e291 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 2 Aug 2023 16:32:06 -0700
Subject: locktorture: Add acq_writer_lim to complain about long acquistion
 times

This commit adds a locktorture.acq_writer_lim module parameter that
specifies the maximum number of jiffies that is expected to be consumed
by write-side lock acquisition.  If this limit is exceeded, a WARN_ONCE()
causes a splat.  Note that this limit applies to the main lock acquisition
only, not to any nested acquisitions.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/locking/locktorture.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index c8c322e69a90..296815ef67ae 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -33,6 +33,7 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
 
+torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies).");
 torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable");
 torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
 torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
@@ -852,11 +853,13 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
  */
 static int lock_torture_writer(void *arg)
 {
+	unsigned long j;
+	unsigned long j1;
+	u32 lockset_mask;
 	struct lock_stress_stats *lwsp = arg;
-	int tid = lwsp - cxt.lwsa;
 	DEFINE_TORTURE_RANDOM(rand);
-	u32 lockset_mask;
 	bool skip_main_lock;
+	int tid = lwsp - cxt.lwsa;
 
 	VERBOSE_TOROUT_STRING("lock_torture_writer task started");
 	if (!rt_task(current))
@@ -883,12 +886,20 @@ static int lock_torture_writer(void *arg)
 			cxt.cur_ops->nested_lock(tid, lockset_mask);
 
 		if (!skip_main_lock) {
+			if (acq_writer_lim > 0)
+				j = jiffies;
 			cxt.cur_ops->writelock(tid);
 			if (WARN_ON_ONCE(lock_is_write_held))
 				lwsp->n_lock_fail++;
 			lock_is_write_held = true;
 			if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
 				lwsp->n_lock_fail++; /* rare, but... */
+			if (acq_writer_lim > 0) {
+				j1 = jiffies;
+				WARN_ONCE(time_after(j1, j + acq_writer_lim),
+					  "%s: Lock acquisition took %lu jiffies.\n",
+					  __func__, j1 - j);
+			}
 			lwsp->n_lock_acquired++;
 
 			cxt.cur_ops->write_delay(&rand);
-- 
cgit v1.2.3


From 394473d876eaee0338a50249d5f807d4ae95e33c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 16 Aug 2023 12:24:44 -0700
Subject: torture: Print out torture module parameters

The kernel/torture.c module now has several module parameters, so this
commit causes them to be printed out.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/torture.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/torture.c b/kernel/torture.c
index fd353f98162f..c72ab2d251f4 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -791,6 +791,13 @@ static void torture_stutter_cleanup(void)
 	stutter_task = NULL;
 }
 
+static void
+torture_print_module_parms(void)
+{
+	pr_alert("torture module --- %s:  disable_onoff_at_boot=%d ftrace_dump_at_shutdown=%d verbose_sleep_frequency=%d verbose_sleep_duration=%d random_shuffle=%d\n",
+		 torture_type, disable_onoff_at_boot, ftrace_dump_at_shutdown, verbose_sleep_frequency, verbose_sleep_duration, random_shuffle);
+}
+
 /*
  * Initialize torture module.  Please note that this is -not- invoked via
  * the usual module_init() mechanism, but rather by an explicit call from
@@ -813,6 +820,7 @@ bool torture_init_begin(char *ttype, int v)
 	torture_type = ttype;
 	verbose = v;
 	fullstop = FULLSTOP_DONTSTOP;
+	torture_print_module_parms();
 	return true;
 }
 EXPORT_SYMBOL_GPL(torture_init_begin);
-- 
cgit v1.2.3


From 00c24c9cfa7895d6e712bea6f7109911cfdd54d0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 21 Aug 2023 16:42:21 -0700
Subject: locktorture: Add new module parameters to
 lock_torture_print_module_parms()

This commit adds new module parameters to lock_torture_print_module_parms,
and alphabetizes things while in the area.  This change makes locktorture
test results more useful and self-contained.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/locking/locktorture.c | 64 ++++++++++++++++----------------------------
 1 file changed, 23 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 296815ef67ae..d2a3a8cc1902 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -47,8 +47,8 @@ torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=d
 torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
 torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
 torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
-torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority");
 torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
+torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority");
 /* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
 #define MAX_NESTED_LOCKS 8
 
@@ -166,12 +166,9 @@ static int torture_lock_busted_write_lock(int tid __maybe_unused)
 
 static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
 {
-	const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
-
 	/* We want a long delay occasionally to force massive contention.  */
-	if (!(torture_random(trsp) %
-	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms);
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+		mdelay(long_hold);
 	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
 		torture_preempt_schedule();  /* Allow test to be preempted. */
 }
@@ -244,15 +241,14 @@ __acquires(torture_spinlock)
 static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
 {
 	const unsigned long shortdelay_us = 2;
-	const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
 	unsigned long j;
 
 	/* We want a short delay mostly to emulate likely code, and
 	 * we want a long delay occasionally to force massive contention.
 	 */
-	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) {
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) {
 		j = jiffies;
-		mdelay(longdelay_ms);
+		mdelay(long_hold);
 		pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j);
 	}
 	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us)))
@@ -370,14 +366,12 @@ __acquires(torture_rwlock)
 static void torture_rwlock_write_delay(struct torture_random_state *trsp)
 {
 	const unsigned long shortdelay_us = 2;
-	const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
 
 	/* We want a short delay mostly to emulate likely code, and
 	 * we want a long delay occasionally to force massive contention.
 	 */
-	if (!(torture_random(trsp) %
-	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms);
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+		mdelay(long_hold);
 	else
 		udelay(shortdelay_us);
 }
@@ -398,14 +392,12 @@ __acquires(torture_rwlock)
 static void torture_rwlock_read_delay(struct torture_random_state *trsp)
 {
 	const unsigned long shortdelay_us = 10;
-	const unsigned long longdelay_ms = 100;
 
 	/* We want a short delay mostly to emulate likely code, and
 	 * we want a long delay occasionally to force massive contention.
 	 */
-	if (!(torture_random(trsp) %
-	      (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms);
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+		mdelay(long_hold);
 	else
 		udelay(shortdelay_us);
 }
@@ -503,12 +495,9 @@ __acquires(torture_mutex)
 
 static void torture_mutex_delay(struct torture_random_state *trsp)
 {
-	const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
-
 	/* We want a long delay occasionally to force massive contention.  */
-	if (!(torture_random(trsp) %
-	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms * 5);
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+		mdelay(long_hold * 5);
 	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
 		torture_preempt_schedule();  /* Allow test to be preempted. */
 }
@@ -676,15 +665,13 @@ __acquires(torture_rtmutex)
 static void torture_rtmutex_delay(struct torture_random_state *trsp)
 {
 	const unsigned long shortdelay_us = 2;
-	const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
 
 	/*
 	 * We want a short delay mostly to emulate likely code, and
 	 * we want a long delay occasionally to force massive contention.
 	 */
-	if (!(torture_random(trsp) %
-	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms);
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+		mdelay(long_hold);
 	if (!(torture_random(trsp) %
 	      (cxt.nrealwriters_stress * 200 * shortdelay_us)))
 		udelay(shortdelay_us);
@@ -741,12 +728,9 @@ __acquires(torture_rwsem)
 
 static void torture_rwsem_write_delay(struct torture_random_state *trsp)
 {
-	const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX;
-
 	/* We want a long delay occasionally to force massive contention.  */
-	if (!(torture_random(trsp) %
-	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms * 10);
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold)))
+		mdelay(long_hold * 10);
 	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
 		torture_preempt_schedule();  /* Allow test to be preempted. */
 }
@@ -766,14 +750,11 @@ __acquires(torture_rwsem)
 
 static void torture_rwsem_read_delay(struct torture_random_state *trsp)
 {
-	const unsigned long longdelay_ms = 100;
-
 	/* We want a long delay occasionally to force massive contention.  */
-	if (!(torture_random(trsp) %
-	      (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms * 2);
+	if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold)))
+		mdelay(long_hold * 2);
 	else
-		mdelay(longdelay_ms / 2);
+		mdelay(long_hold / 2);
 	if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
 		torture_preempt_schedule();  /* Allow test to be preempted. */
 }
@@ -1056,11 +1037,12 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
 
 	cpumask_setall(&cpumask_all);
 	pr_alert("%s" TORTURE_FLAG
-		 "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d readers_bind=%*pbl writers_bind=%*pbl\n",
+		 "--- %s%s: acq_writer_lim=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d readers_bind=%*pbl writers_bind=%*pbl\n",
 		 torture_type, tag, cxt.debug_lock ? " [debug]": "",
-		 cxt.nrealwriters_stress, cxt.nrealreaders_stress,
-		 nested_locks, stat_interval, verbose, shuffle_interval,
-		 stutter, shutdown_secs, onoff_interval, onoff_holdoff,
+		 acq_writer_lim, long_hold, nested_locks, cxt.nrealreaders_stress,
+		 cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost,
+		 rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter,
+		 verbose, writer_fifo,
 		 cpumask_pr_args(rcmp), cpumask_pr_args(wcmp));
 }
 
-- 
cgit v1.2.3


From 7f993623e9ebcd633c0f760991e5078b95a37db3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 21 Aug 2023 19:36:10 -0700
Subject: locktorture: Add call_rcu_chains module parameter

When running locktorture on large systems, there will normally be
enough RCU activity to ensure that there is a grace period in flight
at all times.  However, on smaller systems, RCU might well be idle the
majority of the time.  This situation can be inconvenient in cases where
the RCU CPU stall warning is part of the debugging process.

This commit therefore adds an call_rcu_chains module parameter to
locktorture, allowing the user to specify the desired number of
self-propagating call_rcu() chains.  For good measure, immediately
before invoking call_rcu(), the self-propagating RCU callback invokes
start_poll_synchronize_rcu() to force the immediate start of a grace
period, with the call_rcu() forcing another to start shortly thereafter.

Booting with locktorture.call_rcu_chains=2 increases the probability
of a stuck locking primitive resulting in an RCU CPU stall warning from
about 25% to nearly 100%.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  7 +++
 kernel/locking/locktorture.c                    | 62 ++++++++++++++++++++++++-
 2 files changed, 67 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 0a1731a0f0ef..300e2c30986c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2913,6 +2913,13 @@
 			to extract confidential information from the kernel
 			are also disabled.
 
+	locktorture.call_rcu_chains= [KNL]
+			Specify the number of self-propagating call_rcu()
+			chains to set up.  These are used to ensure that
+			there is a high probability of an RCU grace period
+			in progress at any given time.	Defaults to 0,
+			which disables these call_rcu() chains.
+
 	locktorture.nreaders_stress= [KNL]
 			Set the number of locking read-acquisition kthreads.
 			Defaults to being automatically set based on the
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index d2a3a8cc1902..01d56e6c44d7 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -34,6 +34,7 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
 
 torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies).");
+torture_param(int, call_rcu_chains, 0, "Self-propagate call_rcu() chains during test (0=disable).");
 torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable");
 torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
 torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads");
@@ -119,6 +120,12 @@ struct lock_stress_stats {
 	long n_lock_acquired;
 };
 
+struct call_rcu_chain {
+	struct rcu_head crc_rh;
+	bool crc_stop;
+};
+struct call_rcu_chain *call_rcu_chain;
+
 /* Forward reference. */
 static void lock_torture_cleanup(void);
 
@@ -1037,15 +1044,60 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
 
 	cpumask_setall(&cpumask_all);
 	pr_alert("%s" TORTURE_FLAG
-		 "--- %s%s: acq_writer_lim=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d readers_bind=%*pbl writers_bind=%*pbl\n",
+		 "--- %s%s: acq_writer_lim=%d call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d readers_bind=%*pbl writers_bind=%*pbl\n",
 		 torture_type, tag, cxt.debug_lock ? " [debug]": "",
-		 acq_writer_lim, long_hold, nested_locks, cxt.nrealreaders_stress,
+		 acq_writer_lim, call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress,
 		 cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost,
 		 rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter,
 		 verbose, writer_fifo,
 		 cpumask_pr_args(rcmp), cpumask_pr_args(wcmp));
 }
 
+// If requested, maintain call_rcu() chains to keep a grace period always
+// in flight.  These increase the probability of getting an RCU CPU stall
+// warning and associated diagnostics when a locking primitive stalls.
+
+static void call_rcu_chain_cb(struct rcu_head *rhp)
+{
+	struct call_rcu_chain *crcp = container_of(rhp, struct call_rcu_chain, crc_rh);
+
+	if (!smp_load_acquire(&crcp->crc_stop)) {
+		(void)start_poll_synchronize_rcu(); // Start one grace period...
+		call_rcu(&crcp->crc_rh, call_rcu_chain_cb); // ... and later start another.
+	}
+}
+
+// Start the requested number of call_rcu() chains.
+static int call_rcu_chain_init(void)
+{
+	int i;
+
+	if (call_rcu_chains <= 0)
+		return 0;
+	call_rcu_chain = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain), GFP_KERNEL);
+	if (!call_rcu_chains)
+		return -ENOMEM;
+	for (i = 0; i < call_rcu_chains; i++) {
+		call_rcu_chain[i].crc_stop = false;
+		call_rcu(&call_rcu_chain[i].crc_rh, call_rcu_chain_cb);
+	}
+	return 0;
+}
+
+// Stop all of the call_rcu() chains.
+static void call_rcu_chain_cleanup(void)
+{
+	int i;
+
+	if (!call_rcu_chain)
+		return;
+	for (i = 0; i < call_rcu_chains; i++)
+		smp_store_release(&call_rcu_chain[i].crc_stop, true);
+	rcu_barrier();
+	kfree(call_rcu_chain);
+	call_rcu_chain = NULL;
+}
+
 static void lock_torture_cleanup(void)
 {
 	int i;
@@ -1096,6 +1148,8 @@ static void lock_torture_cleanup(void)
 	kfree(cxt.lrsa);
 	cxt.lrsa = NULL;
 
+	call_rcu_chain_cleanup();
+
 end:
 	if (cxt.init_called) {
 		if (cxt.cur_ops->exit)
@@ -1225,6 +1279,10 @@ static int __init lock_torture_init(void)
 		}
 	}
 
+	firsterr = call_rcu_chain_init();
+	if (torture_init_error(firsterr))
+		goto unwind;
+
 	lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
 
 	/* Prepare torture context. */
-- 
cgit v1.2.3


From 2273799c292b033a20b0897e1ce2e2dee32304ef Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 22 Aug 2023 08:48:16 -0700
Subject: locktorture: Rename readers_bind/writers_bind to
 bind_readers/bind_writers

This commit renames the readers_bind and writers_bind module parameters
to bind_readers and bind_writers, respectively.  This provides added
clarity via the imperative mode and better organizes the documentation.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 16 +++++++-------
 kernel/locking/locktorture.c                    | 28 ++++++++++++-------------
 2 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 47f8b1cef4fd..1d539c6d9d1c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2918,6 +2918,14 @@
 			acquisition.  Acquisitions exceeding this limit
 			will result in a splat once they do complete.
 
+	locktorture.bind_readers= [KNL]
+			Specify the list of CPUs to which the readers are
+			to be bound.
+
+	locktorture.bind_writers= [KNL]
+			Specify the list of CPUs to which the writers are
+			to be bound.
+
 	locktorture.call_rcu_chains= [KNL]
 			Specify the number of self-propagating call_rcu()
 			chains to set up.  These are used to ensure that
@@ -2952,10 +2960,6 @@
 			Set time (s) between CPU-hotplug operations, or
 			zero to disable CPU-hotplug testing.
 
-	locktorture.readers_bind= [KNL]
-			Specify the list of CPUs to which the readers are
-			to be bound.
-
 	locktorture.rt_boost= [KNL]
 			Do periodic testing of real-time lock priority
 			boosting.  Select 0 to disable, 1 to boost
@@ -3000,10 +3004,6 @@
 	locktorture.verbose= [KNL]
 			Enable additional printk() statements.
 
-	locktorture.writers_bind= [KNL]
-			Specify the list of CPUs to which the writers are
-			to be bound.
-
 	locktorture.writer_fifo= [KNL]
 			Run the write-side locktorture kthreads at
 			sched_set_fifo() real-time priority.
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 01d56e6c44d7..a3abcd136f56 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -58,8 +58,8 @@ module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type,
 		 "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
 
-static cpumask_var_t readers_bind; // Bind the readers to the specified set of CPUs.
-static cpumask_var_t writers_bind; // Bind the writers to the specified set of CPUs.
+static cpumask_var_t bind_readers; // Bind the readers to the specified set of CPUs.
+static cpumask_var_t bind_writers; // Bind the writers to the specified set of CPUs.
 
 // Parse a cpumask kernel parameter.  If there are more users later on,
 // this might need to got to a more central location.
@@ -102,8 +102,8 @@ static const struct kernel_param_ops lt_bind_ops = {
 	.get = param_get_cpumask,
 };
 
-module_param_cb(readers_bind, &lt_bind_ops, &readers_bind, 0644);
-module_param_cb(writers_bind, &lt_bind_ops, &writers_bind, 0644);
+module_param_cb(bind_readers, &lt_bind_ops, &bind_readers, 0644);
+module_param_cb(bind_writers, &lt_bind_ops, &bind_writers, 0644);
 
 long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
 
@@ -1039,18 +1039,18 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
 				const char *tag)
 {
 	static cpumask_t cpumask_all;
-	cpumask_t *rcmp = cpumask_nonempty(readers_bind) ? readers_bind : &cpumask_all;
-	cpumask_t *wcmp = cpumask_nonempty(writers_bind) ? writers_bind : &cpumask_all;
+	cpumask_t *rcmp = cpumask_nonempty(bind_readers) ? bind_readers : &cpumask_all;
+	cpumask_t *wcmp = cpumask_nonempty(bind_writers) ? bind_writers : &cpumask_all;
 
 	cpumask_setall(&cpumask_all);
 	pr_alert("%s" TORTURE_FLAG
-		 "--- %s%s: acq_writer_lim=%d call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d readers_bind=%*pbl writers_bind=%*pbl\n",
+		 "--- %s%s: acq_writer_lim=%d bind_readers=%*pbl bind_writers=%*pbl call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d\n",
 		 torture_type, tag, cxt.debug_lock ? " [debug]": "",
-		 acq_writer_lim, call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress,
+		 acq_writer_lim, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp),
+		 call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress,
 		 cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost,
 		 rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter,
-		 verbose, writer_fifo,
-		 cpumask_pr_args(rcmp), cpumask_pr_args(wcmp));
+		 verbose, writer_fifo);
 }
 
 // If requested, maintain call_rcu() chains to keep a grace period always
@@ -1356,8 +1356,8 @@ static int __init lock_torture_init(void)
 						     writer_fifo ? sched_set_fifo : NULL);
 		if (torture_init_error(firsterr))
 			goto unwind;
-		if (cpumask_nonempty(writers_bind))
-			torture_sched_setaffinity(writer_tasks[i]->pid, writers_bind);
+		if (cpumask_nonempty(bind_writers))
+			torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers);
 
 	create_reader:
 		if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
@@ -1367,8 +1367,8 @@ static int __init lock_torture_init(void)
 						  reader_tasks[j]);
 		if (torture_init_error(firsterr))
 			goto unwind;
-		if (cpumask_nonempty(readers_bind))
-			torture_sched_setaffinity(reader_tasks[j]->pid, readers_bind);
+		if (cpumask_nonempty(bind_readers))
+			torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers);
 	}
 	if (stat_interval > 0) {
 		firsterr = torture_create_kthread(lock_torture_stats, NULL,
-- 
cgit v1.2.3


From 66bcb1321b104cce9c767087656e16883f17dfde Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Wed, 16 Aug 2023 20:49:12 +0000
Subject: rcutorture: Replace schedule_timeout*() 1-jiffy waits with HZ/20

In the past, spinning on schedule_timeout* with a wait of 1 jiffy has
hung the kernel. See for example d52d3a2bf408 ("torture: Fix hang during
kthread shutdown phase").

This issue recently recurred in torture's stutter code.  The result is
that the function instantly returns and never goes to sleep, preempting
whatever might otherwise make useful forward progress.

To prevent future issues, apply the commit-d52d3a2bf408 fix throughout
rcutorture, moving from a 1-jiffy wait to a 50-millisecond wait.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/rcutorture.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7e82fb887d09..8136fec0310b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1149,7 +1149,7 @@ static int rcu_torture_boost(void *arg)
 				mutex_unlock(&boost_mutex);
 				break;
 			}
-			schedule_timeout_uninterruptible(1);
+			schedule_timeout_uninterruptible(HZ / 20);
 		}
 
 		/* Go do the stutter. */
@@ -1160,7 +1160,7 @@ checkwait:	if (stutter_wait("rcu_torture_boost"))
 	/* Clean up and exit. */
 	while (!kthread_should_stop()) {
 		torture_shutdown_absorb("rcu_torture_boost");
-		schedule_timeout_uninterruptible(1);
+		schedule_timeout_uninterruptible(HZ / 20);
 	}
 	torture_kthread_stopping("rcu_torture_boost");
 	return 0;
@@ -1183,7 +1183,7 @@ rcu_torture_fqs(void *arg)
 		fqs_resume_time = jiffies + fqs_stutter * HZ;
 		while (time_before(jiffies, fqs_resume_time) &&
 		       !kthread_should_stop()) {
-			schedule_timeout_interruptible(1);
+			schedule_timeout_interruptible(HZ / 20);
 		}
 		fqs_burst_remaining = fqs_duration;
 		while (fqs_burst_remaining > 0 &&
@@ -2899,7 +2899,7 @@ static int rcu_torture_fwd_prog(void *args)
 			WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
 		} else {
 			while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop())
-				schedule_timeout_interruptible(1);
+				schedule_timeout_interruptible(HZ / 20);
 			oldseq = READ_ONCE(rcu_fwd_seq);
 		}
 		pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
@@ -3200,7 +3200,7 @@ static int rcu_torture_read_exit_child(void *trsp_in)
 	set_user_nice(current, MAX_NICE);
 	// Minimize time between reading and exiting.
 	while (!kthread_should_stop())
-		schedule_timeout_uninterruptible(1);
+		schedule_timeout_uninterruptible(HZ / 20);
 	(void)rcu_torture_one_read(trsp, -1);
 	return 0;
 }
@@ -3248,7 +3248,7 @@ static int rcu_torture_read_exit(void *unused)
 	smp_mb(); // Store before wakeup.
 	wake_up(&read_exit_wq);
 	while (!torture_must_stop())
-		schedule_timeout_uninterruptible(1);
+		schedule_timeout_uninterruptible(HZ / 20);
 	torture_kthread_stopping("rcu_torture_read_exit");
 	return 0;
 }
-- 
cgit v1.2.3


From 771a92b85a388b751bb9473300ea24c53f54387e Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang1211@gmail.com>
Date: Thu, 24 Aug 2023 16:42:06 +0800
Subject: rcutorture: Traverse possible cpu to set maxcpu in rcu_nocb_toggle()

Currently, the maxcpu is set by traversing online CPUs, however, if the
rcutorture.onoff_holdoff is set zero and onoff_interval is set non-zero,
and the some CPUs with larger cpuid has been offline before setting
maxcpu, for these CPUs, even if they are online again, also cannot
be offload or deoffload.  This can result in rcutorture attempting to
(de-)offload CPUs that have never been online, but the (de-)offload code
handles this.

This commit therefore use for_each_possible_cpu() instead of
for_each_online_cpu() in rcu_nocb_toggle().

Signed-off-by: Zqiang <qiang.zhang1211@gmail.com>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8136fec0310b..b17ad45cd67e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2126,7 +2126,7 @@ static int rcu_nocb_toggle(void *arg)
 	VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started");
 	while (!rcu_inkernel_boot_has_ended())
 		schedule_timeout_interruptible(HZ / 10);
-	for_each_online_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		maxcpu = cpu;
 	WARN_ON(maxcpu < 0);
 	if (toggle_interval > ULONG_MAX)
-- 
cgit v1.2.3


From 612f769edd06a6e42f7cd72425488e68ddaeef0a Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Fri, 11 Aug 2023 12:20:44 +0100
Subject: sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask

Sebastian noted that the rto_push_work IRQ work can be queued for a CPU
that has an empty pushable_tasks list, which means nothing useful will be
done in the IPI other than queue the work for the next CPU on the rto_mask.

rto_push_irq_work_func() only operates on tasks in the pushable_tasks list,
but the conditions for that irq_work to be queued (and for a CPU to be
added to the rto_mask) rely on rq_rt->nr_migratory instead.

nr_migratory is increased whenever an RT task entity is enqueued and it has
nr_cpus_allowed > 1. Unlike the pushable_tasks list, nr_migratory includes a
rt_rq's current task. This means a rt_rq can have a migratible current, N
non-migratible queued tasks, and be flagged as overloaded / have its CPU
set in the rto_mask, despite having an empty pushable_tasks list.

Make an rt_rq's overload logic be driven by {enqueue,dequeue}_pushable_task().
Since rt_rq->{rt_nr_migratory,rt_nr_total} become unused, remove them.

Note that the case where the current task is pushed away to make way for a
migration-disabled task remains unchanged: the migration-disabled task has
to be in the pushable_tasks list in the first place, which means it has
nr_cpus_allowed > 1.

Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lore.kernel.org/r/20230811112044.3302588-1-vschneid@redhat.com
---
 kernel/sched/debug.c |  3 ---
 kernel/sched/rt.c    | 70 ++++++++--------------------------------------------
 kernel/sched/sched.h |  2 --
 3 files changed, 10 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5e34a8cb2c76..c4253bd2dfb0 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -724,9 +724,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
 
 	PU(rt_nr_running);
-#ifdef CONFIG_SMP
-	PU(rt_nr_migratory);
-#endif
 	P(rt_throttled);
 	PN(rt_time);
 	PN(rt_runtime);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3e442fa3f6bc..3b627ab586fb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -143,7 +143,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
 #if defined CONFIG_SMP
 	rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
 	rt_rq->highest_prio.next = MAX_RT_PRIO-1;
-	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
 	plist_head_init(&rt_rq->pushable_tasks);
 #endif /* CONFIG_SMP */
@@ -358,53 +357,6 @@ static inline void rt_clear_overload(struct rq *rq)
 	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 }
 
-static void update_rt_migration(struct rt_rq *rt_rq)
-{
-	if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
-		if (!rt_rq->overloaded) {
-			rt_set_overload(rq_of_rt_rq(rt_rq));
-			rt_rq->overloaded = 1;
-		}
-	} else if (rt_rq->overloaded) {
-		rt_clear_overload(rq_of_rt_rq(rt_rq));
-		rt_rq->overloaded = 0;
-	}
-}
-
-static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-	struct task_struct *p;
-
-	if (!rt_entity_is_task(rt_se))
-		return;
-
-	p = rt_task_of(rt_se);
-	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
-	rt_rq->rt_nr_total++;
-	if (p->nr_cpus_allowed > 1)
-		rt_rq->rt_nr_migratory++;
-
-	update_rt_migration(rt_rq);
-}
-
-static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-	struct task_struct *p;
-
-	if (!rt_entity_is_task(rt_se))
-		return;
-
-	p = rt_task_of(rt_se);
-	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
-	rt_rq->rt_nr_total--;
-	if (p->nr_cpus_allowed > 1)
-		rt_rq->rt_nr_migratory--;
-
-	update_rt_migration(rt_rq);
-}
-
 static inline int has_pushable_tasks(struct rq *rq)
 {
 	return !plist_head_empty(&rq->rt.pushable_tasks);
@@ -438,6 +390,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 	/* Update the highest prio pushable task */
 	if (p->prio < rq->rt.highest_prio.next)
 		rq->rt.highest_prio.next = p->prio;
+
+	if (!rq->rt.overloaded) {
+		rt_set_overload(rq);
+		rq->rt.overloaded = 1;
+	}
 }
 
 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -451,6 +408,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 		rq->rt.highest_prio.next = p->prio;
 	} else {
 		rq->rt.highest_prio.next = MAX_RT_PRIO-1;
+
+		if (rq->rt.overloaded) {
+			rt_clear_overload(rq);
+			rq->rt.overloaded = 0;
+		}
 	}
 }
 
@@ -464,16 +426,6 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 {
 }
 
-static inline
-void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
-static inline
-void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
 static inline void rt_queue_push_tasks(struct rq *rq)
 {
 }
@@ -1281,7 +1233,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
 
 	inc_rt_prio(rt_rq, prio);
-	inc_rt_migration(rt_se, rt_rq);
 	inc_rt_group(rt_se, rt_rq);
 }
 
@@ -1294,7 +1245,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
 
 	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
-	dec_rt_migration(rt_se, rt_rq);
 	dec_rt_group(rt_se, rt_rq);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 96f8ab7a0702..41d760df458f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -663,8 +663,6 @@ struct rt_rq {
 	} highest_prio;
 #endif
 #ifdef CONFIG_SMP
-	unsigned int		rt_nr_migratory;
-	unsigned int		rt_nr_total;
 	int			overloaded;
 	struct plist_head	pushable_tasks;
 
-- 
cgit v1.2.3


From f915fcb38553eb9150a918348d932fd292de71dc Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 20 Sep 2023 23:31:37 +0200
Subject: bpf: Count stats for kprobe_multi programs

Adding support to gather missed stats for kprobe_multi
programs due to bpf_prog_active protection.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Song Liu <song@kernel.org>
Reviewed-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/bpf/20230920213145.1941596-2-jolsa@kernel.org
---
 kernel/trace/bpf_trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 868008f56fec..54827d04c9a6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2710,6 +2710,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
 	int err;
 
 	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+		bpf_prog_inc_misses_counter(link->link.prog);
 		err = 0;
 		goto out;
 	}
-- 
cgit v1.2.3


From e2b2cd592adbd303bcc02451d32fedd511000fb0 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 20 Sep 2023 23:31:38 +0200
Subject: bpf: Add missed value to kprobe_multi link info

Add missed value to kprobe_multi link info to hold the stats of missed
kprobe_multi probe.

The missed counter gets incremented when fprobe fails the recursion
check or there's no rethook available for return probe. In either
case the attached bpf program is not executed.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Song Liu <song@kernel.org>
Reviewed-by: Song Liu <song@kernel.org>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/bpf/20230920213145.1941596-3-jolsa@kernel.org
---
 include/uapi/linux/bpf.h       | 1 +
 kernel/trace/bpf_trace.c       | 1 +
 tools/include/uapi/linux/bpf.h | 1 +
 3 files changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5f13db15a3c7..1da5b1bcce71 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6532,6 +6532,7 @@ struct bpf_link_info {
 			__aligned_u64 addrs;
 			__u32 count; /* in/out: kprobe_multi function count */
 			__u32 flags;
+			__u64 missed;
 		} kprobe_multi;
 		struct {
 			__u32 type; /* enum bpf_perf_event_type */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 54827d04c9a6..6aec6e7d612a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2614,6 +2614,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
 	kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
 	info->kprobe_multi.count = kmulti_link->cnt;
 	info->kprobe_multi.flags = kmulti_link->flags;
+	info->kprobe_multi.missed = kmulti_link->fp.nmissed;
 
 	if (!uaddrs)
 		return 0;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5f13db15a3c7..1da5b1bcce71 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6532,6 +6532,7 @@ struct bpf_link_info {
 			__aligned_u64 addrs;
 			__u32 count; /* in/out: kprobe_multi function count */
 			__u32 flags;
+			__u64 missed;
 		} kprobe_multi;
 		struct {
 			__u32 type; /* enum bpf_perf_event_type */
-- 
cgit v1.2.3


From 3acf8ace68230e9558cf916847f1cc9f208abdf1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 20 Sep 2023 23:31:39 +0200
Subject: bpf: Add missed value to kprobe perf link info

Add missed value to kprobe attached through perf link info to
hold the stats of missed kprobe handler execution.

The kprobe's missed counter gets incremented when kprobe handler
is not executed due to another kprobe running on the same cpu.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20230920213145.1941596-4-jolsa@kernel.org
---
 include/linux/trace_events.h   |  6 ++++--
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/syscall.c           | 14 ++++++++------
 kernel/trace/bpf_trace.c       |  5 +++--
 kernel/trace/trace_kprobe.c    | 14 +++++++++++---
 tools/include/uapi/linux/bpf.h |  1 +
 6 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 21ae37e49319..5eb88a66eb68 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -761,7 +761,8 @@ struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name);
 void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp);
 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 			    u32 *fd_type, const char **buf,
-			    u64 *probe_offset, u64 *probe_addr);
+			    u64 *probe_offset, u64 *probe_addr,
+			    unsigned long *missed);
 int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 #else
@@ -801,7 +802,7 @@ static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
 static inline int bpf_get_perf_event_info(const struct perf_event *event,
 					  u32 *prog_id, u32 *fd_type,
 					  const char **buf, u64 *probe_offset,
-					  u64 *probe_addr)
+					  u64 *probe_addr, unsigned long *missed)
 {
 	return -EOPNOTSUPP;
 }
@@ -877,6 +878,7 @@ extern void perf_kprobe_destroy(struct perf_event *event);
 extern int bpf_get_kprobe_info(const struct perf_event *event,
 			       u32 *fd_type, const char **symbol,
 			       u64 *probe_offset, u64 *probe_addr,
+			       unsigned long *missed,
 			       bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1da5b1bcce71..70bfa997e896 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6548,6 +6548,7 @@ struct bpf_link_info {
 					__u32 name_len;
 					__u32 offset; /* offset from func_name */
 					__u64 addr;
+					__u64 missed;
 				} kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */
 				struct {
 					__aligned_u64 tp_name;   /* in/out */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 85c1d908f70f..6b5280f14a53 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3374,7 +3374,7 @@ static void bpf_perf_link_dealloc(struct bpf_link *link)
 static int bpf_perf_link_fill_common(const struct perf_event *event,
 				     char __user *uname, u32 ulen,
 				     u64 *probe_offset, u64 *probe_addr,
-				     u32 *fd_type)
+				     u32 *fd_type, unsigned long *missed)
 {
 	const char *buf;
 	u32 prog_id;
@@ -3385,7 +3385,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event,
 		return -EINVAL;
 
 	err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf,
-				      probe_offset, probe_addr);
+				      probe_offset, probe_addr, missed);
 	if (err)
 		return err;
 	if (!uname)
@@ -3408,6 +3408,7 @@ static int bpf_perf_link_fill_common(const struct perf_event *event,
 static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
 				     struct bpf_link_info *info)
 {
+	unsigned long missed;
 	char __user *uname;
 	u64 addr, offset;
 	u32 ulen, type;
@@ -3416,7 +3417,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
 	uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
 	ulen = info->perf_event.kprobe.name_len;
 	err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
-					&type);
+					&type, &missed);
 	if (err)
 		return err;
 	if (type == BPF_FD_TYPE_KRETPROBE)
@@ -3425,6 +3426,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
 		info->perf_event.type = BPF_PERF_EVENT_KPROBE;
 
 	info->perf_event.kprobe.offset = offset;
+	info->perf_event.kprobe.missed = missed;
 	if (!kallsyms_show_value(current_cred()))
 		addr = 0;
 	info->perf_event.kprobe.addr = addr;
@@ -3444,7 +3446,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
 	uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
 	ulen = info->perf_event.uprobe.name_len;
 	err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr,
-					&type);
+					&type, NULL);
 	if (err)
 		return err;
 
@@ -3480,7 +3482,7 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
 	uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
 	ulen = info->perf_event.tracepoint.name_len;
 	info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
-	return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL);
+	return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
 }
 
 static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
@@ -4813,7 +4815,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
 
 		err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
 					      &buf, &probe_offset,
-					      &probe_addr);
+					      &probe_addr, NULL);
 		if (!err)
 			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
 						     fd_type, buf,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6aec6e7d612a..f6a7d2524949 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2384,7 +2384,8 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 
 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 			    u32 *fd_type, const char **buf,
-			    u64 *probe_offset, u64 *probe_addr)
+			    u64 *probe_offset, u64 *probe_addr,
+			    unsigned long *missed)
 {
 	bool is_tracepoint, is_syscall_tp;
 	struct bpf_prog *prog;
@@ -2419,7 +2420,7 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
 #ifdef CONFIG_KPROBE_EVENTS
 		if (flags & TRACE_EVENT_FL_KPROBE)
 			err = bpf_get_kprobe_info(event, fd_type, buf,
-						  probe_offset, probe_addr,
+						  probe_offset, probe_addr, missed,
 						  event->attr.type == PERF_TYPE_TRACEPOINT);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3d7a180a8427..961a78ffd6d2 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1189,6 +1189,12 @@ static const struct file_operations kprobe_events_ops = {
 	.write		= probes_write,
 };
 
+static unsigned long trace_kprobe_missed(struct trace_kprobe *tk)
+{
+	return trace_kprobe_is_return(tk) ?
+		tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed;
+}
+
 /* Probes profiling interfaces */
 static int probes_profile_seq_show(struct seq_file *m, void *v)
 {
@@ -1200,8 +1206,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
 		return 0;
 
 	tk = to_trace_kprobe(ev);
-	nmissed = trace_kprobe_is_return(tk) ?
-		tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed;
+	nmissed = trace_kprobe_missed(tk);
 	seq_printf(m, "  %-44s %15lu %15lu\n",
 		   trace_probe_name(&tk->tp),
 		   trace_kprobe_nhit(tk),
@@ -1547,7 +1552,8 @@ NOKPROBE_SYMBOL(kretprobe_perf_func);
 
 int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
 			const char **symbol, u64 *probe_offset,
-			u64 *probe_addr, bool perf_type_tracepoint)
+			u64 *probe_addr, unsigned long *missed,
+			bool perf_type_tracepoint)
 {
 	const char *pevent = trace_event_name(event->tp_event);
 	const char *group = event->tp_event->class->system;
@@ -1566,6 +1572,8 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
 	*probe_addr = kallsyms_show_value(current_cred()) ?
 		      (unsigned long)tk->rp.kp.addr : 0;
 	*symbol = tk->symbol;
+	if (missed)
+		*missed = trace_kprobe_missed(tk);
 	return 0;
 }
 #endif	/* CONFIG_PERF_EVENTS */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1da5b1bcce71..70bfa997e896 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6548,6 +6548,7 @@ struct bpf_link_info {
 					__u32 name_len;
 					__u32 offset; /* offset from func_name */
 					__u64 addr;
+					__u64 missed;
 				} kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */
 				struct {
 					__aligned_u64 tp_name;   /* in/out */
-- 
cgit v1.2.3


From dd8657894c11b03c6eb0fd53fe9d7fec2072d18b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 20 Sep 2023 23:31:40 +0200
Subject: bpf: Count missed stats in trace_call_bpf

Increase misses stats in case bpf array execution is skipped
because of recursion check in trace_call_bpf.

Adding bpf_prog_inc_misses_counters that increase misses
counts for all bpf programs in bpf_prog_array.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Song Liu <song@kernel.org>
Reviewed-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/bpf/20230920213145.1941596-5-jolsa@kernel.org
---
 include/linux/bpf.h      | 16 ++++++++++++++++
 kernel/trace/bpf_trace.c |  3 +++
 2 files changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 30063a760b5a..a82efd34b741 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2922,6 +2922,22 @@ static inline int sock_map_bpf_prog_query(const union bpf_attr *attr,
 #endif /* CONFIG_BPF_SYSCALL */
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
+static __always_inline void
+bpf_prog_inc_misses_counters(const struct bpf_prog_array *array)
+{
+	const struct bpf_prog_array_item *item;
+	struct bpf_prog *prog;
+
+	if (unlikely(!array))
+		return;
+
+	item = &array->items[0];
+	while ((prog = READ_ONCE(item->prog))) {
+		bpf_prog_inc_misses_counter(prog);
+		item++;
+	}
+}
+
 #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
 void bpf_sk_reuseport_detach(struct sock *sk);
 int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f6a7d2524949..df697c74d519 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -117,6 +117,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 		 * and don't send kprobe event into ring-buffer,
 		 * so return zero here
 		 */
+		rcu_read_lock();
+		bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array));
+		rcu_read_unlock();
 		ret = 0;
 		goto out;
 	}
-- 
cgit v1.2.3


From d8d5b7bf6f2105883bbd91bbd4d5b67e4e3dff71 Mon Sep 17 00:00:00 2001
From: Denis Arefev <arefev@swemel.ru>
Date: Mon, 4 Sep 2023 15:21:14 +0300
Subject: srcu: Fix srcu_struct node grpmask overflow on 64-bit systems

The value of a bitwise expression 1 << (cpu - sdp->mynode->grplo)
is subject to overflow due to a failure to cast operands to a larger
data type before performing the bitwise operation.

The maximum result of this subtraction is defined by the RCU_FANOUT_LEAF
Kconfig option, which on 64-bit systems defaults to 16 (resulting in a
maximum shift of 15), but which can be set up as high as 64 (resulting
in a maximum shift of 63).  A value of 31 can result in sign extension,
resulting in 0xffffffff80000000 instead of the desired 0x80000000.
A value of 32 or greater triggers undefined behavior per the C standard.

This bug has not been known to cause issues because almost all kernels
take the default CONFIG_RCU_FANOUT_LEAF=16.  Furthermore, as long as a
given compiler gives a deterministic non-zero result for 1<<N for N>=32,
the code correctly invokes all SRCU callbacks, albeit wasting CPU time
along the way.

This commit therefore substitutes the correct 1UL for the buggy 1.

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Signed-off-by: Denis Arefev <arefev@swemel.ru>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: David Laight <David.Laight@aculab.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/srcutree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 833a8f848a90..5602042856b1 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -223,7 +223,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
 				snp->grplo = cpu;
 			snp->grphi = cpu;
 		}
-		sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
+		sdp->grpmask = 1UL << (cpu - sdp->mynode->grplo);
 	}
 	smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
 	return true;
@@ -835,7 +835,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
 	int cpu;
 
 	for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
-		if (!(mask & (1 << (cpu - snp->grplo))))
+		if (!(mask & (1UL << (cpu - snp->grplo))))
 			continue;
 		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
 	}
-- 
cgit v1.2.3


From d08970df1980476f27936e24d452550f3e9e92e1 Mon Sep 17 00:00:00 2001
From: Brian Geffon <bgeffon@google.com>
Date: Fri, 22 Sep 2023 12:07:04 -0400
Subject: PM: hibernate: Clean up sync_read handling in snapshot_write_next()

In snapshot_write_next(), sync_read is set and unset in three different
spots unnecessiarly. As a result there is a subtle bug where the first
page after the meta data has been loaded unconditionally sets sync_read
to 0. If this first PFN was actually a highmem page, then the returned
buffer will be the global "buffer," and the page needs to be loaded
synchronously.

That is, I'm not sure we can always assume the following to be safe:

	handle->buffer = get_buffer(&orig_bm, &ca);
	handle->sync_read = 0;

Because get_buffer() can call get_highmem_page_buffer() which can
return 'buffer'.

The easiest way to address this is just set sync_read before
snapshot_write_next() returns if handle->buffer == buffer.

Signed-off-by: Brian Geffon <bgeffon@google.com>
Fixes: 8357376d3df2 ("[PATCH] swsusp: Improve handling of highmem")
Cc: All applicable <stable@vger.kernel.org>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index acd2e49b9ac8..7f0e6870318b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2785,8 +2785,6 @@ next:
 	if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages)
 		return 0;
 
-	handle->sync_read = 1;
-
 	if (!handle->cur) {
 		if (!buffer)
 			/* This makes the buffer be freed by swsusp_free() */
@@ -2829,7 +2827,6 @@ next:
 			memory_bm_position_reset(&zero_bm);
 			restore_pblist = NULL;
 			handle->buffer = get_buffer(&orig_bm, &ca);
-			handle->sync_read = 0;
 			if (IS_ERR(handle->buffer))
 				return PTR_ERR(handle->buffer);
 		}
@@ -2839,9 +2836,8 @@ next:
 		handle->buffer = get_buffer(&orig_bm, &ca);
 		if (IS_ERR(handle->buffer))
 			return PTR_ERR(handle->buffer);
-		if (handle->buffer != buffer)
-			handle->sync_read = 0;
 	}
+	handle->sync_read = (handle->buffer == buffer);
 	handle->cur++;
 
 	/* Zero pages were not included in the image, memset it and move on. */
-- 
cgit v1.2.3


From 7bf770f74e850c0b8d21ac143b3b2bc18813a6eb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 25 Sep 2023 09:40:58 +0200
Subject: PM: hibernate: fix the kerneldoc comment for swsusp_check() and
 swsusp_close()

The comments for both swsusp_check() and swsusp_close() don't actually
describe what they are doing.

Just removing the comments would probably better, but as the file is
full of useless kerneldoc comments for non-exported symbols this fits
in better with the style.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/swap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 74edbce2320b..a64af0a552f9 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1513,7 +1513,7 @@ end:
 static void *swsusp_holder;
 
 /**
- * swsusp_check - Check for swsusp signature in the resume device
+ * swsusp_check - Open the resume device and check for the swsusp signature.
  * @exclusive: Open the resume device exclusively.
  */
 
@@ -1564,7 +1564,7 @@ put:
 }
 
 /**
- * swsusp_close - close swap device.
+ * swsusp_close - close resume device.
  * @exclusive: Close the resume device which is exclusively opened.
  */
 
-- 
cgit v1.2.3


From 2d5780bbef8dbe6375d481cbea212606a80e4453 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <petr@tesarici.cz>
Date: Tue, 26 Sep 2023 20:55:56 +0200
Subject: swiotlb: fix the check whether a device has used software IO TLB

When CONFIG_SWIOTLB_DYNAMIC=y, devices which do not use the software IO TLB
can avoid swiotlb lookup. A flag is added by commit 1395706a1490 ("swiotlb:
search the software IO TLB only if the device makes use of it"), the flag
is correctly set, but it is then never checked. Add the actual check here.

Note that this code is an alternative to the default pool check, not an
additional check, because:

1. swiotlb_find_pool() also searches the default pool;
2. if dma_uses_io_tlb is false, the default swiotlb pool is not used.

Tested in a KVM guest against a QEMU RAM-backed SATA disk over virtio and
*not* using software IO TLB, this patch increases IOPS by approx 2% for
4-way parallel I/O.

The write memory barrier in swiotlb_dyn_alloc() is not needed, because a
newly allocated pool must always be observed by swiotlb_find_slots() before
an address from that pool is passed to is_swiotlb_buffer().

Correctness was verified using the following litmus test:

C swiotlb-new-pool

(*
 * Result: Never
 *
 * Check that a newly allocated pool is always visible when the
 *  corresponding swiotlb buffer is visible.
 *)

{
	mem_pools = default;
}

P0(int **mem_pools, int *pool)
{
	/* add_mem_pool() */
	WRITE_ONCE(*pool, 999);
	rcu_assign_pointer(*mem_pools, pool);
}

P1(int **mem_pools, int *flag, int *buf)
{
	/* swiotlb_find_slots() */
	int *r0;
	int r1;

	rcu_read_lock();
	r0 = READ_ONCE(*mem_pools);
	r1 = READ_ONCE(*r0);
	rcu_read_unlock();

	if (r1) {
		WRITE_ONCE(*flag, 1);
		smp_mb();
	}

	/* device driver (presumed) */
	WRITE_ONCE(*buf, r1);
}

P2(int **mem_pools, int *flag, int *buf)
{
	/* device driver (presumed) */
	int r0 = READ_ONCE(*buf);

	/* is_swiotlb_buffer() */
	int r1;
	int *r2;
	int r3;

	smp_rmb();
	r1 = READ_ONCE(*flag);
	if (r1) {
		/* swiotlb_find_pool() */
		rcu_read_lock();
		r2 = READ_ONCE(*mem_pools);
		r3 = READ_ONCE(*r2);
		rcu_read_unlock();
	}
}

exists (2:r0<>0 /\ 2:r3=0) (* Not found. *)

Fixes: 1395706a1490 ("swiotlb: search the software IO TLB only if the device makes use of it")
Reported-by: Jonathan Corbet <corbet@lwn.net>
Closes: https://lore.kernel.org/linux-iommu/87a5uz3ob8.fsf@meer.lwn.net/
Signed-off-by: Petr Tesarik <petr@tesarici.cz>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/swiotlb.h | 23 ++++++++++++++++-------
 kernel/dma/swiotlb.c    | 26 ++++++++++++++++++++------
 2 files changed, 36 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index b4536626f8ff..ecde0312dd52 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -172,14 +172,23 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 	if (!mem)
 		return false;
 
-	if (IS_ENABLED(CONFIG_SWIOTLB_DYNAMIC)) {
-		/* Pairs with smp_wmb() in swiotlb_find_slots() and
-		 * swiotlb_dyn_alloc(), which modify the RCU lists.
-		 */
-		smp_rmb();
-		return swiotlb_find_pool(dev, paddr);
-	}
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+	/*
+	 * All SWIOTLB buffer addresses must have been returned by
+	 * swiotlb_tbl_map_single() and passed to a device driver.
+	 * If a SWIOTLB address is checked on another CPU, then it was
+	 * presumably loaded by the device driver from an unspecified private
+	 * data structure. Make sure that this load is ordered before reading
+	 * dev->dma_uses_io_tlb here and mem->pools in swiotlb_find_pool().
+	 *
+	 * This barrier pairs with smp_mb() in swiotlb_find_slots().
+	 */
+	smp_rmb();
+	return READ_ONCE(dev->dma_uses_io_tlb) &&
+		swiotlb_find_pool(dev, paddr);
+#else
 	return paddr >= mem->defpool.start && paddr < mem->defpool.end;
+#endif
 }
 
 static inline bool is_swiotlb_force_bounce(struct device *dev)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 85dd94323b98..01637677736f 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -728,9 +728,6 @@ static void swiotlb_dyn_alloc(struct work_struct *work)
 	}
 
 	add_mem_pool(mem, pool);
-
-	/* Pairs with smp_rmb() in is_swiotlb_buffer(). */
-	smp_wmb();
 }
 
 /**
@@ -1151,9 +1148,26 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
 	spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags);
 
 found:
-	dev->dma_uses_io_tlb = true;
-	/* Pairs with smp_rmb() in is_swiotlb_buffer() */
-	smp_wmb();
+	WRITE_ONCE(dev->dma_uses_io_tlb, true);
+
+	/*
+	 * The general barrier orders reads and writes against a presumed store
+	 * of the SWIOTLB buffer address by a device driver (to a driver private
+	 * data structure). It serves two purposes.
+	 *
+	 * First, the store to dev->dma_uses_io_tlb must be ordered before the
+	 * presumed store. This guarantees that the returned buffer address
+	 * cannot be passed to another CPU before updating dev->dma_uses_io_tlb.
+	 *
+	 * Second, the load from mem->pools must be ordered before the same
+	 * presumed store. This guarantees that the returned buffer address
+	 * cannot be observed by another CPU before an update of the RCU list
+	 * that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy
+	 * atomicity).
+	 *
+	 * See also the comment in is_swiotlb_buffer().
+	 */
+	smp_mb();
 
 	*retpool = pool;
 	return index;
-- 
cgit v1.2.3


From dba428a678c7263afce06b1f765efa0e054278e2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 12 Sep 2023 12:44:02 +0200
Subject: tick/nohz: Rename the tick handlers to more self-explanatory names

The current names of the tick handlers don't tell much about what different
between them. Use names that better reflect their role and resolution.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230912104406.312185-2-frederic@kernel.org
---
 kernel/time/tick-sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 87015e9deacc..b66dd0ff1153 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1366,7 +1366,7 @@ void tick_nohz_idle_exit(void)
 /*
  * The nohz low res interrupt handler
  */
-static void tick_nohz_handler(struct clock_event_device *dev)
+static void tick_nohz_lowres_handler(struct clock_event_device *dev)
 {
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 	struct pt_regs *regs = get_irq_regs();
@@ -1412,7 +1412,7 @@ static void tick_nohz_switch_to_nohz(void)
 	if (!tick_nohz_enabled)
 		return;
 
-	if (tick_switch_to_oneshot(tick_nohz_handler))
+	if (tick_switch_to_oneshot(tick_nohz_lowres_handler))
 		return;
 
 	/*
@@ -1475,7 +1475,7 @@ void tick_irq_enter(void)
  * We rearm the timer until we get disabled by the idle code.
  * Called with interrupts disabled.
  */
-static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
+static enum hrtimer_restart tick_nohz_highres_handler(struct hrtimer *timer)
 {
 	struct tick_sched *ts =
 		container_of(timer, struct tick_sched, sched_timer);
@@ -1524,7 +1524,7 @@ void tick_setup_sched_timer(void)
 	 * Emulate tick processing via per-CPU hrtimers:
 	 */
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
-	ts->sched_timer.function = tick_sched_timer;
+	ts->sched_timer.function = tick_nohz_highres_handler;
 
 	/* Get the next period (per-CPU) */
 	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
-- 
cgit v1.2.3


From 822deeed3a6a3fdf0cd899d3b403ecbb12fb6c7a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 12 Sep 2023 12:44:03 +0200
Subject: tick/nohz: Update obsolete comments

Some comments are obsolete enough to assume that IRQ exit restarts the
tick in idle or RCU is turned on at the same time as the tick, among
other details.

Update them and add more.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230912104406.312185-3-frederic@kernel.org
---
 kernel/time/tick-sched.c | 46 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b66dd0ff1153..95a8d1d118a2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1175,12 +1175,23 @@ void tick_nohz_idle_enter(void)
 }
 
 /**
- * tick_nohz_irq_exit - update next tick event from interrupt exit
+ * tick_nohz_irq_exit - Notify the tick about IRQ exit
  *
- * When an interrupt fires while we are idle and it doesn't cause
- * a reschedule, it may still add, modify or delete a timer, enqueue
- * an RCU callback, etc...
- * So we need to re-calculate and reprogram the next tick event.
+ * A timer may have been added/modified/deleted either by the current IRQ,
+ * or by another place using this IRQ as a notification. This IRQ may have
+ * also updated the RCU callback list. These events may require a
+ * re-evaluation of the next tick. Depending on the context:
+ *
+ * 1) If the CPU is idle and no resched is pending, just proceed with idle
+ *    time accounting. The next tick will be re-evaluated on the next idle
+ *    loop iteration.
+ *
+ * 2) If the CPU is nohz_full:
+ *
+ *    2.1) If there is any tick dependency, restart the tick if stopped.
+ *
+ *    2.2) If there is no tick dependency, (re-)evaluate the next tick and
+ *         stop/update it accordingly.
  */
 void tick_nohz_irq_exit(void)
 {
@@ -1330,11 +1341,20 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
 }
 
 /**
- * tick_nohz_idle_exit - restart the idle tick from the idle task
+ * tick_nohz_idle_exit - Update the tick upon idle task exit
+ *
+ * When the idle task exits, update the tick depending on the
+ * following situations:
+ *
+ * 1) If the CPU is not in nohz_full mode (most cases), then
+ *    restart the tick.
+ *
+ * 2) If the CPU is in nohz_full mode (corner case):
+ *   2.1) If the tick can be kept stopped (no tick dependencies)
+ *        then re-eavaluate the next tick and try to keep it stopped
+ *        as long as possible.
+ *   2.2) If the tick has dependencies, restart the tick.
  *
- * Restart the idle tick when the CPU is woken up from idle
- * This also exit the RCU extended quiescent state. The CPU
- * can use RCU again after this function is called.
  */
 void tick_nohz_idle_exit(void)
 {
@@ -1364,7 +1384,13 @@ void tick_nohz_idle_exit(void)
 }
 
 /*
- * The nohz low res interrupt handler
+ * In low-resolution mode, the tick handler must be implemented directly
+ * at the clockevent level. hrtimer can't be used instead because its
+ * infrastructure actually relies on the tick itself as a backend in
+ * low-resolution mode (see hrtimer_run_queues()).
+ *
+ * This low-resolution handler still makes use of some hrtimer APIs meanwhile
+ * for commodity with expiration calculation and forwarding.
  */
 static void tick_nohz_lowres_handler(struct clock_event_device *dev)
 {
-- 
cgit v1.2.3


From 4f7f4409af289715f44685f250e380ce2cbffc7e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 12 Sep 2023 12:44:04 +0200
Subject: tick/nohz: Don't shutdown the lowres tick from itself

In lowres dynticks mode, just like in highres dynticks mode, when there
is no tick to program in the future, the tick eventually gets
deactivated either:

  * From the idle loop if in idle mode.
  * From the IRQ exit if in full dynticks mode.

Therefore there is no need to deactivate it from the tick itself. This
just just brings more overhead in the idle tick path for no reason.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Link: https://lore.kernel.org/r/20230912104406.312185-4-frederic@kernel.org
---
 kernel/time/tick-sched.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 95a8d1d118a2..8e9a9dcf60d5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1403,18 +1403,16 @@ static void tick_nohz_lowres_handler(struct clock_event_device *dev)
 	tick_sched_do_timer(ts, now);
 	tick_sched_handle(ts, regs);
 
-	if (unlikely(ts->tick_stopped)) {
-		/*
-		 * The clockevent device is not reprogrammed, so change the
-		 * clock event device to ONESHOT_STOPPED to avoid spurious
-		 * interrupts on devices which might not be truly one shot.
-		 */
-		tick_program_event(KTIME_MAX, 1);
-		return;
+	/*
+	 * In dynticks mode, tick reprogram is deferred:
+	 * - to the idle task if in dynticks-idle
+	 * - to IRQ exit if in full-dynticks.
+	 */
+	if (likely(!ts->tick_stopped)) {
+		hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
+		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
 	}
 
-	hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
-	tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
 }
 
 static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
@@ -1519,7 +1517,11 @@ static enum hrtimer_restart tick_nohz_highres_handler(struct hrtimer *timer)
 	else
 		ts->next_tick = 0;
 
-	/* No need to reprogram if we are in idle or full dynticks mode */
+	/*
+	 * In dynticks mode, tick reprogram is deferred:
+	 * - to the idle task if in dynticks-idle
+	 * - to IRQ exit if in full-dynticks.
+	 */
 	if (unlikely(ts->tick_stopped))
 		return HRTIMER_NORESTART;
 
-- 
cgit v1.2.3


From fc09027786c900368de98d03d40af058bcb01ad9 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sat, 23 Sep 2023 01:14:08 +0000
Subject: sched/rt: Fix live lock between select_fallback_rq() and RT push

During RCU-boost testing with the TREE03 rcutorture config, I found that
after a few hours, the machine locks up.

On tracing, I found that there is a live lock happening between 2 CPUs.
One CPU has an RT task running, while another CPU is being offlined
which also has an RT task running.  During this offlining, all threads
are migrated. The migration thread is repeatedly scheduled to migrate
actively running tasks on the CPU being offlined. This results in a live
lock because select_fallback_rq() keeps picking the CPU that an RT task
is already running on only to get pushed back to the CPU being offlined.

It is anyway pointless to pick CPUs for pushing tasks to if they are
being offlined only to get migrated away to somewhere else. This could
also add unwanted latency to this task.

Fix these issues by not selecting CPUs in RT if they are not 'active'
for scheduling, using the cpu_active_mask. Other parts in core.c already
use cpu_active_mask to prevent tasks from being put on CPUs going
offline.

With this fix I ran the tests for days and could not reproduce the
hang. Without the patch, I hit it in a few hours.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230923011409.3522762-1-joel@joelfernandes.org
---
 kernel/sched/cpupri.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index a286e726eb4b..42c40cfdf836 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -101,6 +101,7 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
 
 	if (lowest_mask) {
 		cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
+		cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
 
 		/*
 		 * We have to ensure that we have at least one bit
-- 
cgit v1.2.3


From 5fe7765997b139e2d922b58359dea181efe618f9 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <vschneid@redhat.com>
Date: Thu, 28 Sep 2023 17:02:51 +0200
Subject: sched/deadline: Make dl_rq->pushable_dl_tasks update drive
 dl_rq->overloaded

dl_rq->dl_nr_migratory is increased whenever a DL entity is enqueued and it has
nr_cpus_allowed > 1. Unlike the pushable_dl_tasks tree, dl_rq->dl_nr_migratory
includes a dl_rq's current task. This means a dl_rq can have a migratable
current, N non-migratable queued tasks, and be flagged as overloaded and have
its CPU set in the dlo_mask, despite having an empty pushable_tasks tree.

Make an dl_rq's overload logic be driven by {enqueue,dequeue}_pushable_dl_task(),
in other words make DL RQs only be flagged as overloaded if they have at
least one runnable-but-not-current migratable task.

 o push_dl_task() is unaffected, as it is a no-op if there are no pushable
   tasks.

 o pull_dl_task() now no longer scans runqueues whose sole migratable task is
   their current one, which it can't do anything about anyway.
   It may also now pull tasks to a DL RQ with dl_nr_running > 1 if only its
   current task is migratable.

Since dl_rq->dl_nr_migratory becomes unused, remove it.

RT had the exact same mechanism (rt_rq->rt_nr_migratory) which was dropped
in favour of relying on rt_rq->pushable_tasks, see:

  612f769edd06 ("sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask")

Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lore.kernel.org/r/20230928150251.463109-1-vschneid@redhat.com
---
 kernel/sched/deadline.c | 57 ++++++++++++-------------------------------------
 kernel/sched/debug.c    |  1 -
 kernel/sched/sched.h    |  1 -
 3 files changed, 14 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fb1996a674db..d98408a274e5 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -509,7 +509,6 @@ void init_dl_rq(struct dl_rq *dl_rq)
 	/* zero means no -deadline tasks */
 	dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
 
-	dl_rq->dl_nr_migratory = 0;
 	dl_rq->overloaded = 0;
 	dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
 #else
@@ -553,39 +552,6 @@ static inline void dl_clear_overload(struct rq *rq)
 	cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
 }
 
-static void update_dl_migration(struct dl_rq *dl_rq)
-{
-	if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
-		if (!dl_rq->overloaded) {
-			dl_set_overload(rq_of_dl_rq(dl_rq));
-			dl_rq->overloaded = 1;
-		}
-	} else if (dl_rq->overloaded) {
-		dl_clear_overload(rq_of_dl_rq(dl_rq));
-		dl_rq->overloaded = 0;
-	}
-}
-
-static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
-	struct task_struct *p = dl_task_of(dl_se);
-
-	if (p->nr_cpus_allowed > 1)
-		dl_rq->dl_nr_migratory++;
-
-	update_dl_migration(dl_rq);
-}
-
-static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
-	struct task_struct *p = dl_task_of(dl_se);
-
-	if (p->nr_cpus_allowed > 1)
-		dl_rq->dl_nr_migratory--;
-
-	update_dl_migration(dl_rq);
-}
-
 #define __node_2_pdl(node) \
 	rb_entry((node), struct task_struct, pushable_dl_tasks)
 
@@ -594,6 +560,11 @@ static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b)
 	return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl);
 }
 
+static inline int has_pushable_dl_tasks(struct rq *rq)
+{
+	return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
+}
+
 /*
  * The list of pushable -deadline task is not a plist, like in
  * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
@@ -609,6 +580,11 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
 				 __pushable_less);
 	if (leftmost)
 		rq->dl.earliest_dl.next = p->dl.deadline;
+
+	if (!rq->dl.overloaded) {
+		dl_set_overload(rq);
+		rq->dl.overloaded = 1;
+	}
 }
 
 static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
@@ -625,11 +601,11 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
 		dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline;
 
 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
-}
 
-static inline int has_pushable_dl_tasks(struct rq *rq)
-{
-	return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
+	if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) {
+		dl_clear_overload(rq);
+		rq->dl.overloaded = 0;
+	}
 }
 
 static int push_dl_task(struct rq *rq);
@@ -1504,7 +1480,6 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	add_nr_running(rq_of_dl_rq(dl_rq), 1);
 
 	inc_dl_deadline(dl_rq, deadline);
-	inc_dl_migration(dl_se, dl_rq);
 }
 
 static inline
@@ -1518,7 +1493,6 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	sub_nr_running(rq_of_dl_rq(dl_rq), 1);
 
 	dec_dl_deadline(dl_rq, dl_se->deadline);
-	dec_dl_migration(dl_se, dl_rq);
 }
 
 static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
@@ -2291,9 +2265,6 @@ static int push_dl_task(struct rq *rq)
 	struct rq *later_rq;
 	int ret = 0;
 
-	if (!rq->dl.overloaded)
-		return 0;
-
 	next_task = pick_next_pushable_dl_task(rq);
 	if (!next_task)
 		return 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c4253bd2dfb0..4580a450700e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -745,7 +745,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
 
 	PU(dl_nr_running);
 #ifdef CONFIG_SMP
-	PU(dl_nr_migratory);
 	dl_bw = &cpu_rq(cpu)->rd->dl_bw;
 #else
 	dl_bw = &dl_rq->dl_bw;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 41d760df458f..649eb9ec0657 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -707,7 +707,6 @@ struct dl_rq {
 		u64		next;
 	} earliest_dl;
 
-	unsigned int		dl_nr_migratory;
 	int			overloaded;
 
 	/*
-- 
cgit v1.2.3


From 6b00a40147653c8ea748e8f4396510f252763364 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qyousef@layalina.io>
Date: Sun, 17 Sep 2023 00:29:53 +0100
Subject: sched/uclamp: Set max_spare_cap_cpu even if max_spare_cap is 0

When uclamp_max is being used, the util of the task could be higher than
the spare capacity of the CPU, but due to uclamp_max value we force-fit
it there.

The way the condition for checking for max_spare_cap in
find_energy_efficient_cpu() was constructed; it ignored any CPU that has
its spare_cap less than or _equal_ to max_spare_cap. Since we initialize
max_spare_cap to 0; this lead to never setting max_spare_cap_cpu and
hence ending up never performing compute_energy() for this cluster and
missing an opportunity for a better energy efficient placement to honour
uclamp_max setting.

	max_spare_cap = 0;
	cpu_cap = capacity_of(cpu) - cpu_util(p);  // 0 if cpu_util(p) is high

	...

	util_fits_cpu(...);		// will return true if uclamp_max forces it to fit

	...

	// this logic will fail to update max_spare_cap_cpu if cpu_cap is 0
	if (cpu_cap > max_spare_cap) {
		max_spare_cap = cpu_cap;
		max_spare_cap_cpu = cpu;
	}

prev_spare_cap suffers from a similar problem.

Fix the logic by converting the variables into long and treating -1
value as 'not populated' instead of 0 which is a viable and correct
spare capacity value. We need to be careful signed comparison is used
when comparing with cpu_cap in one of the conditions.

Fixes: 1d42509e475c ("sched/fair: Make EAS wakeup placement consider uclamp restrictions")
Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230916232955.2099394-2-qyousef@layalina.io
---
 kernel/sched/fair.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2973173ad850..4ce949bb0213 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7703,11 +7703,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	for (; pd; pd = pd->next) {
 		unsigned long util_min = p_util_min, util_max = p_util_max;
 		unsigned long cpu_cap, cpu_thermal_cap, util;
-		unsigned long cur_delta, max_spare_cap = 0;
+		long prev_spare_cap = -1, max_spare_cap = -1;
 		unsigned long rq_util_min, rq_util_max;
-		unsigned long prev_spare_cap = 0;
+		unsigned long cur_delta, base_energy;
 		int max_spare_cap_cpu = -1;
-		unsigned long base_energy;
 		int fits, max_fits = -1;
 
 		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
@@ -7770,7 +7769,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 				prev_spare_cap = cpu_cap;
 				prev_fits = fits;
 			} else if ((fits > max_fits) ||
-				   ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
+				   ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
 				/*
 				 * Find the CPU with the maximum spare capacity
 				 * among the remaining CPUs in the performance
@@ -7782,7 +7781,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 			}
 		}
 
-		if (max_spare_cap_cpu < 0 && prev_spare_cap == 0)
+		if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
 			continue;
 
 		eenv_pd_busy_time(&eenv, cpus, p);
@@ -7790,7 +7789,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 		base_energy = compute_energy(&eenv, pd, cpus, p, -1);
 
 		/* Evaluate the energy impact of using prev_cpu. */
-		if (prev_spare_cap > 0) {
+		if (prev_spare_cap > -1) {
 			prev_delta = compute_energy(&eenv, pd, cpus, p,
 						    prev_cpu);
 			/* CPU utilization has changed */
-- 
cgit v1.2.3


From 23c9519def98ee0fa97ea5871535e9b136f522fc Mon Sep 17 00:00:00 2001
From: Qais Yousef <qyousef@layalina.io>
Date: Sun, 17 Sep 2023 00:29:54 +0100
Subject: sched/uclamp: Ignore (util == 0) optimization in feec() when
 p_util_max = 0

find_energy_efficient_cpu() bails out early if effective util of the
task is 0 as the delta at this point will be zero and there's nothing
for EAS to do. When uclamp is being used, this could lead to wrong
decisions when uclamp_max is set to 0. In this case the task is capped
to performance point 0, but it is actually running and consuming energy
and we can benefit from EAS energy calculations.

Rework the condition so that it bails out when both util and uclamp_min
are 0.

We can do that without needing to use uclamp_task_util(); remove it.

Fixes: d81304bc6193 ("sched/uclamp: Cater for uclamp in find_energy_efficient_cpu()'s early exit condition")
Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230916232955.2099394-3-qyousef@layalina.io
---
 kernel/sched/fair.c | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4ce949bb0213..284b0abe7f9b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4558,22 +4558,6 @@ static inline unsigned long task_util_est(struct task_struct *p)
 	return max(task_util(p), _task_util_est(p));
 }
 
-#ifdef CONFIG_UCLAMP_TASK
-static inline unsigned long uclamp_task_util(struct task_struct *p,
-					     unsigned long uclamp_min,
-					     unsigned long uclamp_max)
-{
-	return clamp(task_util_est(p), uclamp_min, uclamp_max);
-}
-#else
-static inline unsigned long uclamp_task_util(struct task_struct *p,
-					     unsigned long uclamp_min,
-					     unsigned long uclamp_max)
-{
-	return task_util_est(p);
-}
-#endif
-
 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
 				    struct task_struct *p)
 {
@@ -7695,7 +7679,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	target = prev_cpu;
 
 	sync_entity_load_avg(&p->se);
-	if (!uclamp_task_util(p, p_util_min, p_util_max))
+	if (!task_util_est(p) && p_util_min == 0)
 		goto unlock;
 
 	eenv_task_busy_time(&eenv, p, prev_cpu);
-- 
cgit v1.2.3


From 15874a3d27e6405e9d17595f83bd3ca1b6cab16d Mon Sep 17 00:00:00 2001
From: Qais Yousef <qyousef@layalina.io>
Date: Sun, 17 Sep 2023 00:29:55 +0100
Subject: sched/debug: Add new tracepoint to track compute energy computation

It was useful to track feec() placement decision and debug the spare
capacity and optimization issues vs uclamp_max.

Signed-off-by: Qais Yousef (Google) <qyousef@layalina.io>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230916232955.2099394-4-qyousef@layalina.io
---
 include/trace/events/sched.h | 5 +++++
 kernel/sched/core.c          | 1 +
 kernel/sched/fair.c          | 7 ++++++-
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index fbb99a61f714..a13d5d06be9d 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -735,6 +735,11 @@ DECLARE_TRACE(sched_update_nr_running_tp,
 	TP_PROTO(struct rq *rq, int change),
 	TP_ARGS(rq, change));
 
+DECLARE_TRACE(sched_compute_energy_tp,
+	TP_PROTO(struct task_struct *p, int dst_cpu, unsigned long energy,
+		 unsigned long max_util, unsigned long busy_time),
+	TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 84881a582847..324980e3d2e5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -114,6 +114,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 284b0abe7f9b..e2a69af8be36 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7600,11 +7600,16 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd,
 {
 	unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
 	unsigned long busy_time = eenv->pd_busy_time;
+	unsigned long energy;
 
 	if (dst_cpu >= 0)
 		busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
 
-	return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+	energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+
+	trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
+
+	return energy;
 }
 
 /*
-- 
cgit v1.2.3


From 3b0781595431acafe3db6596e12deb46975d91dd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 21 Jul 2023 08:41:27 -0600
Subject: futex: move FUTEX2_VALID_MASK to futex.h

We need this for validating the futex2 flags outside of the normal
futex syscalls.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/futex/futex.h    | 2 ++
 kernel/futex/syscalls.c | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index a06030a1a27b..a173a9d501e1 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -52,6 +52,8 @@ static inline unsigned int futex_to_flags(unsigned int op)
 	return flags;
 }
 
+#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
+
 /* FUTEX2_ to FLAGS_ */
 static inline unsigned int futex2_to_flags(unsigned int flags2)
 {
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 8200d86d30e1..2b5cafdfdc50 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -179,8 +179,6 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
 }
 
-#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
-
 /**
  * futex_parse_waitv - Parse a waitv array from userspace
  * @futexv:	Kernel side list of waiters to be filled
-- 
cgit v1.2.3


From 12a4be50aff30ee8f2c6a64020c82a4e997e8d6c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 8 Jun 2023 11:56:06 -0600
Subject: futex: factor out the futex wake handling

In preparation for having another waker that isn't futex_wake_mark(),
add a wake handler in futex_q. No extra data is associated with the
handler outside of struct futex_q itself. futex_wake_mark() is defined as
the standard wakeup helper, now set through futex_q_init like other
defaults.

Normal sync futex waiting relies on wake_q holding tasks that should
be woken up. This is what futex_wake_mark() does, it'll unqueue the
futex and add the associated task to the wake queue. For async usage of
futex waiting, rather than having tasks sleeping on the futex, we'll
need to deal with a futex wake differently. For the planned io_uring
case, that means posting a completion event for the task in question.
Having a definable wake handler can help support that use case.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/futex/futex.h    | 5 +++++
 kernel/futex/requeue.c  | 3 ++-
 kernel/futex/waitwake.c | 6 +++---
 3 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index a173a9d501e1..547f509b2c87 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -139,11 +139,15 @@ struct futex_pi_state {
 	union futex_key key;
 } __randomize_layout;
 
+struct futex_q;
+typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);
+
 /**
  * struct futex_q - The hashed futex queue entry, one per waiting task
  * @list:		priority-sorted list of tasks waiting on this futex
  * @task:		the task waiting on the futex
  * @lock_ptr:		the hash bucket lock
+ * @wake:		the wake handler for this queue
  * @key:		the key the futex is hashed on
  * @pi_state:		optional priority inheritance state
  * @rt_waiter:		rt_waiter storage for use with requeue_pi
@@ -168,6 +172,7 @@ struct futex_q {
 
 	struct task_struct *task;
 	spinlock_t *lock_ptr;
+	futex_wake_fn *wake;
 	union futex_key key;
 	struct futex_pi_state *pi_state;
 	struct rt_mutex_waiter *rt_waiter;
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index a0a79954f506..9dc789399a1a 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -58,6 +58,7 @@ enum {
 
 const struct futex_q futex_q_init = {
 	/* list gets initialized in futex_queue()*/
+	.wake		= futex_wake_mark,
 	.key		= FUTEX_KEY_INIT,
 	.bitset		= FUTEX_BITSET_MATCH_ANY,
 	.requeue_state	= ATOMIC_INIT(Q_REQUEUE_PI_NONE),
@@ -593,7 +594,7 @@ retry_private:
 		/* Plain futexes just wake or requeue and are done */
 		if (!requeue_pi) {
 			if (++task_count <= nr_wake)
-				futex_wake_mark(&wake_q, this);
+				this->wake(&wake_q, this);
 			else
 				requeue_futex(this, hb1, hb2, &key2);
 			continue;
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 37860f794bf7..35c6a637a4bb 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -177,7 +177,7 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 			if (!(this->bitset & bitset))
 				continue;
 
-			futex_wake_mark(&wake_q, this);
+			this->wake(&wake_q, this);
 			if (++ret >= nr_wake)
 				break;
 		}
@@ -292,7 +292,7 @@ retry_private:
 				ret = -EINVAL;
 				goto out_unlock;
 			}
-			futex_wake_mark(&wake_q, this);
+			this->wake(&wake_q, this);
 			if (++ret >= nr_wake)
 				break;
 		}
@@ -306,7 +306,7 @@ retry_private:
 					ret = -EINVAL;
 					goto out_unlock;
 				}
-				futex_wake_mark(&wake_q, this);
+				this->wake(&wake_q, this);
 				if (++op_ret >= nr_wake2)
 					break;
 			}
-- 
cgit v1.2.3


From e52c43403c9b839a30a9cfc4b75109581389d764 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 12 Jul 2023 09:14:52 -0600
Subject: futex: abstract out a __futex_wake_mark() helper

Move the unqueue and lock_ptr clear into a helper that futex_wake_mark()
calls. Add it to the public functions as well, in preparation for using
it outside the core futex code.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/futex/futex.h    |  1 +
 kernel/futex/waitwake.c | 33 ++++++++++++++++++++++-----------
 2 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 547f509b2c87..33835b81e0c3 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -219,6 +219,7 @@ extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 			    struct futex_q *q, struct futex_hash_bucket **hb);
 extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
 				   struct hrtimer_sleeper *timeout);
+extern bool __futex_wake_mark(struct futex_q *q);
 extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
 
 extern int fault_in_user_writeable(u32 __user *uaddr);
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 35c6a637a4bb..6fcf5f723719 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -106,20 +106,11 @@
  * double_lock_hb() and double_unlock_hb(), respectively.
  */
 
-/*
- * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed. Callers
- * must ensure to later call wake_up_q() for the actual
- * wakeups to occur.
- */
-void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
+bool __futex_wake_mark(struct futex_q *q)
 {
-	struct task_struct *p = q->task;
-
 	if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
-		return;
+		return false;
 
-	get_task_struct(p);
 	__futex_unqueue(q);
 	/*
 	 * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
@@ -130,6 +121,26 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
 	 */
 	smp_store_release(&q->lock_ptr, NULL);
 
+	return true;
+}
+
+/*
+ * The hash bucket lock must be held when this is called.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
+ */
+void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
+{
+	struct task_struct *p = q->task;
+
+	get_task_struct(p);
+
+	if (!__futex_wake_mark(q)) {
+		put_task_struct(p);
+		return;
+	}
+
 	/*
 	 * Queue the task for later wakeup for after we've released
 	 * the hb->lock.
-- 
cgit v1.2.3


From 8af1692616d993c93a080865a7f19506733aa462 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 13 Jun 2023 15:44:42 -0600
Subject: futex: add wake_data to struct futex_q

With handling multiple futex_q for waitv, we cannot easily go from the
futex_q to data related to that request or queue. Add a wake_data
argument that belongs to the wake handler assigned.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/futex/futex.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 33835b81e0c3..76f6c2e0f539 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -148,6 +148,7 @@ typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);
  * @task:		the task waiting on the futex
  * @lock_ptr:		the hash bucket lock
  * @wake:		the wake handler for this queue
+ * @wake_data:		data associated with the wake handler
  * @key:		the key the futex is hashed on
  * @pi_state:		optional priority inheritance state
  * @rt_waiter:		rt_waiter storage for use with requeue_pi
@@ -173,6 +174,7 @@ struct futex_q {
 	struct task_struct *task;
 	spinlock_t *lock_ptr;
 	futex_wake_fn *wake;
+	void *wake_data;
 	union futex_key key;
 	struct futex_pi_state *pi_state;
 	struct rt_mutex_waiter *rt_waiter;
-- 
cgit v1.2.3


From 5177c0cb306a8628bafbf1e6b7aa7e1b7436b8dc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 13 Jun 2023 08:31:58 -0600
Subject: futex: make futex_parse_waitv() available as a helper

To make it more generically useful, augment it with allowing the caller
to pass in the wake handler and wake data. Convert the futex_waitv()
syscall, passing in the default handlers.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/futex/futex.h    |  5 +++++
 kernel/futex/syscalls.c | 16 +++++++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 76f6c2e0f539..6b6a6b3da103 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -361,6 +361,11 @@ struct futex_vector {
 	struct futex_q q;
 };
 
+extern int futex_parse_waitv(struct futex_vector *futexv,
+			     struct futex_waitv __user *uwaitv,
+			     unsigned int nr_futexes, futex_wake_fn *wake,
+			     void *wake_data);
+
 extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
 			       struct hrtimer_sleeper *to);
 
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 2b5cafdfdc50..4b6da9116aa6 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -184,12 +184,15 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
  * @futexv:	Kernel side list of waiters to be filled
  * @uwaitv:     Userspace list to be parsed
  * @nr_futexes: Length of futexv
+ * @wake:	Wake to call when futex is woken
+ * @wake_data:	Data for the wake handler
  *
  * Return: Error code on failure, 0 on success
  */
-static int futex_parse_waitv(struct futex_vector *futexv,
-			     struct futex_waitv __user *uwaitv,
-			     unsigned int nr_futexes)
+int futex_parse_waitv(struct futex_vector *futexv,
+		      struct futex_waitv __user *uwaitv,
+		      unsigned int nr_futexes, futex_wake_fn *wake,
+		      void *wake_data)
 {
 	struct futex_waitv aux;
 	unsigned int i;
@@ -214,6 +217,8 @@ static int futex_parse_waitv(struct futex_vector *futexv,
 		futexv[i].w.val = aux.val;
 		futexv[i].w.uaddr = aux.uaddr;
 		futexv[i].q = futex_q_init;
+		futexv[i].q.wake = wake;
+		futexv[i].q.wake_data = wake_data;
 	}
 
 	return 0;
@@ -306,7 +311,8 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
 		goto destroy_timer;
 	}
 
-	ret = futex_parse_waitv(futexv, waiters, nr_futexes);
+	ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
+				NULL);
 	if (!ret)
 		ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
 
@@ -421,7 +427,7 @@ SYSCALL_DEFINE4(futex_requeue,
 	if (!waiters)
 		return -EINVAL;
 
-	ret = futex_parse_waitv(futexes, waiters, 2);
+	ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From e9a56c9325ef28d5481712e85dd5d3f8b2a68e88 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 13 Jun 2023 08:34:08 -0600
Subject: futex: make the vectored futex operations available

Rename unqueue_multiple() as futex_unqueue_multiple(), and make both
that and futex_wait_multiple_setup() available for external users. This
is in preparation for wiring up vectored waits in io_uring.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/futex/futex.h    |  5 +++++
 kernel/futex/waitwake.c | 10 +++++-----
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 6b6a6b3da103..8b195d06f4e8 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -366,6 +366,11 @@ extern int futex_parse_waitv(struct futex_vector *futexv,
 			     unsigned int nr_futexes, futex_wake_fn *wake,
 			     void *wake_data);
 
+extern int futex_wait_multiple_setup(struct futex_vector *vs, int count,
+				     int *woken);
+
+extern int futex_unqueue_multiple(struct futex_vector *v, int count);
+
 extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
 			       struct hrtimer_sleeper *to);
 
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 6fcf5f723719..61b112897a84 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -372,7 +372,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
 }
 
 /**
- * unqueue_multiple - Remove various futexes from their hash bucket
+ * futex_unqueue_multiple - Remove various futexes from their hash bucket
  * @v:	   The list of futexes to unqueue
  * @count: Number of futexes in the list
  *
@@ -382,7 +382,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
  *  - >=0 - Index of the last futex that was awoken;
  *  - -1  - No futex was awoken
  */
-static int unqueue_multiple(struct futex_vector *v, int count)
+int futex_unqueue_multiple(struct futex_vector *v, int count)
 {
 	int ret = -1, i;
 
@@ -410,7 +410,7 @@ static int unqueue_multiple(struct futex_vector *v, int count)
  *  -  0 - Success
  *  - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
  */
-static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
+int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
 {
 	struct futex_hash_bucket *hb;
 	bool retry = false;
@@ -472,7 +472,7 @@ retry:
 		 * was woken, we don't return error and return this index to
 		 * userspace
 		 */
-		*woken = unqueue_multiple(vs, i);
+		*woken = futex_unqueue_multiple(vs, i);
 		if (*woken >= 0)
 			return 1;
 
@@ -557,7 +557,7 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
 
 		__set_current_state(TASK_RUNNING);
 
-		ret = unqueue_multiple(vs, count);
+		ret = futex_unqueue_multiple(vs, count);
 		if (ret >= 0)
 			return ret;
 
-- 
cgit v1.2.3


From d77008421afda6208b1256c9b218457acd174ca6 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Thu, 17 Aug 2023 21:14:57 -0700
Subject: groups: Convert group_info.usage to refcount_t

atomic_t variables are currently used to implement reference counters
with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows and
underflows. This is important since overflows and underflows can lead
to use-after-free situation and be exploitable.

The variable group_info.usage is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

**Important note for maintainers:

Some functions from refcount_t API defined in refcount.h have different
memory ordering guarantees than their atomic counterparts. Please check
Documentation/core-api/refcount-vs-atomic.rst for more information.

Normally the differences should not matter since refcount_t provides
enough guarantees to satisfy the refcounting use cases, but in some
rare cases it might matter. Please double check that you don't have
some undocumented memory guarantees for this variable usage.

For the group_info.usage it might make a difference in following places:
 - put_group_info(): decrement in refcount_dec_and_test() only
   provides RELEASE ordering and ACQUIRE ordering on success vs. fully
   ordered atomic counterpart

Suggested-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Link: https://lore.kernel.org/r/20230818041456.gonna.009-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/cred.h | 7 ++++---
 kernel/cred.c        | 2 +-
 kernel/groups.c      | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index f923528d5cc4..92f8d772da6f 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/key.h>
 #include <linux/atomic.h>
+#include <linux/refcount.h>
 #include <linux/uidgid.h>
 #include <linux/sched.h>
 #include <linux/sched/user.h>
@@ -23,7 +24,7 @@ struct inode;
  * COW Supplementary groups list
  */
 struct group_info {
-	atomic_t	usage;
+	refcount_t	usage;
 	int		ngroups;
 	kgid_t		gid[];
 } __randomize_layout;
@@ -39,7 +40,7 @@ struct group_info {
  */
 static inline struct group_info *get_group_info(struct group_info *gi)
 {
-	atomic_inc(&gi->usage);
+	refcount_inc(&gi->usage);
 	return gi;
 }
 
@@ -49,7 +50,7 @@ static inline struct group_info *get_group_info(struct group_info *gi)
  */
 #define put_group_info(group_info)			\
 do {							\
-	if (atomic_dec_and_test(&(group_info)->usage))	\
+	if (refcount_dec_and_test(&(group_info)->usage))	\
 		groups_free(group_info);		\
 } while (0)
 
diff --git a/kernel/cred.c b/kernel/cred.c
index 98cb4eca23fb..4dc0b27b5462 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -36,7 +36,7 @@ do {									\
 static struct kmem_cache *cred_jar;
 
 /* init to 2 - one for init_task, one to ensure it is never freed */
-static struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
 
 /*
  * The initial credentials for the initial task
diff --git a/kernel/groups.c b/kernel/groups.c
index 9aaed2a31073..9b43da22647d 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -19,7 +19,7 @@ struct group_info *groups_alloc(int gidsetsize)
 	if (!gi)
 		return NULL;
 
-	atomic_set(&gi->usage, 1);
+	refcount_set(&gi->usage, 1);
 	gi->ngroups = gidsetsize;
 	return gi;
 }
-- 
cgit v1.2.3


From 6c774377359923e4bb46c6f26381edd9189389ed Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 28 Sep 2023 11:07:01 +0200
Subject: tick/nohz: Update comments some more

Inspired by recent enhancements to comments in kernel/time/tick-sched.c,
go through the entire file and fix/unify its comments:

 - Fix over a dozen typos, spelling mistakes & cases of bad grammar.

 - Re-phrase sentences that I needed to read three times to understand.

    [ I used the following arbitrary rule-of-thumb:
       - if I had to read a comment twice, it was usually my fault,
       - if I had to read it a third time, it was the comment's fault. ]

 - Comma updates:

    - Add commas where needed

    - Remove commas where not needed

    - In cases where a comma is optional, choose one variant and try to
      standardize it over similar sentences in the file.

 - Standardize on standalone 'NOHZ' spelling in free-flowing comments:

      s/nohz/NOHZ
      s/no idle tick/NOHZ

   Still keep 'dynticks' as a popular synonym.

 - Standardize on referring to variable names within free-flowing
   comments with the "'var'" nomenclature, and function names as
   "function_name()".

 - Standardize on '64-bit' and '32-bit':
     s/32bit/32-bit
     s/64bit/64-bit

 - Standardize on 'IRQ work':
     s/irq work/IRQ work

 - A few other tidyups I probably missed to list.

No change in functionality intended - other than one small change to
a syslog output string.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/ZRVCNeMcSQcXS36N@gmail.com
---
 kernel/time/tick-sched.c | 150 +++++++++++++++++++++++------------------------
 1 file changed, 74 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 8e9a9dcf60d5..be77b021e5d6 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -4,7 +4,7 @@
  *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
  *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
  *
- *  No idle tick implementation for low and high resolution timers
+ *  NOHZ implementation for low and high resolution timers
  *
  *  Started by: Thomas Gleixner and Ingo Molnar
  */
@@ -45,7 +45,7 @@ struct tick_sched *tick_get_tick_sched(int cpu)
 
 #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
 /*
- * The time, when the last jiffy update happened. Write access must hold
+ * The time when the last jiffy update happened. Write access must hold
  * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
  * consistent view of jiffies and last_jiffies_update.
  */
@@ -60,13 +60,13 @@ static void tick_do_update_jiffies64(ktime_t now)
 	ktime_t delta, nextp;
 
 	/*
-	 * 64bit can do a quick check without holding jiffies lock and
+	 * 64-bit can do a quick check without holding the jiffies lock and
 	 * without looking at the sequence count. The smp_load_acquire()
 	 * pairs with the update done later in this function.
 	 *
-	 * 32bit cannot do that because the store of tick_next_period
-	 * consists of two 32bit stores and the first store could move it
-	 * to a random point in the future.
+	 * 32-bit cannot do that because the store of 'tick_next_period'
+	 * consists of two 32-bit stores, and the first store could be
+	 * moved by the CPU to a random point in the future.
 	 */
 	if (IS_ENABLED(CONFIG_64BIT)) {
 		if (ktime_before(now, smp_load_acquire(&tick_next_period)))
@@ -75,7 +75,7 @@ static void tick_do_update_jiffies64(ktime_t now)
 		unsigned int seq;
 
 		/*
-		 * Avoid contention on jiffies_lock and protect the quick
+		 * Avoid contention on 'jiffies_lock' and protect the quick
 		 * check with the sequence count.
 		 */
 		do {
@@ -90,7 +90,7 @@ static void tick_do_update_jiffies64(ktime_t now)
 	/* Quick check failed, i.e. update is required. */
 	raw_spin_lock(&jiffies_lock);
 	/*
-	 * Reevaluate with the lock held. Another CPU might have done the
+	 * Re-evaluate with the lock held. Another CPU might have done the
 	 * update already.
 	 */
 	if (ktime_before(now, tick_next_period)) {
@@ -114,25 +114,23 @@ static void tick_do_update_jiffies64(ktime_t now)
 						   TICK_NSEC);
 	}
 
-	/* Advance jiffies to complete the jiffies_seq protected job */
+	/* Advance jiffies to complete the 'jiffies_seq' protected job */
 	jiffies_64 += ticks;
 
-	/*
-	 * Keep the tick_next_period variable up to date.
-	 */
+	/* Keep the tick_next_period variable up to date */
 	nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);
 
 	if (IS_ENABLED(CONFIG_64BIT)) {
 		/*
 		 * Pairs with smp_load_acquire() in the lockless quick
-		 * check above and ensures that the update to jiffies_64 is
-		 * not reordered vs. the store to tick_next_period, neither
+		 * check above, and ensures that the update to 'jiffies_64' is
+		 * not reordered vs. the store to 'tick_next_period', neither
 		 * by the compiler nor by the CPU.
 		 */
 		smp_store_release(&tick_next_period, nextp);
 	} else {
 		/*
-		 * A plain store is good enough on 32bit as the quick check
+		 * A plain store is good enough on 32-bit, as the quick check
 		 * above is protected by the sequence count.
 		 */
 		tick_next_period = nextp;
@@ -140,7 +138,7 @@ static void tick_do_update_jiffies64(ktime_t now)
 
 	/*
 	 * Release the sequence count. calc_global_load() below is not
-	 * protected by it, but jiffies_lock needs to be held to prevent
+	 * protected by it, but 'jiffies_lock' needs to be held to prevent
 	 * concurrent invocations.
 	 */
 	write_seqcount_end(&jiffies_seq);
@@ -160,7 +158,8 @@ static ktime_t tick_init_jiffy_update(void)
 
 	raw_spin_lock(&jiffies_lock);
 	write_seqcount_begin(&jiffies_seq);
-	/* Did we start the jiffies update yet ? */
+
+	/* Have we started the jiffies update yet ? */
 	if (last_jiffies_update == 0) {
 		u32 rem;
 
@@ -175,8 +174,10 @@ static ktime_t tick_init_jiffy_update(void)
 		last_jiffies_update = tick_next_period;
 	}
 	period = last_jiffies_update;
+
 	write_seqcount_end(&jiffies_seq);
 	raw_spin_unlock(&jiffies_lock);
+
 	return period;
 }
 
@@ -192,10 +193,10 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
 	 * concurrency: This happens only when the CPU in charge went
 	 * into a long sleep. If two CPUs happen to assign themselves to
 	 * this duty, then the jiffies update is still serialized by
-	 * jiffies_lock.
+	 * 'jiffies_lock'.
 	 *
 	 * If nohz_full is enabled, this should not happen because the
-	 * tick_do_timer_cpu never relinquishes.
+	 * 'tick_do_timer_cpu' CPU never relinquishes.
 	 */
 	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
 #ifdef CONFIG_NO_HZ_FULL
@@ -205,12 +206,12 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
 	}
 #endif
 
-	/* Check, if the jiffies need an update */
+	/* Check if jiffies need an update */
 	if (tick_do_timer_cpu == cpu)
 		tick_do_update_jiffies64(now);
 
 	/*
-	 * If jiffies update stalled for too long (timekeeper in stop_machine()
+	 * If the jiffies update stalled for too long (timekeeper in stop_machine()
 	 * or VMEXIT'ed for several msecs), force an update.
 	 */
 	if (ts->last_tick_jiffies != jiffies) {
@@ -234,10 +235,10 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 	/*
 	 * When we are idle and the tick is stopped, we have to touch
 	 * the watchdog as we might not schedule for a really long
-	 * time. This happens on complete idle SMP systems while
+	 * time. This happens on completely idle SMP systems while
 	 * waiting on the login prompt. We also increment the "start of
 	 * idle" jiffy stamp so the idle accounting adjustment we do
-	 * when we go busy again does not account too much ticks.
+	 * when we go busy again does not account too many ticks.
 	 */
 	if (ts->tick_stopped) {
 		touch_softlockup_watchdog_sched();
@@ -362,7 +363,7 @@ static void tick_nohz_kick_task(struct task_struct *tsk)
 
 	/*
 	 * If the task is not running, run_posix_cpu_timers()
-	 * has nothing to elapse, IPI can then be spared.
+	 * has nothing to elapse, and an IPI can then be optimized out.
 	 *
 	 * activate_task()                      STORE p->tick_dep_mask
 	 *   STORE p->on_rq
@@ -425,7 +426,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep,
 
 /*
  * Set a global tick dependency. Used by perf events that rely on freq and
- * by unstable clock.
+ * unstable clocks.
  */
 void tick_nohz_dep_set(enum tick_dep_bits bit)
 {
@@ -439,7 +440,7 @@ void tick_nohz_dep_clear(enum tick_dep_bits bit)
 
 /*
  * Set per-CPU tick dependency. Used by scheduler and perf events in order to
- * manage events throttling.
+ * manage event-throttling.
  */
 void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
 {
@@ -455,7 +456,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
 		if (cpu == smp_processor_id()) {
 			tick_nohz_full_kick();
 		} else {
-			/* Remote irq work not NMI-safe */
+			/* Remote IRQ work not NMI-safe */
 			if (!WARN_ON_ONCE(in_nmi()))
 				tick_nohz_full_kick_cpu(cpu);
 		}
@@ -473,7 +474,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
 EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
 
 /*
- * Set a per-task tick dependency. RCU need this. Also posix CPU timers
+ * Set a per-task tick dependency. RCU needs this. Also posix CPU timers
  * in order to elapse per task timers.
  */
 void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
@@ -546,7 +547,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
 bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
 {
 	/*
-	 * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
+	 * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound
 	 * timers, workqueues, timekeeping, ...) on behalf of full dynticks
 	 * CPUs. It must remain online when nohz full is enabled.
 	 */
@@ -568,12 +569,12 @@ void __init tick_nohz_init(void)
 		return;
 
 	/*
-	 * Full dynticks uses irq work to drive the tick rescheduling on safe
-	 * locking contexts. But then we need irq work to raise its own
-	 * interrupts to avoid circular dependency on the tick
+	 * Full dynticks uses IRQ work to drive the tick rescheduling on safe
+	 * locking contexts. But then we need IRQ work to raise its own
+	 * interrupts to avoid circular dependency on the tick.
 	 */
 	if (!arch_irq_work_has_interrupt()) {
-		pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
+		pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n");
 		cpumask_clear(tick_nohz_full_mask);
 		tick_nohz_full_running = false;
 		return;
@@ -643,7 +644,7 @@ bool tick_nohz_tick_stopped_cpu(int cpu)
  * In case the sched_tick was stopped on this CPU, we have to check if jiffies
  * must be updated. Otherwise an interrupt handler could use a stale jiffy
  * value. We do this unconditionally on any CPU, as we don't know whether the
- * CPU, which has the update task assigned is in a long sleep.
+ * CPU, which has the update task assigned, is in a long sleep.
  */
 static void tick_nohz_update_jiffies(ktime_t now)
 {
@@ -726,7 +727,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
  * counters if NULL.
  *
  * Return the cumulative idle time (since boot) for a given
- * CPU, in microseconds. Note this is partially broken due to
+ * CPU, in microseconds. Note that this is partially broken due to
  * the counter of iowait tasks that can be remotely updated without
  * any synchronization. Therefore it is possible to observe backward
  * values within two consecutive reads.
@@ -787,7 +788,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 	}
 
 	/*
-	 * Reset to make sure next tick stop doesn't get fooled by past
+	 * Reset to make sure the next tick stop doesn't get fooled by past
 	 * cached clock deadline.
 	 */
 	ts->next_tick = 0;
@@ -816,11 +817,11 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
 	/*
 	 * Keep the periodic tick, when RCU, architecture or irq_work
 	 * requests it.
-	 * Aside of that check whether the local timer softirq is
-	 * pending. If so its a bad idea to call get_next_timer_interrupt()
+	 * Aside of that, check whether the local timer softirq is
+	 * pending. If so, its a bad idea to call get_next_timer_interrupt(),
 	 * because there is an already expired timer, so it will request
 	 * immediate expiry, which rearms the hardware timer with a
-	 * minimal delta which brings us back to this place
+	 * minimal delta, which brings us back to this place
 	 * immediately. Lather, rinse and repeat...
 	 */
 	if (rcu_needs_cpu() || arch_needs_cpu() ||
@@ -861,7 +862,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
 
 	/*
 	 * If this CPU is the one which had the do_timer() duty last, we limit
-	 * the sleep time to the timekeeping max_deferment value.
+	 * the sleep time to the timekeeping 'max_deferment' value.
 	 * Otherwise we can sleep as long as we want.
 	 */
 	delta = timekeeping_max_deferment();
@@ -895,8 +896,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 	 * If this CPU is the one which updates jiffies, then give up
 	 * the assignment and let it be taken by the CPU which runs
 	 * the tick timer next, which might be this CPU as well. If we
-	 * don't drop this here the jiffies might be stale and
-	 * do_timer() never invoked. Keep track of the fact that it
+	 * don't drop this here, the jiffies might be stale and
+	 * do_timer() never gets invoked. Keep track of the fact that it
 	 * was the one which had the do_timer() duty last.
 	 */
 	if (cpu == tick_do_timer_cpu) {
@@ -906,7 +907,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 		ts->do_timer_last = 0;
 	}
 
-	/* Skip reprogram of event if its not changed */
+	/* Skip reprogram of event if it's not changed */
 	if (ts->tick_stopped && (expires == ts->next_tick)) {
 		/* Sanity check: make sure clockevent is actually programmed */
 		if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
@@ -919,11 +920,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 	}
 
 	/*
-	 * nohz_stop_sched_tick can be called several times before
-	 * the nohz_restart_sched_tick is called. This happens when
+	 * nohz_stop_sched_tick() can be called several times before
+	 * nohz_restart_sched_tick() is called. This happens when
 	 * interrupts arrive which do not cause a reschedule. In the
 	 * first call we save the current tick time, so we can restart
-	 * the scheduler tick in nohz_restart_sched_tick.
+	 * the scheduler tick in nohz_restart_sched_tick().
 	 */
 	if (!ts->tick_stopped) {
 		calc_load_nohz_start();
@@ -985,9 +986,8 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 
 	calc_load_nohz_stop();
 	touch_softlockup_watchdog_sched();
-	/*
-	 * Cancel the scheduled timer and restore the tick
-	 */
+
+	/* Cancel the scheduled timer and restore the tick: */
 	ts->tick_stopped  = 0;
 	tick_nohz_restart(ts, now);
 }
@@ -1019,11 +1019,11 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
 /*
  * A pending softirq outside an IRQ (or softirq disabled section) context
  * should be waiting for ksoftirqd to handle it. Therefore we shouldn't
- * reach here due to the need_resched() early check in can_stop_idle_tick().
+ * reach this code due to the need_resched() early check in can_stop_idle_tick().
  *
  * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
  * cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
- * triggering the below since wakep_softirqd() is ignored.
+ * triggering the code below, since wakep_softirqd() is ignored.
  *
  */
 static bool report_idle_softirq(void)
@@ -1044,7 +1044,7 @@ static bool report_idle_softirq(void)
 	if (ratelimit >= 10)
 		return false;
 
-	/* On RT, softirqs handling may be waiting on some lock */
+	/* On RT, softirq handling may be waiting on some lock */
 	if (local_bh_blocked())
 		return false;
 
@@ -1061,8 +1061,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 	 * If this CPU is offline and it is the one which updates
 	 * jiffies, then give up the assignment and let it be taken by
 	 * the CPU which runs the tick timer next. If we don't drop
-	 * this here the jiffies might be stale and do_timer() never
-	 * invoked.
+	 * this here, the jiffies might be stale and do_timer() never
+	 * gets invoked.
 	 */
 	if (unlikely(!cpu_online(cpu))) {
 		if (cpu == tick_do_timer_cpu)
@@ -1219,7 +1219,7 @@ bool tick_nohz_idle_got_tick(void)
 
 /**
  * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
- * or the tick, whatever that expires first. Note that, if the tick has been
+ * or the tick, whichever expires first. Note that, if the tick has been
  * stopped, it returns the next hrtimer.
  *
  * Called from power state control code with interrupts disabled
@@ -1263,7 +1263,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
 		return *delta_next;
 
 	/*
-	 * If the next highres timer to expire is earlier than next_event, the
+	 * If the next highres timer to expire is earlier than 'next_event', the
 	 * idle governor needs to know that.
 	 */
 	next_event = min_t(u64, next_event,
@@ -1307,9 +1307,9 @@ static void tick_nohz_account_idle_time(struct tick_sched *ts,
 	if (vtime_accounting_enabled_this_cpu())
 		return;
 	/*
-	 * We stopped the tick in idle. Update process times would miss the
-	 * time we slept as update_process_times does only a 1 tick
-	 * accounting. Enforce that this is accounted to idle !
+	 * We stopped the tick in idle. update_process_times() would miss the
+	 * time we slept, as it does only a 1 tick accounting.
+	 * Enforce that this is accounted to idle !
 	 */
 	ticks = jiffies - ts->idle_jiffies;
 	/*
@@ -1351,7 +1351,7 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
  *
  * 2) If the CPU is in nohz_full mode (corner case):
  *   2.1) If the tick can be kept stopped (no tick dependencies)
- *        then re-eavaluate the next tick and try to keep it stopped
+ *        then re-evaluate the next tick and try to keep it stopped
  *        as long as possible.
  *   2.2) If the tick has dependencies, restart the tick.
  *
@@ -1385,12 +1385,12 @@ void tick_nohz_idle_exit(void)
 
 /*
  * In low-resolution mode, the tick handler must be implemented directly
- * at the clockevent level. hrtimer can't be used instead because its
+ * at the clockevent level. hrtimer can't be used instead, because its
  * infrastructure actually relies on the tick itself as a backend in
  * low-resolution mode (see hrtimer_run_queues()).
  *
  * This low-resolution handler still makes use of some hrtimer APIs meanwhile
- * for commodity with expiration calculation and forwarding.
+ * for convenience with expiration calculation and forwarding.
  */
 static void tick_nohz_lowres_handler(struct clock_event_device *dev)
 {
@@ -1426,7 +1426,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
 }
 
 /**
- * tick_nohz_switch_to_nohz - switch to nohz mode
+ * tick_nohz_switch_to_nohz - switch to NOHZ mode
  */
 static void tick_nohz_switch_to_nohz(void)
 {
@@ -1440,8 +1440,8 @@ static void tick_nohz_switch_to_nohz(void)
 		return;
 
 	/*
-	 * Recycle the hrtimer in ts, so we can share the
-	 * hrtimer_forward with the highres code.
+	 * Recycle the hrtimer in 'ts', so we can share the
+	 * hrtimer_forward_now() function with the highres code.
 	 */
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
 	/* Get the next period */
@@ -1464,7 +1464,7 @@ static inline void tick_nohz_irq_enter(void)
 	if (ts->idle_active)
 		tick_nohz_stop_idle(ts, now);
 	/*
-	 * If all CPUs are idle. We may need to update a stale jiffies value.
+	 * If all CPUs are idle we may need to update a stale jiffies value.
 	 * Note nohz_full is a special case: a timekeeper is guaranteed to stay
 	 * alive but it might be busy looping with interrupts disabled in some
 	 * rare case (typically stop machine). So we must make sure we have a
@@ -1483,7 +1483,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
 #endif /* CONFIG_NO_HZ_COMMON */
 
 /*
- * Called from irq_enter to notify about the possible interruption of idle()
+ * Called from irq_enter() to notify about the possible interruption of idle()
  */
 void tick_irq_enter(void)
 {
@@ -1509,8 +1509,8 @@ static enum hrtimer_restart tick_nohz_highres_handler(struct hrtimer *timer)
 	tick_sched_do_timer(ts, now);
 
 	/*
-	 * Do not call, when we are not in irq context and have
-	 * no valid regs pointer
+	 * Do not call when we are not in IRQ context and have
+	 * no valid 'regs' pointer
 	 */
 	if (regs)
 		tick_sched_handle(ts, regs);
@@ -1548,16 +1548,14 @@ void tick_setup_sched_timer(void)
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 	ktime_t now = ktime_get();
 
-	/*
-	 * Emulate tick processing via per-CPU hrtimers:
-	 */
+	/* Emulate tick processing via per-CPU hrtimers: */
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
 	ts->sched_timer.function = tick_nohz_highres_handler;
 
 	/* Get the next period (per-CPU) */
 	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
 
-	/* Offset the tick to avert jiffies_lock contention. */
+	/* Offset the tick to avert 'jiffies_lock' contention. */
 	if (sched_skew_tick) {
 		u64 offset = TICK_NSEC >> 1;
 		do_div(offset, num_possible_cpus());
@@ -1607,10 +1605,10 @@ void tick_oneshot_notify(void)
 }
 
 /*
- * Check, if a change happened, which makes oneshot possible.
+ * Check if a change happened, which makes oneshot possible.
  *
- * Called cyclic from the hrtimer softirq (driven by the timer
- * softirq) allow_nohz signals, that we can switch into low-res nohz
+ * Called cyclically from the hrtimer softirq (driven by the timer
+ * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ
  * mode, because high resolution timers are disabled (either compile
  * or runtime). Called with interrupts disabled.
  */
-- 
cgit v1.2.3


From f9b0e1088bbf35933e25c839b75094039059b3be Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 29 Sep 2023 22:41:20 +0200
Subject: bpf, mprog: Fix maximum program check on mprog attachment

After Paul's recent improvement to syzkaller to improve coverage for
bpf_mprog and tcx, it hit a splat that the program limit was surpassed.
What happened is that the maximum number of progs got added, followed
by another prog add request which adds with BPF_F_BEFORE flag relative
to the last program in the array. The idx >= bpf_mprog_max() check in
bpf_mprog_attach() still passes because the index is below the maximum
but the maximum will be surpassed. We need to add a check upfront for
insertions to catch this situation.

Fixes: 053c8e1f235d ("bpf: Add generic attach/detach/query API for multi-progs")
Reported-by: syzbot+baa44e3dbbe48e05c1ad@syzkaller.appspotmail.com
Reported-by: syzbot+b97d20ed568ce0951a06@syzkaller.appspotmail.com
Reported-by: syzbot+2558ca3567a77b7af4e3@syzkaller.appspotmail.com
Co-developed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: syzbot+baa44e3dbbe48e05c1ad@syzkaller.appspotmail.com
Tested-by: syzbot+b97d20ed568ce0951a06@syzkaller.appspotmail.com
Link: https://github.com/google/syzkaller/pull/4207
Link: https://lore.kernel.org/bpf/20230929204121.20305-1-daniel@iogearbox.net
---
 kernel/bpf/mprog.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/mprog.c b/kernel/bpf/mprog.c
index 32d2c4829eb8..007d98c799e2 100644
--- a/kernel/bpf/mprog.c
+++ b/kernel/bpf/mprog.c
@@ -253,6 +253,9 @@ int bpf_mprog_attach(struct bpf_mprog_entry *entry,
 			goto out;
 		}
 		idx = tidx;
+	} else if (bpf_mprog_total(entry) == bpf_mprog_max()) {
+		ret = -ERANGE;
+		goto out;
 	}
 	if (flags & BPF_F_BEFORE) {
 		tidx = bpf_mprog_pos_before(entry, &rtuple);
-- 
cgit v1.2.3


From e2a8f20dd8e9df695f736e51cd9115ae55be92d1 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Tue, 26 Sep 2023 20:09:05 +0800
Subject: Crash: add lock to serialize crash hotplug handling

Eric reported that handling corresponding crash hotplug event can be
failed easily when many memory hotplug event are notified in a short
period.  They failed because failing to take __kexec_lock.

=======
[   78.714569] Fallback order for Node 0: 0
[   78.714575] Built 1 zonelists, mobility grouping on.  Total pages: 1817886
[   78.717133] Policy zone: Normal
[   78.724423] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate
[   78.727207] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate
[   80.056643] PEFILE: Unsigned PE binary
=======

The memory hotplug events are notified very quickly and very many, while
the handling of crash hotplug is much slower relatively.  So the atomic
variable __kexec_lock and kexec_trylock() can't guarantee the
serialization of crash hotplug handling.

Here, add a new mutex lock __crash_hotplug_lock to serialize crash hotplug
handling specifically.  This doesn't impact the usage of __kexec_lock.

Link: https://lkml.kernel.org/r/20230926120905.392903-1-bhe@redhat.com
Fixes: 247262756121 ("crash: add generic infrastructure for crash hotplug support")
Signed-off-by: Baoquan He <bhe@redhat.com>
Tested-by: Eric DeVolder <eric.devolder@oracle.com>
Reviewed-by: Eric DeVolder <eric.devolder@oracle.com>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/crash_core.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 03a7932cde0a..2f675ef045d4 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -739,6 +739,17 @@ subsys_initcall(crash_notes_memory_init);
 #undef pr_fmt
 #define pr_fmt(fmt) "crash hp: " fmt
 
+/*
+ * Different than kexec/kdump loading/unloading/jumping/shrinking which
+ * usually rarely happen, there will be many crash hotplug events notified
+ * during one short period, e.g one memory board is hot added and memory
+ * regions are online. So mutex lock  __crash_hotplug_lock is used to
+ * serialize the crash hotplug handling specifically.
+ */
+DEFINE_MUTEX(__crash_hotplug_lock);
+#define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock)
+#define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock)
+
 /*
  * This routine utilized when the crash_hotplug sysfs node is read.
  * It reflects the kernel's ability/permission to update the crash
@@ -748,9 +759,11 @@ int crash_check_update_elfcorehdr(void)
 {
 	int rc = 0;
 
+	crash_hotplug_lock();
 	/* Obtain lock while reading crash information */
 	if (!kexec_trylock()) {
 		pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+		crash_hotplug_unlock();
 		return 0;
 	}
 	if (kexec_crash_image) {
@@ -761,6 +774,7 @@ int crash_check_update_elfcorehdr(void)
 	}
 	/* Release lock now that update complete */
 	kexec_unlock();
+	crash_hotplug_unlock();
 
 	return rc;
 }
@@ -783,9 +797,11 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
 {
 	struct kimage *image;
 
+	crash_hotplug_lock();
 	/* Obtain lock while changing crash information */
 	if (!kexec_trylock()) {
 		pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n");
+		crash_hotplug_unlock();
 		return;
 	}
 
@@ -852,6 +868,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
 out:
 	/* Release lock now that update complete */
 	kexec_unlock();
+	crash_hotplug_unlock();
 }
 
 static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
-- 
cgit v1.2.3


From 9077fc228f09c9f975c498c55f5d2e882cd0da59 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Thu, 28 Sep 2023 18:15:58 +0800
Subject: bpf: Use kmalloc_size_roundup() to adjust size_index

Commit d52b59315bf5 ("bpf: Adjust size_index according to the value of
KMALLOC_MIN_SIZE") uses KMALLOC_MIN_SIZE to adjust size_index, but as
reported by Nathan, the adjustment is not enough, because
__kmalloc_minalign() also decides the minimal alignment of slab object
as shown in new_kmalloc_cache() and its value may be greater than
KMALLOC_MIN_SIZE (e.g., 64 bytes vs 8 bytes under a riscv QEMU VM).

Instead of invoking __kmalloc_minalign() in bpf subsystem to find the
maximal alignment, just using kmalloc_size_roundup() directly to get the
corresponding slab object size for each allocation size. If these two
sizes are unmatched, adjust size_index to select a bpf_mem_cache with
unit_size equal to the object_size of the underlying slab cache for the
allocation size.

Fixes: 822fb26bdb55 ("bpf: Add a hint to allocated objects.")
Reported-by: Nathan Chancellor <nathan@kernel.org>
Closes: https://lore.kernel.org/bpf/20230914181407.GA1000274@dev-arch.thelio-3990X/
Signed-off-by: Hou Tao <houtao1@huawei.com>
Tested-by: Emil Renner Berthing <emil.renner.berthing@canonical.com>
Link: https://lore.kernel.org/r/20230928101558.2594068-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 44 +++++++++++++++++++-------------------------
 1 file changed, 19 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index cf1941516643..d93ddac283d4 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -965,37 +965,31 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
 	return !ret ? NULL : ret + LLIST_NODE_SZ;
 }
 
-/* Most of the logic is taken from setup_kmalloc_cache_index_table() */
 static __init int bpf_mem_cache_adjust_size(void)
 {
-	unsigned int size, index;
+	unsigned int size;
 
-	/* Normally KMALLOC_MIN_SIZE is 8-bytes, but it can be
-	 * up-to 256-bytes.
+	/* Adjusting the indexes in size_index() according to the object_size
+	 * of underlying slab cache, so bpf_mem_alloc() will select a
+	 * bpf_mem_cache with unit_size equal to the object_size of
+	 * the underlying slab cache.
+	 *
+	 * The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign() is
+	 * 256-bytes, so only do adjustment for [8-bytes, 192-bytes].
 	 */
-	size = KMALLOC_MIN_SIZE;
-	if (size <= 192)
-		index = size_index[(size - 1) / 8];
-	else
-		index = fls(size - 1) - 1;
-	for (size = 8; size < KMALLOC_MIN_SIZE && size <= 192; size += 8)
-		size_index[(size - 1) / 8] = index;
+	for (size = 192; size >= 8; size -= 8) {
+		unsigned int kmalloc_size, index;
 
-	/* The minimal alignment is 64-bytes, so disable 96-bytes cache and
-	 * use 128-bytes cache instead.
-	 */
-	if (KMALLOC_MIN_SIZE >= 64) {
-		index = size_index[(128 - 1) / 8];
-		for (size = 64 + 8; size <= 96; size += 8)
-			size_index[(size - 1) / 8] = index;
-	}
+		kmalloc_size = kmalloc_size_roundup(size);
+		if (kmalloc_size == size)
+			continue;
 
-	/* The minimal alignment is 128-bytes, so disable 192-bytes cache and
-	 * use 256-bytes cache instead.
-	 */
-	if (KMALLOC_MIN_SIZE >= 128) {
-		index = fls(256 - 1) - 1;
-		for (size = 128 + 8; size <= 192; size += 8)
+		if (kmalloc_size <= 192)
+			index = size_index[(kmalloc_size - 1) / 8];
+		else
+			index = fls(kmalloc_size - 1) - 1;
+		/* Only overwrite if necessary */
+		if (size_index[(size - 1) / 8] != index)
 			size_index[(size - 1) / 8] = index;
 	}
 
-- 
cgit v1.2.3


From 1e0cb399c7653462d9dadf8ab9425337c355d358 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 29 Sep 2023 18:01:13 -0400
Subject: ring-buffer: Update "shortest_full" in polling

It was discovered that the ring buffer polling was incorrectly stating
that read would not block, but that's because polling did not take into
account that reads will block if the "buffer-percent" was set. Instead,
the ring buffer polling would say reads would not block if there was any
data in the ring buffer. This was incorrect behavior from a user space
point of view. This was fixed by commit 42fb0a1e84ff by having the polling
code check if the ring buffer had more data than what the user specified
"buffer percent" had.

The problem now is that the polling code did not register itself to the
writer that it wanted to wait for a specific "full" value of the ring
buffer. The result was that the writer would wake the polling waiter
whenever there was a new event. The polling waiter would then wake up, see
that there's not enough data in the ring buffer to notify user space and
then go back to sleep. The next event would wake it up again.

Before the polling fix was added, the code would wake up around 100 times
for a hackbench 30 benchmark. After the "fix", due to the constant waking
of the writer, it would wake up over 11,0000 times! It would never leave
the kernel, so the user space behavior was still "correct", but this
definitely is not the desired effect.

To fix this, have the polling code add what it's waiting for to the
"shortest_full" variable, to tell the writer not to wake it up if the
buffer is not as full as it expects to be.

Note, after this fix, it appears that the waiter is now woken up around 2x
the times it was before (~200). This is a tremendous improvement from the
11,000 times, but I will need to spend some time to see why polling is
more aggressive in its wakeups than the read blocking code.

Link: https://lore.kernel.org/linux-trace-kernel/20230929180113.01c2cae3@rorschach.local.home

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Fixes: 42fb0a1e84ff ("tracing/ring-buffer: Have polling block on watermark")
Reported-by: Julia Lawall <julia.lawall@inria.fr>
Tested-by: Julia Lawall <julia.lawall@inria.fr>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 28daf0ce95c5..515cafdb18d9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1137,6 +1137,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
 	if (full) {
 		poll_wait(filp, &work->full_waiters, poll_table);
 		work->full_waiters_pending = true;
+		if (!cpu_buffer->shortest_full ||
+		    cpu_buffer->shortest_full > full)
+			cpu_buffer->shortest_full = full;
 	} else {
 		poll_wait(filp, &work->waiters, poll_table);
 		work->waiters_pending = true;
-- 
cgit v1.2.3


From 23cce5f25491968b23fb9c399bbfb25f13870cd9 Mon Sep 17 00:00:00 2001
From: Clément Léger <cleger@rivosinc.com>
Date: Fri, 29 Sep 2023 21:16:37 +0200
Subject: tracing: relax trace_event_eval_update() execution with
 cond_resched()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When kernel is compiled without preemption, the eval_map_work_func()
(which calls trace_event_eval_update()) will not be preempted up to its
complete execution. This can actually cause a problem since if another
CPU call stop_machine(), the call will have to wait for the
eval_map_work_func() function to finish executing in the workqueue
before being able to be scheduled. This problem was observe on a SMP
system at boot time, when the CPU calling the initcalls executed
clocksource_done_booting() which in the end calls stop_machine(). We
observed a 1 second delay because one CPU was executing
eval_map_work_func() and was not preempted by the stop_machine() task.

Adding a call to cond_resched() in trace_event_eval_update() allows
other tasks to be executed and thus continue working asynchronously
like before without blocking any pending task at boot time.

Link: https://lore.kernel.org/linux-trace-kernel/20230929191637.416931-1-cleger@rivosinc.com

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Clément Léger <cleger@rivosinc.com>
Tested-by: Atish Patra <atishp@rivosinc.com>
Reviewed-by: Atish Patra <atishp@rivosinc.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 91951d038ba4..f49d6ddb6342 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2770,6 +2770,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
 				update_event_fields(call, map[i]);
 			}
 		}
+		cond_resched();
 	}
 	up_write(&trace_event_sem);
 }
-- 
cgit v1.2.3


From 2de9ee94054263940122aee8720e902b30c27930 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Mon, 25 Sep 2023 23:08:28 +0000
Subject: tracing/user_events: Align set_bit() address for all archs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All architectures should use a long aligned address passed to set_bit().
User processes can pass either a 32-bit or 64-bit sized value to be
updated when tracing is enabled when on a 64-bit kernel. Both cases are
ensured to be naturally aligned, however, that is not enough. The
address must be long aligned without affecting checks on the value
within the user process which require different adjustments for the bit
for little and big endian CPUs.

Add a compat flag to user_event_enabler that indicates when a 32-bit
value is being used on a 64-bit kernel. Long align addresses and correct
the bit to be used by set_bit() to account for this alignment. Ensure
compat flags are copied during forks and used during deletion clears.

Link: https://lore.kernel.org/linux-trace-kernel/20230925230829.341-2-beaub@linux.microsoft.com
Link: https://lore.kernel.org/linux-trace-kernel/20230914131102.179100-1-cleger@rivosinc.com/

Cc: stable@vger.kernel.org
Fixes: 7235759084a4 ("tracing/user_events: Use remote writes for event enablement")
Reported-by: Clément Léger <cleger@rivosinc.com>
Suggested-by: Clément Léger <cleger@rivosinc.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 58 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 6f046650e527..b87f41187c6a 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -127,8 +127,13 @@ struct user_event_enabler {
 /* Bit 7 is for freeing status of enablement */
 #define ENABLE_VAL_FREEING_BIT 7
 
-/* Only duplicate the bit value */
-#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
+/* Bit 8 is for marking 32-bit on 64-bit */
+#define ENABLE_VAL_32_ON_64_BIT 8
+
+#define ENABLE_VAL_COMPAT_MASK (1 << ENABLE_VAL_32_ON_64_BIT)
+
+/* Only duplicate the bit and compat values */
+#define ENABLE_VAL_DUP_MASK (ENABLE_VAL_BIT_MASK | ENABLE_VAL_COMPAT_MASK)
 
 #define ENABLE_BITOPS(e) (&(e)->values)
 
@@ -174,6 +179,30 @@ struct user_event_validator {
 	int			flags;
 };
 
+static inline void align_addr_bit(unsigned long *addr, int *bit,
+				  unsigned long *flags)
+{
+	if (IS_ALIGNED(*addr, sizeof(long))) {
+#ifdef __BIG_ENDIAN
+		/* 32 bit on BE 64 bit requires a 32 bit offset when aligned. */
+		if (test_bit(ENABLE_VAL_32_ON_64_BIT, flags))
+			*bit += 32;
+#endif
+		return;
+	}
+
+	*addr = ALIGN_DOWN(*addr, sizeof(long));
+
+	/*
+	 * We only support 32 and 64 bit values. The only time we need
+	 * to align is a 32 bit value on a 64 bit kernel, which on LE
+	 * is always 32 bits, and on BE requires no change when unaligned.
+	 */
+#ifdef __LITTLE_ENDIAN
+	*bit += 32;
+#endif
+}
+
 typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
 				   void *tpdata, bool *faulted);
 
@@ -482,6 +511,7 @@ static int user_event_enabler_write(struct user_event_mm *mm,
 	unsigned long *ptr;
 	struct page *page;
 	void *kaddr;
+	int bit = ENABLE_BIT(enabler);
 	int ret;
 
 	lockdep_assert_held(&event_mutex);
@@ -497,6 +527,8 @@ static int user_event_enabler_write(struct user_event_mm *mm,
 		     test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))))
 		return -EBUSY;
 
+	align_addr_bit(&uaddr, &bit, ENABLE_BITOPS(enabler));
+
 	ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
 				    &page, NULL);
 
@@ -515,9 +547,9 @@ static int user_event_enabler_write(struct user_event_mm *mm,
 
 	/* Update bit atomically, user tracers must be atomic as well */
 	if (enabler->event && enabler->event->status)
-		set_bit(ENABLE_BIT(enabler), ptr);
+		set_bit(bit, ptr);
 	else
-		clear_bit(ENABLE_BIT(enabler), ptr);
+		clear_bit(bit, ptr);
 
 	kunmap_local(kaddr);
 	unpin_user_pages_dirty_lock(&page, 1, true);
@@ -849,6 +881,12 @@ static struct user_event_enabler
 	enabler->event = user;
 	enabler->addr = uaddr;
 	enabler->values = reg->enable_bit;
+
+#if BITS_PER_LONG >= 64
+	if (reg->enable_size == 4)
+		set_bit(ENABLE_VAL_32_ON_64_BIT, ENABLE_BITOPS(enabler));
+#endif
+
 retry:
 	/* Prevents state changes from racing with new enablers */
 	mutex_lock(&event_mutex);
@@ -2377,7 +2415,8 @@ static long user_unreg_get(struct user_unreg __user *ureg,
 }
 
 static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
-				   unsigned long uaddr, unsigned char bit)
+				   unsigned long uaddr, unsigned char bit,
+				   unsigned long flags)
 {
 	struct user_event_enabler enabler;
 	int result;
@@ -2385,7 +2424,7 @@ static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
 
 	memset(&enabler, 0, sizeof(enabler));
 	enabler.addr = uaddr;
-	enabler.values = bit;
+	enabler.values = bit | flags;
 retry:
 	/* Prevents state changes from racing with new enablers */
 	mutex_lock(&event_mutex);
@@ -2415,6 +2454,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
 	struct user_event_mm *mm = current->user_event_mm;
 	struct user_event_enabler *enabler, *next;
 	struct user_unreg reg;
+	unsigned long flags;
 	long ret;
 
 	ret = user_unreg_get(ureg, &reg);
@@ -2425,6 +2465,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
 	if (!mm)
 		return -ENOENT;
 
+	flags = 0;
 	ret = -ENOENT;
 
 	/*
@@ -2441,6 +2482,9 @@ static long user_events_ioctl_unreg(unsigned long uarg)
 		    ENABLE_BIT(enabler) == reg.disable_bit) {
 			set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
 
+			/* We must keep compat flags for the clear */
+			flags |= enabler->values & ENABLE_VAL_COMPAT_MASK;
+
 			if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
 				user_event_enabler_destroy(enabler, true);
 
@@ -2454,7 +2498,7 @@ static long user_events_ioctl_unreg(unsigned long uarg)
 	/* Ensure bit is now cleared for user, regardless of event status */
 	if (!ret)
 		ret = user_event_mm_clear_bit(mm, reg.disable_addr,
-					      reg.disable_bit);
+					      reg.disable_bit, flags);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 079be8fc630943d9fc70a97807feb73d169ee3fc Mon Sep 17 00:00:00 2001
From: Cyril Hrubis <chrubis@suse.cz>
Date: Mon, 2 Oct 2023 13:55:51 +0200
Subject: sched/rt: Disallow writing invalid values to sched_rt_period_us

The validation of the value written to sched_rt_period_us was broken
because:

  - the sysclt_sched_rt_period is declared as unsigned int
  - parsed by proc_do_intvec()
  - the range is asserted after the value parsed by proc_do_intvec()

Because of this negative values written to the file were written into a
unsigned integer that were later on interpreted as large positive
integers which did passed the check:

  if (sysclt_sched_rt_period <= 0)
	return EINVAL;

This commit fixes the parsing by setting explicit range for both
perid_us and runtime_us into the sched_rt_sysctls table and processes
the values with proc_dointvec_minmax() instead.

Alternatively if we wanted to use full range of unsigned int for the
period value we would have to split the proc_handler and use
proc_douintvec() for it however even the
Documentation/scheduller/sched-rt-group.rst describes the range as 1 to
INT_MAX.

As far as I can tell the only problem this causes is that the sysctl
file allows writing negative values which when read back may confuse
userspace.

There is also a LTP test being submitted for these sysctl files at:

  http://patchwork.ozlabs.org/project/ltp/patch/20230901144433.2526-1-chrubis@suse.cz/

Signed-off-by: Cyril Hrubis <chrubis@suse.cz>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231002115553.3007-2-chrubis@suse.cz
---
 kernel/sched/rt.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3b627ab586fb..88fc98601413 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -37,6 +37,8 @@ static struct ctl_table sched_rt_sysctls[] = {
 		.maxlen         = sizeof(unsigned int),
 		.mode           = 0644,
 		.proc_handler   = sched_rt_handler,
+		.extra1         = SYSCTL_ONE,
+		.extra2         = SYSCTL_INT_MAX,
 	},
 	{
 		.procname       = "sched_rt_runtime_us",
@@ -44,6 +46,8 @@ static struct ctl_table sched_rt_sysctls[] = {
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = sched_rt_handler,
+		.extra1         = SYSCTL_NEG_ONE,
+		.extra2         = SYSCTL_INT_MAX,
 	},
 	{
 		.procname       = "sched_rr_timeslice_ms",
@@ -2935,9 +2939,6 @@ static int sched_rt_global_constraints(void)
 #ifdef CONFIG_SYSCTL
 static int sched_rt_global_validate(void)
 {
-	if (sysctl_sched_rt_period <= 0)
-		return -EINVAL;
-
 	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
 		((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
 		 ((u64)sysctl_sched_rt_runtime *
@@ -2968,7 +2969,7 @@ static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
 	old_period = sysctl_sched_rt_period;
 	old_runtime = sysctl_sched_rt_runtime;
 
-	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
 	if (!ret && write) {
 		ret = sched_rt_global_validate();
-- 
cgit v1.2.3


From 8788c6c2feb3600ba1a2f84ac5d258af4a284cea Mon Sep 17 00:00:00 2001
From: Atul Kumar Pant <atulpant.linux@gmail.com>
Date: Mon, 7 Aug 2023 17:48:34 +0530
Subject: locking/debug: Fix debugfs API return value checks to use IS_ERR()

Update the checking of return values from debugfs_create_file()
and debugfs_create_dir() to use IS_ERR().

Signed-off-by: Atul Kumar Pant <atulpant.linux@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://lore.kernel.org/r/20230807121834.7438-1-atulpant.linux@gmail.com
---
 kernel/locking/lock_events.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
index fa2c2f951c6b..e68d82099558 100644
--- a/kernel/locking/lock_events.c
+++ b/kernel/locking/lock_events.c
@@ -146,7 +146,7 @@ static int __init init_lockevent_counts(void)
 	struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
 	int i;
 
-	if (!d_counts)
+	if (IS_ERR(d_counts))
 		goto out;
 
 	/*
@@ -159,14 +159,14 @@ static int __init init_lockevent_counts(void)
 	for (i = 0; i < lockevent_num; i++) {
 		if (skip_lockevent(lockevent_names[i]))
 			continue;
-		if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
-					 (void *)(long)i, &fops_lockevent))
+		if (IS_ERR(debugfs_create_file(lockevent_names[i], 0400, d_counts,
+					 (void *)(long)i, &fops_lockevent)))
 			goto fail_undo;
 	}
 
-	if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
+	if (IS_ERR(debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
 				 d_counts, (void *)(long)LOCKEVENT_reset_cnts,
-				 &fops_lockevent))
+				 &fops_lockevent)))
 		goto fail_undo;
 
 	return 0;
-- 
cgit v1.2.3


From 2f2fc17bab0011430ceb6f2dc1959e7d1f981444 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Sep 2023 00:48:55 +0200
Subject: sched/eevdf: Also update slice on placement

Tasks that never consume their full slice would not update their slice value.
This means that tasks that are spawned before the sysctl scaling keep their
original (UP) slice length.

Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230915124822.847197830@noisy.programming.kicks-ass.net
---
 kernel/sched/fair.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cb225921bbca..7d73652acbb2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4919,10 +4919,12 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-	u64 vslice = calc_delta_fair(se->slice, se);
-	u64 vruntime = avg_vruntime(cfs_rq);
+	u64 vslice, vruntime = avg_vruntime(cfs_rq);
 	s64 lag = 0;
 
+	se->slice = sysctl_sched_base_slice;
+	vslice = calc_delta_fair(se->slice, se);
+
 	/*
 	 * Due to how V is constructed as the weighted average of entities,
 	 * adding tasks with positive lag, or removing tasks with negative lag
-- 
cgit v1.2.3


From 650cad561cce04b62a8c8e0446b685ef171bc3bb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 26 Sep 2023 14:29:50 +0200
Subject: sched/eevdf: Fix avg_vruntime()

The expectation is that placing a task at avg_vruntime() makes it
eligible. Turns out there is a corner case where this is not the case.

Specifically, avg_vruntime() relies on the fact that integer division
is a flooring function (eg. it discards the remainder). By this
property the value returned is slightly left of the true average.

However! when the average is a negative (relative to min_vruntime) the
effect is flipped and it becomes a ceil, with the result that the
returned value is just right of the average and thus not eligible.

Fixes: af4cf40470c2 ("sched/fair: Add cfs_rq::avg_vruntime")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/fair.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7d73652acbb2..ef7490c4b8b4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -664,6 +664,10 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
 	cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
 }
 
+/*
+ * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
+ * For this to be so, the result of this function must have a left bias.
+ */
 u64 avg_vruntime(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
@@ -677,8 +681,12 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
 		load += weight;
 	}
 
-	if (load)
+	if (load) {
+		/* sign flips effective floor / ceil */
+		if (avg < 0)
+			avg -= (load - 1);
 		avg = div_s64(avg, load);
+	}
 
 	return cfs_rq->min_vruntime + avg;
 }
-- 
cgit v1.2.3


From ccab211af3c2b90ed792eb5f33707d2f0d59fe50 Mon Sep 17 00:00:00 2001
From: Sohil Mehta <sohil.mehta@intel.com>
Date: Mon, 10 Jul 2023 18:51:24 +0000
Subject: syscalls: Cleanup references to sys_lookup_dcookie()

commit 'be65de6b03aa ("fs: Remove dcookies support")' removed the
syscall definition for lookup_dcookie.  However, syscall tables still
point to the old sys_lookup_dcookie() definition. Update syscall tables
of all architectures to directly point to sys_ni_syscall() instead.

Signed-off-by: Sohil Mehta <sohil.mehta@intel.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Namhyung Kim <namhyung@kernel.org> # for perf
Acked-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/alpha/kernel/syscalls/syscall.tbl              | 2 +-
 arch/arm/tools/syscall.tbl                          | 2 +-
 arch/arm64/include/asm/unistd32.h                   | 4 ++--
 arch/m68k/kernel/syscalls/syscall.tbl               | 2 +-
 arch/microblaze/kernel/syscalls/syscall.tbl         | 2 +-
 arch/mips/kernel/syscalls/syscall_n32.tbl           | 2 +-
 arch/mips/kernel/syscalls/syscall_n64.tbl           | 2 +-
 arch/mips/kernel/syscalls/syscall_o32.tbl           | 2 +-
 arch/parisc/kernel/syscalls/syscall.tbl             | 2 +-
 arch/powerpc/kernel/syscalls/syscall.tbl            | 2 +-
 arch/s390/kernel/syscalls/syscall.tbl               | 2 +-
 arch/sh/kernel/syscalls/syscall.tbl                 | 2 +-
 arch/sparc/kernel/syscalls/syscall.tbl              | 2 +-
 arch/x86/entry/syscalls/syscall_32.tbl              | 2 +-
 arch/x86/entry/syscalls/syscall_64.tbl              | 2 +-
 arch/xtensa/kernel/syscalls/syscall.tbl             | 2 +-
 include/linux/compat.h                              | 1 -
 include/linux/syscalls.h                            | 1 -
 include/uapi/asm-generic/unistd.h                   | 2 +-
 kernel/sys_ni.c                                     | 2 --
 tools/include/uapi/asm-generic/unistd.h             | 2 +-
 tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl | 2 +-
 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl  | 2 +-
 tools/perf/arch/s390/entry/syscalls/syscall.tbl     | 2 +-
 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl   | 2 +-
 25 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index ad37569d0507..26a32f8e47d5 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -334,7 +334,7 @@
 401	common	io_submit			sys_io_submit
 402	common	io_cancel			sys_io_cancel
 405	common	exit_group			sys_exit_group
-406	common	lookup_dcookie			sys_lookup_dcookie
+406	common	lookup_dcookie			sys_ni_syscall
 407	common	epoll_create			sys_epoll_create
 408	common	epoll_ctl			sys_epoll_ctl
 409	common	epoll_wait			sys_epoll_wait
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index c572d6c3dee0..04d8e491d8b4 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -263,7 +263,7 @@
 246	common	io_submit		sys_io_submit
 247	common	io_cancel		sys_io_cancel
 248	common	exit_group		sys_exit_group
-249	common	lookup_dcookie		sys_lookup_dcookie
+249	common	lookup_dcookie		sys_ni_syscall
 250	common	epoll_create		sys_epoll_create
 251	common	epoll_ctl		sys_epoll_ctl		sys_oabi_epoll_ctl
 252	common	epoll_wait		sys_epoll_wait
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 78b68311ec81..9110be82dfaa 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -508,8 +508,8 @@ __SYSCALL(__NR_io_submit, compat_sys_io_submit)
 __SYSCALL(__NR_io_cancel, sys_io_cancel)
 #define __NR_exit_group 248
 __SYSCALL(__NR_exit_group, sys_exit_group)
-#define __NR_lookup_dcookie 249
-__SYSCALL(__NR_lookup_dcookie, compat_sys_lookup_dcookie)
+			/* 249 was lookup_dcookie */
+__SYSCALL(249, sys_ni_syscall)
 #define __NR_epoll_create 250
 __SYSCALL(__NR_epoll_create, sys_epoll_create)
 #define __NR_epoll_ctl 251
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 259ceb125367..f9d1f2d3f067 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -255,7 +255,7 @@
 245	common	io_cancel			sys_io_cancel
 246	common	fadvise64			sys_fadvise64
 247	common	exit_group			sys_exit_group
-248	common	lookup_dcookie			sys_lookup_dcookie
+248	common	lookup_dcookie			sys_ni_syscall
 249	common	epoll_create			sys_epoll_create
 250	common	epoll_ctl			sys_epoll_ctl
 251	common	epoll_wait			sys_epoll_wait
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index a3798c2637fd..185fe73d9bbf 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -260,7 +260,7 @@
 250	common	fadvise64			sys_fadvise64
 # 251 is available for reuse (was briefly sys_set_zone_reclaim)
 252	common	exit_group			sys_exit_group
-253	common	lookup_dcookie			sys_lookup_dcookie
+253	common	lookup_dcookie			sys_ni_syscall
 254	common	epoll_create			sys_epoll_create
 255	common	epoll_ctl			sys_epoll_ctl
 256	common	epoll_wait			sys_epoll_wait
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 152034b8e0a0..08f33e7c2896 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -214,7 +214,7 @@
 203	n32	io_submit			compat_sys_io_submit
 204	n32	io_cancel			sys_io_cancel
 205	n32	exit_group			sys_exit_group
-206	n32	lookup_dcookie			sys_lookup_dcookie
+206	n32	lookup_dcookie			sys_ni_syscall
 207	n32	epoll_create			sys_epoll_create
 208	n32	epoll_ctl			sys_epoll_ctl
 209	n32	epoll_wait			sys_epoll_wait
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index cb5e757f6621..80be0e98ea0c 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -214,7 +214,7 @@
 203	n64	io_submit			sys_io_submit
 204	n64	io_cancel			sys_io_cancel
 205	n64	exit_group			sys_exit_group
-206	n64	lookup_dcookie			sys_lookup_dcookie
+206	n64	lookup_dcookie			sys_ni_syscall
 207	n64	epoll_create			sys_epoll_create
 208	n64	epoll_ctl			sys_epoll_ctl
 209	n64	epoll_wait			sys_epoll_wait
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 1a646813afdc..310c7e839b69 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -258,7 +258,7 @@
 244	o32	io_submit			sys_io_submit			compat_sys_io_submit
 245	o32	io_cancel			sys_io_cancel
 246	o32	exit_group			sys_exit_group
-247	o32	lookup_dcookie			sys_lookup_dcookie		compat_sys_lookup_dcookie
+247	o32	lookup_dcookie			sys_ni_syscall
 248	o32	epoll_create			sys_epoll_create
 249	o32	epoll_ctl			sys_epoll_ctl
 250	o32	epoll_wait			sys_epoll_wait
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index e97c175b56f9..5410ff9456ae 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -245,7 +245,7 @@
 # 220 was alloc_hugepages
 # 221 was free_hugepages
 222	common	exit_group		sys_exit_group
-223	common	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
+223	common	lookup_dcookie		sys_ni_syscall
 224	common	epoll_create		sys_epoll_create
 225	common	epoll_ctl		sys_epoll_ctl
 226	common	epoll_wait		sys_epoll_wait
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 20e50586e8a2..e1412519b4ad 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -294,7 +294,7 @@
 233	32	fadvise64			sys_ppc32_fadvise64		compat_sys_ppc32_fadvise64
 233	64	fadvise64			sys_fadvise64
 234	nospu	exit_group			sys_exit_group
-235	nospu	lookup_dcookie			sys_lookup_dcookie		compat_sys_lookup_dcookie
+235	nospu	lookup_dcookie			sys_ni_syscall
 236	common	epoll_create			sys_epoll_create
 237	common	epoll_ctl			sys_epoll_ctl
 238	common	epoll_wait			sys_epoll_wait
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 0122cc156952..cc0bc144b661 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -100,7 +100,7 @@
 106  common	stat			sys_newstat			compat_sys_newstat
 107  common	lstat			sys_newlstat			compat_sys_newlstat
 108  common	fstat			sys_newfstat			compat_sys_newfstat
-110  common	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
+110  common	lookup_dcookie		-				-
 111  common	vhangup			sys_vhangup			sys_vhangup
 112  common	idle			-				-
 114  common	wait4			sys_wait4			compat_sys_wait4
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index e90d585c4d3e..17ca58976849 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -260,7 +260,7 @@
 250	common	fadvise64			sys_fadvise64
 # 251 is unused
 252	common	exit_group			sys_exit_group
-253	common	lookup_dcookie			sys_lookup_dcookie
+253	common	lookup_dcookie			sys_ni_syscall
 254	common	epoll_create			sys_epoll_create
 255	common	epoll_ctl			sys_epoll_ctl
 256	common	epoll_wait			sys_epoll_wait
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 4ed06c71c43f..3f72970cf983 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -249,7 +249,7 @@
 205	common	readahead		sys_readahead			compat_sys_readahead
 206	common	socketcall		sys_socketcall			sys32_socketcall
 207	common	syslog			sys_syslog
-208	common	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
+208	common	lookup_dcookie		sys_ni_syscall
 209	common	fadvise64		sys_fadvise64			compat_sys_fadvise64
 210	common	fadvise64_64		sys_fadvise64_64		compat_sys_fadvise64_64
 211	common	tgkill			sys_tgkill
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 2d0b1bd866ea..6d0286bbbe27 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -264,7 +264,7 @@
 250	i386	fadvise64		sys_ia32_fadvise64
 # 251 is available for reuse (was briefly sys_set_zone_reclaim)
 252	i386	exit_group		sys_exit_group
-253	i386	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
+253	i386	lookup_dcookie
 254	i386	epoll_create		sys_epoll_create
 255	i386	epoll_ctl		sys_epoll_ctl
 256	i386	epoll_wait		sys_epoll_wait
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 1d6eee30eceb..2a62eaf30d69 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -220,7 +220,7 @@
 209	64	io_submit		sys_io_submit
 210	common	io_cancel		sys_io_cancel
 211	64	get_thread_area
-212	common	lookup_dcookie		sys_lookup_dcookie
+212	common	lookup_dcookie
 213	common	epoll_create		sys_epoll_create
 214	64	epoll_ctl_old
 215	64	epoll_wait_old
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index fc1a4f3c81d9..351521b2e841 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -273,7 +273,7 @@
 252	common	timer_getoverrun		sys_timer_getoverrun
 # System
 253	common	reserved253			sys_ni_syscall
-254	common	lookup_dcookie			sys_lookup_dcookie
+254	common	lookup_dcookie			sys_ni_syscall
 255	common	available255			sys_ni_syscall
 256	common	add_key				sys_add_key
 257	common	request_key			sys_request_key
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 1cfa4f0f490a..233f61ec8afc 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -581,7 +581,6 @@ asmlinkage long compat_sys_io_pgetevents_time64(compat_aio_context_t ctx_id,
 					struct io_event __user *events,
 					struct __kernel_timespec __user *timeout,
 					const struct __compat_aio_sigset __user *usig);
-asmlinkage long compat_sys_lookup_dcookie(u32, u32, char __user *, compat_size_t);
 asmlinkage long compat_sys_epoll_pwait(int epfd,
 			struct epoll_event __user *events,
 			int maxevents, int timeout,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 22bc6bc147f8..a031613bf966 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -355,7 +355,6 @@ asmlinkage long sys_lremovexattr(const char __user *path,
 				 const char __user *name);
 asmlinkage long sys_fremovexattr(int fd, const char __user *name);
 asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
-asmlinkage long sys_lookup_dcookie(u64 cookie64, char __user *buf, size_t len);
 asmlinkage long sys_eventfd2(unsigned int count, int flags);
 asmlinkage long sys_epoll_create1(int flags);
 asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index abe087c53b4b..76d946445391 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -71,7 +71,7 @@ __SYSCALL(__NR_fremovexattr, sys_fremovexattr)
 #define __NR_getcwd 17
 __SYSCALL(__NR_getcwd, sys_getcwd)
 #define __NR_lookup_dcookie 18
-__SC_COMP(__NR_lookup_dcookie, sys_lookup_dcookie, compat_sys_lookup_dcookie)
+__SYSCALL(__NR_lookup_dcookie, sys_ni_syscall)
 #define __NR_eventfd2 19
 __SYSCALL(__NR_eventfd2, sys_eventfd2)
 #define __NR_epoll_create1 20
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e137c1385c56..d6eaaaf9cf77 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -51,8 +51,6 @@ COND_SYSCALL_COMPAT(io_pgetevents);
 COND_SYSCALL(io_uring_setup);
 COND_SYSCALL(io_uring_enter);
 COND_SYSCALL(io_uring_register);
-COND_SYSCALL(lookup_dcookie);
-COND_SYSCALL_COMPAT(lookup_dcookie);
 COND_SYSCALL(eventfd2);
 COND_SYSCALL(epoll_create1);
 COND_SYSCALL(epoll_ctl);
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h
index fd6c1cb585db..7ea3875137e9 100644
--- a/tools/include/uapi/asm-generic/unistd.h
+++ b/tools/include/uapi/asm-generic/unistd.h
@@ -71,7 +71,7 @@ __SYSCALL(__NR_fremovexattr, sys_fremovexattr)
 #define __NR_getcwd 17
 __SYSCALL(__NR_getcwd, sys_getcwd)
 #define __NR_lookup_dcookie 18
-__SC_COMP(__NR_lookup_dcookie, sys_lookup_dcookie, compat_sys_lookup_dcookie)
+__SYSCALL(__NR_lookup_dcookie, sys_ni_syscall)
 #define __NR_eventfd2 19
 __SYSCALL(__NR_eventfd2, sys_eventfd2)
 #define __NR_epoll_create1 20
diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
index cfda2511badf..478fe63601fc 100644
--- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
+++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
@@ -214,7 +214,7 @@
 203	n64	io_submit			sys_io_submit
 204	n64	io_cancel			sys_io_cancel
 205	n64	exit_group			sys_exit_group
-206	n64	lookup_dcookie			sys_lookup_dcookie
+206	n64	lookup_dcookie			sys_ni_syscall
 207	n64	epoll_create			sys_epoll_create
 208	n64	epoll_ctl			sys_epoll_ctl
 209	n64	epoll_wait			sys_epoll_wait
diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
index 8c0b08b7a80e..1b7777e5f9ff 100644
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
@@ -294,7 +294,7 @@
 233	32	fadvise64			sys_ppc32_fadvise64		compat_sys_ppc32_fadvise64
 233	64	fadvise64			sys_fadvise64
 234	nospu	exit_group			sys_exit_group
-235	nospu	lookup_dcookie			sys_lookup_dcookie		compat_sys_lookup_dcookie
+235	nospu	lookup_dcookie			sys_ni_syscall
 236	common	epoll_create			sys_epoll_create
 237	common	epoll_ctl			sys_epoll_ctl
 238	common	epoll_wait			sys_epoll_wait
diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
index a6935af2235c..11782be77f57 100644
--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@@ -100,7 +100,7 @@
 106  common	stat			sys_newstat			compat_sys_newstat
 107  common	lstat			sys_newlstat			compat_sys_newlstat
 108  common	fstat			sys_newfstat			compat_sys_newfstat
-110  common	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
+110  common	lookup_dcookie		-				-
 111  common	vhangup			sys_vhangup			sys_vhangup
 112  common	idle			-				-
 114  common	wait4			sys_wait4			compat_sys_wait4
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
index 227538b0ce80..27f78821453b 100644
--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@@ -220,7 +220,7 @@
 209	64	io_submit		sys_io_submit
 210	common	io_cancel		sys_io_cancel
 211	64	get_thread_area
-212	common	lookup_dcookie		sys_lookup_dcookie
+212	common	lookup_dcookie
 213	common	epoll_create		sys_epoll_create
 214	64	epoll_ctl_old
 215	64	epoll_wait_old
-- 
cgit v1.2.3


From d4d6596b43868a1e05fe5b047e73c3aff96444c6 Mon Sep 17 00:00:00 2001
From: Yu Liao <liaoyu15@huawei.com>
Date: Wed, 2 Aug 2023 10:15:01 +0800
Subject: sched/headers: Remove duplicate header inclusions

<linux/psi.h> and "autogroup.h" are included twice, remove the duplicate header
inclusion.

Signed-off-by: Yu Liao <liaoyu15@huawei.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230802021501.2511569-1-liaoyu15@huawei.com
---
 kernel/sched/build_utility.c | 1 -
 kernel/sched/core.c          | 1 -
 2 files changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index 99bdd96f454f..80a3df49ab47 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -34,7 +34,6 @@
 #include <linux/nospec.h>
 #include <linux/proc_fs.h>
 #include <linux/psi.h>
-#include <linux/psi.h>
 #include <linux/ptrace_api.h>
 #include <linux/sched_clock.h>
 #include <linux/security.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 324980e3d2e5..27aff98645e8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -85,7 +85,6 @@
 
 #include "sched.h"
 #include "stats.h"
-#include "autogroup.h"
 
 #include "autogroup.h"
 #include "pelt.h"
-- 
cgit v1.2.3


From a1f157c7a3bb342b972c2830a0ad53f627000a04 Mon Sep 17 00:00:00 2001
From: Zheng Yejian <zhengyejian1@huawei.com>
Date: Wed, 6 Sep 2023 17:18:37 +0800
Subject: tracing: Expand all ring buffers individually

The ring buffer of global_trace is set to the minimum size in
order to save memory on boot up and then it will be expand when
some trace feature enabled.

However currently operations under an instance can also cause
global_trace ring buffer being expanded, and the expanded memory
would be wasted if global_trace then not being used.

See following case, we enable 'sched_switch' event in instance 'A', then
ring buffer of global_trace is unexpectedly expanded to be 1410KB, also
the '(expanded: 1408)' from 'buffer_size_kb' of instance is confusing.

  # cd /sys/kernel/tracing
  # mkdir instances/A
  # cat buffer_size_kb
  7 (expanded: 1408)
  # cat instances/A/buffer_size_kb
  1410 (expanded: 1408)
  # echo sched:sched_switch > instances/A/set_event
  # cat buffer_size_kb
  1410
  # cat instances/A/buffer_size_kb
  1410

To fix it, we can:
  - Make 'ring_buffer_expanded' as a member of 'struct trace_array';
  - Make 'ring_buffer_expanded' of instance is defaultly true,
    global_trace is defaultly false;
  - In order not to expose 'global_trace' outside of file
    'kernel/trace/trace.c', introduce trace_set_ring_buffer_expanded()
    to set 'ring_buffer_expanded' as 'true';
  - Pass the expected trace_array to tracing_update_buffers().

Link: https://lore.kernel.org/linux-trace-kernel/20230906091837.3998020-1-zhengyejian1@huawei.com

Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 47 +++++++++++++++++++++++++--------------------
 kernel/trace/trace.h        |  9 +++++++--
 kernel/trace/trace_events.c | 22 +++++++++++----------
 3 files changed, 45 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index abaaf516fcae..dd6395692ff9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -54,12 +54,6 @@
 #include "trace.h"
 #include "trace_output.h"
 
-/*
- * On boot up, the ring buffer is set to the minimum size, so that
- * we do not waste memory on systems that are not using tracing.
- */
-bool ring_buffer_expanded;
-
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 /*
  * We need to change this state when a selftest is running.
@@ -202,7 +196,7 @@ static int __init set_cmdline_ftrace(char *str)
 	strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
 	default_bootup_tracer = bootup_tracer_buf;
 	/* We are using ftrace early, expand it */
-	ring_buffer_expanded = true;
+	trace_set_ring_buffer_expanded(NULL);
 	return 1;
 }
 __setup("ftrace=", set_cmdline_ftrace);
@@ -247,7 +241,7 @@ static int __init boot_alloc_snapshot(char *str)
 	} else {
 		allocate_snapshot = true;
 		/* We also need the main ring buffer expanded */
-		ring_buffer_expanded = true;
+		trace_set_ring_buffer_expanded(NULL);
 	}
 	return 1;
 }
@@ -490,6 +484,13 @@ static struct trace_array global_trace = {
 	.trace_flags = TRACE_DEFAULT_FLAGS,
 };
 
+void trace_set_ring_buffer_expanded(struct trace_array *tr)
+{
+	if (!tr)
+		tr = &global_trace;
+	tr->ring_buffer_expanded = true;
+}
+
 LIST_HEAD(ftrace_trace_arrays);
 
 int trace_array_get(struct trace_array *this_tr)
@@ -2012,7 +2013,7 @@ static int run_tracer_selftest(struct tracer *type)
 #ifdef CONFIG_TRACER_MAX_TRACE
 	if (type->use_max_tr) {
 		/* If we expanded the buffers, make sure the max is expanded too */
-		if (ring_buffer_expanded)
+		if (tr->ring_buffer_expanded)
 			ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
 					   RING_BUFFER_ALL_CPUS);
 		tr->allocated_snapshot = true;
@@ -2038,7 +2039,7 @@ static int run_tracer_selftest(struct tracer *type)
 		tr->allocated_snapshot = false;
 
 		/* Shrink the max buffer again */
-		if (ring_buffer_expanded)
+		if (tr->ring_buffer_expanded)
 			ring_buffer_resize(tr->max_buffer.buffer, 1,
 					   RING_BUFFER_ALL_CPUS);
 	}
@@ -3403,7 +3404,7 @@ void trace_printk_init_buffers(void)
 	pr_warn("**********************************************************\n");
 
 	/* Expand the buffers to set size */
-	tracing_update_buffers();
+	tracing_update_buffers(&global_trace);
 
 	buffers_allocated = 1;
 
@@ -6374,7 +6375,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
 	 * we use the size that was given, and we can forget about
 	 * expanding it later.
 	 */
-	ring_buffer_expanded = true;
+	trace_set_ring_buffer_expanded(tr);
 
 	/* May be called before buffers are initialized */
 	if (!tr->array_buffer.buffer)
@@ -6452,6 +6453,7 @@ out:
 
 /**
  * tracing_update_buffers - used by tracing facility to expand ring buffers
+ * @tr: The tracing instance
  *
  * To save on memory when the tracing is never used on a system with it
  * configured in. The ring buffers are set to a minimum size. But once
@@ -6460,13 +6462,13 @@ out:
  *
  * This function is to be called when a tracer is about to be used.
  */
-int tracing_update_buffers(void)
+int tracing_update_buffers(struct trace_array *tr)
 {
 	int ret = 0;
 
 	mutex_lock(&trace_types_lock);
-	if (!ring_buffer_expanded)
-		ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,
+	if (!tr->ring_buffer_expanded)
+		ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
 						RING_BUFFER_ALL_CPUS);
 	mutex_unlock(&trace_types_lock);
 
@@ -6520,7 +6522,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
 
 	mutex_lock(&trace_types_lock);
 
-	if (!ring_buffer_expanded) {
+	if (!tr->ring_buffer_expanded) {
 		ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
 						RING_BUFFER_ALL_CPUS);
 		if (ret < 0)
@@ -7192,7 +7194,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
 		}
 
 		if (buf_size_same) {
-			if (!ring_buffer_expanded)
+			if (!tr->ring_buffer_expanded)
 				r = sprintf(buf, "%lu (expanded: %lu)\n",
 					    size >> 10,
 					    trace_buf_size >> 10);
@@ -7249,10 +7251,10 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
 	mutex_lock(&trace_types_lock);
 	for_each_tracing_cpu(cpu) {
 		size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10;
-		if (!ring_buffer_expanded)
+		if (!tr->ring_buffer_expanded)
 			expanded_size += trace_buf_size >> 10;
 	}
-	if (ring_buffer_expanded)
+	if (tr->ring_buffer_expanded)
 		r = sprintf(buf, "%lu\n", size);
 	else
 		r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
@@ -7646,7 +7648,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	unsigned long val;
 	int ret;
 
-	ret = tracing_update_buffers();
+	ret = tracing_update_buffers(tr);
 	if (ret < 0)
 		return ret;
 
@@ -9550,6 +9552,9 @@ static struct trace_array *trace_array_create(const char *name)
 	if (allocate_trace_buffers(tr, trace_buf_size) < 0)
 		goto out_free_tr;
 
+	/* The ring buffer is defaultly expanded */
+	trace_set_ring_buffer_expanded(tr);
+
 	if (ftrace_allocate_ftrace_ops(tr) < 0)
 		goto out_free_tr;
 
@@ -10444,7 +10449,7 @@ __init static int tracer_alloc_buffers(void)
 		trace_printk_init_buffers();
 
 	/* To save memory, keep the ring buffer size to its minimum */
-	if (ring_buffer_expanded)
+	if (global_trace.ring_buffer_expanded)
 		ring_buf_size = trace_buf_size;
 	else
 		ring_buf_size = 1;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 77debe53f07c..e92cb9c1292f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -410,6 +410,11 @@ struct trace_array {
 	struct cond_snapshot	*cond_snapshot;
 #endif
 	struct trace_func_repeats	__percpu *last_func_repeats;
+	/*
+	 * On boot up, the ring buffer is set to the minimum size, so that
+	 * we do not waste memory on systems that are not using tracing.
+	 */
+	bool ring_buffer_expanded;
 };
 
 enum {
@@ -761,7 +766,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
 #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
 extern int DYN_FTRACE_TEST_NAME2(void);
 
-extern bool ring_buffer_expanded;
+extern void trace_set_ring_buffer_expanded(struct trace_array *tr);
 extern bool tracing_selftest_disabled;
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1305,7 +1310,7 @@ static inline void trace_branch_disable(void)
 #endif /* CONFIG_BRANCH_TRACER */
 
 /* set ring buffers to default size if not already done so */
-int tracing_update_buffers(void);
+int tracing_update_buffers(struct trace_array *tr);
 
 union trace_synth_field {
 	u8				as_u8;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f49d6ddb6342..099b9b6bbdc4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1166,7 +1166,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 	if (!cnt)
 		return 0;
 
-	ret = tracing_update_buffers();
+	ret = tracing_update_buffers(tr);
 	if (ret < 0)
 		return ret;
 
@@ -1397,18 +1397,20 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	if (ret)
 		return ret;
 
-	ret = tracing_update_buffers();
-	if (ret < 0)
-		return ret;
-
 	switch (val) {
 	case 0:
 	case 1:
 		ret = -ENODEV;
 		mutex_lock(&event_mutex);
 		file = event_file_data(filp);
-		if (likely(file))
+		if (likely(file)) {
+			ret = tracing_update_buffers(file->tr);
+			if (ret < 0) {
+				mutex_unlock(&event_mutex);
+				return ret;
+			}
 			ret = ftrace_event_enable_disable(file, val);
+		}
 		mutex_unlock(&event_mutex);
 		break;
 
@@ -1482,7 +1484,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	if (ret)
 		return ret;
 
-	ret = tracing_update_buffers();
+	ret = tracing_update_buffers(dir->tr);
 	if (ret < 0)
 		return ret;
 
@@ -1956,7 +1958,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
 	if (!cnt)
 		return 0;
 
-	ret = tracing_update_buffers();
+	ret = tracing_update_buffers(tr);
 	if (ret < 0)
 		return ret;
 
@@ -2824,7 +2826,7 @@ static __init int setup_trace_triggers(char *str)
 	int i;
 
 	strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE);
-	ring_buffer_expanded = true;
+	trace_set_ring_buffer_expanded(NULL);
 	disable_tracing_selftest("running event triggers");
 
 	buf = bootup_trigger_buf;
@@ -3614,7 +3616,7 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
 static __init int setup_trace_event(char *str)
 {
 	strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
-	ring_buffer_expanded = true;
+	trace_set_ring_buffer_expanded(NULL);
 	disable_tracing_selftest("running event tracing");
 
 	return 1;
-- 
cgit v1.2.3


From bdf4fb628093b4ab2f4eb312256233a2ae0594b7 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Thu, 14 Sep 2023 18:34:02 +0200
Subject: ring_buffer: Use try_cmpxchg instead of cmpxchg in rb_insert_pages

Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in
rb_insert_pages. x86 CMPXCHG instruction returns success in ZF flag,
so this change saves a compare after cmpxchg (and related move
instruction in front of cmpxchg).

No functional change intended.

Link: https://lore.kernel.org/linux-trace-kernel/20230914163420.12923-1-ubizjak@gmail.com

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 515cafdb18d9..43cc47d7faaf 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2056,7 +2056,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
 	retries = 10;
 	success = false;
 	while (retries--) {
-		struct list_head *head_page, *prev_page, *r;
+		struct list_head *head_page, *prev_page;
 		struct list_head *last_page, *first_page;
 		struct list_head *head_page_with_bit;
 		struct buffer_page *hpage = rb_set_head_page(cpu_buffer);
@@ -2075,9 +2075,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
 		last_page->next = head_page_with_bit;
 		first_page->prev = prev_page;
 
-		r = cmpxchg(&prev_page->next, head_page_with_bit, first_page);
-
-		if (r == head_page_with_bit) {
+		/* caution: head_page_with_bit gets updated on cmpxchg failure */
+		if (try_cmpxchg(&prev_page->next,
+				&head_page_with_bit, first_page)) {
 			/*
 			 * yay, we replaced the page pointer to our new list,
 			 * now, we just have to update to head page's prev
-- 
cgit v1.2.3


From 5dbd04eddb2c0841d1b3930e0a9944a2343c9cac Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 12 Sep 2023 18:07:02 +0000
Subject: tracing/user_events: Allow events to persist for perfmon_capable
 users

There are several scenarios that have come up where having a user_event
persist even if the process that registered it exits. The main one is
having a daemon create events on bootup that shouldn't get deleted if
the daemon has to exit or reload. Another is within OpenTelemetry
exporters, they wish to potentially check if a user_event exists on the
system to determine if exporting the data out should occur. The
user_event in this case must exist even in the absence of the owning
process running (such as the above daemon case).

Expose the previously internal flag USER_EVENT_REG_PERSIST to user
processes. Upon register or delete of events with this flag, ensure the
user is perfmon_capable to prevent random user processes with access to
tracefs from creating events that persist after exit.

Link: https://lkml.kernel.org/r/20230912180704.1284-2-beaub@linux.microsoft.com

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/uapi/linux/user_events.h | 11 ++++++++++-
 kernel/trace/trace_events_user.c | 36 ++++++++++++++++++++++--------------
 2 files changed, 32 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h
index 2984aae4a2b4..f74f3aedd49c 100644
--- a/include/uapi/linux/user_events.h
+++ b/include/uapi/linux/user_events.h
@@ -17,6 +17,15 @@
 /* Create dynamic location entry within a 32-bit value */
 #define DYN_LOC(offset, size) ((size) << 16 | (offset))
 
+/* List of supported registration flags */
+enum user_reg_flag {
+	/* Event will not delete upon last reference closing */
+	USER_EVENT_REG_PERSIST		= 1U << 0,
+
+	/* This value or above is currently non-ABI */
+	USER_EVENT_REG_MAX		= 1U << 1,
+};
+
 /*
  * Describes an event registration and stores the results of the registration.
  * This structure is passed to the DIAG_IOCSREG ioctl, callers at a minimum
@@ -33,7 +42,7 @@ struct user_reg {
 	/* Input: Enable size in bytes at address */
 	__u8	enable_size;
 
-	/* Input: Flags for future use, set to 0 */
+	/* Input: Flags to use, if any */
 	__u16	flags;
 
 	/* Input: Address to update when enabled */
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index b87f41187c6a..9365ce407426 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -49,18 +49,6 @@
 #define EVENT_STATUS_PERF BIT(1)
 #define EVENT_STATUS_OTHER BIT(7)
 
-/*
- * User register flags are not allowed yet, keep them here until we are
- * ready to expose them out to the user ABI.
- */
-enum user_reg_flag {
-	/* Event will not delete upon last reference closing */
-	USER_EVENT_REG_PERSIST		= 1U << 0,
-
-	/* This value or above is currently non-ABI */
-	USER_EVENT_REG_MAX		= 1U << 1,
-};
-
 /*
  * Stores the system name, tables, and locks for a group of events. This
  * allows isolation for events by various means.
@@ -220,6 +208,17 @@ static u32 user_event_key(char *name)
 	return jhash(name, strlen(name), 0);
 }
 
+static bool user_event_capable(u16 reg_flags)
+{
+	/* Persistent events require CAP_PERFMON / CAP_SYS_ADMIN */
+	if (reg_flags & USER_EVENT_REG_PERSIST) {
+		if (!perfmon_capable())
+			return false;
+	}
+
+	return true;
+}
+
 static struct user_event *user_event_get(struct user_event *user)
 {
 	refcount_inc(&user->refcnt);
@@ -1811,6 +1810,9 @@ static int user_event_free(struct dyn_event *ev)
 	if (!user_event_last_ref(user))
 		return -EBUSY;
 
+	if (!user_event_capable(user->reg_flags))
+		return -EPERM;
+
 	return destroy_user_event(user);
 }
 
@@ -1926,10 +1928,13 @@ static int user_event_parse(struct user_event_group *group, char *name,
 	int argc = 0;
 	char **argv;
 
-	/* User register flags are not ready yet */
-	if (reg_flags != 0 || flags != NULL)
+	/* Currently don't support any text based flags */
+	if (flags != NULL)
 		return -EINVAL;
 
+	if (!user_event_capable(reg_flags))
+		return -EPERM;
+
 	/* Prevent dyn_event from racing */
 	mutex_lock(&event_mutex);
 	user = find_user_event(group, name, &key);
@@ -2062,6 +2067,9 @@ static int delete_user_event(struct user_event_group *group, char *name)
 	if (!user_event_last_ref(user))
 		return -EBUSY;
 
+	if (!user_event_capable(user->reg_flags))
+		return -EPERM;
+
 	return destroy_user_event(user);
 }
 
-- 
cgit v1.2.3


From 5f98fd034ca6fd1ab8c91a3488968a0e9caaabf6 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Sat, 30 Sep 2023 17:46:56 +0000
Subject: rcu: kmemleak: Ignore kmemleak false positives when RCU-freeing
 objects

Since the actual slab freeing is deferred when calling kvfree_rcu(), so
is the kmemleak_free() callback informing kmemleak of the object
deletion. From the perspective of the kvfree_rcu() caller, the object is
freed and it may remove any references to it. Since kmemleak does not
scan RCU internal data storing the pointer, it will report such objects
as leaks during the grace period.

Tell kmemleak to ignore such objects on the kvfree_call_rcu() path. Note
that the tiny RCU implementation does not have such issue since the
objects can be tracked from the rcu_ctrlblk structure.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Christoph Paasch <cpaasch@apple.com>
Closes: https://lore.kernel.org/all/F903A825-F05F-4B77-A2B5-7356282FBA2C@apple.com/
Cc: <stable@vger.kernel.org>
Tested-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a83ecab77917..4dd7df30df31 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -31,6 +31,7 @@
 #include <linux/bitops.h>
 #include <linux/export.h>
 #include <linux/completion.h>
+#include <linux/kmemleak.h>
 #include <linux/moduleparam.h>
 #include <linux/panic.h>
 #include <linux/panic_notifier.h>
@@ -3389,6 +3390,14 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
 		success = true;
 	}
 
+	/*
+	 * The kvfree_rcu() caller considers the pointer freed at this point
+	 * and likely removes any references to it. Since the actual slab
+	 * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
+	 * this object (no scanning or false positives reporting).
+	 */
+	kmemleak_ignore(ptr);
+
 	// Set timer to drain after KFREE_DRAIN_JIFFIES.
 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
 		schedule_delayed_monitor_work(krcp);
-- 
cgit v1.2.3


From 7df2a2a024145d0dcc1e51dc378c527000b35b07 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Sep 2023 22:35:54 +0200
Subject: rcu: Use rcu_segcblist_segempty() instead of open coding it

This makes the code more readable.

Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/rcu_segcblist.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index f71fac422c8f..1693ea22ef1b 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -368,7 +368,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
 	smp_mb(); /* Ensure counts are updated before callback is entrained. */
 	rhp->next = NULL;
 	for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
-		if (rsclp->tails[i] != rsclp->tails[i - 1])
+		if (!rcu_segcblist_segempty(rsclp, i))
 			break;
 	rcu_segcblist_inc_seglen(rsclp, i);
 	WRITE_ONCE(*rsclp->tails[i], rhp);
@@ -551,7 +551,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 	 * as their ->gp_seq[] grace-period completion sequence number.
 	 */
 	for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
-		if (rsclp->tails[i] != rsclp->tails[i - 1] &&
+		if (!rcu_segcblist_segempty(rsclp, i) &&
 		    ULONG_CMP_LT(rsclp->gp_seq[i], seq))
 			break;
 
-- 
cgit v1.2.3


From 358662a9616c5078dc4d389d6bceeb5974f4aa97 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Sep 2023 22:35:58 +0200
Subject: rcu: Assume IRQS disabled from rcu_report_dead()

rcu_report_dead() is the last RCU word from the CPU down through the
hotplug path. It is called in the idle loop right before the CPU shuts
down for good. Because it removes the CPU from the grace period state
machine and reports an ultimate quiescent state if necessary, no further
use of RCU is allowed. Therefore it is expected that IRQs are disabled
upon calling this function and are not to be re-enabled again until the
CPU shuts down.

Remove the IRQs disablement from that function and verify instead that
it is actually called with IRQs disabled as it is expected at that
special point in the idle path.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4dd7df30df31..8c2954502e55 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4562,11 +4562,16 @@ void rcu_cpu_starting(unsigned int cpu)
  */
 void rcu_report_dead(unsigned int cpu)
 {
-	unsigned long flags, seq_flags;
+	unsigned long flags;
 	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
+	/*
+	 * IRQS must be disabled from now on and until the CPU dies, or an interrupt
+	 * may introduce a new READ-side while it is actually off the QS masks.
+	 */
+	lockdep_assert_irqs_disabled();
 	// Do any dangling deferred wakeups.
 	do_nocb_deferred_wakeup(rdp);
 
@@ -4574,7 +4579,6 @@ void rcu_report_dead(unsigned int cpu)
 
 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
 	mask = rdp->grpmask;
-	local_irq_save(seq_flags);
 	arch_spin_lock(&rcu_state.ofl_lock);
 	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
 	rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4588,8 +4592,6 @@ void rcu_report_dead(unsigned int cpu)
 	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	arch_spin_unlock(&rcu_state.ofl_lock);
-	local_irq_restore(seq_flags);
-
 	rdp->cpu_started = false;
 }
 
-- 
cgit v1.2.3


From c964c1f5ee96e1460606d44f80a47bdacd8fe568 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Sep 2023 22:35:59 +0200
Subject: rcu: Assume rcu_report_dead() is always called locally

rcu_report_dead() has to be called locally by the CPU that is going to
exit the RCU state machine. Passing a cpu argument here is error-prone
and leaves the possibility for a racy remote call.

Use local access instead.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 arch/arm64/kernel/smp.c  | 2 +-
 include/linux/rcupdate.h | 2 +-
 kernel/cpu.c             | 2 +-
 kernel/rcu/tree.c        | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 960b98b43506..8fa646c90c67 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -401,7 +401,7 @@ void __noreturn cpu_die_early(void)
 
 	/* Mark this CPU absent */
 	set_cpu_present(cpu, 0);
-	rcu_report_dead(cpu);
+	rcu_report_dead();
 
 	if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
 		update_cpu_boot_status(CPU_KILL_ME);
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 5e5f920ade90..aa351ddcbe8d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -122,7 +122,7 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
 void rcu_init(void);
 extern int rcu_scheduler_active;
 void rcu_sched_clock_irq(int user);
-void rcu_report_dead(unsigned int cpu);
+void rcu_report_dead(void);
 void rcutree_migrate_callbacks(int cpu);
 
 #ifdef CONFIG_TASKS_RCU_GENERIC
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6de7c6bb74ee..076e75fed8bb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1388,7 +1388,7 @@ void cpuhp_report_idle_dead(void)
 	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
 
 	BUG_ON(st->state != CPUHP_AP_OFFLINE);
-	rcu_report_dead(smp_processor_id());
+	rcu_report_dead();
 	st->state = CPUHP_AP_IDLE_DEAD;
 	/*
 	 * We cannot call complete after rcu_report_dead() so we delegate it
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8c2954502e55..2e1e7eadf2cc 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4560,11 +4560,11 @@ void rcu_cpu_starting(unsigned int cpu)
  * from the outgoing CPU rather than from the cpuhp_step mechanism.
  * This is because this function must be invoked at a precise location.
  */
-void rcu_report_dead(unsigned int cpu)
+void rcu_report_dead(void)
 {
 	unsigned long flags;
 	unsigned long mask;
-	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
 	/*
-- 
cgit v1.2.3


From 01a99a750a4f414c221781de2e5570e0ceae04b5 Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Wed, 26 Jul 2023 03:50:47 +0800
Subject: futex/requeue: Remove unnecessary ‘NULL’ initialization from
 futex_proxy_trylock_atomic()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

'top_waiter' is assigned unconditionally before first use,
so it does not need an initialization.

[ mingo: Created legible changelog. ]

Signed-off-by: Li zeming <zeming@nfschina.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230725195047.3106-1-zeming@nfschina.com
---
 kernel/futex/requeue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index a0a79954f506..16a3645bd786 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -269,7 +269,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
 			   union futex_key *key2, struct futex_pi_state **ps,
 			   struct task_struct **exiting, int set_waiters)
 {
-	struct futex_q *top_waiter = NULL;
+	struct futex_q *top_waiter;
 	u32 curval;
 	int ret;
 
-- 
cgit v1.2.3


From bc0c3357601e3ff1b006600530079bd246ef0d82 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 23 Aug 2023 19:05:56 +0200
Subject: mm: remove remnants of SPLIT_RSS_COUNTING

The feature got retired in f1a7941243c1 ("mm: convert mm's rss stats into
percpu_counter"), but the patch failed to fully clean it up.

Link: https://lkml.kernel.org/r/20230823170556.2281747-1-mjguzik@gmail.com
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/exec.c          | 2 --
 include/linux/mm.h | 8 --------
 kernel/exit.c      | 4 ----
 kernel/fork.c      | 4 ----
 kernel/kthread.c   | 1 -
 mm/madvise.c       | 5 +----
 mm/memory.c        | 2 --
 7 files changed, 1 insertion(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 6518e33ea813..eb039041ad6a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -986,8 +986,6 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk = current;
 	old_mm = current->mm;
 	exec_mm_release(tsk, old_mm);
-	if (old_mm)
-		sync_mm_rss(old_mm);
 
 	ret = down_write_killable(&tsk->signal->exec_update_lock);
 	if (ret)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..7613150acab9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2628,14 +2628,6 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
 		*maxrss = hiwater_rss;
 }
 
-#if defined(SPLIT_RSS_COUNTING)
-void sync_mm_rss(struct mm_struct *mm);
-#else
-static inline void sync_mm_rss(struct mm_struct *mm)
-{
-}
-#endif
-
 #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
 static inline int pte_special(pte_t pte)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index edb50b4c9972..3cdbe797008f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -539,7 +539,6 @@ static void exit_mm(void)
 	exit_mm_release(current, mm);
 	if (!mm)
 		return;
-	sync_mm_rss(mm);
 	mmap_read_lock(mm);
 	mmgrab_lazy_tlb(mm);
 	BUG_ON(mm != current->active_mm);
@@ -829,9 +828,6 @@ void __noreturn do_exit(long code)
 	io_uring_files_cancel();
 	exit_signals(tsk);  /* sets PF_EXITING */
 
-	/* sync mm's RSS info before statistics gathering */
-	if (tsk->mm)
-		sync_mm_rss(tsk->mm);
 	acct_update_integrals(tsk);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b6d20dfb9a8..1779183a7cb3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2406,10 +2406,6 @@ __latent_entropy struct task_struct *copy_process(
 	p->io_uring = NULL;
 #endif
 
-#if defined(SPLIT_RSS_COUNTING)
-	memset(&p->rss_stat, 0, sizeof(p->rss_stat));
-#endif
-
 	p->default_timer_slack_ns = current->timer_slack_ns;
 
 #ifdef CONFIG_PSI
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1eea53050bab..c46128ec0c0a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1469,7 +1469,6 @@ void kthread_unuse_mm(struct mm_struct *mm)
 	 * clearing tsk->mm.
 	 */
 	smp_mb__after_spinlock();
-	sync_mm_rss(mm);
 	local_irq_disable();
 	tsk->mm = NULL;
 	membarrier_update_current_mm(NULL);
diff --git a/mm/madvise.c b/mm/madvise.c
index 4dded5d27e7e..59e8860c86af 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -746,11 +746,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		folio_mark_lazyfree(folio);
 	}
 
-	if (nr_swap) {
-		if (current->mm == mm)
-			sync_mm_rss(mm);
+	if (nr_swap)
 		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
-	}
 	if (start_pte) {
 		arch_leave_lazy_mmu_mode();
 		pte_unmap_unlock(start_pte, ptl);
diff --git a/mm/memory.c b/mm/memory.c
index 6c264d2f969c..0739ccb00e61 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -471,8 +471,6 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
 {
 	int i;
 
-	if (current->mm == mm)
-		sync_mm_rss(mm);
 	for (i = 0; i < NR_MM_COUNTERS; i++)
 		if (rss[i])
 			add_mm_counter(mm, i, rss[i]);
-- 
cgit v1.2.3


From 2fbacff0cbf58530fa459c3f28ba9125df33ad86 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:14 +0800
Subject: rcu: dynamically allocate the rcu-lazy shrinker

Use new APIs to dynamically allocate the rcu-lazy shrinker.

Link: https://lkml.kernel.org/r/20230911094444.68966-16-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Acked-by: Muchun Song <songmuchun@bytedance.com>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/rcu/tree_nocb.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 5598212d1f27..4efbf7333d4e 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1396,13 +1396,6 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 
 	return count ? count : SHRINK_STOP;
 }
-
-static struct shrinker lazy_rcu_shrinker = {
-	.count_objects = lazy_rcu_shrink_count,
-	.scan_objects = lazy_rcu_shrink_scan,
-	.batch = 0,
-	.seeks = DEFAULT_SEEKS,
-};
 #endif // #ifdef CONFIG_RCU_LAZY
 
 void __init rcu_init_nohz(void)
@@ -1410,6 +1403,7 @@ void __init rcu_init_nohz(void)
 	int cpu;
 	struct rcu_data *rdp;
 	const struct cpumask *cpumask = NULL;
+	struct shrinker * __maybe_unused lazy_rcu_shrinker;
 
 #if defined(CONFIG_NO_HZ_FULL)
 	if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
@@ -1436,8 +1430,15 @@ void __init rcu_init_nohz(void)
 		return;
 
 #ifdef CONFIG_RCU_LAZY
-	if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy"))
-		pr_err("Failed to register lazy_rcu shrinker!\n");
+	lazy_rcu_shrinker = shrinker_alloc(0, "rcu-lazy");
+	if (!lazy_rcu_shrinker) {
+		pr_err("Failed to allocate lazy_rcu shrinker!\n");
+	} else {
+		lazy_rcu_shrinker->count_objects = lazy_rcu_shrink_count;
+		lazy_rcu_shrinker->scan_objects = lazy_rcu_shrink_scan;
+
+		shrinker_register(lazy_rcu_shrinker);
+	}
 #endif // #ifdef CONFIG_RCU_LAZY
 
 	if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
-- 
cgit v1.2.3


From 21e0b932fb5d72bb6cefdf56e22bfddea968cf32 Mon Sep 17 00:00:00 2001
From: Qi Zheng <zhengqi.arch@bytedance.com>
Date: Mon, 11 Sep 2023 17:44:15 +0800
Subject: rcu: dynamically allocate the rcu-kfree shrinker

Use new APIs to dynamically allocate the rcu-kfree shrinker.

Link: https://lkml.kernel.org/r/20230911094444.68966-17-zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Cc: Abhinav Kumar <quic_abhinavk@quicinc.com>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bob Peterson <rpeterso@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Carlos Llamas <cmllamas@google.com>
Cc: Chandan Babu R <chandan.babu@oracle.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Chuck Lever <cel@kernel.org>
Cc: Coly Li <colyli@suse.de>
Cc: Dai Ngo <Dai.Ngo@oracle.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Cc: Gao Xiang <hsiangkao@linux.alibaba.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Kirill Tkhai <tkhai@ya.ru>
Cc: Marijn Suijten <marijn.suijten@somainline.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Mike Snitzer <snitzer@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nadav Amit <namit@vmware.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Olga Kornievskaia <kolga@netapp.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Sean Paul <sean@poorly.run>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Song Liu <song@kernel.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Steven Price <steven.price@arm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Tom Talpey <tom@talpey.com>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/rcu/tree.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cb1caefa8bd0..06e2ed495c02 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3449,13 +3449,6 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 	return freed == 0 ? SHRINK_STOP : freed;
 }
 
-static struct shrinker kfree_rcu_shrinker = {
-	.count_objects = kfree_rcu_shrink_count,
-	.scan_objects = kfree_rcu_shrink_scan,
-	.batch = 0,
-	.seeks = DEFAULT_SEEKS,
-};
-
 void __init kfree_rcu_scheduler_running(void)
 {
 	int cpu;
@@ -4931,6 +4924,7 @@ static void __init kfree_rcu_batch_init(void)
 {
 	int cpu;
 	int i, j;
+	struct shrinker *kfree_rcu_shrinker;
 
 	/* Clamp it to [0:100] seconds interval. */
 	if (rcu_delay_page_cache_fill_msec < 0 ||
@@ -4962,8 +4956,17 @@ static void __init kfree_rcu_batch_init(void)
 		INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
 		krcp->initialized = true;
 	}
-	if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree"))
-		pr_err("Failed to register kfree_rcu() shrinker!\n");
+
+	kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree");
+	if (!kfree_rcu_shrinker) {
+		pr_err("Failed to allocate kfree_rcu() shrinker!\n");
+		return;
+	}
+
+	kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
+	kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
+
+	shrinker_register(kfree_rcu_shrinker);
 }
 
 void __init rcu_init(void)
-- 
cgit v1.2.3


From c0d2f4ce5c9fbc3e3b219d828b77ecd4445c1ad2 Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Fri, 25 Aug 2023 04:30:57 +0300
Subject: docs: fix link s390/zfcpdump.rst

After move of Documentation/s390 to Documentation/arch/s390

Link: https://lkml.kernel.org/r/20230825013102.1487979-1-costa.shul@redhat.com
Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Eric DeVolder <eric.devolder@oracle.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/Kconfig.kexec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 9bfe68fe9676..7aff28ded2f4 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -110,7 +110,7 @@ config CRASH_DUMP
 	  For more details see Documentation/admin-guide/kdump/kdump.rst
 
 	  For s390, this option also enables zfcpdump.
-	  See also <file:Documentation/s390/zfcpdump.rst>
+	  See also <file:Documentation/arch/s390/zfcpdump.rst>
 
 config CRASH_HOTPLUG
 	bool "Update the crash elfcorehdr on system configuration changes"
-- 
cgit v1.2.3


From 8e1f385104ac044f1552686ad6e1cbc71cc05a30 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 26 Aug 2023 13:14:09 +0200
Subject: kill task_struct->thread_group

The last user was removed by the previous patch.

Link: https://lkml.kernel.org/r/20230826111409.GA23243@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched.h | 1 -
 init/init_task.c      | 1 -
 kernel/exit.c         | 1 -
 kernel/fork.c         | 3 ---
 4 files changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..6d1341b1673f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1002,7 +1002,6 @@ struct task_struct {
 	/* PID/PID hash table linkage. */
 	struct pid			*thread_pid;
 	struct hlist_node		pid_links[PIDTYPE_MAX];
-	struct list_head		thread_group;
 	struct list_head		thread_node;
 
 	struct completion		*vfork_done;
diff --git a/init/init_task.c b/init/init_task.c
index ff6c4b9bfe6b..c0de0200fd56 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -132,7 +132,6 @@ struct task_struct init_task
 	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
 	.timer_slack_ns = 50000, /* 50 usec default slack */
 	.thread_pid	= &init_struct_pid,
-	.thread_group	= LIST_HEAD_INIT(init_task.thread_group),
 	.thread_node	= LIST_HEAD_INIT(init_signals.thread_head),
 #ifdef CONFIG_AUDIT
 	.loginuid	= INVALID_UID,
diff --git a/kernel/exit.c b/kernel/exit.c
index edb50b4c9972..f3ba4b97a7d9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -133,7 +133,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
 		list_del_init(&p->sibling);
 		__this_cpu_dec(process_counts);
 	}
-	list_del_rcu(&p->thread_group);
 	list_del_rcu(&p->thread_node);
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b6d20dfb9a8..b9d3aa493bbd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2576,7 +2576,6 @@ __latent_entropy struct task_struct *copy_process(
 	p->dirty_paused_when = 0;
 
 	p->pdeath_signal = 0;
-	INIT_LIST_HEAD(&p->thread_group);
 	p->task_works = NULL;
 	clear_posix_cputimers_work(p);
 
@@ -2704,8 +2703,6 @@ __latent_entropy struct task_struct *copy_process(
 			atomic_inc(&current->signal->live);
 			refcount_inc(&current->signal->sigcnt);
 			task_join_group_stop(p);
-			list_add_tail_rcu(&p->thread_group,
-					  &p->group_leader->thread_group);
 			list_add_tail_rcu(&p->thread_node,
 					  &p->signal->thread_head);
 		}
-- 
cgit v1.2.3


From 398352049146e34ec6113a00c63457149a81345c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 23 Aug 2023 19:14:55 +0200
Subject: __kill_pgrp_info: simplify the calculation of return value

No need to calculate/check the "success" variable, we can kill it and update
retval in the main loop unless it is zero.

Link: https://lkml.kernel.org/r/20230823171455.GA12188@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Suggested-by: David Laight <David.Laight@ACULAB.COM>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/signal.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 09019017d669..fc276fc07d87 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1471,16 +1471,21 @@ int group_send_sig_info(int sig, struct kernel_siginfo *info,
 int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
 {
 	struct task_struct *p = NULL;
-	int retval, success;
+	int ret = -ESRCH;
 
-	success = 0;
-	retval = -ESRCH;
 	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 		int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
-		success |= !err;
-		retval = err;
+		/*
+		 * If group_send_sig_info() succeeds at least once ret
+		 * becomes 0 and after that the code below has no effect.
+		 * Otherwise we return the last err or -ESRCH if this
+		 * process group is empty.
+		 */
+		if (ret)
+			ret = err;
 	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
-	return success ? 0 : retval;
+
+	return ret;
 }
 
 int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
-- 
cgit v1.2.3


From 9734fe4dc22052d8010cc0b68b092fe5335ccb31 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Mon, 4 Sep 2023 17:21:01 +0200
Subject: panic: use atomic_try_cmpxchg in panic() and nmi_panic()

Use atomic_try_cmpxchg instead of atomic_cmpxchg (*ptr, old, new) == old
in panic() and nmi_panic().  x86 CMPXCHG instruction returns success in ZF
flag, so this change saves a compare after cmpxchg (and related move
instruction in front of cmpxchg).

Also, rename cpu variable to this_cpu in nmi_panic() and try to unify
logic flow between panic() and nmi_panic().

No functional change intended.

[ubizjak@gmail.com: clean up if/else block]
  Link: https://lkml.kernel.org/r/20230906191200.68707-1-ubizjak@gmail.com
Link: https://lkml.kernel.org/r/20230904152230.9227-1-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/panic.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index ffa037fa777d..2807639aab51 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -192,14 +192,15 @@ atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
  */
 void nmi_panic(struct pt_regs *regs, const char *msg)
 {
-	int old_cpu, cpu;
+	int old_cpu, this_cpu;
 
-	cpu = raw_smp_processor_id();
-	old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu);
+	old_cpu = PANIC_CPU_INVALID;
+	this_cpu = raw_smp_processor_id();
 
-	if (old_cpu == PANIC_CPU_INVALID)
+	/* atomic_try_cmpxchg updates old_cpu on failure */
+	if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu))
 		panic("%s", msg);
-	else if (old_cpu != cpu)
+	else if (old_cpu != this_cpu)
 		nmi_panic_self_stop(regs);
 }
 EXPORT_SYMBOL(nmi_panic);
@@ -311,15 +312,18 @@ void panic(const char *fmt, ...)
 	 * stop themself or will wait until they are stopped by the 1st CPU
 	 * with smp_send_stop().
 	 *
-	 * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
-	 * comes here, so go ahead.
+	 * cmpxchg success means this is the 1st CPU which comes here,
+	 * so go ahead.
 	 * `old_cpu == this_cpu' means we came from nmi_panic() which sets
 	 * panic_cpu to this CPU.  In this case, this is also the 1st CPU.
 	 */
+	old_cpu = PANIC_CPU_INVALID;
 	this_cpu = raw_smp_processor_id();
-	old_cpu  = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
 
-	if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
+	/* atomic_try_cmpxchg updates old_cpu on failure */
+	if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
+		/* go ahead */
+	} else if (old_cpu != this_cpu)
 		panic_smp_self_stop();
 
 	console_verbose();
-- 
cgit v1.2.3


From e5ecf29c507830f192d3eb255662ad9ba219c442 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 9 Sep 2023 18:45:37 +0200
Subject: signal: complete_signal: use __for_each_thread()

do/while_each_thread should be avoided when possible.

Link: https://lkml.kernel.org/r/20230909164537.GA11633@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/signal.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index fc276fc07d87..ccfc3ded5672 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1058,12 +1058,11 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
 			signal->flags = SIGNAL_GROUP_EXIT;
 			signal->group_exit_code = sig;
 			signal->group_stop_count = 0;
-			t = p;
-			do {
+			__for_each_thread(signal, t) {
 				task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
 				sigaddset(&t->pending.signal, SIGKILL);
 				signal_wake_up(t, 1);
-			} while_each_thread(p, t);
+			}
 			return;
 		}
 	}
-- 
cgit v1.2.3


From c7ac8231ace9b07306d0299969e42073b189c70a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 9 Sep 2023 19:25:54 +0200
Subject: getrusage: add the "signal_struct *sig" local variable

No functional changes, cleanup/preparation.

Link: https://lkml.kernel.org/r/20230909172554.GA20441@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/sys.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 2410e3999ebe..097cbea62a72 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1786,6 +1786,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 	unsigned long flags;
 	u64 tgutime, tgstime, utime, stime;
 	unsigned long maxrss = 0;
+	struct signal_struct *sig = p->signal;
 
 	memset((char *)r, 0, sizeof (*r));
 	utime = stime = 0;
@@ -1793,7 +1794,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 	if (who == RUSAGE_THREAD) {
 		task_cputime_adjusted(current, &utime, &stime);
 		accumulate_thread_rusage(p, r);
-		maxrss = p->signal->maxrss;
+		maxrss = sig->maxrss;
 		goto out;
 	}
 
@@ -1803,15 +1804,15 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 	switch (who) {
 	case RUSAGE_BOTH:
 	case RUSAGE_CHILDREN:
-		utime = p->signal->cutime;
-		stime = p->signal->cstime;
-		r->ru_nvcsw = p->signal->cnvcsw;
-		r->ru_nivcsw = p->signal->cnivcsw;
-		r->ru_minflt = p->signal->cmin_flt;
-		r->ru_majflt = p->signal->cmaj_flt;
-		r->ru_inblock = p->signal->cinblock;
-		r->ru_oublock = p->signal->coublock;
-		maxrss = p->signal->cmaxrss;
+		utime = sig->cutime;
+		stime = sig->cstime;
+		r->ru_nvcsw = sig->cnvcsw;
+		r->ru_nivcsw = sig->cnivcsw;
+		r->ru_minflt = sig->cmin_flt;
+		r->ru_majflt = sig->cmaj_flt;
+		r->ru_inblock = sig->cinblock;
+		r->ru_oublock = sig->coublock;
+		maxrss = sig->cmaxrss;
 
 		if (who == RUSAGE_CHILDREN)
 			break;
@@ -1821,14 +1822,14 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 		thread_group_cputime_adjusted(p, &tgutime, &tgstime);
 		utime += tgutime;
 		stime += tgstime;
-		r->ru_nvcsw += p->signal->nvcsw;
-		r->ru_nivcsw += p->signal->nivcsw;
-		r->ru_minflt += p->signal->min_flt;
-		r->ru_majflt += p->signal->maj_flt;
-		r->ru_inblock += p->signal->inblock;
-		r->ru_oublock += p->signal->oublock;
-		if (maxrss < p->signal->maxrss)
-			maxrss = p->signal->maxrss;
+		r->ru_nvcsw += sig->nvcsw;
+		r->ru_nivcsw += sig->nivcsw;
+		r->ru_minflt += sig->min_flt;
+		r->ru_majflt += sig->maj_flt;
+		r->ru_inblock += sig->inblock;
+		r->ru_oublock += sig->oublock;
+		if (maxrss < sig->maxrss)
+			maxrss = sig->maxrss;
 		t = p;
 		do {
 			accumulate_thread_rusage(t, r);
-- 
cgit v1.2.3


From 13b7bc60b5353371460a203df6c38ccd38ad7a3a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 9 Sep 2023 19:26:29 +0200
Subject: getrusage: use __for_each_thread()

do/while_each_thread should be avoided when possible.

Plus this change allows to avoid lock_task_sighand(), we can use rcu
and/or sig->stats_lock instead.

Link: https://lkml.kernel.org/r/20230909172629.GA20454@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/sys.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 097cbea62a72..67436d465be4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1830,10 +1830,8 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 		r->ru_oublock += sig->oublock;
 		if (maxrss < sig->maxrss)
 			maxrss = sig->maxrss;
-		t = p;
-		do {
+		__for_each_thread(sig, t)
 			accumulate_thread_rusage(t, r);
-		} while_each_thread(p, t);
 		break;
 
 	default:
-- 
cgit v1.2.3


From ed5378a387fd7c382497f2abcf4605e030b64044 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 9 Sep 2023 23:49:51 +0200
Subject: taskstats: fill_stats_for_tgid: use for_each_thread()

do/while_each_thread should be avoided when possible.

Plus I _think_ this change allows to avoid lock_task_sighand() but I am
not sure, I forgot everything about taskstats.  In any case, this code
does not look right in that the same thread can be accounted twice:
taskstats_exit() can account the exiting thread in signal->stats and drop
->siglock but this thread is still on the thread-group list, so
lock_task_sighand() can't help.

Link: https://lkml.kernel.org/r/20230909214951.GA24274@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/taskstats.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 8ce3fa0c19e2..4354ea231fab 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -233,9 +233,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
 	else
 		memset(stats, 0, sizeof(*stats));
 
-	tsk = first;
 	start_time = ktime_get_ns();
-	do {
+	for_each_thread(first, tsk) {
 		if (tsk->exit_state)
 			continue;
 		/*
@@ -258,7 +257,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
 
 		stats->nvcsw += tsk->nvcsw;
 		stats->nivcsw += tsk->nivcsw;
-	} while_each_thread(first, tsk);
+	}
 
 	unlock_task_sighand(first, &flags);
 	rc = 0;
-- 
cgit v1.2.3


From 6309727ef27162deabd5c095c11af24970fba5a2 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Fri, 8 Sep 2023 01:40:48 +0200
Subject: kthread: add kthread_stop_put

Add a kthread_stop_put() helper that stops a thread and puts its task
struct.  Use it to replace the various instances of kthread_stop()
followed by put_task_struct().

Remove the kthread_stop_put() macro in usbip that is similar but doesn't
return the result of kthread_stop().

[agruenba@redhat.com: fix kerneldoc comment]
  Link: https://lkml.kernel.org/r/20230911111730.2565537-1-agruenba@redhat.com
[akpm@linux-foundation.org: document kthread_stop_put()'s argument]
Link: https://lkml.kernel.org/r/20230907234048.2499820-1-agruenba@redhat.com
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/accel/ivpu/ivpu_job.c              |  3 +--
 drivers/dma-buf/st-dma-fence-chain.c       | 12 ++++--------
 drivers/dma-buf/st-dma-fence.c             |  4 +---
 drivers/gpu/drm/i915/gt/selftest_migrate.c |  4 +---
 drivers/net/xen-netback/interface.c        |  3 +--
 drivers/usb/usbip/usbip_common.h           |  6 ------
 fs/gfs2/ops_fstype.c                       |  9 +++------
 include/linux/kthread.h                    |  1 +
 kernel/irq/manage.c                        | 15 +++++----------
 kernel/kthread.c                           | 18 ++++++++++++++++++
 kernel/smpboot.c                           |  3 +--
 mm/damon/core.c                            |  3 +--
 net/core/pktgen.c                          |  3 +--
 13 files changed, 38 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c
index de9e69f70af7..76f468c9f761 100644
--- a/drivers/accel/ivpu/ivpu_job.c
+++ b/drivers/accel/ivpu/ivpu_job.c
@@ -618,6 +618,5 @@ int ivpu_job_done_thread_init(struct ivpu_device *vdev)
 
 void ivpu_job_done_thread_fini(struct ivpu_device *vdev)
 {
-	kthread_stop(vdev->job_done_thread);
-	put_task_struct(vdev->job_done_thread);
+	kthread_stop_put(vdev->job_done_thread);
 }
diff --git a/drivers/dma-buf/st-dma-fence-chain.c b/drivers/dma-buf/st-dma-fence-chain.c
index c0979c8049b5..9c2a0c082a76 100644
--- a/drivers/dma-buf/st-dma-fence-chain.c
+++ b/drivers/dma-buf/st-dma-fence-chain.c
@@ -476,10 +476,9 @@ static int find_race(void *arg)
 	for (i = 0; i < ncpus; i++) {
 		int ret;
 
-		ret = kthread_stop(threads[i]);
+		ret = kthread_stop_put(threads[i]);
 		if (ret && !err)
 			err = ret;
-		put_task_struct(threads[i]);
 	}
 	kfree(threads);
 
@@ -591,8 +590,7 @@ static int wait_forward(void *arg)
 	for (i = 0; i < fc.chain_length; i++)
 		dma_fence_signal(fc.fences[i]);
 
-	err = kthread_stop(tsk);
-	put_task_struct(tsk);
+	err = kthread_stop_put(tsk);
 
 err:
 	fence_chains_fini(&fc);
@@ -621,8 +619,7 @@ static int wait_backward(void *arg)
 	for (i = fc.chain_length; i--; )
 		dma_fence_signal(fc.fences[i]);
 
-	err = kthread_stop(tsk);
-	put_task_struct(tsk);
+	err = kthread_stop_put(tsk);
 
 err:
 	fence_chains_fini(&fc);
@@ -669,8 +666,7 @@ static int wait_random(void *arg)
 	for (i = 0; i < fc.chain_length; i++)
 		dma_fence_signal(fc.fences[i]);
 
-	err = kthread_stop(tsk);
-	put_task_struct(tsk);
+	err = kthread_stop_put(tsk);
 
 err:
 	fence_chains_fini(&fc);
diff --git a/drivers/dma-buf/st-dma-fence.c b/drivers/dma-buf/st-dma-fence.c
index fb6e0a6ae2c9..b7c6f7ea9e0c 100644
--- a/drivers/dma-buf/st-dma-fence.c
+++ b/drivers/dma-buf/st-dma-fence.c
@@ -548,11 +548,9 @@ static int race_signal_callback(void *arg)
 		for (i = 0; i < ARRAY_SIZE(t); i++) {
 			int err;
 
-			err = kthread_stop(t[i].task);
+			err = kthread_stop_put(t[i].task);
 			if (err && !ret)
 				ret = err;
-
-			put_task_struct(t[i].task);
 		}
 	}
 
diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c b/drivers/gpu/drm/i915/gt/selftest_migrate.c
index 3def5ca72dec..0fb07f073baa 100644
--- a/drivers/gpu/drm/i915/gt/selftest_migrate.c
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@ -719,11 +719,9 @@ static int threaded_migrate(struct intel_migrate *migrate,
 		if (IS_ERR_OR_NULL(tsk))
 			continue;
 
-		status = kthread_stop(tsk);
+		status = kthread_stop_put(tsk);
 		if (status && !err)
 			err = status;
-
-		put_task_struct(tsk);
 	}
 
 	kfree(thread);
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index f3f2c07423a6..33c8143619f0 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -672,8 +672,7 @@ err:
 static void xenvif_disconnect_queue(struct xenvif_queue *queue)
 {
 	if (queue->task) {
-		kthread_stop(queue->task);
-		put_task_struct(queue->task);
+		kthread_stop_put(queue->task);
 		queue->task = NULL;
 	}
 
diff --git a/drivers/usb/usbip/usbip_common.h b/drivers/usb/usbip/usbip_common.h
index d8cbd2dfc2c2..282efca64a01 100644
--- a/drivers/usb/usbip/usbip_common.h
+++ b/drivers/usb/usbip/usbip_common.h
@@ -298,12 +298,6 @@ struct usbip_device {
 	__k;								   \
 })
 
-#define kthread_stop_put(k)		\
-	do {				\
-		kthread_stop(k);	\
-		put_task_struct(k);	\
-	} while (0)
-
 /* usbip_common.c */
 void usbip_dump_urb(struct urb *purb);
 void usbip_dump_header(struct usbip_header *pdu);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 33ca04733e93..ecf789b7168c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1126,8 +1126,7 @@ static int init_threads(struct gfs2_sbd *sdp)
 	return 0;
 
 fail:
-	kthread_stop(sdp->sd_logd_process);
-	put_task_struct(sdp->sd_logd_process);
+	kthread_stop_put(sdp->sd_logd_process);
 	sdp->sd_logd_process = NULL;
 	return error;
 }
@@ -1135,13 +1134,11 @@ fail:
 void gfs2_destroy_threads(struct gfs2_sbd *sdp)
 {
 	if (sdp->sd_logd_process) {
-		kthread_stop(sdp->sd_logd_process);
-		put_task_struct(sdp->sd_logd_process);
+		kthread_stop_put(sdp->sd_logd_process);
 		sdp->sd_logd_process = NULL;
 	}
 	if (sdp->sd_quotad_process) {
-		kthread_stop(sdp->sd_quotad_process);
-		put_task_struct(sdp->sd_quotad_process);
+		kthread_stop_put(sdp->sd_quotad_process);
 		sdp->sd_quotad_process = NULL;
 	}
 }
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 2c30ade43bc8..b11f53c1ba2e 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -86,6 +86,7 @@ void free_kthread_struct(struct task_struct *k);
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
 int kthread_stop(struct task_struct *k);
+int kthread_stop_put(struct task_struct *k);
 bool kthread_should_stop(void);
 bool kthread_should_park(void);
 bool kthread_should_stop_or_park(void);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d309ba84e08a..1782f90cd8c6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1852,15 +1852,13 @@ out_thread:
 		struct task_struct *t = new->thread;
 
 		new->thread = NULL;
-		kthread_stop(t);
-		put_task_struct(t);
+		kthread_stop_put(t);
 	}
 	if (new->secondary && new->secondary->thread) {
 		struct task_struct *t = new->secondary->thread;
 
 		new->secondary->thread = NULL;
-		kthread_stop(t);
-		put_task_struct(t);
+		kthread_stop_put(t);
 	}
 out_mput:
 	module_put(desc->owner);
@@ -1971,12 +1969,9 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
 	 * the same bit to a newly requested action.
 	 */
 	if (action->thread) {
-		kthread_stop(action->thread);
-		put_task_struct(action->thread);
-		if (action->secondary && action->secondary->thread) {
-			kthread_stop(action->secondary->thread);
-			put_task_struct(action->secondary->thread);
-		}
+		kthread_stop_put(action->thread);
+		if (action->secondary && action->secondary->thread)
+			kthread_stop_put(action->secondary->thread);
 	}
 
 	/* Last action releases resources */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1eea53050bab..290cbc845225 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -715,6 +715,24 @@ int kthread_stop(struct task_struct *k)
 }
 EXPORT_SYMBOL(kthread_stop);
 
+/**
+ * kthread_stop_put - stop a thread and put its task struct
+ * @k: thread created by kthread_create().
+ *
+ * Stops a thread created by kthread_create() and put its task_struct.
+ * Only use when holding an extra task struct reference obtained by
+ * calling get_task_struct().
+ */
+int kthread_stop_put(struct task_struct *k)
+{
+	int ret;
+
+	ret = kthread_stop(k);
+	put_task_struct(k);
+	return ret;
+}
+EXPORT_SYMBOL(kthread_stop_put);
+
 int kthreadd(void *unused)
 {
 	struct task_struct *tsk = current;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index f47d8f375946..1992b62e980b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -272,8 +272,7 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 		struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
 
 		if (tsk) {
-			kthread_stop(tsk);
-			put_task_struct(tsk);
+			kthread_stop_put(tsk);
 			*per_cpu_ptr(ht->store, cpu) = NULL;
 		}
 	}
diff --git a/mm/damon/core.c b/mm/damon/core.c
index bcd2bd9d6c10..2f54f153d7f5 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -699,8 +699,7 @@ static int __damon_stop(struct damon_ctx *ctx)
 	if (tsk) {
 		get_task_struct(tsk);
 		mutex_unlock(&ctx->kdamond_lock);
-		kthread_stop(tsk);
-		put_task_struct(tsk);
+		kthread_stop_put(tsk);
 		return 0;
 	}
 	mutex_unlock(&ctx->kdamond_lock);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f56b8d697014..826250a0f5b1 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3982,8 +3982,7 @@ static void __net_exit pg_net_exit(struct net *net)
 	list_for_each_safe(q, n, &list) {
 		t = list_entry(q, struct pktgen_thread, th_list);
 		list_del(&t->th_list);
-		kthread_stop(t->tsk);
-		put_task_struct(t->tsk);
+		kthread_stop_put(t->tsk);
 		kfree(t);
 	}
 
-- 
cgit v1.2.3


From 2d57792a39e5473c5d98fc0138c4e9ab9c98a57b Mon Sep 17 00:00:00 2001
From: Rong Tao <rongtao@cestc.cn>
Date: Mon, 11 Sep 2023 22:55:09 +0800
Subject: pid: pid_ns_ctl_handler: remove useless comment

commit 95846ecf9dac("pid: replace pid bitmap implementation with IDR API")
removes 'last_pid' element, and use the idr_get_cursor-idr_set_cursor pair
to set the value of idr, so useless comments should be removed.

Link: https://lkml.kernel.org/r/tencent_157A2A1CAF19A3F5885F0687426159A19708@qq.com
Signed-off-by: Rong Tao <rongtao@cestc.cn>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Jeff Xu <jeffxu@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/pid_namespace.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 619972c78774..3028b2218aa4 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -286,12 +286,6 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 	if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
 		return -EPERM;
 
-	/*
-	 * Writing directly to ns' last_pid field is OK, since this field
-	 * is volatile in a living namespace anyway and a code writing to
-	 * it should synchronize its usage with external means.
-	 */
-
 	next = idr_get_cursor(&pid_ns->idr) - 1;
 
 	tmp.data = &next;
-- 
cgit v1.2.3


From a6304272b03ece97346f16923453f7e36ec19a5a Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:34 +0800
Subject: crash_core.c: remove unnecessary parameter of function

Patch series "kdump: use generic functions to simplify crashkernel
reservation in arch", v3.

In the current arm64, crashkernel=,high support has been finished after
several rounds of posting and careful reviewing.  The code in arm64 which
parses crashkernel kernel parameters firstly, then reserve memory can be a
good example for other ARCH to refer to.

Whereas in x86_64, the code mixing crashkernel parameter parsing and
memory reserving is twisted, and looks messy.  Refactoring the code to
make it more readable maintainable is necessary.

Here, firstly abstract the crashkernel parameter parsing code into
parse_crashkernel() to make it be able to parse crashkernel=,high|low.
Then abstract the crashkernel memory reserving code into a generic
function reserve_crashkernel_generic().  Finally, in ARCH which
crashkernel=,high support is needed, a simple arch_reserve_crashkernel()
can be added to call above two functions.  This can remove the duplicated
implmentation code in each ARCH, like arm64, x86_64 and riscv.

crashkernel=512M,high
crashkernel=512M,high crashkernel=256M,low
crashkernel=512M,high crashkernel=0M,low
crashkernel=0M,high crashkernel=256M,low
crashkernel=512M
crashkernel=512M@0x4f000000
crashkernel=1G-4G:256M,4G-64G:320M,64G-:576M
crashkernel=0M


This patch (of 9):

In all call sites of __parse_crashkernel(), the parameter 'name' is
hardcoded as "crashkernel=".  So remove the unnecessary parameter 'name',
add local varibale 'name' inside __parse_crashkernel() instead.

Link: https://lkml.kernel.org/r/20230914033142.676708-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20230914033142.676708-2-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Cc: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/crash_core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 2f675ef045d4..507113932aa9 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -248,11 +248,11 @@ static int __init __parse_crashkernel(char *cmdline,
 			     unsigned long long system_ram,
 			     unsigned long long *crash_size,
 			     unsigned long long *crash_base,
-			     const char *name,
 			     const char *suffix)
 {
 	char	*first_colon, *first_space;
 	char	*ck_cmdline;
+	char	*name = "crashkernel=";
 
 	BUG_ON(!crash_size || !crash_base);
 	*crash_size = 0;
@@ -290,7 +290,7 @@ int __init parse_crashkernel(char *cmdline,
 			     unsigned long long *crash_base)
 {
 	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-					"crashkernel=", NULL);
+				NULL);
 }
 
 int __init parse_crashkernel_high(char *cmdline,
@@ -299,7 +299,7 @@ int __init parse_crashkernel_high(char *cmdline,
 			     unsigned long long *crash_base)
 {
 	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				"crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+				suffix_tbl[SUFFIX_HIGH]);
 }
 
 int __init parse_crashkernel_low(char *cmdline,
@@ -308,7 +308,7 @@ int __init parse_crashkernel_low(char *cmdline,
 			     unsigned long long *crash_base)
 {
 	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				"crashkernel=", suffix_tbl[SUFFIX_LOW]);
+				suffix_tbl[SUFFIX_LOW]);
 }
 
 /*
-- 
cgit v1.2.3


From a9e1a3d84e4a0ea560ed4d84c28d06dbfdffed22 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:35 +0800
Subject: crash_core: change the prototype of function parse_crashkernel()

Add two parameters 'low_size' and 'high' to function parse_crashkernel(),
later crashkernel=,high|low parsing will be added.  Make adjustments in
all call sites of parse_crashkernel() in arch.

Link: https://lkml.kernel.org/r/20230914033142.676708-3-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/kernel/setup.c              |  3 ++-
 arch/arm64/mm/init.c                 |  2 +-
 arch/ia64/kernel/setup.c             |  2 +-
 arch/loongarch/kernel/setup.c        |  4 +++-
 arch/mips/kernel/setup.c             |  3 ++-
 arch/powerpc/kernel/fadump.c         |  2 +-
 arch/powerpc/kexec/core.c            |  2 +-
 arch/powerpc/mm/nohash/kaslr_booke.c |  2 +-
 arch/riscv/mm/init.c                 |  2 +-
 arch/s390/kernel/setup.c             |  4 ++--
 arch/sh/kernel/machine_kexec.c       |  2 +-
 arch/x86/kernel/setup.c              |  3 ++-
 include/linux/crash_core.h           |  3 ++-
 kernel/crash_core.c                  | 15 ++++++++++++---
 14 files changed, 32 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index c66b560562b3..e2bb7afd0683 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -1010,7 +1010,8 @@ static void __init reserve_crashkernel(void)
 
 	total_mem = get_total_mem();
 	ret = parse_crashkernel(boot_command_line, total_mem,
-				&crash_size, &crash_base);
+				&crash_size, &crash_base,
+				NULL, NULL);
 	/* invalid value specified or crashkernel=0 */
 	if (ret || !crash_size)
 		return;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 8a0f8604348b..801c59c39a8f 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -142,7 +142,7 @@ static void __init reserve_crashkernel(void)
 
 	/* crashkernel=X[@offset] */
 	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
-				&crash_size, &crash_base);
+				&crash_size, &crash_base, NULL, NULL);
 	if (ret == -ENOENT) {
 		ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base);
 		if (ret || !crash_size)
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 5a55ac82c13a..4faea2d2cf07 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -277,7 +277,7 @@ static void __init setup_crashkernel(unsigned long total, int *n)
 	int ret;
 
 	ret = parse_crashkernel(boot_command_line, total,
-			&size, &base);
+			&size, &base, NULL, NULL);
 	if (ret == 0 && size > 0) {
 		if (!base) {
 			sort_regions(rsvd_region, *n);
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index 7783f0a3d742..4de32b07c0dc 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -267,7 +267,9 @@ static void __init arch_parse_crashkernel(void)
 	unsigned long long crash_base, crash_size;
 
 	total_mem = memblock_phys_mem_size();
-	ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
+	ret = parse_crashkernel(boot_command_line, total_mem,
+				&crash_size, &crash_base,
+				NULL, NULL);
 	if (ret < 0 || crash_size <= 0)
 		return;
 
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index cb871eb784a7..08321c945ac4 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -460,7 +460,8 @@ static void __init mips_parse_crashkernel(void)
 
 	total_mem = memblock_phys_mem_size();
 	ret = parse_crashkernel(boot_command_line, total_mem,
-				&crash_size, &crash_base);
+				&crash_size, &crash_base,
+				NULL, NULL);
 	if (ret != 0 || crash_size <= 0)
 		return;
 
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 3ff2da7b120b..d14eda1e8589 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -313,7 +313,7 @@ static __init u64 fadump_calculate_reserve_size(void)
 	 * memory at a predefined offset.
 	 */
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
-				&size, &base);
+				&size, &base, NULL, NULL);
 	if (ret == 0 && size > 0) {
 		unsigned long max_size;
 
diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
index de64c7962991..9346c960b296 100644
--- a/arch/powerpc/kexec/core.c
+++ b/arch/powerpc/kexec/core.c
@@ -109,7 +109,7 @@ void __init reserve_crashkernel(void)
 	total_mem_sz = memory_limit ? memory_limit : memblock_phys_mem_size();
 	/* use common parsing */
 	ret = parse_crashkernel(boot_command_line, total_mem_sz,
-			&crash_size, &crash_base);
+			&crash_size, &crash_base, NULL, NULL);
 	if (ret == 0 && crash_size > 0) {
 		crashk_res.start = crash_base;
 		crashk_res.end = crash_base + crash_size - 1;
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c
index 2fb3edafe9ab..b4f2786a7d2b 100644
--- a/arch/powerpc/mm/nohash/kaslr_booke.c
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -178,7 +178,7 @@ static void __init get_crash_kernel(void *fdt, unsigned long size)
 	int ret;
 
 	ret = parse_crashkernel(boot_command_line, size, &crash_size,
-				&crash_base);
+				&crash_base, NULL, NULL);
 	if (ret != 0 || crash_size == 0)
 		return;
 	if (crash_base == 0)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 0798bd861dcb..9fe448900059 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1388,7 +1388,7 @@ static void __init reserve_crashkernel(void)
 	}
 
 	ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
-				&crash_size, &crash_base);
+				&crash_size, &crash_base, NULL, NULL);
 	if (ret == -ENOENT) {
 		/* Fallback to crashkernel=X,[high,low] */
 		ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base);
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index de6ad0fb2328..e555b576d3c8 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -625,8 +625,8 @@ static void __init reserve_crashkernel(void)
 	phys_addr_t low, high;
 	int rc;
 
-	rc = parse_crashkernel(boot_command_line, ident_map_size, &crash_size,
-			       &crash_base);
+	rc = parse_crashkernel(boot_command_line, ident_map_size,
+			       &crash_size, &crash_base, NULL, NULL);
 
 	crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN);
 	crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN);
diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
index 223c14f44af7..fa3a7b36190a 100644
--- a/arch/sh/kernel/machine_kexec.c
+++ b/arch/sh/kernel/machine_kexec.c
@@ -154,7 +154,7 @@ void __init reserve_crashkernel(void)
 	int ret;
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
-			&crash_size, &crash_base);
+			&crash_size, &crash_base, NULL, NULL);
 	if (ret == 0 && crash_size > 0) {
 		crashk_res.start = crash_base;
 		crashk_res.end = crash_base + crash_size - 1;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b098b1fa2470..655c04812905 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -553,7 +553,8 @@ static void __init reserve_crashkernel(void)
 	total_mem = memblock_phys_mem_size();
 
 	/* crashkernel=XM */
-	ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
+	ret = parse_crashkernel(boot_command_line, total_mem,
+				&crash_size, &crash_base, NULL, NULL);
 	if (ret != 0 || crash_size <= 0) {
 		/* crashkernel=X,high */
 		ret = parse_crashkernel_high(boot_command_line, total_mem,
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 0c06561bf5ff..6156355ef831 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -80,7 +80,8 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 void final_note(Elf_Word *buf);
 
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
+		unsigned long long *crash_size, unsigned long long *crash_base,
+		unsigned long long *low_size, bool *high);
 int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
 int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 507113932aa9..33ced5b5ed4e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -287,10 +287,19 @@ static int __init __parse_crashkernel(char *cmdline,
 int __init parse_crashkernel(char *cmdline,
 			     unsigned long long system_ram,
 			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
+			     unsigned long long *crash_base,
+			     unsigned long long *low_size,
+			     bool *high)
 {
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				NULL);
+	int ret;
+
+	/* crashkernel=X[@offset] */
+	ret = __parse_crashkernel(cmdline, system_ram, crash_size,
+				crash_base, NULL);
+	if (!high)
+		return ret;
+
+	return 0;
 }
 
 int __init parse_crashkernel_high(char *cmdline,
-- 
cgit v1.2.3


From 70916e9c8d9f1a286c99727072b22e395097909f Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:36 +0800
Subject: crash_core: change parse_crashkernel() to support
 crashkernel=,high|low parsing

Now parse_crashkernel() is a real entry point for all kinds of crahskernel
parsing on any architecture.

And wrap the crahskernel=,high|low handling inside
CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION ifdeffery scope.

Link: https://lkml.kernel.org/r/20230914033142.676708-4-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h |  6 ++++++
 kernel/crash_core.c        | 36 +++++++++++++++++++++++++++++++++---
 2 files changed, 39 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 6156355ef831..d8050a7eab01 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -79,6 +79,12 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
 void final_note(Elf_Word *buf);
 
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+#define DEFAULT_CRASH_KERNEL_LOW_SIZE  (128UL << 20)
+#endif
+#endif
+
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base,
 		unsigned long long *low_size, bool *high);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 33ced5b5ed4e..99a243540a35 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -283,6 +283,9 @@ static int __init __parse_crashkernel(char *cmdline,
 /*
  * That function is the entry point for command line parsing and should be
  * called from the arch-specific code.
+ *
+ * If crashkernel=,high|low is supported on architecture, non-NULL values
+ * should be passed to parameters 'low_size' and 'high'.
  */
 int __init parse_crashkernel(char *cmdline,
 			     unsigned long long system_ram,
@@ -296,10 +299,37 @@ int __init parse_crashkernel(char *cmdline,
 	/* crashkernel=X[@offset] */
 	ret = __parse_crashkernel(cmdline, system_ram, crash_size,
 				crash_base, NULL);
-	if (!high)
-		return ret;
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+	/*
+	 * If non-NULL 'high' passed in and no normal crashkernel
+	 * setting detected, try parsing crashkernel=,high|low.
+	 */
+	if (high && ret == -ENOENT) {
+		ret = __parse_crashkernel(cmdline, 0, crash_size,
+				crash_base, suffix_tbl[SUFFIX_HIGH]);
+		if (ret || !*crash_size)
+			return -EINVAL;
 
-	return 0;
+		/*
+		 * crashkernel=Y,low can be specified or not, but invalid value
+		 * is not allowed.
+		 */
+		ret = __parse_crashkernel(cmdline, 0, low_size,
+				crash_base, suffix_tbl[SUFFIX_LOW]);
+		if (ret == -ENOENT) {
+			*low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+			ret = 0;
+		} else if (ret) {
+			return ret;
+		}
+
+		*high = true;
+	}
+#endif
+	if (!*crash_size)
+		ret = -EINVAL;
+
+	return ret;
 }
 
 int __init parse_crashkernel_high(char *cmdline,
-- 
cgit v1.2.3


From 0ab97169aa0517079b22c2e64192906caa5dc6d5 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:37 +0800
Subject: crash_core: add generic function to do reservation

In architecture like x86_64, arm64 and riscv, they have vast virtual
address space and usually have huge physical memory RAM.  Their
crashkernel reservation doesn't have to be limited under 4G RAM, but can
be extended to the whole physical memory via crashkernel=,high support.

Now add function reserve_crashkernel_generic() to reserve crashkernel
memory if users specify any case of kernel pamameters, like
crashkernel=xM[@offset] or crashkernel=,high|low.

This is preparation to simplify code of crashkernel=,high support in
architecutures.

Link: https://lkml.kernel.org/r/20230914033142.676708-5-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h |  28 ++++++++++++
 kernel/crash_core.c        | 107 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 134 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index d8050a7eab01..4dbd6565e0ff 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -93,6 +93,34 @@ int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
 int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
 
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+#define DEFAULT_CRASH_KERNEL_LOW_SIZE	(128UL << 20)
+#endif
+#ifndef CRASH_ALIGN
+#define CRASH_ALIGN			SZ_2M
+#endif
+#ifndef CRASH_ADDR_LOW_MAX
+#define CRASH_ADDR_LOW_MAX		SZ_4G
+#endif
+#ifndef CRASH_ADDR_HIGH_MAX
+#define CRASH_ADDR_HIGH_MAX		memblock_end_of_DRAM()
+#endif
+
+void __init reserve_crashkernel_generic(char *cmdline,
+		unsigned long long crash_size,
+		unsigned long long crash_base,
+		unsigned long long crash_low_size,
+		bool high);
+#else
+static inline void __init reserve_crashkernel_generic(char *cmdline,
+		unsigned long long crash_size,
+		unsigned long long crash_base,
+		unsigned long long crash_low_size,
+		bool high)
+{}
+#endif
+
 /* Alignment required for elf header segment */
 #define ELF_CORE_HEADER_ALIGN   4096
 
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 99a243540a35..72e358197b52 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -5,7 +5,6 @@
  */
 
 #include <linux/buildid.h>
-#include <linux/crash_core.h>
 #include <linux/init.h>
 #include <linux/utsname.h>
 #include <linux/vmalloc.h>
@@ -13,6 +12,9 @@
 #include <linux/kexec.h>
 #include <linux/memory.h>
 #include <linux/cpuhotplug.h>
+#include <linux/memblock.h>
+#include <linux/kexec.h>
+#include <linux/kmemleak.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -360,6 +362,109 @@ static int __init parse_crashkernel_dummy(char *arg)
 }
 early_param("crashkernel", parse_crashkernel_dummy);
 
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+static int __init reserve_crashkernel_low(unsigned long long low_size)
+{
+#ifdef CONFIG_64BIT
+	unsigned long long low_base;
+
+	low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
+	if (!low_base) {
+		pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
+		return -ENOMEM;
+	}
+
+	pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
+		low_base, low_base + low_size, low_size >> 20);
+
+	crashk_low_res.start = low_base;
+	crashk_low_res.end   = low_base + low_size - 1;
+	insert_resource(&iomem_resource, &crashk_low_res);
+#endif
+	return 0;
+}
+
+void __init reserve_crashkernel_generic(char *cmdline,
+			     unsigned long long crash_size,
+			     unsigned long long crash_base,
+			     unsigned long long crash_low_size,
+			     bool high)
+{
+	unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0;
+	bool fixed_base = false;
+
+	/* User specifies base address explicitly. */
+	if (crash_base) {
+		fixed_base = true;
+		search_base = crash_base;
+		search_end = crash_base + crash_size;
+	} else if (high) {
+		search_base = CRASH_ADDR_LOW_MAX;
+		search_end = CRASH_ADDR_HIGH_MAX;
+	}
+
+retry:
+	crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+					       search_base, search_end);
+	if (!crash_base) {
+		/*
+		 * For crashkernel=size[KMG]@offset[KMG], print out failure
+		 * message if can't reserve the specified region.
+		 */
+		if (fixed_base) {
+			pr_warn("crashkernel reservation failed - memory is in use.\n");
+			return;
+		}
+
+		/*
+		 * For crashkernel=size[KMG], if the first attempt was for
+		 * low memory, fall back to high memory, the minimum required
+		 * low memory will be reserved later.
+		 */
+		if (!high && search_end == CRASH_ADDR_LOW_MAX) {
+			search_end = CRASH_ADDR_HIGH_MAX;
+			search_base = CRASH_ADDR_LOW_MAX;
+			crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
+			goto retry;
+		}
+
+		/*
+		 * For crashkernel=size[KMG],high, if the first attempt was
+		 * for high memory, fall back to low memory.
+		 */
+		if (high && search_end == CRASH_ADDR_HIGH_MAX) {
+			search_end = CRASH_ADDR_LOW_MAX;
+			search_base = 0;
+			goto retry;
+		}
+		pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+			crash_size);
+		return;
+	}
+
+	if ((crash_base > CRASH_ADDR_LOW_MAX) &&
+	     crash_low_size && reserve_crashkernel_low(crash_low_size)) {
+		memblock_phys_free(crash_base, crash_size);
+		return;
+	}
+
+	pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
+		crash_base, crash_base + crash_size, crash_size >> 20);
+
+	/*
+	 * The crashkernel memory will be removed from the kernel linear
+	 * map. Inform kmemleak so that it won't try to access it.
+	 */
+	kmemleak_ignore_phys(crash_base);
+	if (crashk_low_res.end)
+		kmemleak_ignore_phys(crashk_low_res.start);
+
+	crashk_res.start = crash_base;
+	crashk_res.end = crash_base + crash_size - 1;
+	insert_resource(&iomem_resource, &crashk_res);
+}
+#endif
+
 int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
 			  void **addr, unsigned long *sz)
 {
-- 
cgit v1.2.3


From b631b95dded5e7f007a3a79cbaf82ef50c1e2cf7 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:38 +0800
Subject: crash_core: move crashk_*res definition into crash_core.c

Both crashk_res and crashk_low_res are used to mark the reserved
crashkernel regions in iomem_resource tree.  And later the generic
crashkernel resrvation will be added into crash_core.c.  So move
crashk_res and crashk_low_res definition into crash_core.c to avoid
compiling error if CONFIG_CRASH_CORE=on while CONFIG_KEXEC_CORE is unset.

Meanwhile include <asm/crash_core.h> in <linux/crash_core.h> if generic
reservation is needed.  In that case, <asm/crash_core.h> need be added by
ARCH.  In asm/crash_core.h, ARCH can provide its own macro definitions to
override macros in <linux/crash_core.h> if needed.  Wrap the including
into CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION ifdeffery scope to
avoid compiling error in other ARCH-es which don't take the generic
reservation way yet.

Link: https://lkml.kernel.org/r/20230914033142.676708-6-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h |  8 ++++++++
 include/linux/kexec.h      |  4 ----
 kernel/crash_core.c        | 16 ++++++++++++++++
 kernel/kexec_core.c        | 17 -----------------
 4 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 4dbd6565e0ff..3c735a7e33fb 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -5,6 +5,14 @@
 #include <linux/linkage.h>
 #include <linux/elfcore.h>
 #include <linux/elf.h>
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#include <asm/crash_core.h>
+#endif
+
+/* Location of a reserved region to hold the crash kernel.
+ */
+extern struct resource crashk_res;
+extern struct resource crashk_low_res;
 
 #define CRASH_CORE_NOTE_NAME	   "CORE"
 #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 32c78078552c..8227455192b7 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -22,10 +22,6 @@
 #include <uapi/linux/kexec.h>
 #include <linux/verification.h>
 
-/* Location of a reserved region to hold the crash kernel.
- */
-extern struct resource crashk_res;
-extern struct resource crashk_low_res;
 extern note_buf_t __percpu *crash_notes;
 
 #ifdef CONFIG_KEXEC_CORE
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 72e358197b52..fa8808c4f00e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -35,6 +35,22 @@ u32 *vmcoreinfo_note;
 /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
 static unsigned char *vmcoreinfo_data_safecopy;
 
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+	.name  = "Crash kernel",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+	.desc  = IORES_DESC_CRASH_KERNEL
+};
+struct resource crashk_low_res = {
+	.name  = "Crash kernel",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+	.desc  = IORES_DESC_CRASH_KERNEL
+};
+
 /*
  * parsing the "crashkernel" commandline
  *
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 9dc728982d79..be5642a4ec49 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -52,23 +52,6 @@ atomic_t __kexec_lock = ATOMIC_INIT(0);
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
 
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
-	.name  = "Crash kernel",
-	.start = 0,
-	.end   = 0,
-	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
-	.desc  = IORES_DESC_CRASH_KERNEL
-};
-struct resource crashk_low_res = {
-	.name  = "Crash kernel",
-	.start = 0,
-	.end   = 0,
-	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
-	.desc  = IORES_DESC_CRASH_KERNEL
-};
-
 int kexec_should_crash(struct task_struct *p)
 {
 	/*
-- 
cgit v1.2.3


From c37e56cac3d62c69f093904afbc58fc428484d14 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 14 Sep 2023 11:31:42 +0800
Subject: crash_core.c: remove unneeded functions

So far, nobody calls functions parse_crashkernel_high() and
parse_crashkernel_low(), remove both of them.

Link: https://lkml.kernel.org/r/20230914033142.676708-10-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Jiahao <chenjiahao16@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_core.h |  4 ----
 kernel/crash_core.c        | 18 ------------------
 2 files changed, 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 3c735a7e33fb..3426f6eef60b 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -96,10 +96,6 @@ void final_note(Elf_Word *buf);
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base,
 		unsigned long long *low_size, bool *high);
-int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
-int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
 
 #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
 #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index fa8808c4f00e..efe87d501c8c 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -350,24 +350,6 @@ int __init parse_crashkernel(char *cmdline,
 	return ret;
 }
 
-int __init parse_crashkernel_high(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				suffix_tbl[SUFFIX_LOW]);
-}
-
 /*
  * Add a dummy early_param handler to mark crashkernel= as a known command line
  * parameter and suppress incorrect warnings in init/main.c.
-- 
cgit v1.2.3


From b21f18ef964b2c71aa0b451df6d17b7bcad8280d Mon Sep 17 00:00:00 2001
From: Pavankumar Kondeti <quic_pkondeti@quicinc.com>
Date: Wed, 4 Oct 2023 10:31:15 +0530
Subject: PM: hibernate: Fix copying the zero bitmap to safe pages

The following crash is observed 100% of the time during resume from
the hibernation on a x86 QEMU system.

[   12.931887]  ? __die_body+0x1a/0x60
[   12.932324]  ? page_fault_oops+0x156/0x420
[   12.932824]  ? search_exception_tables+0x37/0x50
[   12.933389]  ? fixup_exception+0x21/0x300
[   12.933889]  ? exc_page_fault+0x69/0x150
[   12.934371]  ? asm_exc_page_fault+0x26/0x30
[   12.934869]  ? get_buffer.constprop.0+0xac/0x100
[   12.935428]  snapshot_write_next+0x7c/0x9f0
[   12.935929]  ? submit_bio_noacct_nocheck+0x2c2/0x370
[   12.936530]  ? submit_bio_noacct+0x44/0x2c0
[   12.937035]  ? hib_submit_io+0xa5/0x110
[   12.937501]  load_image+0x83/0x1a0
[   12.937919]  swsusp_read+0x17f/0x1d0
[   12.938355]  ? create_basic_memory_bitmaps+0x1b7/0x240
[   12.938967]  load_image_and_restore+0x45/0xc0
[   12.939494]  software_resume+0x13c/0x180
[   12.939994]  resume_store+0xa3/0x1d0

The commit being fixed introduced a bug in copying the zero bitmap
to safe pages. A temporary bitmap is allocated with PG_ANY flag in
prepare_image() to make a copy of zero bitmap after the unsafe pages
are marked. Freeing this temporary bitmap with PG_UNSAFE_KEEP later
results in an inconsistent state of unsafe pages. Since free bit is
left as is for this temporary bitmap after free, these pages are
treated as unsafe pages when they are allocated again. This results
in incorrect calculation of the number of pages pre-allocated for the
image.

nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages;

The allocate_unsafe_pages is estimated to be higher than the actual
which results in running short of pages in safe_pages_list. Hence the
crash is observed in get_buffer() due to NULL pointer access of
safe_pages_list.

Fix this issue by creating the temporary zero bitmap from safe pages
(free bit not set) so that the corresponding free bits can be cleared
while freeing this bitmap.

Fixes: 005e8dddd497 ("PM: hibernate: don't store zero pages in the image file")
Suggested-by:: Brian Geffon <bgeffon@google.com>
Signed-off-by: Pavankumar Kondeti <quic_pkondeti@quicinc.com>
Reviewed-by: Brian Geffon <bgeffon@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 87e9f7e2bdc0..0f12e0a97e43 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2647,7 +2647,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm,
 	memory_bm_free(bm, PG_UNSAFE_KEEP);
 
 	/* Make a copy of zero_bm so it can be created in safe pages */
-	error = memory_bm_create(&tmp, GFP_ATOMIC, PG_ANY);
+	error = memory_bm_create(&tmp, GFP_ATOMIC, PG_SAFE);
 	if (error)
 		goto Free;
 
@@ -2660,7 +2660,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm,
 		goto Free;
 
 	duplicate_memory_bitmap(zero_bm, &tmp);
-	memory_bm_free(&tmp, PG_UNSAFE_KEEP);
+	memory_bm_free(&tmp, PG_UNSAFE_CLEAR);
 	/* At this point zero_bm is in safe pages and it can be used for restoring. */
 
 	if (nr_highmem > 0) {
-- 
cgit v1.2.3


From 9b81d3a5be05d350ac93d99762c7ee91fe29b4cb Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <luizcap@amazon.com>
Date: Wed, 27 Sep 2023 14:25:40 +0000
Subject: cgroup: add cgroup_favordynmods= command-line option
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have a need of using favordynmods with cgroup v1, which doesn't support
changing mount flags during remount. Enabling CONFIG_CGROUP_FAVOR_DYNMODS at
build-time is not an option because we want to be able to selectively
enable it for certain systems.

This commit addresses this by introducing the cgroup_favordynmods=
command-line option. This option works for both cgroup v1 and v2 and also
allows for disabling favorynmods when the kernel built with
CONFIG_CGROUP_FAVOR_DYNMODS=y.

Also, note that when cgroup_favordynmods=true favordynmods is never
disabled in cgroup_destroy_root().

Signed-off-by: Luiz Capitulino <luizcap@amazon.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  4 ++++
 kernel/cgroup/cgroup.c                          | 18 ++++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 0a1731a0f0ef..8b744d39d393 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -580,6 +580,10 @@
 			named mounts. Specifying both "all" and "named" disables
 			all v1 hierarchies.
 
+	cgroup_favordynmods= [KNL] Enable or Disable favordynmods.
+			Format: { "true" | "false" }
+			Defaults to the value of CONFIG_CGROUP_FAVOR_DYNMODS.
+
 	cgroup.memory=	[KNL] Pass options to the cgroup memory controller.
 			Format: <string>
 			nosocket -- Disable socket memory accounting.
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 833ac6dd15d9..059cd5651d41 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -207,6 +207,8 @@ static u16 have_exit_callback __read_mostly;
 static u16 have_release_callback __read_mostly;
 static u16 have_canfork_callback __read_mostly;
 
+static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
+
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
 	.ns.count	= REFCOUNT_INIT(2),
@@ -1350,7 +1352,9 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 		cgroup_root_count--;
 	}
 
-	cgroup_favor_dynmods(root, false);
+	if (!have_favordynmods)
+		cgroup_favor_dynmods(root, false);
+
 	cgroup_exit_root_id(root);
 
 	cgroup_unlock();
@@ -2245,9 +2249,9 @@ static int cgroup_init_fs_context(struct fs_context *fc)
 	fc->user_ns = get_user_ns(ctx->ns->user_ns);
 	fc->global = true;
 
-#ifdef CONFIG_CGROUP_FAVOR_DYNMODS
-	ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
-#endif
+	if (have_favordynmods)
+		ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+
 	return 0;
 }
 
@@ -6766,6 +6770,12 @@ static int __init enable_cgroup_debug(char *str)
 }
 __setup("cgroup_debug", enable_cgroup_debug);
 
+static int __init cgroup_favordynmods_setup(char *str)
+{
+	return (kstrtobool(str, &have_favordynmods) == 0);
+}
+__setup("cgroup_favordynmods=", cgroup_favordynmods_setup);
+
 /**
  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
  * @dentry: directory dentry of interest
-- 
cgit v1.2.3


From 46c521bac592251229acdd2cd67976a7b1f88bed Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 3 Oct 2023 10:44:20 -0400
Subject: cgroup/cpuset: Enable invalid to valid local partition transition

When a local partition becomes invalid, it won't transition back to
valid partition automatically if a proper "cpuset.cpus.exclusive" or
"cpuset.cpus" change is made. Instead, system administrators have to
explicitly echo "root" or "isolated" into the "cpuset.cpus.partition"
file at the partition root.

This patch now enables the automatic transition of an invalid local
partition back to valid when there is a proper "cpuset.cpus.exclusive"
or "cpuset.cpus" change.

Automatic transition of an invalid remote partition to a valid one,
however, is not covered by this patch. They still need an explicit
write to "cpuset.cpus.partition" to become valid again.

The test_cpuset_prs.sh test script is updated to add new test cases to
test this automatic state transition.

Reported-by: Pierre Gondois <pierre.gondois@arm.com>
Link: https://lore.kernel.org/lkml/9777f0d2-2fdf-41cb-bd01-19c52939ef42@arm.com
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c                            | 79 ++++++++++++++---------
 tools/testing/selftests/cgroup/test_cpuset_prs.sh | 21 ++++--
 2 files changed, 62 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 15f399153a2e..93facdab513c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1806,17 +1806,28 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 		 *
 		 * Compute add/delete mask to/from effective_cpus
 		 *
-		 * addmask = effective_xcpus & ~newmask & parent->effective_xcpus
-		 * delmask = newmask & ~cs->effective_xcpus
-		 *		     & parent->effective_xcpus
+		 * For valid partition:
+		 *   addmask = exclusive_cpus & ~newmask
+		 *			      & parent->effective_xcpus
+		 *   delmask = newmask & ~exclusive_cpus
+		 *		       & parent->effective_xcpus
+		 *
+		 * For invalid partition:
+		 *   delmask = newmask & parent->effective_xcpus
 		 */
-		cpumask_andnot(tmp->addmask, xcpus, newmask);
-		adding = cpumask_and(tmp->addmask, tmp->addmask,
-				     parent->effective_xcpus);
+		if (is_prs_invalid(old_prs)) {
+			adding = false;
+			deleting = cpumask_and(tmp->delmask,
+					newmask, parent->effective_xcpus);
+		} else {
+			cpumask_andnot(tmp->addmask, xcpus, newmask);
+			adding = cpumask_and(tmp->addmask, tmp->addmask,
+					     parent->effective_xcpus);
 
-		cpumask_andnot(tmp->delmask, newmask, xcpus);
-		deleting = cpumask_and(tmp->delmask, tmp->delmask,
-				       parent->effective_xcpus);
+			cpumask_andnot(tmp->delmask, newmask, xcpus);
+			deleting = cpumask_and(tmp->delmask, tmp->delmask,
+					       parent->effective_xcpus);
+		}
 		/*
 		 * Make partition invalid if parent's effective_cpus could
 		 * become empty and there are tasks in the parent.
@@ -1910,9 +1921,11 @@ write_error:
 
 	/*
 	 * Transitioning between invalid to valid or vice versa may require
-	 * changing CS_CPU_EXCLUSIVE.
+	 * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,
+	 * validate_change() has already been successfully called and
+	 * CPU lists in cs haven't been updated yet. So defer it to later.
 	 */
-	if (old_prs != new_prs) {
+	if ((old_prs != new_prs) && (cmd != partcmd_update))  {
 		int err = update_partition_exclusive(cs, new_prs);
 
 		if (err)
@@ -1960,6 +1973,9 @@ write_error:
 
 	spin_unlock_irq(&callback_lock);
 
+	if ((old_prs != new_prs) && (cmd == partcmd_update))
+		update_partition_exclusive(cs, new_prs);
+
 	if (adding || deleting) {
 		update_tasks_cpumask(parent, tmp->addmask);
 		update_sibling_cpumasks(parent, cs, tmp);
@@ -2356,8 +2372,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (alloc_cpumasks(NULL, &tmp))
 		return -ENOMEM;
 
-	if (is_partition_valid(cs)) {
-		if (cpumask_empty(trialcs->effective_xcpus)) {
+	if (old_prs) {
+		if (is_partition_valid(cs) &&
+		    cpumask_empty(trialcs->effective_xcpus)) {
 			invalidate = true;
 			cs->prs_err = PERR_INVCPUS;
 		} else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
@@ -2391,13 +2408,16 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 		 */
 		invalidate = true;
 		rcu_read_lock();
-		cpuset_for_each_child(cp, css, parent)
+		cpuset_for_each_child(cp, css, parent) {
+			struct cpumask *xcpus = fetch_xcpus(trialcs);
+
 			if (is_partition_valid(cp) &&
-			    cpumask_intersects(trialcs->effective_xcpus, cp->effective_xcpus)) {
+			    cpumask_intersects(xcpus, cp->effective_xcpus)) {
 				rcu_read_unlock();
 				update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
 				rcu_read_lock();
 			}
+		}
 		rcu_read_unlock();
 		retval = 0;
 	}
@@ -2405,18 +2425,24 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (retval < 0)
 		goto out_free;
 
-	if (is_partition_valid(cs)) {
+	if (is_partition_valid(cs) ||
+	   (is_partition_invalid(cs) && !invalidate)) {
+		struct cpumask *xcpus = trialcs->effective_xcpus;
+
+		if (cpumask_empty(xcpus) && is_partition_invalid(cs))
+			xcpus = trialcs->cpus_allowed;
+
 		/*
 		 * Call remote_cpus_update() to handle valid remote partition
 		 */
 		if (is_remote_partition(cs))
-			remote_cpus_update(cs, trialcs->effective_xcpus, &tmp);
+			remote_cpus_update(cs, xcpus, &tmp);
 		else if (invalidate)
 			update_parent_effective_cpumask(cs, partcmd_invalidate,
 							NULL, &tmp);
 		else
 			update_parent_effective_cpumask(cs, partcmd_update,
-						trialcs->effective_xcpus, &tmp);
+							xcpus, &tmp);
 	} else if (!cpumask_empty(cs->exclusive_cpus)) {
 		/*
 		 * Use trialcs->effective_cpus as a temp cpumask
@@ -2493,7 +2519,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (retval)
 		return retval;
 
-	if (is_partition_valid(cs)) {
+	if (old_prs) {
 		if (cpumask_empty(trialcs->effective_xcpus)) {
 			invalidate = true;
 			cs->prs_err = PERR_INVCPUS;
@@ -2927,19 +2953,10 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 		return 0;
 
 	/*
-	 * For a previously invalid partition root with valid partition root
-	 * parent, treat it as if it is a "member". Otherwise, reject it as
-	 * remote partition cannot currently self-recover from an invalid
-	 * state.
+	 * Treat a previously invalid partition root as if it is a "member".
 	 */
-	if (new_prs && is_prs_invalid(old_prs)) {
-		if (is_partition_valid(parent)) {
-			old_prs = PRS_MEMBER;
-		} else {
-			cs->partition_root_state = -new_prs;
-			return 0;
-		}
-	}
+	if (new_prs && is_prs_invalid(old_prs))
+		old_prs = PRS_MEMBER;
 
 	if (alloc_cpumasks(NULL, &tmpmask))
 		return -ENOMEM;
diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
index 0f4f4a57ae12..a6e9848189d6 100755
--- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -220,7 +220,8 @@ test_isolated()
 #      +- B1
 #
 #  P<v> = set cpus.partition (0:member, 1:root, 2:isolated)
-#  C<l> = add cpu-list
+#  C<l> = add cpu-list to cpuset.cpus
+#  X<l> = add cpu-list to cpuset.cpus.exclusive
 #  S<p> = use prefix in subtree_control
 #  T    = put a task into cgroup
 #  O<c>=<v> = Write <v> to CPU online file of <c>
@@ -318,16 +319,19 @@ TEST_MATRIX=(
 	" C0-3:S+ C1-3:S+  C2      .    X2-3   X2-3   T:P2:O2=0 O2=1 0 A1:0-3,A2:1-3,A3:2 A1:P0,A3:P-2"
 
 	# cpus.exclusive.effective clearing test
-	" C0-3:S+ C1-3:S+  C2      .   X2-3:X    .      .      .      0 A1:0-3,A2:1-3,A3:2,XA1:"
+	" C0-3:S+ C1-3:S+  C2      .   X2-3:X    .      .      .     0 A1:0-3,A2:1-3,A3:2,XA1:"
 
-	# Invalid to valid remote partition indirect transition test via member
-	" C0-3:S+   C1-3    .      .      .    X3:P2    .      .      0 A1:0-3,A2:1-3,XA2: A2:P-2"
+	# Invalid to valid remote partition transition test
+	" C0-3:S+   C1-3    .      .      .    X3:P2    .      .     0 A1:0-3,A2:1-3,XA2: A2:P-2"
 	" C0-3:S+ C1-3:X3:P2
-			    .      .    X2-3   P0:P2    .      .      0 A1:0-2,A2:3,XA2:3 A2:P2 3"
+			    .      .    X2-3    P2      .      .     0 A1:0-2,A2:3,XA2:3 A2:P2 3"
 
 	# Invalid to valid local partition direct transition tests
-	" C1-3:S+:P2 C2-3:X1:P2 .  .      .      .      .      .      0 A1:1-3,XA1:1-3,A2:2-3:XA2: A1:P2,A2:P-2 1-3"
-	" C1-3:S+:P2 C2-3:X1:P2 .  .      .    X3:P2    .      .      0 A1:1-2,XA1:1-3,A2:3:XA2:3 A1:P2,A2:P2 1-3"
+	" C1-3:S+:P2 C2-3:X1:P2 .  .      .      .      .      .     0 A1:1-3,XA1:1-3,A2:2-3:XA2: A1:P2,A2:P-2 1-3"
+	" C1-3:S+:P2 C2-3:X1:P2 .  .      .    X3:P2    .      .     0 A1:1-2,XA1:1-3,A2:3:XA2:3 A1:P2,A2:P2 1-3"
+	"  C0-3:P2   .      .    C4-6   C0-4     .      .      .     0 A1:0-4,B1:4-6 A1:P-2,B1:P0"
+	"  C0-3:P2   .      .    C4-6 C0-4:C0-3  .      .      .     0 A1:0-3,B1:4-6 A1:P2,B1:P0 0-3"
+	"  C0-3:P2   .      .  C3-5:C4-5  .      .      .      .     0 A1:0-3,B1:4-5 A1:P2,B1:P0 0-3"
 
 	# Local partition invalidation tests
 	" C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \
@@ -336,6 +340,9 @@ TEST_MATRIX=(
 				   .      .     X4      .      .     0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3"
 	" C0-3:X1-3:S+:P2 C1-3:X2-3:S+:P2 C2-3:X3:P2 \
 				   .      .     C4      .      .     0 A1:1-3,A2:1-3,A3:2-3,XA2:,XA3: A1:P2,A2:P-2,A3:P-2 1-3"
+	# Local partition CPU change tests
+	" C0-5:S+:P2 C4-5:S+:P1 .  .      .    C3-5     .      .     0 A1:0-2,A2:3-5 A1:P2,A2:P1 0-2"
+	" C0-5:S+:P2 C4-5:S+:P1 .  .    C1-5     .      .      .     0 A1:1-3,A2:4-5 A1:P2,A2:P1 1-3"
 
 	# cpus_allowed/exclusive_cpus update tests
 	" C0-3:X2-3:S+ C1-3:X2-3:S+ C2-3:X2-3 \
-- 
cgit v1.2.3


From 783a8334ec1cadefbb992ca2adbb459b0ee0f9f7 Mon Sep 17 00:00:00 2001
From: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com>
Date: Tue, 26 Sep 2023 23:58:01 -0700
Subject: cgroup/cpuset: Cleanup signedness issue in cpu_exclusive_check()

Smatch complains about returning negative error codes from a type
bool function.

kernel/cgroup/cpuset.c:705 cpu_exclusive_check() warn:
	signedness bug returning '(-22)'

The code works correctly, but it is confusing.  The current behavior is
that cpu_exclusive_check() returns true if it's *NOT* exclusive.  Rename
it to cpusets_are_exclusive() and reverse the returns so it returns true
if it is exclusive and false if it's not.  Update both callers as well.

Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <error27@gmail.com>
Closes: https://lore.kernel.org/r/202309201706.2LhKdM6o-lkp@intel.com/
Signed-off-by: Harshit Mogalapalli <harshit.m.mogalapalli@oracle.com>
Reviewed-by: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 93facdab513c..615daaf87f1f 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -719,18 +719,18 @@ static inline struct cpumask *fetch_xcpus(struct cpuset *cs)
 }
 
 /*
- * cpu_exclusive_check() - check if two cpusets are exclusive
+ * cpusets_are_exclusive() - check if two cpusets are exclusive
  *
- * Return 0 if exclusive, -EINVAL if not
+ * Return true if exclusive, false if not
  */
-static inline bool cpu_exclusive_check(struct cpuset *cs1, struct cpuset *cs2)
+static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
 {
 	struct cpumask *xcpus1 = fetch_xcpus(cs1);
 	struct cpumask *xcpus2 = fetch_xcpus(cs2);
 
 	if (cpumask_intersects(xcpus1, xcpus2))
-		return -EINVAL;
-	return 0;
+		return false;
+	return true;
 }
 
 /*
@@ -833,7 +833,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 	cpuset_for_each_child(c, css, par) {
 		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
 		    c != cur) {
-			if (cpu_exclusive_check(trial, c))
+			if (!cpusets_are_exclusive(trial, c))
 				goto out;
 		}
 		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
@@ -1875,7 +1875,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 			cpuset_for_each_child(child, css, parent) {
 				if (child == cs)
 					continue;
-				if (cpu_exclusive_check(cs, child)) {
+				if (!cpusets_are_exclusive(cs, child)) {
 					exclusive = false;
 					break;
 				}
-- 
cgit v1.2.3


From 643445531829d89dc5ddbe0c5ee4ff8f84ce8687 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang1211@gmail.com>
Date: Wed, 20 Sep 2023 14:07:04 +0800
Subject: workqueue: Fix UAF report by KASAN in pwq_release_workfn()

Currently, for UNBOUND wq, if the apply_wqattrs_prepare() return error,
the apply_wqattr_cleanup() will be called and use the pwq_release_worker
kthread to release resources asynchronously. however, the kfree(wq) is
invoked directly in failure path of alloc_workqueue(), if the kfree(wq)
has been executed and when the pwq_release_workfn() accesses wq, this
leads to the following scenario:

BUG: KASAN: slab-use-after-free in pwq_release_workfn+0x339/0x380 kernel/workqueue.c:4124
Read of size 4 at addr ffff888027b831c0 by task pool_workqueue_/3

CPU: 0 PID: 3 Comm: pool_workqueue_ Not tainted 6.5.0-rc7-next-20230825-syzkaller #0
Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023
Call Trace:
 <TASK>
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0xd9/0x1b0 lib/dump_stack.c:106
 print_address_description mm/kasan/report.c:364 [inline]
 print_report+0xc4/0x620 mm/kasan/report.c:475
 kasan_report+0xda/0x110 mm/kasan/report.c:588
 pwq_release_workfn+0x339/0x380 kernel/workqueue.c:4124
 kthread_worker_fn+0x2fc/0xa80 kernel/kthread.c:823
 kthread+0x33a/0x430 kernel/kthread.c:388
 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147
 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304
 </TASK>

Allocated by task 5054:
 kasan_save_stack+0x33/0x50 mm/kasan/common.c:45
 kasan_set_track+0x25/0x30 mm/kasan/common.c:52
 ____kasan_kmalloc mm/kasan/common.c:374 [inline]
 __kasan_kmalloc+0xa2/0xb0 mm/kasan/common.c:383
 kmalloc include/linux/slab.h:599 [inline]
 kzalloc include/linux/slab.h:720 [inline]
 alloc_workqueue+0x16f/0x1490 kernel/workqueue.c:4684
 kvm_mmu_init_tdp_mmu+0x23/0x100 arch/x86/kvm/mmu/tdp_mmu.c:19
 kvm_mmu_init_vm+0x248/0x2e0 arch/x86/kvm/mmu/mmu.c:6180
 kvm_arch_init_vm+0x39/0x720 arch/x86/kvm/x86.c:12311
 kvm_create_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:1222 [inline]
 kvm_dev_ioctl_create_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:5089 [inline]
 kvm_dev_ioctl+0xa31/0x1c20 arch/x86/kvm/../../../virt/kvm/kvm_main.c:5131
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:871 [inline]
 __se_sys_ioctl fs/ioctl.c:857 [inline]
 __x64_sys_ioctl+0x18f/0x210 fs/ioctl.c:857
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

Freed by task 5054:
 kasan_save_stack+0x33/0x50 mm/kasan/common.c:45
 kasan_set_track+0x25/0x30 mm/kasan/common.c:52
 kasan_save_free_info+0x2b/0x40 mm/kasan/generic.c:522
 ____kasan_slab_free mm/kasan/common.c:236 [inline]
 ____kasan_slab_free+0x15b/0x1b0 mm/kasan/common.c:200
 kasan_slab_free include/linux/kasan.h:164 [inline]
 slab_free_hook mm/slub.c:1800 [inline]
 slab_free_freelist_hook+0x114/0x1e0 mm/slub.c:1826
 slab_free mm/slub.c:3809 [inline]
 __kmem_cache_free+0xb8/0x2f0 mm/slub.c:3822
 alloc_workqueue+0xe76/0x1490 kernel/workqueue.c:4746
 kvm_mmu_init_tdp_mmu+0x23/0x100 arch/x86/kvm/mmu/tdp_mmu.c:19
 kvm_mmu_init_vm+0x248/0x2e0 arch/x86/kvm/mmu/mmu.c:6180
 kvm_arch_init_vm+0x39/0x720 arch/x86/kvm/x86.c:12311
 kvm_create_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:1222 [inline]
 kvm_dev_ioctl_create_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:5089 [inline]
 kvm_dev_ioctl+0xa31/0x1c20 arch/x86/kvm/../../../virt/kvm/kvm_main.c:5131
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:871 [inline]
 __se_sys_ioctl fs/ioctl.c:857 [inline]
 __x64_sys_ioctl+0x18f/0x210 fs/ioctl.c:857
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

This commit therefore flush pwq_release_worker in the alloc_and_link_pwqs()
before invoke kfree(wq).

Reported-by: syzbot+60db9f652c92d5bacba4@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=60db9f652c92d5bacba4
Signed-off-by: Zqiang <qiang.zhang1211@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b9f053a5a5f0..ebe24a5e1435 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4600,6 +4600,12 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 	}
 	cpus_read_unlock();
 
+	/* for unbound pwq, flush the pwq_release_worker ensures that the
+	 * pwq_release_workfn() completes before calling kfree(wq).
+	 */
+	if (ret)
+		kthread_flush_worker(pwq_release_worker);
+
 	return ret;
 
 enomem:
-- 
cgit v1.2.3


From 2cb1f6e9a743af58a23cf14563b5eada1e0d3fde Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Sep 2023 22:36:00 +0200
Subject: rcu: Conditionally build CPU-hotplug teardown callbacks

Among the three CPU-hotplug teardown RCU callbacks, two of them early
exit if CONFIG_HOTPLUG_CPU=n, and one is left unchanged. In any case
all of them have an implementation when CONFIG_HOTPLUG_CPU=n.

Align instead with the common way to deal with CPU-hotplug teardown
callbacks and provide a proper stub when they are not supported.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/rcutree.h |  11 ++++-
 kernel/rcu/tree.c       | 114 +++++++++++++++++++++++-------------------------
 2 files changed, 63 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 153cfc7bbffd..46875c4e9f56 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -110,9 +110,16 @@ void rcu_all_qs(void);
 /* RCUtree hotplug events */
 int rcutree_prepare_cpu(unsigned int cpu);
 int rcutree_online_cpu(unsigned int cpu);
-int rcutree_offline_cpu(unsigned int cpu);
+void rcu_cpu_starting(unsigned int cpu);
+
+#ifdef CONFIG_HOTPLUG_CPU
 int rcutree_dead_cpu(unsigned int cpu);
 int rcutree_dying_cpu(unsigned int cpu);
-void rcu_cpu_starting(unsigned int cpu);
+int rcutree_offline_cpu(unsigned int cpu);
+#else
+#define rcutree_dead_cpu NULL
+#define rcutree_dying_cpu NULL
+#define rcutree_offline_cpu NULL
+#endif
 
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2e1e7eadf2cc..f9c6b2680cbb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4237,25 +4237,6 @@ static bool rcu_init_invoked(void)
 	return !!rcu_state.n_online_cpus;
 }
 
-/*
- * Near the end of the offline process.  Trace the fact that this CPU
- * is going offline.
- */
-int rcutree_dying_cpu(unsigned int cpu)
-{
-	bool blkd;
-	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-	struct rcu_node *rnp = rdp->mynode;
-
-	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
-		return 0;
-
-	blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
-	trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
-			       blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
-	return 0;
-}
-
 /*
  * All CPUs for the specified rcu_node structure have gone offline,
  * and all tasks that were preempted within an RCU read-side critical
@@ -4301,23 +4282,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 	}
 }
 
-/*
- * The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context.  Do the remainder of the cleanup.
- * There can only be one CPU hotplug operation at a time, so no need for
- * explicit locking.
- */
-int rcutree_dead_cpu(unsigned int cpu)
-{
-	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
-		return 0;
-
-	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
-	// Stop-machine done, so allow nohz_full to disable tick.
-	tick_dep_clear(TICK_DEP_BIT_RCU);
-	return 0;
-}
-
 /*
  * Propagate ->qsinitmask bits up the rcu_node tree to account for the
  * first CPU in a given leaf rcu_node structure coming online.  The caller
@@ -4470,29 +4434,6 @@ int rcutree_online_cpu(unsigned int cpu)
 	return 0;
 }
 
-/*
- * Near the beginning of the process.  The CPU is still very much alive
- * with pretty much all services enabled.
- */
-int rcutree_offline_cpu(unsigned int cpu)
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-	struct rcu_node *rnp;
-
-	rdp = per_cpu_ptr(&rcu_data, cpu);
-	rnp = rdp->mynode;
-	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	rnp->ffmask &= ~rdp->grpmask;
-	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
-	rcutree_affinity_setting(cpu, cpu);
-
-	// nohz_full CPUs need the tick for stop-machine to work quickly
-	tick_dep_set(TICK_DEP_BIT_RCU);
-	return 0;
-}
-
 /*
  * Mark the specified CPU as being online so that subsequent grace periods
  * (both expedited and normal) will wait on it.  Note that this means that
@@ -4646,7 +4587,60 @@ void rcutree_migrate_callbacks(int cpu)
 		  cpu, rcu_segcblist_n_cbs(&rdp->cblist),
 		  rcu_segcblist_first_cb(&rdp->cblist));
 }
-#endif
+
+/*
+ * The CPU has been completely removed, and some other CPU is reporting
+ * this fact from process context.  Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
+ */
+int rcutree_dead_cpu(unsigned int cpu)
+{
+	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
+	// Stop-machine done, so allow nohz_full to disable tick.
+	tick_dep_clear(TICK_DEP_BIT_RCU);
+	return 0;
+}
+
+/*
+ * Near the end of the offline process.  Trace the fact that this CPU
+ * is going offline.
+ */
+int rcutree_dying_cpu(unsigned int cpu)
+{
+	bool blkd;
+	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+	struct rcu_node *rnp = rdp->mynode;
+
+	blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
+	trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
+			       blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
+	return 0;
+}
+
+/*
+ * Near the beginning of the process.  The CPU is still very much alive
+ * with pretty much all services enabled.
+ */
+int rcutree_offline_cpu(unsigned int cpu)
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+
+	rdp = per_cpu_ptr(&rcu_data, cpu);
+	rnp = rdp->mynode;
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	rnp->ffmask &= ~rdp->grpmask;
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+	rcutree_affinity_setting(cpu, cpu);
+
+	// nohz_full CPUs need the tick for stop-machine to work quickly
+	tick_dep_set(TICK_DEP_BIT_RCU);
+	return 0;
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
 /*
  * On non-huge systems, use expedited RCU grace periods to make suspend
-- 
cgit v1.2.3


From 448e9f34d91d1a4799fdb06a93c2c24b34b6fd9d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Sep 2023 22:36:01 +0200
Subject: rcu: Standardize explicit CPU-hotplug calls

rcu_report_dead() and rcutree_migrate_callbacks() have their headers in
rcupdate.h while those are pure rcutree calls, like the other CPU-hotplug
functions.

Also rcu_cpu_starting() and rcu_report_dead() have different naming
conventions while they mirror each other's effects.

Fix the headers and propose a naming that relates both functions and
aligns with the prefix of other rcutree CPU-hotplug functions.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 .../Expedited-Grace-Periods/Expedited-Grace-Periods.rst      |  2 +-
 Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg  |  4 ++--
 Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg      |  4 ++--
 Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg |  4 ++--
 Documentation/RCU/Design/Requirements/Requirements.rst       |  4 ++--
 arch/arm64/kernel/smp.c                                      |  4 ++--
 arch/powerpc/kernel/smp.c                                    |  2 +-
 arch/s390/kernel/smp.c                                       |  2 +-
 arch/x86/kernel/smpboot.c                                    |  2 +-
 include/linux/interrupt.h                                    |  2 +-
 include/linux/rcupdate.h                                     |  2 --
 include/linux/rcutiny.h                                      |  2 +-
 include/linux/rcutree.h                                      |  7 ++++++-
 kernel/cpu.c                                                 |  6 +++---
 kernel/rcu/tree.c                                            | 12 ++++++++----
 15 files changed, 33 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
index 93d899d53258..414f8a2012d6 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
@@ -181,7 +181,7 @@ operations is carried out at several levels:
    of this wait (or series of waits, as the case may be) is to permit a
    concurrent CPU-hotplug operation to complete.
 #. In the case of RCU-sched, one of the last acts of an outgoing CPU is
-   to invoke ``rcu_report_dead()``, which reports a quiescent state for
+   to invoke ``rcutree_report_cpu_dead()``, which reports a quiescent state for
    that CPU. However, this is likely paranoia-induced redundancy.
 
 +-----------------------------------------------------------------------+
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg
index 7ddc094d7f28..d82a77d03d8c 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg
@@ -1135,7 +1135,7 @@
        font-weight="bold"
        font-size="192"
        id="text202-7-5-3-27-6-5"
-       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_report_dead()</text>
+       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead()</text>
     <text
        xml:space="preserve"
        x="3745.7725"
@@ -1256,7 +1256,7 @@
        font-style="normal"
        y="3679.27"
        x="-3804.9949"
-       xml:space="preserve">rcu_cpu_starting()</text>
+       xml:space="preserve">rcutree_report_cpu_starting()</text>
     <g
        style="fill:none;stroke-width:0.025in"
        id="g3107-7-5-0"
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
index 069f6f8371c2..6e690a3be161 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg
@@ -3274,7 +3274,7 @@
          font-weight="bold"
          font-size="192"
          id="text202-7-5-3-27-6-5"
-         style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_report_dead()</text>
+         style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead()</text>
       <text
          xml:space="preserve"
          x="3745.7725"
@@ -3395,7 +3395,7 @@
          font-style="normal"
          y="3679.27"
          x="-3804.9949"
-         xml:space="preserve">rcu_cpu_starting()</text>
+         xml:space="preserve">rcutree_report_cpu_starting()</text>
       <g
          style="fill:none;stroke-width:0.025in"
          id="g3107-7-5-0"
diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg
index 2c9310ba29ba..4fa7506082bf 100644
--- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg
+++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg
@@ -607,7 +607,7 @@
        font-weight="bold"
        font-size="192"
        id="text202-7-5-3-27-6"
-       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_report_dead()</text>
+       style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead()</text>
     <text
        xml:space="preserve"
        x="3745.7725"
@@ -728,7 +728,7 @@
        font-style="normal"
        y="3679.27"
        x="-3804.9949"
-       xml:space="preserve">rcu_cpu_starting()</text>
+       xml:space="preserve">rcutree_report_cpu_starting()</text>
     <g
        style="fill:none;stroke-width:0.025in"
        id="g3107-7-5-0"
diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst
index f3b605285a87..cccafdaa1f84 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -1955,12 +1955,12 @@ if offline CPUs block an RCU grace period for too long.
 
 An offline CPU's quiescent state will be reported either:
 
-1.  As the CPU goes offline using RCU's hotplug notifier (rcu_report_dead()).
+1.  As the CPU goes offline using RCU's hotplug notifier (rcutree_report_cpu_dead()).
 2.  When grace period initialization (rcu_gp_init()) detects a
     race either with CPU offlining or with a task unblocking on a leaf
     ``rcu_node`` structure whose CPUs are all offline.
 
-The CPU-online path (rcu_cpu_starting()) should never need to report
+The CPU-online path (rcutree_report_cpu_starting()) should never need to report
 a quiescent state for an offline CPU.  However, as a debugging measure,
 it does emit a warning if a quiescent state was not already reported
 for that CPU.
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 8fa646c90c67..196533c362e1 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -215,7 +215,7 @@ asmlinkage notrace void secondary_start_kernel(void)
 	if (system_uses_irq_prio_masking())
 		init_gic_priority_masking();
 
-	rcu_cpu_starting(cpu);
+	rcutree_report_cpu_starting(cpu);
 	trace_hardirqs_off();
 
 	/*
@@ -401,7 +401,7 @@ void __noreturn cpu_die_early(void)
 
 	/* Mark this CPU absent */
 	set_cpu_present(cpu, 0);
-	rcu_report_dead();
+	rcutree_report_cpu_dead();
 
 	if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
 		update_cpu_boot_status(CPU_KILL_ME);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5826f5108a12..a30d4d93ff0b 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1629,7 +1629,7 @@ void start_secondary(void *unused)
 
 	smp_store_cpu_info(cpu);
 	set_dec(tb_ticks_per_jiffy);
-	rcu_cpu_starting(cpu);
+	rcutree_report_cpu_starting(cpu);
 	cpu_callin_map[cpu] = 1;
 
 	if (smp_ops->setup_cpu)
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index a4edb7ea66ea..214a1b67f80a 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -898,7 +898,7 @@ static void smp_start_secondary(void *cpuvoid)
 	S390_lowcore.restart_flags = 0;
 	restore_access_regs(S390_lowcore.access_regs_save_area);
 	cpu_init();
-	rcu_cpu_starting(cpu);
+	rcutree_report_cpu_starting(cpu);
 	init_cpu_timer();
 	vtime_init();
 	vdso_getcpu_init();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 4e45ff44aa07..4ccb76f89af8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -288,7 +288,7 @@ static void notrace start_secondary(void *unused)
 
 	cpu_init();
 	fpu__init_cpu();
-	rcu_cpu_starting(raw_smp_processor_id());
+	rcutree_report_cpu_starting(raw_smp_processor_id());
 	x86_cpuinit.early_percpu_clock_init();
 
 	ap_starting();
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a92bce40b04b..d05e1e9a553c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -566,7 +566,7 @@ enum
  *
  * _ RCU:
  * 	1) rcutree_migrate_callbacks() migrates the queue.
- * 	2) rcu_report_dead() reports the final quiescent states.
+ * 	2) rcutree_report_cpu_dead() reports the final quiescent states.
  *
  * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue
  */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index aa351ddcbe8d..f7206b2623c9 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -122,8 +122,6 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
 void rcu_init(void);
 extern int rcu_scheduler_active;
 void rcu_sched_clock_irq(int user);
-void rcu_report_dead(void);
-void rcutree_migrate_callbacks(int cpu);
 
 #ifdef CONFIG_TASKS_RCU_GENERIC
 void rcu_init_tasks_generic(void);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 7b949292908a..d9ac7b136aea 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -171,6 +171,6 @@ static inline void rcu_all_qs(void) { barrier(); }
 #define rcutree_offline_cpu      NULL
 #define rcutree_dead_cpu         NULL
 #define rcutree_dying_cpu        NULL
-static inline void rcu_cpu_starting(unsigned int cpu) { }
+static inline void rcutree_report_cpu_starting(unsigned int cpu) { }
 
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 46875c4e9f56..254244202ea9 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -110,7 +110,7 @@ void rcu_all_qs(void);
 /* RCUtree hotplug events */
 int rcutree_prepare_cpu(unsigned int cpu);
 int rcutree_online_cpu(unsigned int cpu);
-void rcu_cpu_starting(unsigned int cpu);
+void rcutree_report_cpu_starting(unsigned int cpu);
 
 #ifdef CONFIG_HOTPLUG_CPU
 int rcutree_dead_cpu(unsigned int cpu);
@@ -122,4 +122,9 @@ int rcutree_offline_cpu(unsigned int cpu);
 #define rcutree_offline_cpu NULL
 #endif
 
+void rcutree_migrate_callbacks(int cpu);
+
+/* Called from hotplug and also arm64 early secondary boot failure */
+void rcutree_report_cpu_dead(void);
+
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 076e75fed8bb..2491766e1fd5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1388,10 +1388,10 @@ void cpuhp_report_idle_dead(void)
 	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
 
 	BUG_ON(st->state != CPUHP_AP_OFFLINE);
-	rcu_report_dead();
+	rcutree_report_cpu_dead();
 	st->state = CPUHP_AP_IDLE_DEAD;
 	/*
-	 * We cannot call complete after rcu_report_dead() so we delegate it
+	 * We cannot call complete after rcutree_report_cpu_dead() so we delegate it
 	 * to an online cpu.
 	 */
 	smp_call_function_single(cpumask_first(cpu_online_mask),
@@ -1617,7 +1617,7 @@ void notify_cpu_starting(unsigned int cpu)
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
 
-	rcu_cpu_starting(cpu);	/* Enables RCU usage on this CPU. */
+	rcutree_report_cpu_starting(cpu);	/* Enables RCU usage on this CPU. */
 	cpumask_set_cpu(cpu, &cpus_booted_once_mask);
 
 	/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f9c6b2680cbb..36d8818eaec1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4216,7 +4216,7 @@ bool rcu_lockdep_current_cpu_online(void)
 	rdp = this_cpu_ptr(&rcu_data);
 	/*
 	 * Strictly, we care here about the case where the current CPU is
-	 * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
+	 * in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask
 	 * not being up to date. So arch_spin_is_locked() might have a
 	 * false positive if it's held by some *other* CPU, but that's
 	 * OK because that just means a false *negative* on the warning.
@@ -4445,8 +4445,10 @@ int rcutree_online_cpu(unsigned int cpu)
  * from the incoming CPU rather than from the cpuhp_step mechanism.
  * This is because this function must be invoked at a precise location.
  * This incoming CPU must not have enabled interrupts yet.
+ *
+ * This mirrors the effects of rcutree_report_cpu_dead().
  */
-void rcu_cpu_starting(unsigned int cpu)
+void rcutree_report_cpu_starting(unsigned int cpu)
 {
 	unsigned long mask;
 	struct rcu_data *rdp;
@@ -4500,8 +4502,10 @@ void rcu_cpu_starting(unsigned int cpu)
  * Note that this function is special in that it is invoked directly
  * from the outgoing CPU rather than from the cpuhp_step mechanism.
  * This is because this function must be invoked at a precise location.
+ *
+ * This mirrors the effect of rcutree_report_cpu_starting().
  */
-void rcu_report_dead(void)
+void rcutree_report_cpu_dead(void)
 {
 	unsigned long flags;
 	unsigned long mask;
@@ -5072,7 +5076,7 @@ void __init rcu_init(void)
 	pm_notifier(rcu_pm_notify, 0);
 	WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot.
 	rcutree_prepare_cpu(cpu);
-	rcu_cpu_starting(cpu);
+	rcutree_report_cpu_starting(cpu);
 	rcutree_online_cpu(cpu);
 
 	/* Create workqueue for Tree SRCU and for expedited GPs. */
-- 
cgit v1.2.3


From a28ab03b499601c14b8f502d875c2bee23209659 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Sep 2023 22:36:03 +0200
Subject: rcu: Comment why callbacks migration can't wait for
 CPUHP_RCUTREE_PREP

The callbacks migration is performed through an explicit call from
the hotplug control CPU right after the death of the target CPU and
before proceeding with the CPUHP_ teardown functions.

This is unusual but necessary and yet uncommented. Summarize the reason
as explained in the changelog of:

	a58163d8ca2c (rcu: Migrate callbacks earlier in the CPU-offline timeline)

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/cpu.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2491766e1fd5..3b9d5c7eb4a2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1372,7 +1372,14 @@ static int takedown_cpu(unsigned int cpu)
 	cpuhp_bp_sync_dead(cpu);
 
 	tick_cleanup_dead_cpu(cpu);
+
+	/*
+	 * Callbacks must be re-integrated right away to the RCU state machine.
+	 * Otherwise an RCU callback could block a further teardown function
+	 * waiting for its completion.
+	 */
 	rcutree_migrate_callbacks(cpu);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 5790b1fb3d672d9a1fe3881a7181dfdbe741568f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 4 Oct 2023 16:50:07 -0400
Subject: eventfs: Remove eventfs_file and just use eventfs_inode

Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.

struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
						const struct eventfs_entry *entries,
						int size, void *data);

is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:

struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
					 const struct eventfs_entry *entries,
					 int size, void *data);

where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.

The entries are defined by:

typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
				const struct file_operations **fops);

struct eventfs_entry {
	const char			*name;
	eventfs_callback		callback;
};

Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.

If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.

This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.

The "show_events_dentry" file has been updated to show the directories,
and any files they have.

With just the eventfs_file allocations:

 Before after deltas for meminfo (in kB):

   MemFree:		-14360
   MemAvailable:	-14260
   Buffers:		40
   Cached:		24
   Active:		44
   Inactive:		48
   Inactive(anon):	28
   Active(file):	44
   Inactive(file):	20
   Dirty:		-4
   AnonPages:		28
   Mapped:		4
   KReclaimable:	132
   Slab:		1604
   SReclaimable:	132
   SUnreclaim:		1472
   Committed_AS:	12

 Before after deltas for slabinfo:

   <slab>:		<objects>	[ * <size> = <total>]

   ext4_inode_cache	27		[* 1184 = 31968 ]
   extent_status	102		[*   40 = 4080 ]
   tracefs_inode_cache	144		[*  656 = 94464 ]
   buffer_head		39		[*  104 = 4056 ]
   shmem_inode_cache	49		[*  800 = 39200 ]
   filp			-53		[*  256 = -13568 ]
   dentry		251		[*  192 = 48192 ]
   lsm_file_cache	277		[*   32 = 8864 ]
   vm_area_struct	-14		[*  184 = -2576 ]
   trace_event_file	1748		[*   88 = 153824 ]
   kmalloc-1k		35		[* 1024 = 35840 ]
   kmalloc-256		49		[*  256 = 12544 ]
   kmalloc-192		-28		[*  192 = -5376 ]
   kmalloc-128		-30		[*  128 = -3840 ]
   kmalloc-96		10581		[*   96 = 1015776 ]
   kmalloc-64		3056		[*   64 = 195584 ]
   kmalloc-32		1291		[*   32 = 41312 ]
   kmalloc-16		2310		[*   16 = 36960 ]
   kmalloc-8		9216		[*    8 = 73728 ]

 Free memory dropped by 14,360 kB
 Available memory dropped by 14,260 kB
 Total slab additions in size: 1,771,032 bytes

With this change:

 Before after deltas for meminfo (in kB):

   MemFree:		-12084
   MemAvailable:	-11976
   Buffers:		32
   Cached:		32
   Active:		72
   Inactive:		168
   Inactive(anon):	176
   Active(file):	72
   Inactive(file):	-8
   Dirty:		24
   AnonPages:		196
   Mapped:		8
   KReclaimable:	148
   Slab:		836
   SReclaimable:	148
   SUnreclaim:		688
   Committed_AS:	324

 Before after deltas for slabinfo:

   <slab>:		<objects>	[ * <size> = <total>]

   tracefs_inode_cache	144		[* 656 = 94464 ]
   shmem_inode_cache	-23		[* 800 = -18400 ]
   filp			-92		[* 256 = -23552 ]
   dentry		179		[* 192 = 34368 ]
   lsm_file_cache	-3		[* 32 = -96 ]
   vm_area_struct	-13		[* 184 = -2392 ]
   trace_event_file	1748		[* 88 = 153824 ]
   kmalloc-1k		-49		[* 1024 = -50176 ]
   kmalloc-256		-27		[* 256 = -6912 ]
   kmalloc-128		1864		[* 128 = 238592 ]
   kmalloc-64		4685		[* 64 = 299840 ]
   kmalloc-32		-72		[* 32 = -2304 ]
   kmalloc-16		256		[* 16 = 4096 ]
   total = 721352

 Free memory dropped by 12,084 kB
 Available memory dropped by 11,976 kB
 Total slab additions in size:  721,352 bytes

That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.

Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 fs/tracefs/event_inode.c     | 847 ++++++++++++++++++++++---------------------
 fs/tracefs/inode.c           |   2 +-
 fs/tracefs/internal.h        |  37 +-
 include/linux/trace_events.h |   2 +-
 include/linux/tracefs.h      |  29 +-
 kernel/trace/trace.c         |   7 +-
 kernel/trace/trace.h         |   4 +-
 kernel/trace/trace_events.c  | 313 +++++++++++-----
 8 files changed, 705 insertions(+), 536 deletions(-)

(limited to 'kernel')

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 8c8d64e76103..eab18b157ef5 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -2,8 +2,9 @@
 /*
  *  event_inode.c - part of tracefs, a pseudo file system for activating tracing
  *
- *  Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org>
+ *  Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt <rostedt@goodmis.org>
  *  Copyright (C) 2020-23 VMware Inc, author: Ajay Kaher <akaher@vmware.com>
+ *  Copyright (C) 2023 Google, author: Steven Rostedt <rostedt@goodmis.org>
  *
  *  eventfs is used to dynamically create inodes and dentries based on the
  *  meta data provided by the tracing system.
@@ -23,46 +24,6 @@
 #include <linux/delay.h>
 #include "internal.h"
 
-struct eventfs_inode {
-	struct list_head	e_top_files;
-};
-
-/*
- * struct eventfs_file - hold the properties of the eventfs files and
- *                       directories.
- * @name:	the name of the file or directory to create
- * @d_parent:   holds parent's dentry
- * @dentry:     once accessed holds dentry
- * @list:	file or directory to be added to parent directory
- * @ei:		list of files and directories within directory
- * @fop:	file_operations for file or directory
- * @iop:	inode_operations for file or directory
- * @data:	something that the caller will want to get to later on
- * @mode:	the permission that the file or directory should have
- */
-struct eventfs_file {
-	const char			*name;
-	struct dentry			*d_parent;
-	struct dentry			*dentry;
-	struct list_head		list;
-	struct eventfs_inode		*ei;
-	const struct file_operations	*fop;
-	const struct inode_operations	*iop;
-	/*
-	 * Union - used for deletion
-	 * @del_list:	list of eventfs_file to delete
-	 * @rcu:	eventfs_file to delete in RCU
-	 * @is_freed:	node is freed if one of the above is set
-	 */
-	union {
-		struct list_head	del_list;
-		struct rcu_head		rcu;
-		unsigned long		is_freed;
-	};
-	void				*data;
-	umode_t				mode;
-};
-
 static DEFINE_MUTEX(eventfs_mutex);
 DEFINE_STATIC_SRCU(eventfs_srcu);
 
@@ -93,16 +54,9 @@ static const struct file_operations eventfs_file_operations = {
  * @data: something that the caller will want to get to later on.
  * @fop: struct file_operations that should be used for this file.
  *
- * This is the basic "create a file" function for tracefs.  It allows for a
- * wide range of flexibility in creating a file.
- *
- * This function will return a pointer to a dentry if it succeeds.  This
- * pointer must be passed to the tracefs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
- *
- * If tracefs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
+ * This function creates a dentry that represents a file in the eventsfs_inode
+ * directory. The inode.i_private pointer will point to @data in the open()
+ * call.
  */
 static struct dentry *create_file(const char *name, umode_t mode,
 				  struct dentry *parent, void *data,
@@ -118,6 +72,7 @@ static struct dentry *create_file(const char *name, umode_t mode,
 	if (WARN_ON_ONCE(!S_ISREG(mode)))
 		return NULL;
 
+	WARN_ON_ONCE(!parent);
 	dentry = eventfs_start_creating(name, parent);
 
 	if (IS_ERR(dentry))
@@ -142,20 +97,11 @@ static struct dentry *create_file(const char *name, umode_t mode,
  * create_dir - create a dir in the tracefs filesystem
  * @name: the name of the file to create.
  * @parent: parent dentry for this file.
- * @data: something that the caller will want to get to later on.
- *
- * This is the basic "create a dir" function for eventfs.  It allows for a
- * wide range of flexibility in creating a dir.
- *
- * This function will return a pointer to a dentry if it succeeds.  This
- * pointer must be passed to the tracefs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, %NULL will be returned.
  *
- * If tracefs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
+ * This function will create a dentry for a directory represented by
+ * a eventfs_inode.
  */
-static struct dentry *create_dir(const char *name, struct dentry *parent, void *data)
+static struct dentry *create_dir(const char *name, struct dentry *parent)
 {
 	struct tracefs_inode *ti;
 	struct dentry *dentry;
@@ -172,7 +118,6 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
 	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
 	inode->i_op = &eventfs_root_dir_inode_operations;
 	inode->i_fop = &eventfs_file_operations;
-	inode->i_private = data;
 
 	ti = get_tracefs(inode);
 	ti->flags |= TRACEFS_EVENT_INODE;
@@ -185,18 +130,18 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void *
 }
 
 /**
- * eventfs_set_ef_status_free - set the ef->status to free
+ * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode
  * @ti: the tracefs_inode of the dentry
- * @dentry: dentry who's status to be freed
+ * @dentry: dentry which has the reference to remove.
  *
- * eventfs_set_ef_status_free will be called if no more
- * references remain
+ * Remove the association between a dentry from an eventfs_inode.
  */
-void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
+void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
 {
 	struct tracefs_inode *ti_parent;
+	struct eventfs_inode *ei_child, *tmp;
 	struct eventfs_inode *ei;
-	struct eventfs_file *ef, *tmp;
+	int i;
 
 	/* The top level events directory may be freed by this */
 	if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) {
@@ -207,9 +152,9 @@ void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
 		ei = ti->private;
 
 		/* Record all the top level files */
-		list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+		list_for_each_entry_srcu(ei_child, &ei->children, list,
 					 lockdep_is_held(&eventfs_mutex)) {
-			list_add_tail(&ef->del_list, &ef_del_list);
+			list_add_tail(&ei_child->del_list, &ef_del_list);
 		}
 
 		/* Nothing should access this, but just in case! */
@@ -218,11 +163,13 @@ void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
 		mutex_unlock(&eventfs_mutex);
 
 		/* Now safely free the top level files and their children */
-		list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
-			list_del(&ef->del_list);
-			eventfs_remove(ef);
+		list_for_each_entry_safe(ei_child, tmp, &ef_del_list, del_list) {
+			list_del(&ei_child->del_list);
+			eventfs_remove_dir(ei_child);
 		}
 
+		kfree_const(ei->name);
+		kfree(ei->d_children);
 		kfree(ei);
 		return;
 	}
@@ -233,68 +180,162 @@ void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
 	if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE))
 		goto out;
 
-	ef = dentry->d_fsdata;
-	if (!ef)
+	ei = dentry->d_fsdata;
+	if (!ei)
 		goto out;
 
 	/*
-	 * If ef was freed, then the LSB bit is set for d_fsdata.
+	 * If ei was freed, then the LSB bit is set for d_fsdata.
 	 * But this should not happen, as it should still have a
 	 * ref count that prevents it. Warn in case it does.
 	 */
-	if (WARN_ON_ONCE((unsigned long)ef & 1))
+	if (WARN_ON_ONCE((unsigned long)ei & 1))
 		goto out;
 
+	/* This could belong to one of the files of the ei */
+	if (ei->dentry != dentry) {
+		for (i = 0; i < ei->nr_entries; i++) {
+			if (ei->d_children[i] == dentry)
+				break;
+		}
+		if (WARN_ON_ONCE(i == ei->nr_entries))
+			goto out;
+		ei->d_children[i] = NULL;
+	} else {
+		ei->dentry = NULL;
+	}
+
 	dentry->d_fsdata = NULL;
-	ef->dentry = NULL;
-out:
+ out:
 	mutex_unlock(&eventfs_mutex);
 }
 
+/**
+ * create_file_dentry - create a dentry for a file of an eventfs_inode
+ * @ei: the eventfs_inode that the file will be created under
+ * @e_dentry: a pointer to the d_children[] of the @ei
+ * @parent: The parent dentry of the created file.
+ * @name: The name of the file to create
+ * @mode: The mode of the file.
+ * @data: The data to use to set the inode of the file with on open()
+ * @fops: The fops of the file to be created.
+ * @lookup: If called by the lookup routine, in which case, dput() the created dentry.
+ *
+ * Create a dentry for a file of an eventfs_inode @ei and place it into the
+ * address located at @e_dentry. If the @e_dentry already has a dentry, then
+ * just do a dget() on it and return. Otherwise create the dentry and attach it.
+ */
+static struct dentry *
+create_file_dentry(struct eventfs_inode *ei, struct dentry **e_dentry,
+		   struct dentry *parent, const char *name, umode_t mode, void *data,
+		   const struct file_operations *fops, bool lookup)
+{
+	struct dentry *dentry;
+	bool invalidate = false;
+
+	mutex_lock(&eventfs_mutex);
+	/* If the e_dentry already has a dentry, use it */
+	if (*e_dentry) {
+		/* lookup does not need to up the ref count */
+		if (!lookup)
+			dget(*e_dentry);
+		mutex_unlock(&eventfs_mutex);
+		return *e_dentry;
+	}
+	mutex_unlock(&eventfs_mutex);
+
+	/* The lookup already has the parent->d_inode locked */
+	if (!lookup)
+		inode_lock(parent->d_inode);
+
+	dentry = create_file(name, mode, parent, data, fops);
+
+	if (!lookup)
+		inode_unlock(parent->d_inode);
+
+	mutex_lock(&eventfs_mutex);
+
+	if (IS_ERR_OR_NULL(dentry)) {
+		/*
+		 * When the mutex was released, something else could have
+		 * created the dentry for this e_dentry. In which case
+		 * use that one.
+		 *
+		 * Note, with the mutex held, the e_dentry cannot have content
+		 * and the ei->is_freed be true at the same time.
+		 */
+		WARN_ON_ONCE(ei->is_freed);
+		dentry = *e_dentry;
+		/* The lookup does not need to up the dentry refcount */
+		if (dentry && !lookup)
+			dget(dentry);
+		mutex_unlock(&eventfs_mutex);
+		return dentry;
+	}
+
+	if (!*e_dentry && !ei->is_freed) {
+		*e_dentry = dentry;
+		dentry->d_fsdata = ei;
+	} else {
+		/*
+		 * Should never happen unless we get here due to being freed.
+		 * Otherwise it means two dentries exist with the same name.
+		 */
+		WARN_ON_ONCE(!ei->is_freed);
+		invalidate = true;
+	}
+	mutex_unlock(&eventfs_mutex);
+
+	if (invalidate)
+		d_invalidate(dentry);
+
+	if (lookup || invalidate)
+		dput(dentry);
+
+	return invalidate ? NULL : dentry;
+}
+
 /**
  * eventfs_post_create_dir - post create dir routine
- * @ef: eventfs_file of recently created dir
+ * @ei: eventfs_inode of recently created dir
  *
  * Map the meta-data of files within an eventfs dir to their parent dentry
  */
-static void eventfs_post_create_dir(struct eventfs_file *ef)
+static void eventfs_post_create_dir(struct eventfs_inode *ei)
 {
-	struct eventfs_file *ef_child;
+	struct eventfs_inode *ei_child;
 	struct tracefs_inode *ti;
 
 	/* srcu lock already held */
 	/* fill parent-child relation */
-	list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
+	list_for_each_entry_srcu(ei_child, &ei->children, list,
 				 srcu_read_lock_held(&eventfs_srcu)) {
-		ef_child->d_parent = ef->dentry;
+		ei_child->d_parent = ei->dentry;
 	}
 
-	ti = get_tracefs(ef->dentry->d_inode);
-	ti->private = ef->ei;
+	ti = get_tracefs(ei->dentry->d_inode);
+	ti->private = ei;
 }
 
 /**
- * create_dentry - helper function to create dentry
- * @ef: eventfs_file of file or directory to create
- * @parent: parent dentry
- * @lookup: true if called from lookup routine
+ * create_dir_dentry - Create a directory dentry for the eventfs_inode
+ * @ei: The eventfs_inode to create the directory for
+ * @parent: The dentry of the parent of this directory
+ * @lookup: True if this is called by the lookup code
  *
- * Used to create a dentry for file/dir, executes post dentry creation routine
+ * This creates and attaches a directory dentry to the eventfs_inode @ei.
  */
 static struct dentry *
-create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup)
+create_dir_dentry(struct eventfs_inode *ei, struct dentry *parent, bool lookup)
 {
 	bool invalidate = false;
-	struct dentry *dentry;
+	struct dentry *dentry = NULL;
 
 	mutex_lock(&eventfs_mutex);
-	if (ef->is_freed) {
-		mutex_unlock(&eventfs_mutex);
-		return NULL;
-	}
-	if (ef->dentry) {
-		dentry = ef->dentry;
-		/* On dir open, up the ref count */
+	if (ei->dentry) {
+		/* If the dentry already has a dentry, use it */
+		dentry = ei->dentry;
+		/* lookup does not need to up the ref count */
 		if (!lookup)
 			dget(dentry);
 		mutex_unlock(&eventfs_mutex);
@@ -302,42 +343,44 @@ create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup)
 	}
 	mutex_unlock(&eventfs_mutex);
 
+	/* The lookup already has the parent->d_inode locked */
 	if (!lookup)
 		inode_lock(parent->d_inode);
 
-	if (ef->ei)
-		dentry = create_dir(ef->name, parent, ef->data);
-	else
-		dentry = create_file(ef->name, ef->mode, parent,
-				     ef->data, ef->fop);
+	dentry = create_dir(ei->name, parent);
 
 	if (!lookup)
 		inode_unlock(parent->d_inode);
 
 	mutex_lock(&eventfs_mutex);
-	if (IS_ERR_OR_NULL(dentry)) {
-		/* If the ef was already updated get it */
-		dentry = ef->dentry;
+
+	if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) {
+		/*
+		 * When the mutex was released, something else could have
+		 * created the dentry for this e_dentry. In which case
+		 * use that one.
+		 *
+		 * Note, with the mutex held, the e_dentry cannot have content
+		 * and the ei->is_freed be true at the same time.
+		 */
+		dentry = ei->dentry;
 		if (dentry && !lookup)
 			dget(dentry);
 		mutex_unlock(&eventfs_mutex);
 		return dentry;
 	}
 
-	if (!ef->dentry && !ef->is_freed) {
-		ef->dentry = dentry;
-		if (ef->ei)
-			eventfs_post_create_dir(ef);
-		dentry->d_fsdata = ef;
+	if (!ei->dentry && !ei->is_freed) {
+		ei->dentry = dentry;
+		eventfs_post_create_dir(ei);
+		dentry->d_fsdata = ei;
 	} else {
-		/* A race here, should try again (unless freed) */
-		invalidate = true;
-
 		/*
 		 * Should never happen unless we get here due to being freed.
 		 * Otherwise it means two dentries exist with the same name.
 		 */
-		WARN_ON_ONCE(!ef->is_freed);
+		WARN_ON_ONCE(!ei->is_freed);
+		invalidate = true;
 	}
 	mutex_unlock(&eventfs_mutex);
 	if (invalidate)
@@ -349,50 +392,85 @@ create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup)
 	return invalidate ? NULL : dentry;
 }
 
-static bool match_event_file(struct eventfs_file *ef, const char *name)
-{
-	bool ret;
-
-	mutex_lock(&eventfs_mutex);
-	ret = !ef->is_freed && strcmp(ef->name, name) == 0;
-	mutex_unlock(&eventfs_mutex);
-
-	return ret;
-}
-
 /**
  * eventfs_root_lookup - lookup routine to create file/dir
  * @dir: in which a lookup is being done
  * @dentry: file/dir dentry
- * @flags: to pass as flags parameter to simple lookup
+ * @flags: Just passed to simple_lookup()
  *
- * Used to create a dynamic file/dir within @dir. Use the eventfs_inode
- * list of meta data to find the information needed to create the file/dir.
+ * Used to create dynamic file/dir with-in @dir, search with-in @ei
+ * list, if @dentry found go ahead and create the file/dir
  */
+
 static struct dentry *eventfs_root_lookup(struct inode *dir,
 					  struct dentry *dentry,
 					  unsigned int flags)
 {
+	const struct file_operations *fops;
+	const struct eventfs_entry *entry;
+	struct eventfs_inode *ei_child;
 	struct tracefs_inode *ti;
 	struct eventfs_inode *ei;
-	struct eventfs_file *ef;
+	struct dentry *ei_dentry = NULL;
 	struct dentry *ret = NULL;
+	const char *name = dentry->d_name.name;
+	bool created = false;
+	umode_t mode;
+	void *data;
 	int idx;
+	int i;
+	int r;
 
 	ti = get_tracefs(dir);
 	if (!(ti->flags & TRACEFS_EVENT_INODE))
 		return NULL;
 
-	ei = ti->private;
+	/* Grab srcu to prevent the ei from going away */
 	idx = srcu_read_lock(&eventfs_srcu);
-	list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+
+	/*
+	 * Grab the eventfs_mutex to consistent value from ti->private.
+	 * This s
+	 */
+	mutex_lock(&eventfs_mutex);
+	ei = READ_ONCE(ti->private);
+	if (ei)
+		ei_dentry = READ_ONCE(ei->dentry);
+	mutex_unlock(&eventfs_mutex);
+
+	if (!ei || !ei_dentry)
+		goto out;
+
+	data = ei->data;
+
+	list_for_each_entry_srcu(ei_child, &ei->children, list,
 				 srcu_read_lock_held(&eventfs_srcu)) {
-		if (!match_event_file(ef, dentry->d_name.name))
+		if (strcmp(ei_child->name, name) != 0)
 			continue;
 		ret = simple_lookup(dir, dentry, flags);
-		create_dentry(ef, ef->d_parent, true);
+		create_dir_dentry(ei_child, ei_dentry, true);
+		created = true;
 		break;
 	}
+
+	if (created)
+		goto out;
+
+	for (i = 0; i < ei->nr_entries; i++) {
+		entry = &ei->entries[i];
+		if (strcmp(name, entry->name) == 0) {
+			void *cdata = data;
+			r = entry->callback(name, &mode, &cdata, &fops);
+			if (r <= 0)
+				continue;
+			ret = simple_lookup(dir, dentry, flags);
+			create_file_dentry(ei, &ei->d_children[i],
+					   ei_dentry, name, mode, cdata,
+					   fops, true);
+			break;
+		}
+	}
+ out:
 	srcu_read_unlock(&eventfs_srcu, idx);
 	return ret;
 }
@@ -432,29 +510,48 @@ static int eventfs_release(struct inode *inode, struct file *file)
 	return dcache_dir_close(inode, file);
 }
 
+static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt)
+{
+	struct dentry **tmp;
+
+	tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_KERNEL);
+	if (!tmp)
+		return -1;
+	tmp[cnt] = d;
+	tmp[cnt + 1] = NULL;
+	*dentries = tmp;
+	return 0;
+}
+
 /**
  * dcache_dir_open_wrapper - eventfs open wrapper
  * @inode: not used
- * @file: dir to be opened (to create its child)
+ * @file: dir to be opened (to create it's children)
  *
- * Used to dynamically create the file/dir within @file. @file is really a
- * directory and all the files/dirs of the children within @file will be
- * created. If any of the files/dirs have already been created, their
- * reference count will be incremented.
+ * Used to dynamic create file/dir with-in @file, all the
+ * file/dir will be created. If already created then references
+ * will be increased
  */
 static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
 {
+	const struct file_operations *fops;
+	const struct eventfs_entry *entry;
+	struct eventfs_inode *ei_child;
 	struct tracefs_inode *ti;
 	struct eventfs_inode *ei;
-	struct eventfs_file *ef;
 	struct dentry_list *dlist;
 	struct dentry **dentries = NULL;
-	struct dentry *dentry = file_dentry(file);
+	struct dentry *parent = file_dentry(file);
 	struct dentry *d;
 	struct inode *f_inode = file_inode(file);
+	const char *name = parent->d_name.name;
+	umode_t mode;
+	void *data;
 	int cnt = 0;
 	int idx;
 	int ret;
+	int i;
+	int r;
 
 	ti = get_tracefs(f_inode);
 	if (!(ti->flags & TRACEFS_EVENT_INODE))
@@ -463,25 +560,51 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
 	if (WARN_ON_ONCE(file->private_data))
 		return -EINVAL;
 
+	idx = srcu_read_lock(&eventfs_srcu);
+
+	mutex_lock(&eventfs_mutex);
+	ei = READ_ONCE(ti->private);
+	mutex_unlock(&eventfs_mutex);
+
+	if (!ei) {
+		srcu_read_unlock(&eventfs_srcu, idx);
+		return -EINVAL;
+	}
+
+
+	data = ei->data;
+
 	dlist = kmalloc(sizeof(*dlist), GFP_KERNEL);
-	if (!dlist)
+	if (!dlist) {
+		srcu_read_unlock(&eventfs_srcu, idx);
 		return -ENOMEM;
+	}
 
-	ei = ti->private;
-	idx = srcu_read_lock(&eventfs_srcu);
-	list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+	list_for_each_entry_srcu(ei_child, &ei->children, list,
 				 srcu_read_lock_held(&eventfs_srcu)) {
-		d = create_dentry(ef, dentry, false);
+		d = create_dir_dentry(ei_child, parent, false);
 		if (d) {
-			struct dentry **tmp;
+			ret = add_dentries(&dentries, d, cnt);
+			if (ret < 0)
+				break;
+			cnt++;
+		}
+	}
 
-			tmp = krealloc(dentries, sizeof(d) * (cnt + 2), GFP_KERNEL);
-			if (!tmp)
+	for (i = 0; i < ei->nr_entries; i++) {
+		void *cdata = data;
+		entry = &ei->entries[i];
+		name = entry->name;
+		r = entry->callback(name, &mode, &cdata, &fops);
+		if (r <= 0)
+			continue;
+		d = create_file_dentry(ei, &ei->d_children[i],
+				       parent, name, mode, cdata, fops, false);
+		if (d) {
+			ret = add_dentries(&dentries, d, cnt);
+			if (ret < 0)
 				break;
-			tmp[cnt] = d;
-			tmp[cnt + 1] = NULL;
 			cnt++;
-			dentries = tmp;
 		}
 	}
 	srcu_read_unlock(&eventfs_srcu, idx);
@@ -514,63 +637,90 @@ static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx)
 }
 
 /**
- * eventfs_prepare_ef - helper function to prepare eventfs_file
- * @name: the name of the file/directory to create.
- * @mode: the permission that the file should have.
- * @fop: struct file_operations that should be used for this file/directory.
- * @iop: struct inode_operations that should be used for this file/directory.
- * @data: something that the caller will want to get to later on. The
- *        inode.i_private pointer will point to this value on the open() call.
+ * eventfs_create_dir - Create the eventfs_inode for this directory
+ * @name: The name of the directory to create.
+ * @parent: The eventfs_inode of the parent directory.
+ * @entries: A list of entries that represent the files under this directory
+ * @size: The number of @entries
+ * @data: The default data to pass to the files (an entry may override it).
+ *
+ * This function creates the descriptor to represent a directory in the
+ * eventfs. This descriptor is an eventfs_inode, and it is returned to be
+ * used to create other children underneath.
+ *
+ * The @entries is an array of eventfs_entry structures which has:
+ *	const char		 *name
+ *	eventfs_callback	callback;
+ *
+ * The name is the name of the file, and the callback is a pointer to a function
+ * that will be called when the file is reference (either by lookup or by
+ * reading a directory). The callback is of the prototype:
  *
- * This function allocates and fills the eventfs_file structure.
+ *    int callback(const char *name, umode_t *mode, void **data,
+ *		   const struct file_operations **fops);
+ *
+ * When a file needs to be created, this callback will be called with
+ *   name = the name of the file being created (so that the same callback
+ *          may be used for multiple files).
+ *   mode = a place to set the file's mode
+ *   data = A pointer to @data, and the callback may replace it, which will
+ *         cause the file created to pass the new data to the open() call.
+ *   fops = the fops to use for the created file.
  */
-static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode,
-					const struct file_operations *fop,
-					const struct inode_operations *iop,
-					void *data)
+struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
+					 const struct eventfs_entry *entries,
+					 int size, void *data)
 {
-	struct eventfs_file *ef;
+	struct eventfs_inode *ei;
 
-	ef = kzalloc(sizeof(*ef), GFP_KERNEL);
-	if (!ef)
+	if (!parent)
+		return ERR_PTR(-EINVAL);
+
+	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+	if (!ei)
 		return ERR_PTR(-ENOMEM);
 
-	ef->name = kstrdup(name, GFP_KERNEL);
-	if (!ef->name) {
-		kfree(ef);
+	ei->name = kstrdup_const(name, GFP_KERNEL);
+	if (!ei->name) {
+		kfree(ei);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (S_ISDIR(mode)) {
-		ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL);
-		if (!ef->ei) {
-			kfree(ef->name);
-			kfree(ef);
+	if (size) {
+		ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
+		if (!ei->d_children) {
+			kfree_const(ei->name);
+			kfree(ei);
 			return ERR_PTR(-ENOMEM);
 		}
-		INIT_LIST_HEAD(&ef->ei->e_top_files);
-	} else {
-		ef->ei = NULL;
 	}
 
-	ef->iop = iop;
-	ef->fop = fop;
-	ef->mode = mode;
-	ef->data = data;
-	return ef;
+	ei->entries = entries;
+	ei->nr_entries = size;
+	ei->data = data;
+	INIT_LIST_HEAD(&ei->children);
+
+	mutex_lock(&eventfs_mutex);
+	list_add_tail(&ei->list, &parent->children);
+	ei->d_parent = parent->dentry;
+	mutex_unlock(&eventfs_mutex);
+
+	return ei;
 }
 
 /**
- * eventfs_create_events_dir - create the trace event structure
- * @name: the name of the directory to create.
- * @parent: parent dentry for this file.  This should be a directory dentry
- *          if set.  If this parameter is NULL, then the directory will be
- *          created in the root of the tracefs filesystem.
+ * eventfs_create_events_dir - create the top level events directory
+ * @name: The name of the top level directory to create.
+ * @parent: Parent dentry for this file in the tracefs directory.
+ * @entries: A list of entries that represent the files under this directory
+ * @size: The number of @entries
+ * @data: The default data to pass to the files (an entry may override it).
  *
  * This function creates the top of the trace event directory.
  */
-struct dentry *eventfs_create_events_dir(const char *name,
-					 struct dentry *parent)
+struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
+						const struct eventfs_entry *entries,
+						int size, void *data)
 {
 	struct dentry *dentry = tracefs_start_creating(name, parent);
 	struct eventfs_inode *ei;
@@ -581,19 +731,32 @@ struct dentry *eventfs_create_events_dir(const char *name,
 		return NULL;
 
 	if (IS_ERR(dentry))
-		return dentry;
+		return (struct eventfs_inode *)dentry;
 
 	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
 	if (!ei)
-		return ERR_PTR(-ENOMEM);
+		goto fail;
+
 	inode = tracefs_get_inode(dentry->d_sb);
-	if (unlikely(!inode)) {
-		kfree(ei);
-		tracefs_failed_creating(dentry);
-		return ERR_PTR(-ENOMEM);
+	if (unlikely(!inode))
+		goto fail;
+
+	if (size) {
+		ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
+		if (!ei->d_children)
+			goto fail;
 	}
 
-	INIT_LIST_HEAD(&ei->e_top_files);
+	ei->dentry = dentry;
+	ei->entries = entries;
+	ei->nr_entries = size;
+	ei->data = data;
+	ei->name = kstrdup_const(name, GFP_KERNEL);
+	if (!ei->name)
+		goto fail;
+
+	INIT_LIST_HEAD(&ei->children);
+	INIT_LIST_HEAD(&ei->list);
 
 	ti = get_tracefs(inode);
 	ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE;
@@ -608,193 +771,41 @@ struct dentry *eventfs_create_events_dir(const char *name,
 	d_instantiate(dentry, inode);
 	inc_nlink(dentry->d_parent->d_inode);
 	fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
-	return tracefs_end_creating(dentry);
-}
+	tracefs_end_creating(dentry);
 
-/**
- * eventfs_add_subsystem_dir - add eventfs subsystem_dir to list to create later
- * @name: the name of the file to create.
- * @parent: parent dentry for this dir.
- *
- * This function adds eventfs subsystem dir to list.
- * And all these dirs are created on the fly when they are looked up,
- * and the dentry and inodes will be removed when they are done.
- */
-struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
-					       struct dentry *parent)
-{
-	struct tracefs_inode *ti_parent;
-	struct eventfs_inode *ei_parent;
-	struct eventfs_file *ef;
+	/* Will call dput when the directory is removed */
+	dget(dentry);
 
-	if (security_locked_down(LOCKDOWN_TRACEFS))
-		return NULL;
-
-	if (!parent)
-		return ERR_PTR(-EINVAL);
-
-	ti_parent = get_tracefs(parent->d_inode);
-	ei_parent = ti_parent->private;
+	return ei;
 
-	ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL);
-	if (IS_ERR(ef))
-		return ef;
-
-	mutex_lock(&eventfs_mutex);
-	list_add_tail(&ef->list, &ei_parent->e_top_files);
-	ef->d_parent = parent;
-	mutex_unlock(&eventfs_mutex);
-	return ef;
+ fail:
+	kfree(ei->d_children);
+	kfree(ei);
+	tracefs_failed_creating(dentry);
+	return ERR_PTR(-ENOMEM);
 }
 
-/**
- * eventfs_add_dir - add eventfs dir to list to create later
- * @name: the name of the file to create.
- * @ef_parent: parent eventfs_file for this dir.
- *
- * This function adds eventfs dir to list.
- * And all these dirs are created on the fly when they are looked up,
- * and the dentry and inodes will be removed when they are done.
- */
-struct eventfs_file *eventfs_add_dir(const char *name,
-				     struct eventfs_file *ef_parent)
+static void free_ei(struct rcu_head *head)
 {
-	struct eventfs_file *ef;
-
-	if (security_locked_down(LOCKDOWN_TRACEFS))
-		return NULL;
+	struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu);
 
-	if (!ef_parent)
-		return ERR_PTR(-EINVAL);
-
-	ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL);
-	if (IS_ERR(ef))
-		return ef;
-
-	mutex_lock(&eventfs_mutex);
-	list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
-	ef->d_parent = ef_parent->dentry;
-	mutex_unlock(&eventfs_mutex);
-	return ef;
-}
-
-/**
- * eventfs_add_events_file - add the data needed to create a file for later reference
- * @name: the name of the file to create.
- * @mode: the permission that the file should have.
- * @parent: parent dentry for this file.
- * @data: something that the caller will want to get to later on.
- * @fop: struct file_operations that should be used for this file.
- *
- * This function is used to add the information needed to create a
- * dentry/inode within the top level events directory. The file created
- * will have the @mode permissions. The @data will be used to fill the
- * inode.i_private when the open() call is done. The dentry and inodes are
- * all created when they are referenced, and removed when they are no
- * longer referenced.
- */
-int eventfs_add_events_file(const char *name, umode_t mode,
-			 struct dentry *parent, void *data,
-			 const struct file_operations *fop)
-{
-	struct tracefs_inode *ti;
-	struct eventfs_inode *ei;
-	struct eventfs_file *ef;
-
-	if (security_locked_down(LOCKDOWN_TRACEFS))
-		return -ENODEV;
-
-	if (!parent)
-		return -EINVAL;
-
-	if (!(mode & S_IFMT))
-		mode |= S_IFREG;
-
-	if (!parent->d_inode)
-		return -EINVAL;
-
-	ti = get_tracefs(parent->d_inode);
-	if (!(ti->flags & TRACEFS_EVENT_INODE))
-		return -EINVAL;
-
-	ei = ti->private;
-	ef = eventfs_prepare_ef(name, mode, fop, NULL, data);
-
-	if (IS_ERR(ef))
-		return -ENOMEM;
-
-	mutex_lock(&eventfs_mutex);
-	list_add_tail(&ef->list, &ei->e_top_files);
-	ef->d_parent = parent;
-	mutex_unlock(&eventfs_mutex);
-	return 0;
-}
-
-/**
- * eventfs_add_file - add eventfs file to list to create later
- * @name: the name of the file to create.
- * @mode: the permission that the file should have.
- * @ef_parent: parent eventfs_file for this file.
- * @data: something that the caller will want to get to later on.
- * @fop: struct file_operations that should be used for this file.
- *
- * This function is used to add the information needed to create a
- * file within a subdirectory of the events directory. The file created
- * will have the @mode permissions. The @data will be used to fill the
- * inode.i_private when the open() call is done. The dentry and inodes are
- * all created when they are referenced, and removed when they are no
- * longer referenced.
- */
-int eventfs_add_file(const char *name, umode_t mode,
-		     struct eventfs_file *ef_parent,
-		     void *data,
-		     const struct file_operations *fop)
-{
-	struct eventfs_file *ef;
-
-	if (security_locked_down(LOCKDOWN_TRACEFS))
-		return -ENODEV;
-
-	if (!ef_parent)
-		return -EINVAL;
-
-	if (!(mode & S_IFMT))
-		mode |= S_IFREG;
-
-	ef = eventfs_prepare_ef(name, mode, fop, NULL, data);
-	if (IS_ERR(ef))
-		return -ENOMEM;
-
-	mutex_lock(&eventfs_mutex);
-	list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
-	ef->d_parent = ef_parent->dentry;
-	mutex_unlock(&eventfs_mutex);
-	return 0;
-}
-
-static void free_ef(struct rcu_head *head)
-{
-	struct eventfs_file *ef = container_of(head, struct eventfs_file, rcu);
-
-	kfree(ef->name);
-	kfree(ef->ei);
-	kfree(ef);
+	kfree_const(ei->name);
+	kfree(ei->d_children);
+	kfree(ei);
 }
 
 /**
  * eventfs_remove_rec - remove eventfs dir or file from list
- * @ef: eventfs_file to be removed.
- * @head: to create list of eventfs_file to be deleted
- * @level: to check recursion depth
+ * @ei: eventfs_inode to be removed.
  *
- * The helper function eventfs_remove_rec() is used to clean up and free the
- * associated data from eventfs for both of the added functions.
+ * This function recursively remove eventfs_inode which
+ * contains info of file or dir.
  */
-static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head, int level)
+static void eventfs_remove_rec(struct eventfs_inode *ei, struct list_head *head, int level)
 {
-	struct eventfs_file *ef_child;
+	struct eventfs_inode *ei_child;
 
-	if (!ef)
+	if (!ei)
 		return;
 	/*
 	 * Check recursion depth. It should never be greater than 3:
@@ -806,62 +817,68 @@ static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head,
 	if (WARN_ON_ONCE(level > 3))
 		return;
 
-	if (ef->ei) {
-		/* search for nested folders or files */
-		list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
-					 lockdep_is_held(&eventfs_mutex)) {
-			eventfs_remove_rec(ef_child, head, level + 1);
-		}
+	/* search for nested folders or files */
+	list_for_each_entry_srcu(ei_child, &ei->children, list,
+				 lockdep_is_held(&eventfs_mutex)) {
+		eventfs_remove_rec(ei_child, head, level + 1);
 	}
 
-	list_del_rcu(&ef->list);
-	list_add_tail(&ef->del_list, head);
+	list_del_rcu(&ei->list);
+	list_add_tail(&ei->del_list, head);
 }
 
+static void unhook_dentry(struct dentry **dentry, struct dentry **list)
+{
+	if (*dentry) {
+		unsigned long ptr = (unsigned long)*list;
+
+		/* Keep the dentry from being freed yet */
+		dget(*dentry);
+
+		/*
+		 * Paranoid: The dget() above should prevent the dentry
+		 * from being freed and calling eventfs_set_ei_status_free().
+		 * But just in case, set the link list LSB pointer to 1
+		 * and have eventfs_set_ei_status_free() check that to
+		 * make sure that if it does happen, it will not think
+		 * the d_fsdata is an eventfs_inode.
+		 *
+		 * For this to work, no eventfs_inode should be allocated
+		 * on a odd space, as the ef should always be allocated
+		 * to be at least word aligned. Check for that too.
+		 */
+		WARN_ON_ONCE(ptr & 1);
+
+		(*dentry)->d_fsdata = (void *)(ptr | 1);
+		*list = *dentry;
+		*dentry = NULL;
+	}
+}
 /**
  * eventfs_remove - remove eventfs dir or file from list
- * @ef: eventfs_file to be removed.
+ * @ei: eventfs_inode to be removed.
  *
  * This function acquire the eventfs_mutex lock and call eventfs_remove_rec()
  */
-void eventfs_remove(struct eventfs_file *ef)
+void eventfs_remove_dir(struct eventfs_inode *ei)
 {
-	struct eventfs_file *tmp;
-	LIST_HEAD(ef_del_list);
+	struct eventfs_inode *tmp;
+	LIST_HEAD(ei_del_list);
 	struct dentry *dentry_list = NULL;
 	struct dentry *dentry;
+	int i;
 
-	if (!ef)
+	if (!ei)
 		return;
 
 	mutex_lock(&eventfs_mutex);
-	eventfs_remove_rec(ef, &ef_del_list, 0);
-	list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
-		if (ef->dentry) {
-			unsigned long ptr = (unsigned long)dentry_list;
-
-			/* Keep the dentry from being freed yet */
-			dget(ef->dentry);
-
-			/*
-			 * Paranoid: The dget() above should prevent the dentry
-			 * from being freed and calling eventfs_set_ef_status_free().
-			 * But just in case, set the link list LSB pointer to 1
-			 * and have eventfs_set_ef_status_free() check that to
-			 * make sure that if it does happen, it will not think
-			 * the d_fsdata is an event_file.
-			 *
-			 * For this to work, no event_file should be allocated
-			 * on a odd space, as the ef should always be allocated
-			 * to be at least word aligned. Check for that too.
-			 */
-			WARN_ON_ONCE(ptr & 1);
-
-			ef->dentry->d_fsdata = (void *)(ptr | 1);
-			dentry_list = ef->dentry;
-			ef->dentry = NULL;
-		}
-		call_srcu(&eventfs_srcu, &ef->rcu, free_ef);
+	eventfs_remove_rec(ei, &ei_del_list, 0);
+
+	list_for_each_entry_safe(ei, tmp, &ei_del_list, del_list) {
+		for (i = 0; i < ei->nr_entries; i++)
+			unhook_dentry(&ei->d_children[i], &dentry_list);
+		unhook_dentry(&ei->dentry, &dentry_list);
+		call_srcu(&eventfs_srcu, &ei->rcu, free_ei);
 	}
 	mutex_unlock(&eventfs_mutex);
 
@@ -876,8 +893,8 @@ void eventfs_remove(struct eventfs_file *ef)
 		mutex_lock(&eventfs_mutex);
 		/* dentry should now have at least a single reference */
 		WARN_ONCE((int)d_count(dentry) < 1,
-			  "dentry %p less than one reference (%d) after invalidate\n",
-			  dentry, d_count(dentry));
+			  "dentry %px (%s) less than one reference (%d) after invalidate\n",
+			  dentry, dentry->d_name.name, d_count(dentry));
 		mutex_unlock(&eventfs_mutex);
 		dput(dentry);
 	}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 891653ba9cf3..34ffb2f8114e 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -385,7 +385,7 @@ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
 
 	ti = get_tracefs(inode);
 	if (ti && ti->flags & TRACEFS_EVENT_INODE)
-		eventfs_set_ef_status_free(ti, dentry);
+		eventfs_set_ei_status_free(ti, dentry);
 	iput(inode);
 }
 
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 4f2e49e2197b..298d3ecaf621 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -13,6 +13,41 @@ struct tracefs_inode {
 	struct inode            vfs_inode;
 };
 
+/*
+ * struct eventfs_inode - hold the properties of the eventfs directories.
+ * @list:	link list into the parent directory
+ * @entries:	the array of entries representing the files in the directory
+ * @name:	the name of the directory to create
+ * @children:	link list into the child eventfs_inode
+ * @dentry:     the dentry of the directory
+ * @d_parent:   pointer to the parent's dentry
+ * @d_children: The array of dentries to represent the files when created
+ * @data:	The private data to pass to the callbacks
+ * @nr_entries: The number of items in @entries
+ */
+struct eventfs_inode {
+	struct list_head		list;
+	const struct eventfs_entry	*entries;
+	const char			*name;
+	struct list_head		children;
+	struct dentry			*dentry;
+	struct dentry			*d_parent;
+	struct dentry			**d_children;
+	void				*data;
+	/*
+	 * Union - used for deletion
+	 * @del_list:	list of eventfs_inode to delete
+	 * @rcu:	eventfs_indoe to delete in RCU
+	 * @is_freed:	node is freed if one of the above is set
+	 */
+	union {
+		struct list_head	del_list;
+		struct rcu_head		rcu;
+		unsigned long		is_freed;
+	};
+	int				nr_entries;
+};
+
 static inline struct tracefs_inode *get_tracefs(const struct inode *inode)
 {
 	return container_of(inode, struct tracefs_inode, vfs_inode);
@@ -25,6 +60,6 @@ struct inode *tracefs_get_inode(struct super_block *sb);
 struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
 struct dentry *eventfs_failed_creating(struct dentry *dentry);
 struct dentry *eventfs_end_creating(struct dentry *dentry);
-void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry);
+void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry);
 
 #endif /* _TRACEFS_INTERNAL_H */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 21ae37e49319..12207dc6722d 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -649,7 +649,7 @@ struct trace_event_file {
 	struct list_head		list;
 	struct trace_event_call		*event_call;
 	struct event_filter __rcu	*filter;
-	struct eventfs_file             *ef;
+	struct eventfs_inode		*ei;
 	struct trace_array		*tr;
 	struct trace_subsystem_dir	*system;
 	struct list_head		triggers;
diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
index 009072792fa3..0c39704455d9 100644
--- a/include/linux/tracefs.h
+++ b/include/linux/tracefs.h
@@ -23,26 +23,25 @@ struct file_operations;
 
 struct eventfs_file;
 
-struct dentry *eventfs_create_events_dir(const char *name,
-					 struct dentry *parent);
+typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
+				const struct file_operations **fops);
 
-struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
-					       struct dentry *parent);
+struct eventfs_entry {
+	const char			*name;
+	eventfs_callback		callback;
+};
 
-struct eventfs_file *eventfs_add_dir(const char *name,
-				     struct eventfs_file *ef_parent);
+struct eventfs_inode;
 
-int eventfs_add_file(const char *name, umode_t mode,
-		     struct eventfs_file *ef_parent, void *data,
-		     const struct file_operations *fops);
+struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
+						const struct eventfs_entry *entries,
+						int size, void *data);
 
-int eventfs_add_events_file(const char *name, umode_t mode,
-			 struct dentry *parent, void *data,
-			 const struct file_operations *fops);
+struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
+					 const struct eventfs_entry *entries,
+					 int size, void *data);
 
-void eventfs_remove(struct eventfs_file *ef);
-
-void eventfs_remove_events_dir(struct dentry *dentry);
+void eventfs_remove_dir(struct eventfs_inode *ei);
 
 struct dentry *tracefs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dd6395692ff9..4383be8fa1b0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9764,7 +9764,6 @@ static __init void create_trace_instances(struct dentry *d_tracer)
 static void
 init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 {
-	struct trace_event_file *file;
 	int cpu;
 
 	trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer,
@@ -9797,11 +9796,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("trace_marker", 0220, d_tracer,
 			  tr, &tracing_mark_fops);
 
-	file = __find_event_file(tr, "ftrace", "print");
-	if (file && file->ef)
-		eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef,
-				  file, &event_trigger_fops);
-	tr->trace_marker_file = file;
+	tr->trace_marker_file = __find_event_file(tr, "ftrace", "print");
 
 	trace_create_file("trace_marker_raw", 0220, d_tracer,
 			  tr, &tracing_mark_raw_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e92cb9c1292f..0e1405abf4f7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -381,7 +381,7 @@ struct trace_array {
 	struct dentry		*dir;
 	struct dentry		*options;
 	struct dentry		*percpu_dir;
-	struct dentry		*event_dir;
+	struct eventfs_inode	*event_dir;
 	struct trace_options	*topts;
 	struct list_head	systems;
 	struct list_head	events;
@@ -1349,7 +1349,7 @@ struct trace_subsystem_dir {
 	struct list_head		list;
 	struct event_subsystem		*subsystem;
 	struct trace_array		*tr;
-	struct eventfs_file             *ef;
+	struct eventfs_inode		*ei;
 	int				ref_count;
 	int				nr_events;
 };
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 099b9b6bbdc4..a3b9d9423824 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -984,7 +984,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
 		return;
 
 	if (!--dir->nr_events) {
-		eventfs_remove(dir->ef);
+		eventfs_remove_dir(dir->ei);
 		list_del(&dir->list);
 		__put_system_dir(dir);
 	}
@@ -992,7 +992,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
 
 static void remove_event_file_dir(struct trace_event_file *file)
 {
-	eventfs_remove(file->ef);
+	eventfs_remove_dir(file->ei);
 	list_del(&file->list);
 	remove_subsystem(file->system);
 	free_event_filter(file->filter);
@@ -2282,14 +2282,40 @@ create_new_subsystem(const char *name)
 	return NULL;
 }
 
-static struct eventfs_file *
+int system_callback(const char *name, umode_t *mode, void **data,
+		    const struct file_operations **fops)
+{
+	if (strcmp(name, "filter") == 0)
+		*fops = &ftrace_subsystem_filter_fops;
+
+	else if (strcmp(name, "enable") == 0)
+		*fops = &ftrace_system_enable_fops;
+
+	else
+		return 0;
+
+	*mode = TRACE_MODE_WRITE;
+	return 1;
+}
+
+static struct eventfs_inode *
 event_subsystem_dir(struct trace_array *tr, const char *name,
-		    struct trace_event_file *file, struct dentry *parent)
+		    struct trace_event_file *file, struct eventfs_inode *parent)
 {
 	struct event_subsystem *system, *iter;
 	struct trace_subsystem_dir *dir;
-	struct eventfs_file *ef;
-	int res;
+	struct eventfs_inode *ei;
+	int nr_entries;
+	static struct eventfs_entry system_entries[] = {
+		{
+			.name		= "filter",
+			.callback	= system_callback,
+		},
+		{
+			.name		= "enable",
+			.callback	= system_callback,
+		}
+	};
 
 	/* First see if we did not already create this dir */
 	list_for_each_entry(dir, &tr->systems, list) {
@@ -2297,7 +2323,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 		if (strcmp(system->name, name) == 0) {
 			dir->nr_events++;
 			file->system = dir;
-			return dir->ef;
+			return dir->ei;
 		}
 	}
 
@@ -2321,39 +2347,29 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 	} else
 		__get_system(system);
 
-	ef = eventfs_add_subsystem_dir(name, parent);
-	if (IS_ERR(ef)) {
+	/* ftrace only has directories no files */
+	if (strcmp(name, "ftrace") == 0)
+		nr_entries = 0;
+	else
+		nr_entries = ARRAY_SIZE(system_entries);
+
+	ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir);
+	if (!ei) {
 		pr_warn("Failed to create system directory %s\n", name);
 		__put_system(system);
 		goto out_free;
 	}
 
-	dir->ef = ef;
+	dir->ei = ei;
 	dir->tr = tr;
 	dir->ref_count = 1;
 	dir->nr_events = 1;
 	dir->subsystem = system;
 	file->system = dir;
 
-	/* the ftrace system is special, do not create enable or filter files */
-	if (strcmp(name, "ftrace") != 0) {
-
-		res = eventfs_add_file("filter", TRACE_MODE_WRITE,
-					    dir->ef, dir,
-					    &ftrace_subsystem_filter_fops);
-		if (res) {
-			kfree(system->filter);
-			system->filter = NULL;
-			pr_warn("Could not create tracefs '%s/filter' entry\n", name);
-		}
-
-		eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir,
-				  &ftrace_system_enable_fops);
-	}
-
 	list_add(&dir->list, &tr->systems);
 
-	return dir->ef;
+	return dir->ei;
 
  out_free:
 	kfree(dir);
@@ -2402,15 +2418,134 @@ event_define_fields(struct trace_event_call *call)
 	return ret;
 }
 
+static int event_callback(const char *name, umode_t *mode, void **data,
+			  const struct file_operations **fops)
+{
+	struct trace_event_file *file = *data;
+	struct trace_event_call *call = file->event_call;
+
+	if (strcmp(name, "format") == 0) {
+		*mode = TRACE_MODE_READ;
+		*fops = &ftrace_event_format_fops;
+		*data = call;
+		return 1;
+	}
+
+	/*
+	 * Only event directories that can be enabled should have
+	 * triggers or filters, with the exception of the "print"
+	 * event that can have a "trigger" file.
+	 */
+	if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
+		if (call->class->reg && strcmp(name, "enable") == 0) {
+			*mode = TRACE_MODE_WRITE;
+			*fops = &ftrace_enable_fops;
+			return 1;
+		}
+
+		if (strcmp(name, "filter") == 0) {
+			*mode = TRACE_MODE_WRITE;
+			*fops = &ftrace_event_filter_fops;
+			return 1;
+		}
+	}
+
+	if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) ||
+	    strcmp(trace_event_name(call), "print") == 0) {
+		if (strcmp(name, "trigger") == 0) {
+			*mode = TRACE_MODE_WRITE;
+			*fops = &event_trigger_fops;
+			return 1;
+		}
+	}
+
+#ifdef CONFIG_PERF_EVENTS
+	if (call->event.type && call->class->reg &&
+	    strcmp(name, "id") == 0) {
+		*mode = TRACE_MODE_READ;
+		*data = (void *)(long)call->event.type;
+		*fops = &ftrace_event_id_fops;
+		return 1;
+	}
+#endif
+
+#ifdef CONFIG_HIST_TRIGGERS
+	if (strcmp(name, "hist") == 0) {
+		*mode = TRACE_MODE_READ;
+		*fops = &event_hist_fops;
+		return 1;
+	}
+#endif
+#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+	if (strcmp(name, "hist_debug") == 0) {
+		*mode = TRACE_MODE_READ;
+		*fops = &event_hist_debug_fops;
+		return 1;
+	}
+#endif
+#ifdef CONFIG_TRACE_EVENT_INJECT
+	if (call->event.type && call->class->reg &&
+	    strcmp(name, "inject") == 0) {
+		*mode = 0200;
+		*fops = &event_inject_fops;
+		return 1;
+	}
+#endif
+	return 0;
+}
+
 static int
-event_create_dir(struct dentry *parent, struct trace_event_file *file)
+event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
 {
 	struct trace_event_call *call = file->event_call;
-	struct eventfs_file *ef_subsystem = NULL;
 	struct trace_array *tr = file->tr;
-	struct eventfs_file *ef;
+	struct eventfs_inode *e_events;
+	struct eventfs_inode *ei;
 	const char *name;
+	int nr_entries;
 	int ret;
+	static struct eventfs_entry event_entries[] = {
+		{
+			.name		= "enable",
+			.callback	= event_callback,
+		},
+		{
+			.name		= "filter",
+			.callback	= event_callback,
+		},
+		{
+			.name		= "trigger",
+			.callback	= event_callback,
+		},
+		{
+			.name		= "format",
+			.callback	= event_callback,
+		},
+#ifdef CONFIG_PERF_EVENTS
+		{
+			.name		= "id",
+			.callback	= event_callback,
+		},
+#endif
+#ifdef CONFIG_HIST_TRIGGERS
+		{
+			.name		= "hist",
+			.callback	= event_callback,
+		},
+#endif
+#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+		{
+			.name		= "hist_debug",
+			.callback	= event_callback,
+		},
+#endif
+#ifdef CONFIG_TRACE_EVENT_INJECT
+		{
+			.name		= "inject",
+			.callback	= event_callback,
+		},
+#endif
+	};
 
 	/*
 	 * If the trace point header did not define TRACE_SYSTEM
@@ -2420,29 +2555,20 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 	if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0))
 		return -ENODEV;
 
-	ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent);
-	if (!ef_subsystem)
+	e_events = event_subsystem_dir(tr, call->class->system, file, parent);
+	if (!e_events)
 		return -ENOMEM;
 
+	nr_entries = ARRAY_SIZE(event_entries);
+
 	name = trace_event_name(call);
-	ef = eventfs_add_dir(name, ef_subsystem);
-	if (IS_ERR(ef)) {
+	ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file);
+	if (IS_ERR(ei)) {
 		pr_warn("Could not create tracefs '%s' directory\n", name);
 		return -1;
 	}
 
-	file->ef = ef;
-
-	if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
-		eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file,
-				  &ftrace_enable_fops);
-
-#ifdef CONFIG_PERF_EVENTS
-	if (call->event.type && call->class->reg)
-		eventfs_add_file("id", TRACE_MODE_READ, file->ef,
-				  (void *)(long)call->event.type,
-				  &ftrace_event_id_fops);
-#endif
+	file->ei = ei;
 
 	ret = event_define_fields(call);
 	if (ret < 0) {
@@ -2450,35 +2576,6 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 		return ret;
 	}
 
-	/*
-	 * Only event directories that can be enabled should have
-	 * triggers or filters.
-	 */
-	if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
-		eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef,
-				  file, &ftrace_event_filter_fops);
-
-		eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef,
-				  file, &event_trigger_fops);
-	}
-
-#ifdef CONFIG_HIST_TRIGGERS
-	eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file,
-			  &event_hist_fops);
-#endif
-#ifdef CONFIG_HIST_TRIGGERS_DEBUG
-	eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file,
-			  &event_hist_debug_fops);
-#endif
-	eventfs_add_file("format", TRACE_MODE_READ, file->ef, call,
-			  &ftrace_event_format_fops);
-
-#ifdef CONFIG_TRACE_EVENT_INJECT
-	if (call->event.type && call->class->reg)
-		eventfs_add_file("inject", 0200, file->ef, file,
-				  &event_inject_fops);
-#endif
-
 	return 0;
 }
 
@@ -3623,30 +3720,65 @@ static __init int setup_trace_event(char *str)
 }
 __setup("trace_event=", setup_trace_event);
 
+static int events_callback(const char *name, umode_t *mode, void **data,
+			   const struct file_operations **fops)
+{
+	if (strcmp(name, "enable") == 0) {
+		*mode = TRACE_MODE_WRITE;
+		*fops = &ftrace_tr_enable_fops;
+		return 1;
+	}
+
+	if (strcmp(name, "header_page") == 0)
+		*data = ring_buffer_print_page_header;
+
+	else if (strcmp(name, "header_event") == 0)
+		*data = ring_buffer_print_entry_header;
+
+	else
+		return 0;
+
+	*mode = TRACE_MODE_READ;
+	*fops = &ftrace_show_header_fops;
+	return 1;
+}
+
 /* Expects to have event_mutex held when called */
 static int
 create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 {
-	struct dentry *d_events;
+	struct eventfs_inode *e_events;
 	struct dentry *entry;
-	int error = 0;
+	int nr_entries;
+	static struct eventfs_entry events_entries[] = {
+		{
+			.name		= "enable",
+			.callback	= events_callback,
+		},
+		{
+			.name		= "header_page",
+			.callback	= events_callback,
+		},
+		{
+			.name		= "header_event",
+			.callback	= events_callback,
+		},
+	};
 
 	entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
 				  tr, &ftrace_set_event_fops);
 	if (!entry)
 		return -ENOMEM;
 
-	d_events = eventfs_create_events_dir("events", parent);
-	if (IS_ERR(d_events)) {
+	nr_entries = ARRAY_SIZE(events_entries);
+
+	e_events = eventfs_create_events_dir("events", parent, events_entries,
+					     nr_entries, tr);
+	if (IS_ERR(e_events)) {
 		pr_warn("Could not create tracefs 'events' directory\n");
 		return -ENOMEM;
 	}
 
-	error = eventfs_add_events_file("enable", TRACE_MODE_WRITE, d_events,
-				  tr, &ftrace_tr_enable_fops);
-	if (error)
-		return -ENOMEM;
-
 	/* There are not as crucial, just warn if they are not created */
 
 	trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
@@ -3656,16 +3788,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 			  TRACE_MODE_WRITE, parent, tr,
 			  &ftrace_set_event_notrace_pid_fops);
 
-	/* ring buffer internal formats */
-	eventfs_add_events_file("header_page", TRACE_MODE_READ, d_events,
-				  ring_buffer_print_page_header,
-				  &ftrace_show_header_fops);
-
-	eventfs_add_events_file("header_event", TRACE_MODE_READ, d_events,
-				  ring_buffer_print_entry_header,
-				  &ftrace_show_header_fops);
-
-	tr->event_dir = d_events;
+	tr->event_dir = e_events;
 
 	return 0;
 }
@@ -3749,7 +3872,7 @@ int event_trace_del_tracer(struct trace_array *tr)
 
 	down_write(&trace_event_sem);
 	__trace_remove_event_dirs(tr);
-	eventfs_remove_events_dir(tr->event_dir);
+	eventfs_remove_dir(tr->event_dir);
 	up_write(&trace_event_sem);
 
 	tr->event_dir = NULL;
-- 
cgit v1.2.3


From e6814ec3ba1994561db9b1c05a80227d30cc18fa Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Fri, 21 Jul 2023 09:06:07 +0000
Subject: perf/core: Rename perf_proc_update_handler() ->
 perf_event_max_sample_rate_handler(), for readability

Follow the naming pattern of the other sysctl handlers in perf.

Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230721090607.172002-1-xiujianfeng@huawei.com
---
 include/linux/perf_event.h | 2 +-
 kernel/events/core.c       | 4 ++--
 kernel/sysctl.c            | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e85cd1c0eaf3..f31f962a6445 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1573,7 +1573,7 @@ extern int sysctl_perf_cpu_time_max_percent;
 
 extern void perf_sample_event_took(u64 sample_len_ns);
 
-int perf_proc_update_handler(struct ctl_table *table, int write,
+int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos);
 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4c72a41f11af..af569196d760 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -449,8 +449,8 @@ static void update_perf_cpu_limits(void)
 
 static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
 
-int perf_proc_update_handler(struct ctl_table *table, int write,
-		void *buffer, size_t *lenp, loff_t *ppos)
+int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
+				       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	int ret;
 	int perf_cpu = sysctl_perf_cpu_time_max_percent;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 354a2d294f52..2b6585751891 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1983,7 +1983,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_perf_event_sample_rate,
 		.maxlen		= sizeof(sysctl_perf_event_sample_rate),
 		.mode		= 0644,
-		.proc_handler	= perf_proc_update_handler,
+		.proc_handler	= perf_event_max_sample_rate_handler,
 		.extra1		= SYSCTL_ONE,
 	},
 	{
-- 
cgit v1.2.3


From 441f0dd8fa035a2c7cfe972047bb905d3be05c1b Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 12 Sep 2023 19:53:10 +0300
Subject: resource: Reuse for_each_resource() macro

We have a few places where for_each_resource() is open coded.
Replace that by the macro. This makes code easier to read and
understand.

With this, compile r_next() only for CONFIG_PROC_FS=y.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230912165312.402422-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/resource.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index b1763b2fd7ef..86716cd566e9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -77,13 +77,6 @@ static struct resource *next_resource_skip_children(struct resource *p)
 	     (_p) = (_skip_children) ? next_resource_skip_children(_p) : \
 				       next_resource(_p))
 
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	struct resource *p = v;
-	(*pos)++;
-	return (void *)next_resource(p);
-}
-
 #ifdef CONFIG_PROC_FS
 
 enum { MAX_IORES_LEVEL = 5 };
@@ -91,14 +84,26 @@ enum { MAX_IORES_LEVEL = 5 };
 static void *r_start(struct seq_file *m, loff_t *pos)
 	__acquires(resource_lock)
 {
-	struct resource *p = pde_data(file_inode(m->file));
-	loff_t l = 0;
+	struct resource *root = pde_data(file_inode(m->file));
+	struct resource *p;
+	loff_t l = *pos;
+
 	read_lock(&resource_lock);
-	for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
-		;
+	for_each_resource(root, p, false) {
+		if (l-- == 0)
+			break;
+	}
+
 	return p;
 }
 
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct resource *p = v;
+	(*pos)++;
+	return (void *)next_resource(p);
+}
+
 static void r_stop(struct seq_file *m, void *v)
 	__releases(resource_lock)
 {
@@ -336,7 +341,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
 
 	read_lock(&resource_lock);
 
-	for (p = iomem_resource.child; p; p = next_resource(p)) {
+	for_each_resource(&iomem_resource, p, false) {
 		/* If we passed the resource we are looking for, stop */
 		if (p->start > end) {
 			p = NULL;
@@ -1641,13 +1646,12 @@ __setup("reserve=", reserve_setup);
  */
 int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
 {
-	struct resource *p = &iomem_resource;
 	resource_size_t end = addr + size - 1;
+	struct resource *p;
 	int err = 0;
-	loff_t l;
 
 	read_lock(&resource_lock);
-	for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+	for_each_resource(&iomem_resource, p, false) {
 		/*
 		 * We can probably skip the resources without
 		 * IORESOURCE_IO attribute?
-- 
cgit v1.2.3


From 10dabdf45ed34caaaad97978306fe6e9ee7581d9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 12 Sep 2023 19:53:11 +0300
Subject: resource: Unify next_resource() and next_resource_skip_children()

We have the next_resource() is used once and no user for the
next_resource_skip_children() outside of the for_each_resource().

Unify them by adding skip_children parameter to the next_resource().

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20230912165312.402422-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/resource.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 86716cd566e9..866ef3663a0b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -56,26 +56,17 @@ struct resource_constraint {
 
 static DEFINE_RWLOCK(resource_lock);
 
-static struct resource *next_resource(struct resource *p)
+static struct resource *next_resource(struct resource *p, bool skip_children)
 {
-	if (p->child)
+	if (!skip_children && p->child)
 		return p->child;
 	while (!p->sibling && p->parent)
 		p = p->parent;
 	return p->sibling;
 }
 
-static struct resource *next_resource_skip_children(struct resource *p)
-{
-	while (!p->sibling && p->parent)
-		p = p->parent;
-	return p->sibling;
-}
-
 #define for_each_resource(_root, _p, _skip_children) \
-	for ((_p) = (_root)->child; (_p); \
-	     (_p) = (_skip_children) ? next_resource_skip_children(_p) : \
-				       next_resource(_p))
+	for ((_p) = (_root)->child; (_p); (_p) = next_resource(_p, _skip_children))
 
 #ifdef CONFIG_PROC_FS
 
@@ -100,8 +91,10 @@ static void *r_start(struct seq_file *m, loff_t *pos)
 static void *r_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct resource *p = v;
+
 	(*pos)++;
-	return (void *)next_resource(p);
+
+	return (void *)next_resource(p, false);
 }
 
 static void r_stop(struct seq_file *m, void *v)
-- 
cgit v1.2.3


From 2819f23ac12ce93ff79ca7a54597df9a4a1f6331 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 5 Oct 2023 09:13:48 -0400
Subject: eventfs: Use eventfs_remove_events_dir()

The update to removing the eventfs_file changed the way the events top
level directory was handled. Instead of returning a dentry, it now returns
the eventfs_inode. In this changed, the removing of the events top level
directory is not much different than removing any of the other
directories. Because of this, the removal just called eventfs_remove_dir()
instead of eventfs_remove_events_dir().

Although eventfs_remove_dir() does the clean up, it misses out on the
dget() of the ei->dentry done in eventfs_create_events_dir(). It makes
more sense to match eventfs_create_events_dir() with a specific function
eventfs_remove_events_dir() and this specific function can then perform
the dput() to the dentry that had the dget() when it was created.

Fixes: 5790b1fb3d67 ("eventfs: Remove eventfs_file and just use eventfs_inode")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202310051743.y9EobbUr-lkp@intel.com/
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 fs/tracefs/event_inode.c    | 19 +++++++------------
 include/linux/tracefs.h     |  1 +
 kernel/trace/trace_events.c |  2 +-
 3 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index eab18b157ef5..1ccd100bc565 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -901,22 +901,17 @@ void eventfs_remove_dir(struct eventfs_inode *ei)
 }
 
 /**
- * eventfs_remove_events_dir - remove eventfs dir or file from list
- * @dentry: events's dentry to be removed.
+ * eventfs_remove_events_dir - remove the top level eventfs directory
+ * @ei: the event_inode returned by eventfs_create_events_dir().
  *
- * This function remove events main directory
+ * This function removes the events main directory
  */
-void eventfs_remove_events_dir(struct dentry *dentry)
+void eventfs_remove_events_dir(struct eventfs_inode *ei)
 {
-	struct tracefs_inode *ti;
-
-	if (!dentry || !dentry->d_inode)
-		return;
+	struct dentry *dentry = ei->dentry;
 
-	ti = get_tracefs(dentry->d_inode);
-	if (!ti || !(ti->flags & TRACEFS_EVENT_INODE))
-		return;
+	eventfs_remove_dir(ei);
 
-	d_invalidate(dentry);
+	/* Matches the dget() from eventfs_create_events_dir() */
 	dput(dentry);
 }
diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
index 0c39704455d9..13359b1a35d1 100644
--- a/include/linux/tracefs.h
+++ b/include/linux/tracefs.h
@@ -41,6 +41,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
 					 const struct eventfs_entry *entries,
 					 int size, void *data);
 
+void eventfs_remove_events_dir(struct eventfs_inode *ei);
 void eventfs_remove_dir(struct eventfs_inode *ei);
 
 struct dentry *tracefs_create_file(const char *name, umode_t mode,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a3b9d9423824..0e3a1c70e410 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3872,7 +3872,7 @@ int event_trace_del_tracer(struct trace_array *tr)
 
 	down_write(&trace_event_sem);
 	__trace_remove_event_dirs(tr);
-	eventfs_remove_dir(tr->event_dir);
+	eventfs_remove_events_dir(tr->event_dir);
 	up_write(&trace_event_sem);
 
 	tr->event_dir = NULL;
-- 
cgit v1.2.3


From 5ddd8baa4857709b4e5d84b376d735152851955b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 5 Oct 2023 10:47:45 -0400
Subject: tracing: Make system_callback() function static

The system_callback() function in trace_events.c is only used within that
file. The "static" annotation was missed.

Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202310051743.y9EobbUr-lkp@intel.com/
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0e3a1c70e410..db46d2116500 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2282,7 +2282,7 @@ create_new_subsystem(const char *name)
 	return NULL;
 }
 
-int system_callback(const char *name, umode_t *mode, void **data,
+static int system_callback(const char *name, umode_t *mode, void **data,
 		    const struct file_operations **fops)
 {
 	if (strcmp(name, "filter") == 0)
-- 
cgit v1.2.3


From 9e0bc36ab07c550d791bf17feeb479f1dfc42d89 Mon Sep 17 00:00:00 2001
From: Xuewen Yan <xuewen.yan@unisoc.com>
Date: Wed, 19 Jul 2023 21:05:27 +0800
Subject: cpufreq: schedutil: Update next_freq when cpufreq_limits change

When cpufreq's policy is 'single', there is a scenario that will
cause sg_policy's next_freq to be unable to update.

When the CPU's util is always max, the cpufreq will be max,
and then if we change the policy's scaling_max_freq to be a
lower freq, indeed, the sg_policy's next_freq need change to
be the lower freq, however, because the cpu_is_busy, the next_freq
would keep the max_freq.

For example:

The cpu7 is a single CPU:

  unisoc:/sys/devices/system/cpu/cpufreq/policy7 # while true;do done& [1] 4737
  unisoc:/sys/devices/system/cpu/cpufreq/policy7 # taskset -p 80 4737
  pid 4737's current affinity mask: ff
  pid 4737's new affinity mask: 80
  unisoc:/sys/devices/system/cpu/cpufreq/policy7 # cat scaling_max_freq
  2301000
  unisoc:/sys/devices/system/cpu/cpufreq/policy7 # cat scaling_cur_freq
  2301000
  unisoc:/sys/devices/system/cpu/cpufreq/policy7 # echo 2171000 > scaling_max_freq
  unisoc:/sys/devices/system/cpu/cpufreq/policy7 # cat scaling_max_freq
  2171000

At this time, the sg_policy's next_freq would stay at 2301000, which
is wrong.

To fix this, add a check for the ->need_freq_update flag.

[ mingo: Clarified the changelog. ]

Co-developed-by: Guohua Yan <guohua.yan@unisoc.com>
Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
Signed-off-by: Guohua Yan <guohua.yan@unisoc.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: "Rafael J. Wysocki" <rafael@kernel.org>
Link: https://lore.kernel.org/r/20230719130527.8074-1-xuewen.yan@unisoc.com
---
 kernel/sched/cpufreq_schedutil.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 4492608b7d7f..458d359f5991 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -350,7 +350,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
 	 * Except when the rq is capped by uclamp_max.
 	 */
 	if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
-	    sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
+	    sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
+	    !sg_policy->need_freq_update) {
 		next_f = sg_policy->next_freq;
 
 		/* Restore cached freq as next_freq has changed */
-- 
cgit v1.2.3


From 16a03c71bba012b4cc31f34e00007c34ef5a58df Mon Sep 17 00:00:00 2001
From: Liao Chang <liaochang1@huawei.com>
Date: Fri, 8 Sep 2023 03:16:04 +0000
Subject: cpufreq: schedutil: Merge initialization code of sg_cpu in single
 loop

The initialization code of the per-cpu sg_cpu struct is currently split
into two for-loop blocks. This can be simplified by merging the two
blocks into a single loop. This will make the code more maintainable.

Signed-off-by: Liao Chang <liaochang1@huawei.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 4492608b7d7f..f3a95def49cc 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -767,14 +767,6 @@ static int sugov_start(struct cpufreq_policy *policy)
 
 	sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
 
-	for_each_cpu(cpu, policy->cpus) {
-		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
-
-		memset(sg_cpu, 0, sizeof(*sg_cpu));
-		sg_cpu->cpu			= cpu;
-		sg_cpu->sg_policy		= sg_policy;
-	}
-
 	if (policy_is_shared(policy))
 		uu = sugov_update_shared;
 	else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
@@ -785,6 +777,9 @@ static int sugov_start(struct cpufreq_policy *policy)
 	for_each_cpu(cpu, policy->cpus) {
 		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 
+		memset(sg_cpu, 0, sizeof(*sg_cpu));
+		sg_cpu->cpu = cpu;
+		sg_cpu->sg_policy = sg_policy;
 		cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu);
 	}
 	return 0;
-- 
cgit v1.2.3


From e7a1b32e43b194bbf930281ae7f5149c420cd122 Mon Sep 17 00:00:00 2001
From: Pierre Gondois <pierre.gondois@arm.com>
Date: Thu, 5 Oct 2023 15:41:20 +0200
Subject: cpufreq: Rebuild sched-domains when removing cpufreq driver

The Energy Aware Scheduler (EAS) relies on the schedutil governor.
When moving to/from the schedutil governor, sched domains must be
rebuilt to allow re-evaluating the enablement conditions of EAS.
This is done through sched_cpufreq_governor_change().

Having a cpufreq governor assumes a cpufreq driver is running.
Inserting/removing a cpufreq driver should trigger a re-evaluation
of EAS enablement conditions, avoiding to see EAS enabled when
removing a running cpufreq driver.

Rebuild the sched domains in schedutil's sugov_init()/sugov_exit(),
allowing to check EAS's enablement condition whenever schedutil
governor is initialized/exited from.
Move relevant code up in schedutil.c to avoid a split and conditional
function declaration.
Rename sched_cpufreq_governor_change() to sugov_eas_rebuild_sd().

Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c        |  3 +--
 include/linux/cpufreq.h          |  8 ------
 kernel/sched/cpufreq_schedutil.c | 55 +++++++++++++++++++++-------------------
 3 files changed, 30 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 60ed89000e82..4bc15634d49c 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1544,7 +1544,7 @@ static int cpufreq_online(unsigned int cpu)
 
 		/*
 		 * Register with the energy model before
-		 * sched_cpufreq_governor_change() is called, which will result
+		 * sugov_eas_rebuild_sd() is called, which will result
 		 * in rebuilding of the sched domains, which should only be done
 		 * once the energy model is properly initialized for the policy
 		 * first.
@@ -2652,7 +2652,6 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 		ret = cpufreq_start_governor(policy);
 		if (!ret) {
 			pr_debug("governor change\n");
-			sched_cpufreq_governor_change(policy, old_gov);
 			return 0;
 		}
 		cpufreq_exit_governor(policy);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 71d186d6933a..1c5ca92a0555 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -1193,14 +1193,6 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_
 }
 #endif
 
-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
-void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
-			struct cpufreq_governor *old_gov);
-#else
-static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
-			struct cpufreq_governor *old_gov) { }
-#endif
-
 extern unsigned int arch_freq_get_on_cpu(int cpu);
 
 #ifndef arch_set_freq_scale
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index f3a95def49cc..492ec650d48f 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -555,6 +555,31 @@ static const struct kobj_type sugov_tunables_ktype = {
 
 /********************** cpufreq governor interface *********************/
 
+#ifdef CONFIG_ENERGY_MODEL
+static void rebuild_sd_workfn(struct work_struct *work)
+{
+	rebuild_sched_domains_energy();
+}
+
+static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
+
+/*
+ * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
+ * on governor changes to make sure the scheduler knows about it.
+ */
+static void sugov_eas_rebuild_sd(void)
+{
+	/*
+	 * When called from the cpufreq_register_driver() path, the
+	 * cpu_hotplug_lock is already held, so use a work item to
+	 * avoid nested locking in rebuild_sched_domains().
+	 */
+	schedule_work(&rebuild_sd_work);
+}
+#else
+static inline void sugov_eas_rebuild_sd(void) { };
+#endif
+
 struct cpufreq_governor schedutil_gov;
 
 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
@@ -709,6 +734,8 @@ static int sugov_init(struct cpufreq_policy *policy)
 	if (ret)
 		goto fail;
 
+	sugov_eas_rebuild_sd();
+
 out:
 	mutex_unlock(&global_tunables_lock);
 	return 0;
@@ -750,6 +777,8 @@ static void sugov_exit(struct cpufreq_policy *policy)
 	sugov_kthread_stop(sg_policy);
 	sugov_policy_free(sg_policy);
 	cpufreq_disable_fast_switch(policy);
+
+	sugov_eas_rebuild_sd();
 }
 
 static int sugov_start(struct cpufreq_policy *policy)
@@ -833,29 +862,3 @@ struct cpufreq_governor *cpufreq_default_governor(void)
 #endif
 
 cpufreq_governor_init(schedutil_gov);
-
-#ifdef CONFIG_ENERGY_MODEL
-static void rebuild_sd_workfn(struct work_struct *work)
-{
-	rebuild_sched_domains_energy();
-}
-static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
-
-/*
- * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
- * on governor changes to make sure the scheduler knows about it.
- */
-void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
-				  struct cpufreq_governor *old_gov)
-{
-	if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
-		/*
-		 * When called from the cpufreq_register_driver() path, the
-		 * cpu_hotplug_lock is already held, so use a work item to
-		 * avoid nested locking in rebuild_sched_domains().
-		 */
-		schedule_work(&rebuild_sd_work);
-	}
-
-}
-#endif
-- 
cgit v1.2.3


From 24e41bf8a6b424c76c5902fb999e9eca61bdf83d Mon Sep 17 00:00:00 2001
From: Florent Revest <revest@chromium.org>
Date: Mon, 28 Aug 2023 17:08:57 +0200
Subject: mm: add a NO_INHERIT flag to the PR_SET_MDWE prctl

This extends the current PR_SET_MDWE prctl arg with a bit to indicate that
the process doesn't want MDWE protection to propagate to children.

To implement this no-inherit mode, the tag in current->mm->flags must be
absent from MMF_INIT_MASK.  This means that the encoding for "MDWE but
without inherit" is different in the prctl than in the mm flags.  This
leads to a bit of bit-mangling in the prctl implementation.

Link: https://lkml.kernel.org/r/20230828150858.393570-6-revest@chromium.org
Signed-off-by: Florent Revest <revest@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alexey Izbyshev <izbyshev@ispras.ru>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ayush Jain <ayush.jain3@amd.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Szabolcs Nagy <Szabolcs.Nagy@arm.com>
Cc: Topi Miettinen <toiwoton@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/coredump.h   | 10 ++++++++++
 include/uapi/linux/prctl.h       |  1 +
 kernel/fork.c                    |  2 +-
 kernel/sys.c                     | 32 ++++++++++++++++++++++++++------
 tools/include/uapi/linux/prctl.h |  1 +
 5 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 0ee96ea7a0e9..1b37fa8fc723 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -91,4 +91,14 @@ static inline int get_dumpable(struct mm_struct *mm)
 				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
 
 #define MMF_VM_MERGE_ANY	29
+#define MMF_HAS_MDWE_NO_INHERIT	30
+
+static inline unsigned long mmf_init_flags(unsigned long flags)
+{
+	if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
+		flags &= ~((1UL << MMF_HAS_MDWE) |
+			   (1UL << MMF_HAS_MDWE_NO_INHERIT));
+	return flags & MMF_INIT_MASK;
+}
+
 #endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 9a85c69782bd..370ed14b1ae0 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -284,6 +284,7 @@ struct prctl_mm_map {
 /* Memory deny write / execute */
 #define PR_SET_MDWE			65
 # define PR_MDWE_REFUSE_EXEC_GAIN	(1UL << 0)
+# define PR_MDWE_NO_INHERIT		(1UL << 1)
 
 #define PR_GET_MDWE			66
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 1779183a7cb3..e45a4457ba83 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1288,7 +1288,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	hugetlb_count_init(mm);
 
 	if (current->mm) {
-		mm->flags = current->mm->flags & MMF_INIT_MASK;
+		mm->flags = mmf_init_flags(current->mm->flags);
 		mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
 	} else {
 		mm->flags = default_dump_filter;
diff --git a/kernel/sys.c b/kernel/sys.c
index 2410e3999ebe..4a8073c1b255 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2368,19 +2368,41 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
 }
 #endif /* CONFIG_ANON_VMA_NAME */
 
+static inline unsigned long get_current_mdwe(void)
+{
+	unsigned long ret = 0;
+
+	if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
+		ret |= PR_MDWE_REFUSE_EXEC_GAIN;
+	if (test_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags))
+		ret |= PR_MDWE_NO_INHERIT;
+
+	return ret;
+}
+
 static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
 				 unsigned long arg4, unsigned long arg5)
 {
+	unsigned long current_bits;
+
 	if (arg3 || arg4 || arg5)
 		return -EINVAL;
 
-	if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN))
+	if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT))
+		return -EINVAL;
+
+	/* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */
+	if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
 		return -EINVAL;
 
+	current_bits = get_current_mdwe();
+	if (current_bits && current_bits != bits)
+		return -EPERM; /* Cannot unset the flags */
+
+	if (bits & PR_MDWE_NO_INHERIT)
+		set_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags);
 	if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
 		set_bit(MMF_HAS_MDWE, &current->mm->flags);
-	else if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
-		return -EPERM; /* Cannot unset the flag */
 
 	return 0;
 }
@@ -2390,9 +2412,7 @@ static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
 {
 	if (arg2 || arg3 || arg4 || arg5)
 		return -EINVAL;
-
-	return test_bit(MMF_HAS_MDWE, &current->mm->flags) ?
-		PR_MDWE_REFUSE_EXEC_GAIN : 0;
+	return get_current_mdwe();
 }
 
 static int prctl_get_auxv(void __user *addr, unsigned long len)
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index 9a85c69782bd..370ed14b1ae0 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -284,6 +284,7 @@ struct prctl_mm_map {
 /* Memory deny write / execute */
 #define PR_SET_MDWE			65
 # define PR_MDWE_REFUSE_EXEC_GAIN	(1UL << 0)
+# define PR_MDWE_NO_INHERIT		(1UL << 1)
 
 #define PR_GET_MDWE			66
 
-- 
cgit v1.2.3


From 84cb9cbd911a3e06c1ff31572706ba0ee3499b19 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 6 Oct 2023 13:17:00 -0700
Subject: bpf: Annotate struct bpf_stack_map with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for
array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle [1], add __counted_by for struct bpf_stack_map.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci [1]
Link: https://lore.kernel.org/bpf/20231006201657.work.531-kees@kernel.org
---
 kernel/bpf/stackmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 458bb80b14d5..d6b277482085 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -28,7 +28,7 @@ struct bpf_stack_map {
 	void *elems;
 	struct pcpu_freelist freelist;
 	u32 n_buckets;
-	struct stack_map_bucket *buckets[];
+	struct stack_map_bucket *buckets[] __counted_by(n_buckets);
 };
 
 static inline bool stack_map_use_build_id(struct bpf_map *map)
-- 
cgit v1.2.3


From a4fe78386afb94780f8e6fcd10a67c4d4dfe4da8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 7 Oct 2023 00:06:49 +0200
Subject: bpf: Fix BPF_PROG_QUERY last field check

While working on the ebpf-go [0] library integration for bpf_mprog and tcx,
Lorenz noticed that two subsequent BPF_PROG_QUERY requests currently fail. A
typical workflow is to first gather the bpf_mprog count without passing program/
link arrays, followed by the second request which contains the actual array
pointers.

The initial call populates count and revision fields. The second call gets
rejected due to a BPF_PROG_QUERY_LAST_FIELD bug which should point to
query.revision instead of query.link_attach_flags since the former is really
the last member.

It was not noticed in libbpf as bpf_prog_query_opts() always calls bpf(2) with
an on-stack bpf_attr that is memset() each time (and therefore query.revision
was reset to zero).

  [0] https://ebpf-go.dev

Fixes: e420bed02507 ("bpf: Add fd-based tcx multi-prog infra with link support")
Reported-by: Lorenz Bauer <lmb@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20231006220655.1653-1-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/syscall.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index eb01c31ed591..453a43695a23 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3913,7 +3913,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	return ret;
 }
 
-#define BPF_PROG_QUERY_LAST_FIELD query.link_attach_flags
+#define BPF_PROG_QUERY_LAST_FIELD query.revision
 
 static int bpf_prog_query(const union bpf_attr *attr,
 			  union bpf_attr __user *uattr)
-- 
cgit v1.2.3


From edfa9af0a73ecc2000d7bb81d0b0fd3158cc9a65 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 7 Oct 2023 00:06:50 +0200
Subject: bpf: Handle bpf_mprog_query with NULL entry

Improve consistency for bpf_mprog_query() API and let the latter also handle
a NULL entry as can be the case for tcx. Instead of returning -ENOENT, we
copy a count of 0 and revision of 1 to user space, so that this can be fed
into a subsequent bpf_mprog_attach() call as expected_revision. A BPF self-
test as part of this series has been added to assert this case.

Suggested-by: Lorenz Bauer <lmb@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20231006220655.1653-2-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/mprog.c | 10 ++++++----
 kernel/bpf/tcx.c   |  8 +-------
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/mprog.c b/kernel/bpf/mprog.c
index 007d98c799e2..1394168062e8 100644
--- a/kernel/bpf/mprog.c
+++ b/kernel/bpf/mprog.c
@@ -401,14 +401,16 @@ int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr,
 	struct bpf_mprog_cp *cp;
 	struct bpf_prog *prog;
 	const u32 flags = 0;
+	u32 id, count = 0;
+	u64 revision = 1;
 	int i, ret = 0;
-	u32 id, count;
-	u64 revision;
 
 	if (attr->query.query_flags || attr->query.attach_flags)
 		return -EINVAL;
-	revision = bpf_mprog_revision(entry);
-	count = bpf_mprog_total(entry);
+	if (entry) {
+		revision = bpf_mprog_revision(entry);
+		count = bpf_mprog_total(entry);
+	}
 	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
 		return -EFAULT;
 	if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c
index 13f0b5dc8262..1338a13a8b64 100644
--- a/kernel/bpf/tcx.c
+++ b/kernel/bpf/tcx.c
@@ -123,7 +123,6 @@ int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
 {
 	bool ingress = attr->query.attach_type == BPF_TCX_INGRESS;
 	struct net *net = current->nsproxy->net_ns;
-	struct bpf_mprog_entry *entry;
 	struct net_device *dev;
 	int ret;
 
@@ -133,12 +132,7 @@ int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
 		ret = -ENODEV;
 		goto out;
 	}
-	entry = tcx_entry_fetch(dev, ingress);
-	if (!entry) {
-		ret = -ENOENT;
-		goto out;
-	}
-	ret = bpf_mprog_query(attr, uattr, entry);
+	ret = bpf_mprog_query(attr, uattr, tcx_entry_fetch(dev, ingress));
 out:
 	rtnl_unlock();
 	return ret;
-- 
cgit v1.2.3


From ba62d61128bda71fd02622f320ac59d861fc4baa Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@isovalent.com>
Date: Sat, 7 Oct 2023 00:06:51 +0200
Subject: bpf: Refuse unused attributes in bpf_prog_{attach,detach}

The recently added tcx attachment extended the BPF UAPI for attaching and
detaching by a couple of fields. Those fields are currently only supported
for tcx, other types like cgroups and flow dissector silently ignore the
new fields except for the new flags.

This is problematic once we extend bpf_mprog to older attachment types, since
it's hard to figure out whether the syscall really was successful if the
kernel silently ignores non-zero values.

Explicitly reject non-zero fields relevant to bpf_mprog for attachment types
which don't use the latter yet.

Fixes: e420bed02507 ("bpf: Add fd-based tcx multi-prog infra with link support")
Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20231006220655.1653-3-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/syscall.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 453a43695a23..d77b2f8b9364 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3796,7 +3796,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 {
 	enum bpf_prog_type ptype;
 	struct bpf_prog *prog;
-	u32 mask;
 	int ret;
 
 	if (CHECK_ATTR(BPF_PROG_ATTACH))
@@ -3805,10 +3804,16 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	ptype = attach_type_to_prog_type(attr->attach_type);
 	if (ptype == BPF_PROG_TYPE_UNSPEC)
 		return -EINVAL;
-	mask = bpf_mprog_supported(ptype) ?
-	       BPF_F_ATTACH_MASK_MPROG : BPF_F_ATTACH_MASK_BASE;
-	if (attr->attach_flags & ~mask)
-		return -EINVAL;
+	if (bpf_mprog_supported(ptype)) {
+		if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
+			return -EINVAL;
+	} else {
+		if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
+			return -EINVAL;
+		if (attr->relative_fd ||
+		    attr->expected_revision)
+			return -EINVAL;
+	}
 
 	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
 	if (IS_ERR(prog))
@@ -3878,6 +3883,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 			if (IS_ERR(prog))
 				return PTR_ERR(prog);
 		}
+	} else if (attr->attach_flags ||
+		   attr->relative_fd ||
+		   attr->expected_revision) {
+		return -EINVAL;
 	}
 
 	switch (ptype) {
-- 
cgit v1.2.3


From ea41bb514fe286bf50498b3c6d7f7a5dc2b6c5e0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 4 Oct 2023 11:33:36 +0200
Subject: sched/core: Update stale comment in try_to_wake_up()

The following commit:

  9b3c4ab3045e ("sched,rcu: Rework try_invoke_on_locked_down_task()")

... renamed try_invoke_on_locked_down_task() to task_call_func(),
but forgot to update the comment in try_to_wake_up().

But it turns out that the smp_rmb() doesn't live in task_call_func()
either, it was moved to __task_needs_rq_lock() in:

  91dabf33ae5d ("sched: Fix race in task_call_func()")

Fix that now.

Also fix the s/smb/smp typo while at it.

Reported-by: Zhang Qiao <zhangqiao22@huawei.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230731085759.11443-1-zhangqiao22@huawei.com
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 65e10ac34660..f5783cb16791 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4237,7 +4237,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
 		 * __schedule().  See the comment for smp_mb__after_spinlock().
 		 *
-		 * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
+		 * A similar smp_rmb() lives in __task_needs_rq_lock().
 		 */
 		smp_rmb();
 		if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
-- 
cgit v1.2.3


From bc87127a45928de5fdf0ec39d7a86e1edd0e179e Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Thu, 20 Jul 2023 16:05:16 +0800
Subject: sched/debug: Print 'tgid' in sched_show_task()

Multiple blocked tasks are printed when the system hangs. They may have
the same parent pid, but belong to different task groups.

Printing tgid lets users better know whether these tasks are from the same
task group or not.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20230720080516.1515297-1-yajun.deng@linux.dev
---
 kernel/sched/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5783cb16791..cf6d3fdd4eb5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9089,9 +9089,9 @@ void sched_show_task(struct task_struct *p)
 	if (pid_alive(p))
 		ppid = task_pid_nr(rcu_dereference(p->real_parent));
 	rcu_read_unlock();
-	pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
-		free, task_pid_nr(p), ppid,
-		read_task_thread_flags(p));
+	pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n",
+		free, task_pid_nr(p), task_tgid_nr(p),
+		ppid, read_task_thread_flags(p));
 
 	print_worker_info(KERN_INFO, p);
 	print_stop_info(KERN_INFO, p);
-- 
cgit v1.2.3


From 569c8d82f95eb5993c84fb61a649a9c4ddd208b3 Mon Sep 17 00:00:00 2001
From: Philipp Stanner <pstanner@redhat.com>
Date: Wed, 20 Sep 2023 14:36:10 +0200
Subject: kernel: kexec: copy user-array safely

Currently, there is no overflow-check with memdup_user().

Use the new function memdup_array_user() instead of memdup_user() for
duplicating the user-space array safely.

Suggested-by: David Airlie <airlied@redhat.com>
Signed-off-by: Philipp Stanner <pstanner@redhat.com>
Acked-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Zack Rusin <zackr@vmware.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230920123612.16914-4-pstanner@redhat.com
---
 kernel/kexec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 107f355eac10..8f35a5a42af8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -247,7 +247,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 		((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
 		return -EINVAL;
 
-	ksegments = memdup_user(segments, nr_segments * sizeof(ksegments[0]));
+	ksegments = memdup_array_user(segments, nr_segments, sizeof(ksegments[0]));
 	if (IS_ERR(ksegments))
 		return PTR_ERR(ksegments);
 
-- 
cgit v1.2.3


From ca0776571d3163bd03b3e8c9e3da936abfaecbf6 Mon Sep 17 00:00:00 2001
From: Philipp Stanner <pstanner@redhat.com>
Date: Wed, 20 Sep 2023 14:36:11 +0200
Subject: kernel: watch_queue: copy user-array safely

Currently, there is no overflow-check with memdup_user().

Use the new function memdup_array_user() instead of memdup_user() for
duplicating the user-space array safely.

Suggested-by: David Airlie <airlied@redhat.com>
Signed-off-by: Philipp Stanner <pstanner@redhat.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Zack Rusin <zackr@vmware.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230920123612.16914-5-pstanner@redhat.com
---
 kernel/watch_queue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index d0b6b390ee42..778b4056700f 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -331,7 +331,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe,
 	    filter.__reserved != 0)
 		return -EINVAL;
 
-	tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf));
+	tf = memdup_array_user(_filter->filters, filter.nr_filters, sizeof(*tf));
 	if (IS_ERR(tf))
 		return PTR_ERR(tf);
 
-- 
cgit v1.2.3


From 8dafa9d0eb1a1550a0f4d462db9354161bc51e0c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 6 Oct 2023 21:24:45 +0200
Subject: sched/eevdf: Fix min_deadline heap integrity

Marek and Biju reported instances of:

  "EEVDF scheduling fail, picking leftmost"

which Mike correlated with cgroup scheduling and the min_deadline heap
getting corrupted; some trace output confirms:

> And yeah, min_deadline is hosed somehow:
>
>    validate_cfs_rq: --- /
>    __print_se: ffff88845cf48080 w: 1024 ve: -58857638 lag: 870381 vd: -55861854 vmd: -66302085 E (11372/tr)
>    __print_se:   ffff88810d165800 w: 25 ve: -80323686 lag: 22336429 vd: -41496434 vmd: -66302085 E (-1//autogroup-31)
>    __print_se:   ffff888108379000 w: 25 ve: 0 lag: -57987257 vd: 114632828 vmd: 114632828 N (-1//autogroup-33)
>    validate_cfs_rq: min_deadline: -55861854 avg_vruntime: -62278313462 / 1074 = -57987256

Turns out that reweight_entity(), which tries really hard to be fast,
does not do the normal dequeue+update+enqueue pattern but *does* scale
the deadline.

However, it then fails to propagate the updated deadline value up the
heap.

Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reported-by: Biju Das <biju.das.jz@bp.renesas.com>
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Biju Das <biju.das.jz@bp.renesas.com>
Tested-by: Mike Galbraith <efault@gmx.de>
Link: https://lkml.kernel.org/r/20231006192445.GE743@noisy.programming.kicks-ass.net
---
 kernel/sched/fair.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef7490c4b8b4..a4b904a010c6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3613,6 +3613,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 		 */
 		deadline = div_s64(deadline * old_weight, weight);
 		se->deadline = se->vruntime + deadline;
+		min_deadline_cb_propagate(&se->run_node, NULL);
 	}
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From b01db23d5923a35023540edc4f0c5f019e11ac7d Mon Sep 17 00:00:00 2001
From: Benjamin Segall <bsegall@google.com>
Date: Fri, 29 Sep 2023 17:09:30 -0700
Subject: sched/eevdf: Fix pick_eevdf()

The old pick_eevdf() could fail to find the actual earliest eligible
deadline when it descended to the right looking for min_deadline, but
it turned out that that min_deadline wasn't actually eligible. In that
case we need to go back and search through any left branches we
skipped looking for the actual best _eligible_ min_deadline.

This is more expensive, but still O(log n), and at worst should only
involve descending two branches of the rbtree.

I've run this through a userspace stress test (thank you
tools/lib/rbtree.c), so hopefully this implementation doesn't miss any
corner cases.

Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Signed-off-by: Ben Segall <bsegall@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/xm261qego72d.fsf_-_@google.com
---
 kernel/sched/fair.c | 72 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 58 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a4b904a010c6..061a30a8925a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -872,14 +872,16 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
  *
  * Which allows an EDF like search on (sub)trees.
  */
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
 	struct sched_entity *curr = cfs_rq->curr;
 	struct sched_entity *best = NULL;
+	struct sched_entity *best_left = NULL;
 
 	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
 		curr = NULL;
+	best = curr;
 
 	/*
 	 * Once selected, run a task until it either becomes non-eligible or
@@ -900,33 +902,75 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
 		}
 
 		/*
-		 * If this entity has an earlier deadline than the previous
-		 * best, take this one. If it also has the earliest deadline
-		 * of its subtree, we're done.
+		 * Now we heap search eligible trees for the best (min_)deadline
 		 */
-		if (!best || deadline_gt(deadline, best, se)) {
+		if (!best || deadline_gt(deadline, best, se))
 			best = se;
-			if (best->deadline == best->min_deadline)
-				break;
-		}
 
 		/*
-		 * If the earlest deadline in this subtree is in the fully
-		 * eligible left half of our space, go there.
+		 * Every se in a left branch is eligible, keep track of the
+		 * branch with the best min_deadline
 		 */
+		if (node->rb_left) {
+			struct sched_entity *left = __node_2_se(node->rb_left);
+
+			if (!best_left || deadline_gt(min_deadline, best_left, left))
+				best_left = left;
+
+			/*
+			 * min_deadline is in the left branch. rb_left and all
+			 * descendants are eligible, so immediately switch to the second
+			 * loop.
+			 */
+			if (left->min_deadline == se->min_deadline)
+				break;
+		}
+
+		/* min_deadline is at this node, no need to look right */
+		if (se->deadline == se->min_deadline)
+			break;
+
+		/* else min_deadline is in the right branch. */
+		node = node->rb_right;
+	}
+
+	/*
+	 * We ran into an eligible node which is itself the best.
+	 * (Or nr_running == 0 and both are NULL)
+	 */
+	if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0)
+		return best;
+
+	/*
+	 * Now best_left and all of its children are eligible, and we are just
+	 * looking for deadline == min_deadline
+	 */
+	node = &best_left->run_node;
+	while (node) {
+		struct sched_entity *se = __node_2_se(node);
+
+		/* min_deadline is the current node */
+		if (se->deadline == se->min_deadline)
+			return se;
+
+		/* min_deadline is in the left branch */
 		if (node->rb_left &&
 		    __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
 			node = node->rb_left;
 			continue;
 		}
 
+		/* else min_deadline is in the right branch */
 		node = node->rb_right;
 	}
+	return NULL;
+}
 
-	if (!best || (curr && deadline_gt(deadline, best, curr)))
-		best = curr;
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *se = __pick_eevdf(cfs_rq);
 
-	if (unlikely(!best)) {
+	if (!se) {
 		struct sched_entity *left = __pick_first_entity(cfs_rq);
 		if (left) {
 			pr_err("EEVDF scheduling fail, picking leftmost\n");
@@ -934,7 +978,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
 		}
 	}
 
-	return best;
+	return se;
 }
 
 #ifdef CONFIG_SCHED_DEBUG
-- 
cgit v1.2.3


From 054c22bd784d6953ac85f545bed4a2a27b0e4ddb Mon Sep 17 00:00:00 2001
From: John Ogness <john.ogness@linutronix.de>
Date: Fri, 6 Oct 2023 10:21:50 +0200
Subject: printk: flush consoles before checking progress

Commit 9e70a5e109a4 ("printk: Add per-console suspended state")
removed console lock usage during resume and replaced it with
the clearly defined console_list_lock and srcu mechanisms.

However, the console lock usage had an important side-effect
of flushing the consoles. After its removal, consoles were no
longer flushed before checking their progress.

Add the console_lock/console_unlock dance to the beginning
of __pr_flush() to actually flush the consoles before checking
their progress. Also add comments to clarify this additional
usage of the console lock.

Note that console_unlock() does not guarantee flushing all messages
since the commit dbdda842fe96f89 ("printk: Add console owner and waiter
logic to load balance console writes").

Reported-by: Todd Brandt <todd.e.brandt@intel.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217955
Fixes: 9e70a5e109a4 ("printk: Add per-console suspended state")
Co-developed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Link: https://lore.kernel.org/r/20231006082151.6969-2-pmladek@suse.com
---
 kernel/printk/printk.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8787d3a72114..1919899b6897 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3738,12 +3738,18 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 
 	seq = prb_next_seq(prb);
 
+	/* Flush the consoles so that records up to @seq are printed. */
+	console_lock();
+	console_unlock();
+
 	for (;;) {
 		diff = 0;
 
 		/*
 		 * Hold the console_lock to guarantee safe access to
-		 * console->seq.
+		 * console->seq. Releasing console_lock flushes more
+		 * records in case @seq is still not printed on all
+		 * usable consoles.
 		 */
 		console_lock();
 
-- 
cgit v1.2.3


From 7ef7145a2b26b172ac6885c4cf3272a38bc0979a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 6 Oct 2023 12:25:16 +0200
Subject: sched/nohz: Update idle load-balancing (ILB) comments

 - Fix incorrect/misleading comments,

 - clarify some others,

 - fix typos & grammar,

 - and use more consistent style throughout.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Link: https://lore.kernel.org/r/20231006102518.2452758-2-mingo@kernel.org
---
 kernel/sched/fair.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 52c498fd6c46..2b63a14cd05e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11503,14 +11503,15 @@ static inline int on_null_domain(struct rq *rq)
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * idle load balancing details
- * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * NOHZ idle load balancing (ILB) details:
+ *
+ * - When one of the busy CPUs notices that there may be an idle rebalancing
  *   needed, they will kick the idle load balancer, which then does idle
  *   load balancing for all the idle CPUs.
- * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
+ *
+ * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
  *   anywhere yet.
  */
-
 static inline int find_new_ilb(void)
 {
 	int ilb;
@@ -11531,8 +11532,10 @@ static inline int find_new_ilb(void)
 }
 
 /*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
+ * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
+ * SMP function call (IPI).
+ *
+ * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
  */
 static void kick_ilb(unsigned int flags)
 {
@@ -11560,7 +11563,7 @@ static void kick_ilb(unsigned int flags)
 
 	/*
 	 * This way we generate an IPI on the target CPU which
-	 * is idle. And the softirq performing nohz idle load balance
+	 * is idle, and the softirq performing NOHZ idle load balancing
 	 * will be run before returning from the IPI.
 	 */
 	smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
@@ -11589,7 +11592,7 @@ static void nohz_balancer_kick(struct rq *rq)
 
 	/*
 	 * None are in tickless mode and hence no need for NOHZ idle load
-	 * balancing.
+	 * balancing:
 	 */
 	if (likely(!atomic_read(&nohz.nr_cpus)))
 		return;
@@ -11611,9 +11614,8 @@ static void nohz_balancer_kick(struct rq *rq)
 	sd = rcu_dereference(rq->sd);
 	if (sd) {
 		/*
-		 * If there's a CFS task and the current CPU has reduced
-		 * capacity; kick the ILB to see if there's a better CPU to run
-		 * on.
+		 * If there's a runnable CFS task and the current CPU has reduced
+		 * capacity, kick the ILB to see if there's a better CPU to run on:
 		 */
 		if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
 			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
@@ -11665,11 +11667,11 @@ static void nohz_balancer_kick(struct rq *rq)
 	if (sds) {
 		/*
 		 * If there is an imbalance between LLC domains (IOW we could
-		 * increase the overall cache use), we need some less-loaded LLC
-		 * domain to pull some load. Likewise, we may need to spread
+		 * increase the overall cache utilization), we need a less-loaded LLC
+		 * domain to pull some load from. Likewise, we may need to spread
 		 * load within the current LLC domain (e.g. packed SMT cores but
 		 * other CPUs are idle). We can't really know from here how busy
-		 * the others are - so just get a nohz balance going if it looks
+		 * the others are - so just get a NOHZ balance going if it looks
 		 * like this LLC domain has tasks we could move.
 		 */
 		nr_busy = atomic_read(&sds->nr_busy_cpus);
-- 
cgit v1.2.3


From b6dd6984832a2868f78879fce30d6965ae899d02 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 6 Oct 2023 12:25:17 +0200
Subject: sched/nohz: Use consistent variable names in find_new_ilb() and
 kick_ilb()

Use 'ilb_cpu' consistently in both functions.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Link: https://lore.kernel.org/r/20231006102518.2452758-3-mingo@kernel.org
---
 kernel/sched/fair.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2b63a14cd05e..f82b301740ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11514,18 +11514,18 @@ static inline int on_null_domain(struct rq *rq)
  */
 static inline int find_new_ilb(void)
 {
-	int ilb;
 	const struct cpumask *hk_mask;
+	int ilb_cpu;
 
 	hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
 
-	for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
+	for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
 
-		if (ilb == smp_processor_id())
+		if (ilb_cpu == smp_processor_id())
 			continue;
 
-		if (idle_cpu(ilb))
-			return ilb;
+		if (idle_cpu(ilb_cpu))
+			return ilb_cpu;
 	}
 
 	return nr_cpu_ids;
-- 
cgit v1.2.3


From f4bb5705114530cd775a5a649b666755b3efe7aa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 6 Oct 2023 12:25:18 +0200
Subject: sched/nohz: Remove unnecessarily complex error handling pattern from
 find_new_ilb()

find_new_ilb() returns nr_cpu_ids on failure - which is the usual
cpumask bitops return pattern, but is weird & unnecessary in this
context: not only is it a global variable, it it is a +1 out of
bounds CPU index and also has different signedness ...

Its only user, kick_ilb(), then checks the return against nr_cpu_ids
to decide to return. There's no other use.

So instead of this, use a standard -1 return on failure to find an
idle CPU, as the argument is signed already.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Link: https://lore.kernel.org/r/20231006102518.2452758-4-mingo@kernel.org
---
 kernel/sched/fair.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f82b301740ec..19bb4ac94146 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11528,7 +11528,7 @@ static inline int find_new_ilb(void)
 			return ilb_cpu;
 	}
 
-	return nr_cpu_ids;
+	return -1;
 }
 
 /*
@@ -11549,8 +11549,7 @@ static void kick_ilb(unsigned int flags)
 		nohz.next_balance = jiffies+1;
 
 	ilb_cpu = find_new_ilb();
-
-	if (ilb_cpu >= nr_cpu_ids)
+	if (ilb_cpu < 0)
 		return;
 
 	/*
-- 
cgit v1.2.3


From 089768dfeb3ab294f9ab6a1f2462001f0f879fbb Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Sun, 8 Oct 2023 10:15:38 +0800
Subject: sched/rt: Change the type of 'sysctl_sched_rt_period' from 'unsigned
 int' to 'int'

Doing this matches the natural type of 'int' based calculus
in sched_rt_handler(), and also enables the adding in of a
correct upper bounds check on the sysctl interface.

[ mingo: Rewrote the changelog. ]

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231008021538.3063250-1-yajun.deng@linux.dev
---
 kernel/sched/rt.c    | 6 +++---
 kernel/sched/sched.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 88fc98601413..76d82a096e03 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -16,7 +16,7 @@ struct rt_bandwidth def_rt_bandwidth;
  * period over which we measure -rt task CPU usage in us.
  * default: 1s
  */
-unsigned int sysctl_sched_rt_period = 1000000;
+int sysctl_sched_rt_period = 1000000;
 
 /*
  * part of the period that we allow rt tasks to run in us.
@@ -34,7 +34,7 @@ static struct ctl_table sched_rt_sysctls[] = {
 	{
 		.procname       = "sched_rt_period_us",
 		.data           = &sysctl_sched_rt_period,
-		.maxlen         = sizeof(unsigned int),
+		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = sched_rt_handler,
 		.extra1         = SYSCTL_ONE,
@@ -47,7 +47,7 @@ static struct ctl_table sched_rt_sysctls[] = {
 		.mode           = 0644,
 		.proc_handler   = sched_rt_handler,
 		.extra1         = SYSCTL_NEG_ONE,
-		.extra2         = SYSCTL_INT_MAX,
+		.extra2         = (void *)&sysctl_sched_rt_period,
 	},
 	{
 		.procname       = "sched_rr_timeslice_ms",
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 649eb9ec0657..515eb4cffd5e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -105,7 +105,7 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 
 extern void call_trace_sched_update_nr_running(struct rq *rq, int count);
 
-extern unsigned int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 extern int sched_rr_timeslice;
 
-- 
cgit v1.2.3


From 7bc263840bc3377186cb06b003ac287bb2f18ce2 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 9 Oct 2023 12:36:16 +0200
Subject: sched/topology: Consolidate and clean up access to a CPU's max
 compute capacity

Remove the rq::cpu_capacity_orig field and use arch_scale_cpu_capacity()
instead.

The scheduler uses 3 methods to get access to a CPU's max compute capacity:

 - arch_scale_cpu_capacity(cpu) which is the default way to get a CPU's capacity.

 - cpu_capacity_orig field which is periodically updated with
   arch_scale_cpu_capacity().

 - capacity_orig_of(cpu) which encapsulates rq->cpu_capacity_orig.

There is no real need to save the value returned by arch_scale_cpu_capacity()
in struct rq. arch_scale_cpu_capacity() returns:

 - either a per_cpu variable.

 - or a const value for systems which have only one capacity.

Remove rq::cpu_capacity_orig and use arch_scale_cpu_capacity() everywhere.

No functional changes.

Some performance tests on Arm64:

  - small SMP device (hikey): no noticeable changes
  - HMP device (RB5):         hackbench shows minor improvement (1-2%)
  - large smp (thx2):         hackbench and tbench shows minor improvement (1%)

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lore.kernel.org/r/20231009103621.374412-2-vincent.guittot@linaro.org
---
 Documentation/scheduler/sched-capacity.rst | 13 +++++++------
 kernel/sched/core.c                        |  2 +-
 kernel/sched/cpudeadline.c                 |  2 +-
 kernel/sched/deadline.c                    |  4 ++--
 kernel/sched/fair.c                        | 18 ++++++++----------
 kernel/sched/rt.c                          |  2 +-
 kernel/sched/sched.h                       |  6 ------
 kernel/sched/topology.c                    |  7 +++++--
 8 files changed, 25 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/scheduler/sched-capacity.rst b/Documentation/scheduler/sched-capacity.rst
index e2c1cf743158..de414b33dd2a 100644
--- a/Documentation/scheduler/sched-capacity.rst
+++ b/Documentation/scheduler/sched-capacity.rst
@@ -39,14 +39,15 @@ per Hz, leading to::
 -------------------
 
 Two different capacity values are used within the scheduler. A CPU's
-``capacity_orig`` is its maximum attainable capacity, i.e. its maximum
-attainable performance level. A CPU's ``capacity`` is its ``capacity_orig`` to
-which some loss of available performance (e.g. time spent handling IRQs) is
-subtracted.
+``original capacity`` is its maximum attainable capacity, i.e. its maximum
+attainable performance level. This original capacity is returned by
+the function arch_scale_cpu_capacity(). A CPU's ``capacity`` is its ``original
+capacity`` to which some loss of available performance (e.g. time spent
+handling IRQs) is subtracted.
 
 Note that a CPU's ``capacity`` is solely intended to be used by the CFS class,
-while ``capacity_orig`` is class-agnostic. The rest of this document will use
-the term ``capacity`` interchangeably with ``capacity_orig`` for the sake of
+while ``original capacity`` is class-agnostic. The rest of this document will use
+the term ``capacity`` interchangeably with ``original capacity`` for the sake of
 brevity.
 
 1.3 Platform examples
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cf6d3fdd4eb5..a3f9cd52eec5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9929,7 +9929,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
-		rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
+		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
 		rq->balance_callback = &balance_push_callback;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 57c92d751bcd..95baa12a1029 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -131,7 +131,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 			if (!dl_task_fits_capacity(p, cpu)) {
 				cpumask_clear_cpu(cpu, later_mask);
 
-				cap = capacity_orig_of(cpu);
+				cap = arch_scale_cpu_capacity(cpu);
 
 				if (cap > max_cap ||
 				    (cpu == task_cpu(p) && cap == max_cap)) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d98408a274e5..7039a8d5ae9b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -132,7 +132,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
 	int i;
 
 	for_each_cpu_and(i, mask, cpu_active_mask)
-		cap += capacity_orig_of(i);
+		cap += arch_scale_cpu_capacity(i);
 
 	return cap;
 }
@@ -144,7 +144,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask)
 static inline unsigned long dl_bw_capacity(int i)
 {
 	if (!sched_asym_cpucap_active() &&
-	    capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
+	    arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) {
 		return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
 	} else {
 		RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 19bb4ac94146..e7c1bafc0460 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4669,7 +4669,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	 * To avoid overestimation of actual task utilization, skip updates if
 	 * we cannot grant there is idle time in this CPU.
 	 */
-	if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
+	if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
 		return;
 
 	/*
@@ -4717,14 +4717,14 @@ static inline int util_fits_cpu(unsigned long util,
 		return fits;
 
 	/*
-	 * We must use capacity_orig_of() for comparing against uclamp_min and
+	 * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
 	 * uclamp_max. We only care about capacity pressure (by using
 	 * capacity_of()) for comparing against the real util.
 	 *
 	 * If a task is boosted to 1024 for example, we don't want a tiny
 	 * pressure to skew the check whether it fits a CPU or not.
 	 *
-	 * Similarly if a task is capped to capacity_orig_of(little_cpu), it
+	 * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
 	 * should fit a little cpu even if there's some pressure.
 	 *
 	 * Only exception is for thermal pressure since it has a direct impact
@@ -4736,7 +4736,7 @@ static inline int util_fits_cpu(unsigned long util,
 	 * For uclamp_max, we can tolerate a drop in performance level as the
 	 * goal is to cap the task. So it's okay if it's getting less.
 	 */
-	capacity_orig = capacity_orig_of(cpu);
+	capacity_orig = arch_scale_cpu_capacity(cpu);
 	capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
 
 	/*
@@ -7217,7 +7217,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 		 * Look for the CPU with best capacity.
 		 */
 		else if (fits < 0)
-			cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
+			cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
 
 		/*
 		 * First, select CPU which fits better (-1 being better than 0).
@@ -7459,7 +7459,7 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
 		util = max(util, util_est);
 	}
 
-	return min(util, capacity_orig_of(cpu));
+	return min(util, arch_scale_cpu_capacity(cpu));
 }
 
 unsigned long cpu_util_cfs(int cpu)
@@ -9250,8 +9250,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 	unsigned long capacity = scale_rt_capacity(cpu);
 	struct sched_group *sdg = sd->groups;
 
-	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
-
 	if (!capacity)
 		capacity = 1;
 
@@ -9327,7 +9325,7 @@ static inline int
 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 {
 	return ((rq->cpu_capacity * sd->imbalance_pct) <
-				(rq->cpu_capacity_orig * 100));
+				(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
 }
 
 /*
@@ -9338,7 +9336,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
 {
 	return rq->misfit_task_load &&
-		(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+		(arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
 		 check_cpu_capacity(rq, sd));
 }
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 76d82a096e03..e93b69ef919b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -471,7 +471,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
 	min_cap = uclamp_eff_value(p, UCLAMP_MIN);
 	max_cap = uclamp_eff_value(p, UCLAMP_MAX);
 
-	cpu_cap = capacity_orig_of(cpu);
+	cpu_cap = arch_scale_cpu_capacity(cpu);
 
 	return cpu_cap >= min(min_cap, max_cap);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 515eb4cffd5e..7e7fedcfc580 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1033,7 +1033,6 @@ struct rq {
 	struct sched_domain __rcu	*sd;
 
 	unsigned long		cpu_capacity;
-	unsigned long		cpu_capacity_orig;
 
 	struct balance_callback *balance_callback;
 
@@ -2967,11 +2966,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #endif
 
 #ifdef CONFIG_SMP
-static inline unsigned long capacity_orig_of(int cpu)
-{
-	return cpu_rq(cpu)->cpu_capacity_orig;
-}
-
 /**
  * enum cpu_util_type - CPU utilization type
  * @FREQUENCY_UTIL:	Utilization used to select frequency
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a7b50bba7829..1cc595907363 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2488,12 +2488,15 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {
+		unsigned long capacity;
+
 		rq = cpu_rq(i);
 		sd = *per_cpu_ptr(d.sd, i);
 
+		capacity = arch_scale_cpu_capacity(i);
 		/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
-		if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
-			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+		if (capacity > READ_ONCE(d.rd->max_cpu_capacity))
+			WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
 
 		cpu_attach_domain(sd, d.rd, i);
 	}
-- 
cgit v1.2.3


From 5b77261c5510f1e6f4d359e97dd3e39ee7259c3d Mon Sep 17 00:00:00 2001
From: Pierre Gondois <Pierre.Gondois@arm.com>
Date: Mon, 9 Oct 2023 11:30:36 +0530
Subject: sched/topology: Remove the EM_MAX_COMPLEXITY limit

The Energy Aware Scheduler (EAS) estimates the energy consumption
of placing a task on different CPUs. The goal is to minimize this
energy consumption. Estimating the energy of different task placements
is increasingly complex with the size of the platform.

To avoid having a slow wake-up path, EAS is only enabled if this
complexity is low enough.

The current complexity limit was set in:

  b68a4c0dba3b1 ("sched/topology: Disable EAS on inappropriate platforms")

... based on the first implementation of EAS, which was re-computing
the power of the whole platform for each task placement scenario, see:

  390031e4c309 ("sched/fair: Introduce an energy estimation helper function")

... but the complexity of EAS was reduced in:

  eb92692b2544d ("sched/fair: Speed-up energy-aware wake-ups")

... and find_energy_efficient_cpu() (feec) algorithm was updated in:

  3e8c6c9aac42 ("sched/fair: Remove task_util from effective utilization in feec()")

find_energy_efficient_cpu() (feec) is now doing:

	feec()
	\_ for_each_pd(pd) [0]
	  // get max_spare_cap_cpu and compute_prev_delta
	  \_ for_each_cpu(pd) [1]

	  \_ eenv_pd_busy_time(pd) [2]
		\_ for_each_cpu(pd)

	  // compute_energy(pd) without the task
	  \_ eenv_pd_max_util(pd, -1) [3.0]
	    \_ for_each_cpu(pd)
	  \_ em_cpu_energy(pd, -1)
	    \_ for_each_ps(pd)

	  // compute_energy(pd) with the task on prev_cpu
	  \_ eenv_pd_max_util(pd, prev_cpu) [3.1]
	    \_ for_each_cpu(pd)
	  \_ em_cpu_energy(pd, prev_cpu)
	    \_ for_each_ps(pd)

	  // compute_energy(pd) with the task on max_spare_cap_cpu
	  \_ eenv_pd_max_util(pd, max_spare_cap_cpu) [3.2]
	    \_ for_each_cpu(pd)
	  \_ em_cpu_energy(pd, max_spare_cap_cpu)
	    \_ for_each_ps(pd)

	[3.1] happens only once since prev_cpu is unique. With the same
	      definitions for nr_pd, nr_cpus and nr_ps, the complexity is of:

		nr_pd * (2 * [nr_cpus in pd] + 2 * ([nr_cpus in pd] + [nr_ps in pd]))
		+ ([nr_cpus in pd] + [nr_ps in pd])

		 [0]  * (     [1] + [2]      +       [3.0] + [3.2]                  )
		+ [3.1]

		= nr_pd * (4 * [nr_cpus in pd] + 2 * [nr_ps in pd])
		+ [nr_cpus in prev pd] + nr_ps

The complexity limit was set to 2048 in:

  b68a4c0dba3b1 ("sched/topology: Disable EAS on inappropriate platforms")

... to make "EAS usable up to 16 CPUs with per-CPU DVFS and less than 8
performance states each". For the same platform, the complexity would
actually be of:

  16 * (4 + 2 * 7) + 1 + 7 = 296

Since the EAS complexity was greatly reduced since the limit was
introduced, bigger platforms can handle EAS.

For instance, a platform with 112 CPUs with 7 performance states
each would not reach it:

  112 * (4 + 2 * 7) + 1 + 7 = 2024

To reflect this improvement in the underlying EAS code, remove
the EAS complexity check.

Note that a limit on the number of CPUs still holds against
EM_MAX_NUM_CPUS to avoid overflows during the energy estimation.

[ mingo: Updates to the changelog. ]

Signed-off-by: Pierre Gondois <Pierre.Gondois@arm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lore.kernel.org/r/20231009060037.170765-2-sshegde@linux.vnet.ibm.com
---
 Documentation/scheduler/sched-energy.rst | 29 +++---------------------
 kernel/sched/topology.c                  | 39 +++-----------------------------
 2 files changed, 6 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/scheduler/sched-energy.rst b/Documentation/scheduler/sched-energy.rst
index fc853c8cc346..70e2921ef725 100644
--- a/Documentation/scheduler/sched-energy.rst
+++ b/Documentation/scheduler/sched-energy.rst
@@ -359,32 +359,9 @@ in milli-Watts or in an 'abstract scale'.
 6.3 - Energy Model complexity
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The task wake-up path is very latency-sensitive. When the EM of a platform is
-too complex (too many CPUs, too many performance domains, too many performance
-states, ...), the cost of using it in the wake-up path can become prohibitive.
-The energy-aware wake-up algorithm has a complexity of:
-
-	C = Nd * (Nc + Ns)
-
-with: Nd the number of performance domains; Nc the number of CPUs; and Ns the
-total number of OPPs (ex: for two perf. domains with 4 OPPs each, Ns = 8).
-
-A complexity check is performed at the root domain level, when scheduling
-domains are built. EAS will not start on a root domain if its C happens to be
-higher than the completely arbitrary EM_MAX_COMPLEXITY threshold (2048 at the
-time of writing).
-
-If you really want to use EAS but the complexity of your platform's Energy
-Model is too high to be used with a single root domain, you're left with only
-two possible options:
-
-    1. split your system into separate, smaller, root domains using exclusive
-       cpusets and enable EAS locally on each of them. This option has the
-       benefit to work out of the box but the drawback of preventing load
-       balance between root domains, which can result in an unbalanced system
-       overall;
-    2. submit patches to reduce the complexity of the EAS wake-up algorithm,
-       hence enabling it to cope with larger EMs in reasonable time.
+EAS does not impose any complexity limit on the number of PDs/OPPs/CPUs but
+restricts the number of CPUs to EM_MAX_NUM_CPUS to prevent overflows during
+the energy estimation.
 
 
 6.4 - Schedutil governor
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1cc595907363..fcda3f066eec 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -348,32 +348,13 @@ static void sched_energy_set(bool has_eas)
  *    1. an Energy Model (EM) is available;
  *    2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
  *    3. no SMT is detected.
- *    4. the EM complexity is low enough to keep scheduling overheads low;
- *    5. schedutil is driving the frequency of all CPUs of the rd;
- *    6. frequency invariance support is present;
- *
- * The complexity of the Energy Model is defined as:
- *
- *              C = nr_pd * (nr_cpus + nr_ps)
- *
- * with parameters defined as:
- *  - nr_pd:    the number of performance domains
- *  - nr_cpus:  the number of CPUs
- *  - nr_ps:    the sum of the number of performance states of all performance
- *              domains (for example, on a system with 2 performance domains,
- *              with 10 performance states each, nr_ps = 2 * 10 = 20).
- *
- * It is generally not a good idea to use such a model in the wake-up path on
- * very complex platforms because of the associated scheduling overheads. The
- * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
- * with per-CPU DVFS and less than 8 performance states each, for example.
+ *    4. schedutil is driving the frequency of all CPUs of the rd;
+ *    5. frequency invariance support is present;
  */
-#define EM_MAX_COMPLEXITY 2048
-
 extern struct cpufreq_governor schedutil_gov;
 static bool build_perf_domains(const struct cpumask *cpu_map)
 {
-	int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
+	int i;
 	struct perf_domain *pd = NULL, *tmp;
 	int cpu = cpumask_first(cpu_map);
 	struct root_domain *rd = cpu_rq(cpu)->rd;
@@ -431,20 +412,6 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
 			goto free;
 		tmp->next = pd;
 		pd = tmp;
-
-		/*
-		 * Count performance domains and performance states for the
-		 * complexity check.
-		 */
-		nr_pd++;
-		nr_ps += em_pd_nr_perf_states(pd->em_pd);
-	}
-
-	/* Bail out if the Energy Model complexity is too high. */
-	if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
-		WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
-						cpumask_pr_args(cpu_map));
-		goto free;
 	}
 
 	perf_domain_debug(cpu_map, pd);
-- 
cgit v1.2.3


From e03dc9fa0663bc303383170e961561462ff00c93 Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang29@zte.com.cn>
Date: Mon, 9 Oct 2023 20:24:28 +0800
Subject: sched/psi: Change update_triggers() to a 'void' function

Update_triggers() always returns now + group->rtpoll_min_period, and the
return value is only used by psi_rtpoll_work(), so change update_triggers()
to a void function, let group->rtpoll_next_update = now +
group->rtpoll_min_period directly.

This will avoid unnecessary function return value passing & simplifies
the function.

[ mingo: Updated changelog ]

Suggested-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/202310092024289721617@zte.com.cn
---
 kernel/sched/psi.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1d0f634725a6..be853f227e40 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -434,7 +434,7 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
 	return growth;
 }
 
-static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
+static void update_triggers(struct psi_group *group, u64 now, bool *update_total,
 						   enum psi_aggregators aggregator)
 {
 	struct psi_trigger *t;
@@ -503,8 +503,6 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
 		/* Reset threshold breach flag once event got generated */
 		t->pending_event = false;
 	}
-
-	return now + group->rtpoll_min_period;
 }
 
 static u64 update_averages(struct psi_group *group, u64 now)
@@ -706,7 +704,8 @@ static void psi_rtpoll_work(struct psi_group *group)
 	}
 
 	if (now >= group->rtpoll_next_update) {
-		group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL);
+		update_triggers(group, now, &update_total, PSI_POLL);
+		group->rtpoll_next_update = now + group->rtpoll_min_period;
 		if (update_total)
 			memcpy(group->rtpoll_total, group->total[PSI_POLL],
 				   sizeof(group->rtpoll_total));
-- 
cgit v1.2.3


From 8ceea12d183cf29f28072dede218a04eda2a789c Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Fri, 15 Sep 2023 08:22:38 -0700
Subject: alarmtimer: Use maximum alarm time for suspend

Some userspace applications use timerfd_create() to request wakeups after
a long period of time. For example, a backup application may request a
wakeup once per week. This is perfectly fine as long as the system does
not try to suspend. However, if the system tries to suspend and the
system's RTC does not support the required alarm timeout, the suspend
operation will fail with an error such as

rtc_cmos 00:01: Alarms can be up to one day in the future
PM: dpm_run_callback(): platform_pm_suspend+0x0/0x4a returns -22
alarmtimer alarmtimer.4.auto: platform_pm_suspend+0x0/0x4a returned -22 after 117 usecs
PM: Device alarmtimer.4.auto failed to suspend: error -22

This results in a refusal to suspend the system, causing substantial
battery drain on affected systems.

To fix the problem, use the maximum alarm time offset as reported by RTC
drivers to set the maximum alarm time. While this may result in early
wakeups from suspend, it is still much better than not suspending at all.

Standardize system behavior if the requested alarm timeout is larger than
the alarm timeout supported by the rtc chip. Currently, in this situation,
the RTC driver will do one of the following:

  - It may return an error.
  - It may limit the alarm timeout to the maximum supported by the rtc chip.
  - It may mask the timeout by the maximum alarm timeout supported by the RTC
    chip (i.e. a requested timeout of 1 day + 1 minute may result in a 1
    minute timeout).

With this in place, if the RTC driver reports the maximum alarm timeout
supported by the RTC chip, the system will always limit the alarm timeout
to the maximum supported by the RTC chip.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/r/20230915152238.1144706-3-linux@roeck-us.net
---
 kernel/time/alarmtimer.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8d9f13d847f0..4657cb8e8b1f 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -290,6 +290,17 @@ static int alarmtimer_suspend(struct device *dev)
 	rtc_timer_cancel(rtc, &rtctimer);
 	rtc_read_time(rtc, &tm);
 	now = rtc_tm_to_ktime(tm);
+
+	/*
+	 * If the RTC alarm timer only supports a limited time offset, set the
+	 * alarm time to the maximum supported value.
+	 * The system may wake up earlier (possibly much earlier) than expected
+	 * when the alarmtimer runs. This is the best the kernel can do if
+	 * the alarmtimer exceeds the time that the rtc device can be programmed
+	 * for.
+	 */
+	min = rtc_bound_alarmtime(rtc, min);
+
 	now = ktime_add(now, min);
 
 	/* Set alarm, if in the past reject suspend briefly to handle */
-- 
cgit v1.2.3


From 1765bb61bb18a7b81f68806de6e8b8f5000f65bf Mon Sep 17 00:00:00 2001
From: Tero Kristo <tero.kristo@linux.intel.com>
Date: Wed, 13 Sep 2023 15:59:56 +0300
Subject: perf/core: Allow reading package events from perf_event_read_local

Per-package perf events are typically registered with a single CPU only,
however they can be read across all the CPUs within the package.
Currently perf_event_read maps the event CPU according to the topology
information to avoid an unnecessary SMP call, however
perf_event_read_local deals with hard values and rejects a read with a
failure if the CPU is not the one exactly registered. Allow similar
mapping within the perf_event_read_local if the perf event in question
can support this.

This allows users like BPF code to read the package perf events properly
across different CPUs within a package.

Signed-off-by: Tero Kristo <tero.kristo@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230913125956.3652667-1-tero.kristo@linux.intel.com
---
 kernel/events/core.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index af569196d760..708d474c2ede 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4425,6 +4425,9 @@ static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
 {
 	u16 local_pkg, event_pkg;
 
+	if ((unsigned)event_cpu >= nr_cpu_ids)
+		return event_cpu;
+
 	if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
 		int local_cpu = smp_processor_id();
 
@@ -4527,6 +4530,8 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
 			  u64 *enabled, u64 *running)
 {
 	unsigned long flags;
+	int event_oncpu;
+	int event_cpu;
 	int ret = 0;
 
 	/*
@@ -4551,15 +4556,22 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
 		goto out;
 	}
 
+	/*
+	 * Get the event CPU numbers, and adjust them to local if the event is
+	 * a per-package event that can be read locally
+	 */
+	event_oncpu = __perf_event_read_cpu(event, event->oncpu);
+	event_cpu = __perf_event_read_cpu(event, event->cpu);
+
 	/* If this is a per-CPU event, it must be for this CPU */
 	if (!(event->attach_state & PERF_ATTACH_TASK) &&
-	    event->cpu != smp_processor_id()) {
+	    event_cpu != smp_processor_id()) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	/* If this is a pinned event it must be running on this CPU */
-	if (event->attr.pinned && event->oncpu != smp_processor_id()) {
+	if (event->attr.pinned && event_oncpu != smp_processor_id()) {
 		ret = -EBUSY;
 		goto out;
 	}
@@ -4569,7 +4581,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
 	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
 	 * oncpu == -1).
 	 */
-	if (event->oncpu == smp_processor_id())
+	if (event_oncpu == smp_processor_id())
 		event->pmu->read(event);
 
 	*value = local64_read(&event->count);
-- 
cgit v1.2.3


From d6247ecb6c1e17d7a33317090627f5bfe563cbb2 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 4 Oct 2023 11:23:38 -0500
Subject: bpf: Add ability to pin bpf timer to calling CPU

BPF supports creating high resolution timers using bpf_timer_* helper
functions. Currently, only the BPF_F_TIMER_ABS flag is supported, which
specifies that the timeout should be interpreted as absolute time. It
would also be useful to be able to pin that timer to a core. For
example, if you wanted to make a subset of cores run without timer
interrupts, and only have the timer be invoked on a single core.

This patch adds support for this with a new BPF_F_TIMER_CPU_PIN flag.
When specified, the HRTIMER_MODE_PINNED flag is passed to
hrtimer_start(). A subsequent patch will update selftests to validate.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <song@kernel.org>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/bpf/20231004162339.200702-2-void@manifault.com
---
 include/uapi/linux/bpf.h       | 4 ++++
 kernel/bpf/helpers.c           | 5 ++++-
 tools/include/uapi/linux/bpf.h | 4 ++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 70bfa997e896..a7d4a1a69f21 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5096,6 +5096,8 @@ union bpf_attr {
  *		**BPF_F_TIMER_ABS**
  *			Start the timer in absolute expire value instead of the
  *			default relative one.
+ *		**BPF_F_TIMER_CPU_PIN**
+ *			Timer will be pinned to the CPU of the caller.
  *
  *	Return
  *		0 on success.
@@ -7309,9 +7311,11 @@ struct bpf_core_relo {
  * Flags to control bpf_timer_start() behaviour.
  *     - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is
  *       relative to current time.
+ *     - BPF_F_TIMER_CPU_PIN: Timer will be pinned to the CPU of the caller.
  */
 enum {
 	BPF_F_TIMER_ABS = (1ULL << 0),
+	BPF_F_TIMER_CPU_PIN = (1ULL << 1),
 };
 
 /* BPF numbers iterator state */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index dd1c69ee3375..d2840dd5b00d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1272,7 +1272,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
 
 	if (in_nmi())
 		return -EOPNOTSUPP;
-	if (flags > BPF_F_TIMER_ABS)
+	if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN))
 		return -EINVAL;
 	__bpf_spin_lock_irqsave(&timer->lock);
 	t = timer->timer;
@@ -1286,6 +1286,9 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
 	else
 		mode = HRTIMER_MODE_REL_SOFT;
 
+	if (flags & BPF_F_TIMER_CPU_PIN)
+		mode |= HRTIMER_MODE_PINNED;
+
 	hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
 out:
 	__bpf_spin_unlock_irqrestore(&timer->lock);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 70bfa997e896..a7d4a1a69f21 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5096,6 +5096,8 @@ union bpf_attr {
  *		**BPF_F_TIMER_ABS**
  *			Start the timer in absolute expire value instead of the
  *			default relative one.
+ *		**BPF_F_TIMER_CPU_PIN**
+ *			Timer will be pinned to the CPU of the caller.
  *
  *	Return
  *		0 on success.
@@ -7309,9 +7311,11 @@ struct bpf_core_relo {
  * Flags to control bpf_timer_start() behaviour.
  *     - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is
  *       relative to current time.
+ *     - BPF_F_TIMER_CPU_PIN: Timer will be pinned to the CPU of the caller.
  */
 enum {
 	BPF_F_TIMER_ABS = (1ULL << 0),
+	BPF_F_TIMER_CPU_PIN = (1ULL << 1),
 };
 
 /* BPF numbers iterator state */
-- 
cgit v1.2.3


From 8f833c82cdab7b4049bcfe88311d35fa5f24e422 Mon Sep 17 00:00:00 2001
From: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
Date: Mon, 9 Oct 2023 11:30:37 +0530
Subject: sched/topology: Change behaviour of the 'sched_energy_aware' sysctl,
 based on the platform

The 'sched_energy_aware' sysctl is available for the admin to disable/enable
energy aware scheduling(EAS). EAS is enabled only if few conditions are
met by the platform. They are, asymmetric CPU capacity, no SMT,
schedutil CPUfreq governor, frequency invariant load tracking etc.
A platform may boot without EAS capability, but could gain such
capability at runtime. For example, changing/registering the cpufreq
governor to schedutil.

At present, though platform doesn't support EAS, this sysctl returns 1
and it ends up calling build_perf_domains on write to 1 and
NOP when writing to 0. That is confusing and un-necessary.

Desired behavior would be to have this sysctl to enable/disable the EAS
on supported platform. On non-supported platform write to the sysctl
would return not supported error and read of the sysctl would return
empty. So sched_energy_aware returns empty - EAS is not possible at this moment
This will include EAS capable platforms which have at least one EAS
condition false during startup, e.g. not using the schedutil cpufreq governor
sched_energy_aware returns 0 - EAS is supported but disabled by admin.
sched_energy_aware returns 1 - EAS is supported and enabled.

User can find out the reason why EAS is not possible by checking
info messages. sched_is_eas_possible returns true if the platform
can do EAS at this moment.

Signed-off-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Pierre Gondois <pierre.gondois@arm.com>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lore.kernel.org/r/20231009060037.170765-3-sshegde@linux.vnet.ibm.com
---
 Documentation/admin-guide/sysctl/kernel.rst |   3 +-
 kernel/sched/topology.c                     | 112 ++++++++++++++++++----------
 2 files changed, 76 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index cf33de56da27..d89ac2bd8dc4 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1182,7 +1182,8 @@ automatically on platforms where it can run (that is,
 platforms with asymmetric CPU topologies and having an Energy
 Model available). If your platform happens to meet the
 requirements for EAS but you do not want to use it, change
-this value to 0.
+this value to 0. On Non-EAS platforms, write operation fails and
+read doesn't return anything.
 
 task_delayacct
 ===============
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index fcda3f066eec..4cbbdacafe9e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -212,6 +212,70 @@ static unsigned int sysctl_sched_energy_aware = 1;
 static DEFINE_MUTEX(sched_energy_mutex);
 static bool sched_energy_update;
 
+extern struct cpufreq_governor schedutil_gov;
+static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
+{
+	bool any_asym_capacity = false;
+	struct cpufreq_policy *policy;
+	struct cpufreq_governor *gov;
+	int i;
+
+	/* EAS is enabled for asymmetric CPU capacity topologies. */
+	for_each_cpu(i, cpu_mask) {
+		if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) {
+			any_asym_capacity = true;
+			break;
+		}
+	}
+	if (!any_asym_capacity) {
+		if (sched_debug()) {
+			pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n",
+				cpumask_pr_args(cpu_mask));
+		}
+		return false;
+	}
+
+	/* EAS definitely does *not* handle SMT */
+	if (sched_smt_active()) {
+		if (sched_debug()) {
+			pr_info("rd %*pbl: Checking EAS, SMT is not supported\n",
+				cpumask_pr_args(cpu_mask));
+		}
+		return false;
+	}
+
+	if (!arch_scale_freq_invariant()) {
+		if (sched_debug()) {
+			pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported",
+				cpumask_pr_args(cpu_mask));
+		}
+		return false;
+	}
+
+	/* Do not attempt EAS if schedutil is not being used. */
+	for_each_cpu(i, cpu_mask) {
+		policy = cpufreq_cpu_get(i);
+		if (!policy) {
+			if (sched_debug()) {
+				pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
+					cpumask_pr_args(cpu_mask), i);
+			}
+			return false;
+		}
+		gov = policy->governor;
+		cpufreq_cpu_put(policy);
+		if (gov != &schedutil_gov) {
+			if (sched_debug()) {
+				pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
+					cpumask_pr_args(cpu_mask));
+			}
+			return false;
+		}
+	}
+
+	return true;
+}
+
 void rebuild_sched_domains_energy(void)
 {
 	mutex_lock(&sched_energy_mutex);
@@ -230,6 +294,15 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write,
 	if (write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	if (!sched_is_eas_possible(cpu_active_mask)) {
+		if (write) {
+			return -EOPNOTSUPP;
+		} else {
+			*lenp = 0;
+			return 0;
+		}
+	}
+
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (!ret && write) {
 		state = static_branch_unlikely(&sched_energy_present);
@@ -351,61 +424,24 @@ static void sched_energy_set(bool has_eas)
  *    4. schedutil is driving the frequency of all CPUs of the rd;
  *    5. frequency invariance support is present;
  */
-extern struct cpufreq_governor schedutil_gov;
 static bool build_perf_domains(const struct cpumask *cpu_map)
 {
 	int i;
 	struct perf_domain *pd = NULL, *tmp;
 	int cpu = cpumask_first(cpu_map);
 	struct root_domain *rd = cpu_rq(cpu)->rd;
-	struct cpufreq_policy *policy;
-	struct cpufreq_governor *gov;
 
 	if (!sysctl_sched_energy_aware)
 		goto free;
 
-	/* EAS is enabled for asymmetric CPU capacity topologies. */
-	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
-		if (sched_debug()) {
-			pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
-					cpumask_pr_args(cpu_map));
-		}
-		goto free;
-	}
-
-	/* EAS definitely does *not* handle SMT */
-	if (sched_smt_active()) {
-		pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
-			cpumask_pr_args(cpu_map));
-		goto free;
-	}
-
-	if (!arch_scale_freq_invariant()) {
-		if (sched_debug()) {
-			pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
-				cpumask_pr_args(cpu_map));
-		}
+	if (!sched_is_eas_possible(cpu_map))
 		goto free;
-	}
 
 	for_each_cpu(i, cpu_map) {
 		/* Skip already covered CPUs. */
 		if (find_pd(pd, i))
 			continue;
 
-		/* Do not attempt EAS if schedutil is not being used. */
-		policy = cpufreq_cpu_get(i);
-		if (!policy)
-			goto free;
-		gov = policy->governor;
-		cpufreq_cpu_put(policy);
-		if (gov != &schedutil_gov) {
-			if (rd->pd)
-				pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
-						cpumask_pr_args(cpu_map));
-			goto free;
-		}
-
 		/* Create the new pd and add it to the local list. */
 		tmp = pd_init(i);
 		if (!tmp)
-- 
cgit v1.2.3


From f2273f4e19e29f7d0be6a2393f18369cd1b496c8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 9 Oct 2023 17:31:26 +0200
Subject: sched/topology: Move the declaration of 'schedutil_gov' to
 kernel/sched/sched.h

Move it out of the .c file into the shared scheduler-internal header file,
to gain type-checking.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
Cc: Valentin Schneider <vschneid@redhat.com>
Link: https://lore.kernel.org/r/20231009060037.170765-3-sshegde@linux.vnet.ibm.com
---
 kernel/sched/sched.h    | 2 ++
 kernel/sched/topology.c | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7e7fedcfc580..faf9031422e1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3203,6 +3203,8 @@ static inline bool sched_energy_enabled(void)
 	return static_branch_unlikely(&sched_energy_present);
 }
 
+extern struct cpufreq_governor schedutil_gov;
+
 #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
 
 #define perf_domain_span(pd) NULL
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 4cbbdacafe9e..d9508617f7f8 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -212,7 +212,6 @@ static unsigned int sysctl_sched_energy_aware = 1;
 static DEFINE_MUTEX(sched_energy_mutex);
 static bool sched_energy_update;
 
-extern struct cpufreq_governor schedutil_gov;
 static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
 {
 	bool any_asym_capacity = false;
-- 
cgit v1.2.3


From f843249cb6a151bfe7b955dfb93ff29663c258fb Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@inria.fr>
Date: Thu, 28 Sep 2023 12:43:34 +0200
Subject: tracing/eprobe: drop unneeded breaks

Drop break after return.

Link: https://lore.kernel.org/all/20230928104334.41215-1-Julia.Lawall@inria.fr/

Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace_eprobe.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 72714cbf475c..03c851f57969 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -788,12 +788,9 @@ find_and_get_event(const char *system, const char *event_name)
 		name = trace_event_name(tp_event);
 		if (!name || strcmp(event_name, name))
 			continue;
-		if (!trace_event_try_get_ref(tp_event)) {
+		if (!trace_event_try_get_ref(tp_event))
 			return NULL;
-			break;
-		}
 		return tp_event;
-		break;
 	}
 	return NULL;
 }
-- 
cgit v1.2.3


From 1ca0b605150501b7dc59f3016271da4eb3e96fce Mon Sep 17 00:00:00 2001
From: Michal Koutný <mkoutny@suse.com>
Date: Mon, 9 Oct 2023 15:58:11 +0200
Subject: cgroup: Remove duplicates in cgroup v1 tasks file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

One PID may appear multiple times in a preloaded pidlist.
(Possibly due to PID recycling but we have reports of the same
task_struct appearing with different PIDs, thus possibly involving
transfer of PID via de_thread().)

Because v1 seq_file iterator uses PIDs as position, it leads to
a message:
> seq_file: buggy .next function kernfs_seq_next did not update position index

Conservative and quick fix consists of removing duplicates from `tasks`
file (as opposed to removing pidlists altogether). It doesn't affect
correctness (it's sufficient to show a PID once), performance impact
would be hidden by unconditional sorting of the pidlist already in place
(asymptotically).

Link: https://lore.kernel.org/r/20230823174804.23632-1-mkoutny@suse.com/
Suggested-by: Firo Yang <firo.yang@suse.com>
Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
---
 kernel/cgroup/cgroup-v1.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index c487ffef6652..76db6c67e39a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -360,10 +360,9 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	}
 	css_task_iter_end(&it);
 	length = n;
-	/* now sort & (if procs) strip out duplicates */
+	/* now sort & strip out duplicates (tgids or recycled thread PIDs) */
 	sort(array, length, sizeof(pid_t), cmppid, NULL);
-	if (type == CGROUP_FILE_PROCS)
-		length = pidlist_uniq(array, length);
+	length = pidlist_uniq(array, length);
 
 	l = cgroup_pidlist_find_create(cgrp, type);
 	if (!l) {
-- 
cgit v1.2.3


From 27a6c5c50c4bb0c56296f01a3142db796bb01da1 Mon Sep 17 00:00:00 2001
From: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Date: Fri, 6 Oct 2023 17:20:32 +0530
Subject: cgroup: use legacy_name for cgroup v1 disable info

cgroup v1 or v2 or both controller names can be passed as arguments to
the 'cgroup_no_v1' kernel parameter, though most of the controller's
names are the same for both cgroup versions. This can be confusing when
both versions are used interchangeably, i.e., passing cgroup_no_v1=io

$ sudo dmesg |grep cgroup
...
cgroup: Disabling io control group subsystem in v1 mounts
cgroup: Disabled controller 'blkio'

Make it consistent across the pr_info()'s, by using ss->legacy_name, as
the subsystem name, while printing the cgroup v1 controller disabling
information in cgroup_init().

Signed-off-by: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 059cd5651d41..c09531bace38 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6127,7 +6127,7 @@ int __init cgroup_init(void)
 
 		if (cgroup1_ssid_disabled(ssid))
 			pr_info("Disabling %s control group subsystem in v1 mounts\n",
-				ss->name);
+				ss->legacy_name);
 
 		cgrp_dfl_root.subsys_mask |= 1 << ss->id;
 
-- 
cgit v1.2.3


From 829955981c557c7fc7416581c4cd68a8a0c28620 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 9 Oct 2023 11:14:13 -0500
Subject: bpf: Fix verifier log for async callback return values

The verifier, as part of check_return_code(), verifies that async
callbacks such as from e.g. timers, will return 0. It does this by
correctly checking that R0->var_off is in tnum_const(0), which
effectively checks that it's in a range of 0. If this condition fails,
however, it prints an error message which says that the value should
have been in (0x0; 0x1). This results in possibly confusing output such
as the following in which an async callback returns 1:

  At async callback the register R0 has value (0x1; 0x0) should have been in (0x0; 0x1)

The fix is easy -- we should just pass the tnum_const(0) as the correct
range to verbose_invalid_scalar(), which will then print the following:

  At async callback the register R0 has value (0x1; 0x0) should have been in (0x0; 0x0)

Fixes: bfc6bb74e4f1 ("bpf: Implement verifier support for validation of async callbacks.")
Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20231009161414.235829-1-void@manifault.com
---
 kernel/bpf/verifier.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c0c7d137066a..873ade146f3d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14479,7 +14479,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	struct tnum enforce_attach_type_range = tnum_unknown;
 	const struct bpf_prog *prog = env->prog;
 	struct bpf_reg_state *reg;
-	struct tnum range = tnum_range(0, 1);
+	struct tnum range = tnum_range(0, 1), const_0 = tnum_const(0);
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
 	int err;
 	struct bpf_func_state *frame = env->cur_state->frame[0];
@@ -14527,8 +14527,8 @@ static int check_return_code(struct bpf_verifier_env *env)
 			return -EINVAL;
 		}
 
-		if (!tnum_in(tnum_const(0), reg->var_off)) {
-			verbose_invalid_scalar(env, reg, &range, "async callback", "R0");
+		if (!tnum_in(const_0, reg->var_off)) {
+			verbose_invalid_scalar(env, reg, &const_0, "async callback", "R0");
 			return -EINVAL;
 		}
 		return 0;
-- 
cgit v1.2.3


From f3a6c97940fbd25d6c84c2d5642338fc99a9b35b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 10 Oct 2023 09:31:39 +0100
Subject: sched/numa: Rename vma_numab_state::access_pids[] => ::pids_active[],
 ::next_pid_reset => ::pids_active_reset

The access_pids[] field name is somewhat ambiguous as no PIDs are accessed.
Similarly, it's not clear that next_pid_reset is related to access_pids[].
Rename the fields to more accurately reflect their purpose.

[ mingo: Rename in the comments too. ]

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231010083143.19593-3-mgorman@techsingularity.net
---
 include/linux/mm.h       |  4 ++--
 include/linux/mm_types.h |  6 +++---
 kernel/sched/fair.c      | 12 ++++++------
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf5d0b1b16f4..19fc73b02c9f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1726,8 +1726,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
 	unsigned int pid_bit;
 
 	pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
-	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
-		__set_bit(pid_bit, &vma->numab_state->access_pids[1]);
+	if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
+		__set_bit(pid_bit, &vma->numab_state->pids_active[1]);
 	}
 }
 #else /* !CONFIG_NUMA_BALANCING */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d7f042ec1f33..e7571eca1131 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -559,10 +559,10 @@ struct vma_numab_state {
 	unsigned long next_scan;
 
 	/*
-	 * Time in jiffies when access_pids[] is reset to
+	 * Time in jiffies when pids_active[] is reset to
 	 * detect phase change behaviour:
 	 */
-	unsigned long next_pid_reset;
+	unsigned long pids_active_reset;
 
 	/*
 	 * Approximate tracking of PIDs that trapped a NUMA hinting
@@ -574,7 +574,7 @@ struct vma_numab_state {
 	 * Window moves after next_pid_reset has expired approximately
 	 * every VMA_PID_RESET_PERIOD jiffies:
 	 */
-	unsigned long access_pids[2];
+	unsigned long pids_active[2];
 };
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e7c1bafc0460..6b47edcbe834 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3125,7 +3125,7 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
 	if (READ_ONCE(current->mm->numa_scan_seq) < 2)
 		return true;
 
-	pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
+	pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
 	return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
 }
 
@@ -3241,7 +3241,7 @@ static void task_numa_work(struct callback_head *work)
 				msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 
 			/* Reset happens after 4 times scan delay of scan start */
-			vma->numab_state->next_pid_reset =  vma->numab_state->next_scan +
+			vma->numab_state->pids_active_reset =  vma->numab_state->next_scan +
 				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
 		}
 
@@ -3262,11 +3262,11 @@ static void task_numa_work(struct callback_head *work)
 		 * vma for recent access to avoid clearing PID info before access..
 		 */
 		if (mm->numa_scan_seq &&
-				time_after(jiffies, vma->numab_state->next_pid_reset)) {
-			vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset +
+				time_after(jiffies, vma->numab_state->pids_active_reset)) {
+			vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
 				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
-			vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]);
-			vma->numab_state->access_pids[1] = 0;
+			vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
+			vma->numab_state->pids_active[1] = 0;
 		}
 
 		do {
-- 
cgit v1.2.3


From ed2da8b725b932b1e2b2f4835bb664d47ed03031 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 10 Oct 2023 09:31:40 +0100
Subject: sched/numa: Trace decisions related to skipping VMAs

NUMA balancing skips or scans VMAs for a variety of reasons. In preparation
for completing scans of VMAs regardless of PID access, trace the reasons
why a VMA was skipped. In a later patch, the tracing will be used to track
if a VMA was forcibly scanned.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231010083143.19593-4-mgorman@techsingularity.net
---
 include/linux/sched/numa_balancing.h |  8 ++++++
 include/trace/events/sched.h         | 50 ++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c                  | 17 +++++++++---
 3 files changed, 71 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762efe15..c127a1509e2f 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -15,6 +15,14 @@
 #define TNF_FAULT_LOCAL	0x08
 #define TNF_MIGRATE_FAIL 0x10
 
+enum numa_vmaskip_reason {
+	NUMAB_SKIP_UNSUITABLE,
+	NUMAB_SKIP_SHARED_RO,
+	NUMAB_SKIP_INACCESSIBLE,
+	NUMAB_SKIP_SCAN_DELAY,
+	NUMAB_SKIP_PID_INACTIVE,
+};
+
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index a13d5d06be9d..d82a04d6a1bc 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -664,6 +664,56 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
 	TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
 );
 
+#ifdef CONFIG_NUMA_BALANCING
+#define NUMAB_SKIP_REASON					\
+	EM( NUMAB_SKIP_UNSUITABLE,		"unsuitable" )	\
+	EM( NUMAB_SKIP_SHARED_RO,		"shared_ro" )	\
+	EM( NUMAB_SKIP_INACCESSIBLE,		"inaccessible" )	\
+	EM( NUMAB_SKIP_SCAN_DELAY,		"scan_delay" )	\
+	EMe(NUMAB_SKIP_PID_INACTIVE,		"pid_inactive" )
+
+/* Redefine for export. */
+#undef EM
+#undef EMe
+#define EM(a, b)	TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
+
+NUMAB_SKIP_REASON
+
+/* Redefine for symbolic printing. */
+#undef EM
+#undef EMe
+#define EM(a, b)	{ a, b },
+#define EMe(a, b)	{ a, b }
+
+TRACE_EVENT(sched_skip_vma_numa,
+
+	TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma,
+		 enum numa_vmaskip_reason reason),
+
+	TP_ARGS(mm, vma, reason),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, numa_scan_offset)
+		__field(unsigned long, vm_start)
+		__field(unsigned long, vm_end)
+		__field(enum numa_vmaskip_reason, reason)
+	),
+
+	TP_fast_assign(
+		__entry->numa_scan_offset	= mm->numa_scan_offset;
+		__entry->vm_start		= vma->vm_start;
+		__entry->vm_end			= vma->vm_end;
+		__entry->reason			= reason;
+	),
+
+	TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s",
+		  __entry->numa_scan_offset,
+		  __entry->vm_start,
+		  __entry->vm_end,
+		  __print_symbolic(__entry->reason, NUMAB_SKIP_REASON))
+);
+#endif /* CONFIG_NUMA_BALANCING */
 
 /*
  * Tracepoint for waking a polling cpu without an IPI.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b47edcbe834..31cfdb0794fb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3210,6 +3210,7 @@ static void task_numa_work(struct callback_head *work)
 	do {
 		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
 			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
 			continue;
 		}
 
@@ -3220,15 +3221,19 @@ static void task_numa_work(struct callback_head *work)
 		 * as migrating the pages will be of marginal benefit.
 		 */
 		if (!vma->vm_mm ||
-		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
 			continue;
+		}
 
 		/*
 		 * Skip inaccessible VMAs to avoid any confusion between
 		 * PROT_NONE and NUMA hinting ptes
 		 */
-		if (!vma_is_accessible(vma))
+		if (!vma_is_accessible(vma)) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
 			continue;
+		}
 
 		/* Initialise new per-VMA NUMAB state. */
 		if (!vma->numab_state) {
@@ -3250,12 +3255,16 @@ static void task_numa_work(struct callback_head *work)
 		 * delay the scan for new VMAs.
 		 */
 		if (mm->numa_scan_seq && time_before(jiffies,
-						vma->numab_state->next_scan))
+						vma->numab_state->next_scan)) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
 			continue;
+		}
 
 		/* Do not scan the VMA if task has not accessed */
-		if (!vma_is_accessed(vma))
+		if (!vma_is_accessed(vma)) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
 			continue;
+		}
 
 		/*
 		 * RESET access PIDs regularly for old VMAs. Resetting after checking
-- 
cgit v1.2.3


From 2e2675db1906ac04809f5399bf1f5e30d56a6f3e Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@amd.com>
Date: Tue, 10 Oct 2023 09:31:41 +0100
Subject: sched/numa: Move up the access pid reset logic

Recent NUMA hinting faulting activity is reset approximately every
VMA_PID_RESET_PERIOD milliseconds. However, if the current task has not
accessed a VMA then the reset check is missed and the reset is potentially
deferred forever. Check if the PID activity information should be reset
before checking if the current task recently trapped a NUMA hinting fault.

[ mgorman@techsingularity.net: Rewrite changelog ]

Suggested-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231010083143.19593-5-mgorman@techsingularity.net
---
 kernel/sched/fair.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 31cfdb0794fb..ce36969625bd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3260,16 +3260,7 @@ static void task_numa_work(struct callback_head *work)
 			continue;
 		}
 
-		/* Do not scan the VMA if task has not accessed */
-		if (!vma_is_accessed(vma)) {
-			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
-			continue;
-		}
-
-		/*
-		 * RESET access PIDs regularly for old VMAs. Resetting after checking
-		 * vma for recent access to avoid clearing PID info before access..
-		 */
+		/* RESET access PIDs regularly for old VMAs. */
 		if (mm->numa_scan_seq &&
 				time_after(jiffies, vma->numab_state->pids_active_reset)) {
 			vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
@@ -3278,6 +3269,12 @@ static void task_numa_work(struct callback_head *work)
 			vma->numab_state->pids_active[1] = 0;
 		}
 
+		/* Do not scan the VMA if task has not accessed */
+		if (!vma_is_accessed(vma)) {
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
+			continue;
+		}
+
 		do {
 			start = max(start, vma->vm_start);
 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
-- 
cgit v1.2.3


From d91bdd96b55cc3ce98d883a60f133713821b80a6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Aug 2023 10:18:27 +0200
Subject: cpu/SMT: Make SMT control more robust against enumeration failures

The SMT control mechanism got added as speculation attack vector
mitigation. The implemented logic relies on the primary thread mask to
be set up properly.

This turns out to be an issue with XEN/PV guests because their CPU hotplug
mechanics do not enumerate APICs and therefore the mask is never correctly
populated.

This went unnoticed so far because by chance XEN/PV ends up with
smp_num_siblings == 2. So smt_hotplug_control stays at its default value
CPU_SMT_ENABLED and the primary thread mask is never evaluated in the
context of CPU hotplug.

This stopped "working" with the upcoming overhaul of the topology
evaluation which legitimately provides a fake topology for XEN/PV. That
sets smp_num_siblings to 1, which causes the core CPU hot-plug core to
refuse to bring up the APs.

This happens because smt_hotplug_control is set to CPU_SMT_NOT_SUPPORTED
which causes cpu_smt_allowed() to evaluate the unpopulated primary thread
mask with the conclusion that all non-boot CPUs are not valid to be
plugged.

Make cpu_smt_allowed() more robust and take CPU_SMT_NOT_SUPPORTED and
CPU_SMT_NOT_IMPLEMENTED into account. Rename it to cpu_bootable() while at
it as that makes it more clear what the function is about.

The primary mask issue on x86 XEN/PV needs to be addressed separately as
there are users outside of the CPU hotplug code too.

Fixes: 05736e4ac13c ("cpu/hotplug: Provide knobs to control SMT")
Reported-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Sohil Mehta <sohil.mehta@intel.com>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230814085112.149440843@linutronix.de
---
 kernel/cpu.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6de7c6bb74ee..1a189da3bdac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -659,11 +659,19 @@ static inline bool cpu_smt_thread_allowed(unsigned int cpu)
 #endif
 }
 
-static inline bool cpu_smt_allowed(unsigned int cpu)
+static inline bool cpu_bootable(unsigned int cpu)
 {
 	if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
 		return true;
 
+	/* All CPUs are bootable if controls are not configured */
+	if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED)
+		return true;
+
+	/* All CPUs are bootable if CPU is not SMT capable */
+	if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+		return true;
+
 	if (topology_is_primary_thread(cpu))
 		return true;
 
@@ -685,7 +693,7 @@ bool cpu_smt_possible(void)
 EXPORT_SYMBOL_GPL(cpu_smt_possible);
 
 #else
-static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
+static inline bool cpu_bootable(unsigned int cpu) { return true; }
 #endif
 
 static inline enum cpuhp_state
@@ -788,10 +796,10 @@ static int bringup_wait_for_ap_online(unsigned int cpu)
 	 * SMT soft disabling on X86 requires to bring the CPU out of the
 	 * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit.  The
 	 * CPU marked itself as booted_once in notify_cpu_starting() so the
-	 * cpu_smt_allowed() check will now return false if this is not the
+	 * cpu_bootable() check will now return false if this is not the
 	 * primary sibling.
 	 */
-	if (!cpu_smt_allowed(cpu))
+	if (!cpu_bootable(cpu))
 		return -ECANCELED;
 	return 0;
 }
@@ -1741,7 +1749,7 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target)
 		err = -EBUSY;
 		goto out;
 	}
-	if (!cpu_smt_allowed(cpu)) {
+	if (!cpu_bootable(cpu)) {
 		err = -EPERM;
 		goto out;
 	}
-- 
cgit v1.2.3


From 4a8e65b0c348e42107c64381e692e282900be361 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 4 Oct 2023 01:28:59 +0200
Subject: srcu: Fix callbacks acceleration mishandling

SRCU callbacks acceleration might fail if the preceding callbacks
advance also fails. This can happen when the following steps are met:

1) The RCU_WAIT_TAIL segment has callbacks (say for gp_num 8) and the
   RCU_NEXT_READY_TAIL also has callbacks (say for gp_num 12).

2) The grace period for RCU_WAIT_TAIL is observed as started but not yet
   completed so rcu_seq_current() returns 4 + SRCU_STATE_SCAN1 = 5.

3) This value is passed to rcu_segcblist_advance() which can't move
   any segment forward and fails.

4) srcu_gp_start_if_needed() still proceeds with callback acceleration.
   But then the call to rcu_seq_snap() observes the grace period for the
   RCU_WAIT_TAIL segment (gp_num 8) as completed and the subsequent one
   for the RCU_NEXT_READY_TAIL segment as started
   (ie: 8 + SRCU_STATE_SCAN1 = 9) so it returns a snapshot of the
   next grace period, which is 16.

5) The value of 16 is passed to rcu_segcblist_accelerate() but the
   freshly enqueued callback in RCU_NEXT_TAIL can't move to
   RCU_NEXT_READY_TAIL which already has callbacks for a previous grace
   period (gp_num = 12). So acceleration fails.

6) Note in all these steps, srcu_invoke_callbacks() hadn't had a chance
   to run srcu_invoke_callbacks().

Then some very bad outcome may happen if the following happens:

7) Some other CPU races and starts the grace period number 16 before the
   CPU handling previous steps had a chance. Therefore srcu_gp_start()
   isn't called on the latter sdp to fix the acceleration leak from
   previous steps with a new pair of call to advance/accelerate.

8) The grace period 16 completes and srcu_invoke_callbacks() is finally
   called. All the callbacks from previous grace periods (8 and 12) are
   correctly advanced and executed but callbacks in RCU_NEXT_READY_TAIL
   still remain. Then rcu_segcblist_accelerate() is called with a
   snaphot of 20.

9) Since nothing started the grace period number 20, callbacks stay
   unhandled.

This has been reported in real load:

	[3144162.608392] INFO: task kworker/136:12:252684 blocked for more
	than 122 seconds.
	[3144162.615986]       Tainted: G           O  K   5.4.203-1-tlinux4-0011.1 #1
	[3144162.623053] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
	disables this message.
	[3144162.631162] kworker/136:12  D    0 252684      2 0x90004000
	[3144162.631189] Workqueue: kvm-irqfd-cleanup irqfd_shutdown [kvm]
	[3144162.631192] Call Trace:
	[3144162.631202]  __schedule+0x2ee/0x660
	[3144162.631206]  schedule+0x33/0xa0
	[3144162.631209]  schedule_timeout+0x1c4/0x340
	[3144162.631214]  ? update_load_avg+0x82/0x660
	[3144162.631217]  ? raw_spin_rq_lock_nested+0x1f/0x30
	[3144162.631218]  wait_for_completion+0x119/0x180
	[3144162.631220]  ? wake_up_q+0x80/0x80
	[3144162.631224]  __synchronize_srcu.part.19+0x81/0xb0
	[3144162.631226]  ? __bpf_trace_rcu_utilization+0x10/0x10
	[3144162.631227]  synchronize_srcu+0x5f/0xc0
	[3144162.631236]  irqfd_shutdown+0x3c/0xb0 [kvm]
	[3144162.631239]  ? __schedule+0x2f6/0x660
	[3144162.631243]  process_one_work+0x19a/0x3a0
	[3144162.631244]  worker_thread+0x37/0x3a0
	[3144162.631247]  kthread+0x117/0x140
	[3144162.631247]  ? process_one_work+0x3a0/0x3a0
	[3144162.631248]  ? __kthread_cancel_work+0x40/0x40
	[3144162.631250]  ret_from_fork+0x1f/0x30

Fix this with taking the snapshot for acceleration _before_ the read
of the current grace period number.

The only side effect of this solution is that callbacks advancing happen
then _after_ the full barrier in rcu_seq_snap(). This is not a problem
because that barrier only cares about:

1) Ordering accesses of the update side before call_srcu() so they don't
   bleed.
2) See all the accesses prior to the grace period of the current gp_num

The only things callbacks advancing need to be ordered against are
carried by snp locking.

Reported-by: Yong He <alexyonghe@tencent.com>
Co-developed-by:: Yong He <alexyonghe@tencent.com>
Signed-off-by: Yong He <alexyonghe@tencent.com>
Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by:  Joel Fernandes (Google) <joel@joelfernandes.org>
Co-developed-by: Neeraj upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Neeraj upadhyay <Neeraj.Upadhyay@amd.com>
Link: http://lore.kernel.org/CANZk6aR+CqZaqmMWrC2eRRPY12qAZnDZLwLnHZbNi=xXMB401g@mail.gmail.com
Fixes: da915ad5cf25 ("srcu: Parallelize callback handling")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/srcutree.c | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 5602042856b1..9fab9ac36996 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1244,10 +1244,37 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 	spin_lock_irqsave_sdp_contention(sdp, &flags);
 	if (rhp)
 		rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
+	/*
+	 * The snapshot for acceleration must be taken _before_ the read of the
+	 * current gp sequence used for advancing, otherwise advancing may fail
+	 * and acceleration may then fail too.
+	 *
+	 * This could happen if:
+	 *
+	 *  1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the
+	 *     RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8).
+	 *
+	 *  2) The grace period for RCU_WAIT_TAIL is seen as started but not
+	 *     completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1.
+	 *
+	 *  3) This value is passed to rcu_segcblist_advance() which can't move
+	 *     any segment forward and fails.
+	 *
+	 *  4) srcu_gp_start_if_needed() still proceeds with callback acceleration.
+	 *     But then the call to rcu_seq_snap() observes the grace period for the
+	 *     RCU_WAIT_TAIL segment as completed and the subsequent one for the
+	 *     RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1)
+	 *     so it returns a snapshot of the next grace period, which is X + 12.
+	 *
+	 *  5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the
+	 *     freshly enqueued callback in RCU_NEXT_TAIL can't move to
+	 *     RCU_NEXT_READY_TAIL which already has callbacks for a previous grace
+	 *     period (gp_num = X + 8). So acceleration fails.
+	 */
+	s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
-	s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
-	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
+	WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s) && rhp);
 	if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
 		sdp->srcu_gp_seq_needed = s;
 		needgp = true;
-- 
cgit v1.2.3


From ff0712ea71f173f4cc9a1d33a498cb0d05debdb3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 24 Aug 2023 20:36:44 +0100
Subject: fork: Fix kernel-doc

Fix the various warnings from kernel-doc in kernel/fork.c

Signed-off-by: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/20230824193644.3029141-1-willy@infradead.org
---
 kernel/fork.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 3b6d20dfb9a8..aabda7d9b2c7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1393,6 +1393,8 @@ EXPORT_SYMBOL_GPL(mmput_async);
 
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
  *
  * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
  *
@@ -1432,6 +1434,8 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 
 /**
  * replace_mm_exe_file - replace a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
  *
  * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
  *
@@ -1483,6 +1487,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
 
 /**
  * get_mm_exe_file - acquire a reference to the mm's executable file
+ * @mm: The mm of interest.
  *
  * Returns %NULL if mm has no associated executable file.
  * User must release file via fput().
@@ -1501,6 +1506,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 
 /**
  * get_task_exe_file - acquire a reference to the task's executable file
+ * @task: The task.
  *
  * Returns %NULL if task's mm (if any) has no associated executable file or
  * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
@@ -1523,6 +1529,7 @@ struct file *get_task_exe_file(struct task_struct *task)
 
 /**
  * get_task_mm - acquire a reference to the task's mm
+ * @task: The task.
  *
  * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
  * this kernel workthread has transiently adopted a user mm with use_mm,
@@ -2102,11 +2109,11 @@ const struct file_operations pidfd_fops = {
  * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
  * @pid:   the struct pid for which to create a pidfd
  * @flags: flags of the new @pidfd
- * @pidfd: the pidfd to return
+ * @ret: Where to return the file for the pidfd.
  *
  * Allocate a new file that stashes @pid and reserve a new pidfd number in the
  * caller's file descriptor table. The pidfd is reserved but not installed yet.
-
+ *
  * The helper doesn't perform checks on @pid which makes it useful for pidfds
  * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
  * pidfd file are prepared.
@@ -2153,7 +2160,7 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
  * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
  * @pid:   the struct pid for which to create a pidfd
  * @flags: flags of the new @pidfd
- * @pidfd: the pidfd to return
+ * @ret: Where to return the pidfd.
  *
  * Allocate a new file that stashes @pid and reserve a new pidfd number in the
  * caller's file descriptor table. The pidfd is reserved but not installed yet.
@@ -3181,7 +3188,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
 }
 
 /**
- * clone3 - create a new process with specific properties
+ * sys_clone3 - create a new process with specific properties
  * @uargs: argument structure
  * @size:  size of @uargs
  *
-- 
cgit v1.2.3


From b7a5b537c55c088d891ae554103d1b281abef781 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 10 Oct 2023 09:31:42 +0100
Subject: sched/numa: Complete scanning of partial VMAs regardless of PID
 activity

NUMA Balancing skips VMAs when the current task has not trapped a NUMA
fault within the VMA. If the VMA is skipped then mm->numa_scan_offset
advances and a task that is trapping faults within the VMA may never
fully update PTEs within the VMA.

Force tasks to update PTEs for partially scanned PTEs. The VMA will
be tagged for NUMA hints by some task but this removes some of the
benefit of tracking PID activity within a VMA. A follow-on patch
will mitigate this problem.

The test cases and machines evaluated did not trigger the corner case so
the performance results are neutral with only small changes within the
noise from normal test-to-test variance. However, the next patch makes
the corner case easier to trigger.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Raghavendra K T <raghavendra.kt@amd.com>
Link: https://lore.kernel.org/r/20231010083143.19593-6-mgorman@techsingularity.net
---
 include/linux/sched/numa_balancing.h |  1 +
 include/trace/events/sched.h         |  3 ++-
 kernel/sched/fair.c                  | 18 +++++++++++++++---
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index c127a1509e2f..7dcc0bdfddbb 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -21,6 +21,7 @@ enum numa_vmaskip_reason {
 	NUMAB_SKIP_INACCESSIBLE,
 	NUMAB_SKIP_SCAN_DELAY,
 	NUMAB_SKIP_PID_INACTIVE,
+	NUMAB_SKIP_IGNORE_PID,
 };
 
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index d82a04d6a1bc..bfc07c10541a 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -670,7 +670,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
 	EM( NUMAB_SKIP_SHARED_RO,		"shared_ro" )	\
 	EM( NUMAB_SKIP_INACCESSIBLE,		"inaccessible" )	\
 	EM( NUMAB_SKIP_SCAN_DELAY,		"scan_delay" )	\
-	EMe(NUMAB_SKIP_PID_INACTIVE,		"pid_inactive" )
+	EM( NUMAB_SKIP_PID_INACTIVE,		"pid_inactive" )	\
+	EMe(NUMAB_SKIP_IGNORE_PID,		"ignore_pid_inactive" )
 
 /* Redefine for export. */
 #undef EM
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ce36969625bd..ab79013f6e91 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3113,7 +3113,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
 	p->mm->numa_scan_offset = 0;
 }
 
-static bool vma_is_accessed(struct vm_area_struct *vma)
+static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
 {
 	unsigned long pids;
 	/*
@@ -3126,7 +3126,19 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
 		return true;
 
 	pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
-	return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
+	if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+		return true;
+
+	/*
+	 * Complete a scan that has already started regardless of PID access, or
+	 * some VMAs may never be scanned in multi-threaded applications:
+	 */
+	if (mm->numa_scan_offset > vma->vm_start) {
+		trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
+		return true;
+	}
+
+	return false;
 }
 
 #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
@@ -3270,7 +3282,7 @@ static void task_numa_work(struct callback_head *work)
 		}
 
 		/* Do not scan the VMA if task has not accessed */
-		if (!vma_is_accessed(vma)) {
+		if (!vma_is_accessed(mm, vma)) {
 			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
 			continue;
 		}
-- 
cgit v1.2.3


From f169c62ff7cd1acf8bac8ae17bfeafa307d9e6fa Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 10 Oct 2023 09:31:43 +0100
Subject: sched/numa: Complete scanning of inactive VMAs when there is no
 alternative

VMAs are skipped if there is no recent fault activity but this represents
a chicken-and-egg problem as there may be no fault activity if the PTEs
are never updated to trap NUMA hints. There is an indirect reliance on
scanning to be forced early in the lifetime of a task but this may fail
to detect changes in phase behaviour. Force inactive VMAs to be scanned
when all other eligible VMAs have been updated within the same scan
sequence.

Test results in general look good with some changes in performance, both
negative and positive, depending on whether the additional scanning and
faulting was beneficial or not to the workload. The autonuma benchmark
workload NUMA01_THREADLOCAL was picked for closer examination. The workload
creates two processes with numerous threads and thread-local storage that
is zero-filled in a loop. It exercises the corner case where unrelated
threads may skip VMAs that are thread-local to another thread and still
has some VMAs that inactive while the workload executes.

The VMA skipping activity frequency with and without the patch:

	6.6.0-rc2-sched-numabtrace-v1
	=============================
	    649 reason=scan_delay
	  9,094 reason=unsuitable
	 48,915 reason=shared_ro
	143,919 reason=inaccessible
	193,050 reason=pid_inactive

	6.6.0-rc2-sched-numabselective-v1
	=============================
	    146 reason=seq_completed
	    622 reason=ignore_pid_inactive

	    624 reason=scan_delay
	  6,570 reason=unsuitable
	 16,101 reason=shared_ro
	 27,608 reason=inaccessible
	 41,939 reason=pid_inactive

Note that with the patch applied, the PID activity is ignored
(ignore_pid_inactive) to ensure a VMA with some activity is completely
scanned. In addition, a small number of VMAs are scanned when no other
eligible VMA is available during a single scan window (seq_completed).
The number of times a VMA is skipped due to no PID activity from the
scanning task (pid_inactive) drops dramatically. It is expected that
this will increase the number of PTEs updated for NUMA hinting faults
as well as hinting faults but these represent PTEs that would otherwise
have been missed. The tradeoff is scan+fault overhead versus improving
locality due to migration.

On a 2-socket Cascade Lake test machine, the time to complete the
workload is as follows;

                                                 6.6.0-rc2              6.6.0-rc2
                                       sched-numabtrace-v1 sched-numabselective-v1
  Min       elsp-NUMA01_THREADLOCAL      174.22 (   0.00%)      117.64 (  32.48%)
  Amean     elsp-NUMA01_THREADLOCAL      175.68 (   0.00%)      123.34 *  29.79%*
  Stddev    elsp-NUMA01_THREADLOCAL        1.20 (   0.00%)        4.06 (-238.20%)
  CoeffVar  elsp-NUMA01_THREADLOCAL        0.68 (   0.00%)        3.29 (-381.70%)
  Max       elsp-NUMA01_THREADLOCAL      177.18 (   0.00%)      128.03 (  27.74%)

The time to complete the workload is reduced by almost 30%:

                     6.6.0-rc2   6.6.0-rc2
                  sched-numabtrace-v1 sched-numabselective-v1 /
  Duration User       91201.80    63506.64
  Duration System      2015.53     1819.78
  Duration Elapsed     1234.77      868.37

In this specific case, system CPU time was not increased but it's not
universally true.

From vmstat, the NUMA scanning and fault activity is as follows;

                                        6.6.0-rc2      6.6.0-rc2
                              sched-numabtrace-v1 sched-numabselective-v1
  Ops NUMA base-page range updates       64272.00    26374386.00
  Ops NUMA PTE updates                   36624.00       55538.00
  Ops NUMA PMD updates                      54.00       51404.00
  Ops NUMA hint faults                   15504.00       75786.00
  Ops NUMA hint local faults %           14860.00       56763.00
  Ops NUMA hint local percent               95.85          74.90
  Ops NUMA pages migrated                 1629.00     6469222.00

Both the number of PTE updates and hint faults is dramatically
increased. While this is superficially unfortunate, it represents
ranges that were simply skipped without the patch. As a result
of the scanning and hinting faults, many more pages were also
migrated but as the time to completion is reduced, the overhead
is offset by the gain.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Raghavendra K T <raghavendra.kt@amd.com>
Link: https://lore.kernel.org/r/20231010083143.19593-7-mgorman@techsingularity.net
---
 include/linux/mm_types.h             |  6 ++++
 include/linux/sched/numa_balancing.h |  1 +
 include/trace/events/sched.h         |  3 +-
 kernel/sched/fair.c                  | 55 ++++++++++++++++++++++++++++++++++--
 4 files changed, 61 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e7571eca1131..589f31ef2e84 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -575,6 +575,12 @@ struct vma_numab_state {
 	 * every VMA_PID_RESET_PERIOD jiffies:
 	 */
 	unsigned long pids_active[2];
+
+	/*
+	 * MM scan sequence ID when the VMA was last completely scanned.
+	 * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq
+	 */
+	int prev_scan_seq;
 };
 
 /*
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 7dcc0bdfddbb..b69afb8630db 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -22,6 +22,7 @@ enum numa_vmaskip_reason {
 	NUMAB_SKIP_SCAN_DELAY,
 	NUMAB_SKIP_PID_INACTIVE,
 	NUMAB_SKIP_IGNORE_PID,
+	NUMAB_SKIP_SEQ_COMPLETED,
 };
 
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index bfc07c10541a..6188ad0d9e0d 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -671,7 +671,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
 	EM( NUMAB_SKIP_INACCESSIBLE,		"inaccessible" )	\
 	EM( NUMAB_SKIP_SCAN_DELAY,		"scan_delay" )	\
 	EM( NUMAB_SKIP_PID_INACTIVE,		"pid_inactive" )	\
-	EMe(NUMAB_SKIP_IGNORE_PID,		"ignore_pid_inactive" )
+	EM( NUMAB_SKIP_IGNORE_PID,		"ignore_pid_inactive" )		\
+	EMe(NUMAB_SKIP_SEQ_COMPLETED,		"seq_completed" )
 
 /* Redefine for export. */
 #undef EM
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ab79013f6e91..922905194c0c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3158,6 +3158,8 @@ static void task_numa_work(struct callback_head *work)
 	unsigned long nr_pte_updates = 0;
 	long pages, virtpages;
 	struct vma_iterator vmi;
+	bool vma_pids_skipped;
+	bool vma_pids_forced = false;
 
 	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
 
@@ -3200,7 +3202,6 @@ static void task_numa_work(struct callback_head *work)
 	 */
 	p->node_stamp += 2 * TICK_NSEC;
 
-	start = mm->numa_scan_offset;
 	pages = sysctl_numa_balancing_scan_size;
 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
 	virtpages = pages * 8;	   /* Scan up to this much virtual space */
@@ -3210,6 +3211,16 @@ static void task_numa_work(struct callback_head *work)
 
 	if (!mmap_read_trylock(mm))
 		return;
+
+	/*
+	 * VMAs are skipped if the current PID has not trapped a fault within
+	 * the VMA recently. Allow scanning to be forced if there is no
+	 * suitable VMA remaining.
+	 */
+	vma_pids_skipped = false;
+
+retry_pids:
+	start = mm->numa_scan_offset;
 	vma_iter_init(&vmi, mm, start);
 	vma = vma_next(&vmi);
 	if (!vma) {
@@ -3260,6 +3271,13 @@ static void task_numa_work(struct callback_head *work)
 			/* Reset happens after 4 times scan delay of scan start */
 			vma->numab_state->pids_active_reset =  vma->numab_state->next_scan +
 				msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+
+			/*
+			 * Ensure prev_scan_seq does not match numa_scan_seq,
+			 * to prevent VMAs being skipped prematurely on the
+			 * first scan:
+			 */
+			 vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
 		}
 
 		/*
@@ -3281,8 +3299,19 @@ static void task_numa_work(struct callback_head *work)
 			vma->numab_state->pids_active[1] = 0;
 		}
 
-		/* Do not scan the VMA if task has not accessed */
-		if (!vma_is_accessed(mm, vma)) {
+		/* Do not rescan VMAs twice within the same sequence. */
+		if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
+			mm->numa_scan_offset = vma->vm_end;
+			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
+			continue;
+		}
+
+		/*
+		 * Do not scan the VMA if task has not accessed it, unless no other
+		 * VMA candidate exists.
+		 */
+		if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
+			vma_pids_skipped = true;
 			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
 			continue;
 		}
@@ -3311,8 +3340,28 @@ static void task_numa_work(struct callback_head *work)
 
 			cond_resched();
 		} while (end != vma->vm_end);
+
+		/* VMA scan is complete, do not scan until next sequence. */
+		vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
+
+		/*
+		 * Only force scan within one VMA at a time, to limit the
+		 * cost of scanning a potentially uninteresting VMA.
+		 */
+		if (vma_pids_forced)
+			break;
 	} for_each_vma(vmi, vma);
 
+	/*
+	 * If no VMAs are remaining and VMAs were skipped due to the PID
+	 * not accessing the VMA previously, then force a scan to ensure
+	 * forward progress:
+	 */
+	if (!vma && !vma_pids_forced && vma_pids_skipped) {
+		vma_pids_forced = true;
+		goto retry_pids;
+	}
+
 out:
 	/*
 	 * It is possible to reach the end of the VMA list but the last few
-- 
cgit v1.2.3


From b19fdb16fb2167c6bc9ee8fbc0c1d2d4fd3e2eb8 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Tue, 10 Oct 2023 16:57:44 +0100
Subject: sched/headers: Remove comment referring to rq::cpu_load, since this
 has been removed

There is a comment that refers to cpu_load, however, this cpu_load was
removed with:

  55627e3cd22c ("sched/core: Remove rq->cpu_load[]")

... back in 2019. The comment does not make sense with respect to this
removed array, so remove the comment.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231010155744.1381065-1-colin.i.king@gmail.com
---
 kernel/sched/sched.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index faf9031422e1..65cad0e5729e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -948,10 +948,6 @@ struct rq {
 	/* runqueue lock: */
 	raw_spinlock_t		__lock;
 
-	/*
-	 * nr_running and cpu_load should be in the same cacheline because
-	 * remote CPUs use both these fields when doing load calculation.
-	 */
 	unsigned int		nr_running;
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int		nr_numa_running;
-- 
cgit v1.2.3


From 29fda1ad2a64a62e1c51d61207396e2de1c67362 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 6 Oct 2023 10:21:51 +0200
Subject: printk: Reduce pr_flush() pooling time

pr_flush() does not guarantee that all messages would really get flushed
to the console. The best it could do is to wait with a given timeout.[*]

The current interval 100ms for checking the progress might seem too
long in some situations. For example, such delays are not appreciated
during suspend and resume especially when the consoles have been flushed
"long" time before the check.

On the other hand, the sleeping wait might be useful in other situations.
Especially, it would allow flushing the messages using printk kthreads
on the same CPU[*].

Use msleep(1) as a compromise.

Also measure the time using jiffies. msleep() does not guarantee
precise wakeup after the given delay. It might be much longer,
especially for times < 20s. See Documentation/timers/timers-howto.rst
for more details.

Note that msecs_to_jiffies() already translates a negative value into
an infinite timeout.

[*] console_unlock() does not guarantee flushing the consoles since
    the commit dbdda842fe96f893 ("printk: Add console owner and waiter
    logic to load balance console writes").

    It would be possible to guarantee it another way. For example,
    the spinning might be enabled only when the console_lock has been
    taken via console_trylock().

    But the load balancing is helpful. And more importantly, the flush
    with a timeout has been added as a preparation step for introducing
    printk kthreads.

Signed-off-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Link: https://lore.kernel.org/r/20231006082151.6969-3-pmladek@suse.com
---
 kernel/printk/printk.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1919899b6897..80eb79177bab 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3726,7 +3726,8 @@ late_initcall(printk_late_init);
 /* If @con is specified, only wait for that console. Otherwise wait for all. */
 static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
 {
-	int remaining = timeout_ms;
+	unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms);
+	unsigned long remaining_jiffies = timeout_jiffies;
 	struct console *c;
 	u64 last_diff = 0;
 	u64 printk_seq;
@@ -3743,6 +3744,9 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 	console_unlock();
 
 	for (;;) {
+		unsigned long begin_jiffies;
+		unsigned long slept_jiffies;
+
 		diff = 0;
 
 		/*
@@ -3771,24 +3775,20 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 		console_srcu_read_unlock(cookie);
 
 		if (diff != last_diff && reset_on_progress)
-			remaining = timeout_ms;
+			remaining_jiffies = timeout_jiffies;
 
 		console_unlock();
 
 		/* Note: @diff is 0 if there are no usable consoles. */
-		if (diff == 0 || remaining == 0)
+		if (diff == 0 || remaining_jiffies == 0)
 			break;
 
-		if (remaining < 0) {
-			/* no timeout limit */
-			msleep(100);
-		} else if (remaining < 100) {
-			msleep(remaining);
-			remaining = 0;
-		} else {
-			msleep(100);
-			remaining -= 100;
-		}
+		/* msleep(1) might sleep much longer. Check time by jiffies. */
+		begin_jiffies = jiffies;
+		msleep(1);
+		slept_jiffies = jiffies - begin_jiffies;
+
+		remaining_jiffies -= min(slept_jiffies, remaining_jiffies);
 
 		last_diff = diff;
 	}
-- 
cgit v1.2.3


From cefe8ce559b5ff301f59b44f97c5c313ec30c643 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Thu, 28 Sep 2023 10:06:11 +0300
Subject: locktorture: Check the correct variable for allocation failure

There is a typo so this checks the wrong variable.  "chains" plural vs
"chain" singular.  We already know that "chains" is non-zero.

Fixes: 7f993623e9eb ("locktorture: Add call_rcu_chains module parameter")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/locking/locktorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index a3abcd136f56..69d3cd2cfc3b 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -1075,7 +1075,7 @@ static int call_rcu_chain_init(void)
 	if (call_rcu_chains <= 0)
 		return 0;
 	call_rcu_chain = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain), GFP_KERNEL);
-	if (!call_rcu_chains)
+	if (!call_rcu_chain)
 		return -ENOMEM;
 	for (i = 0; i < call_rcu_chains; i++) {
 		call_rcu_chain[i].crc_stop = false;
-- 
cgit v1.2.3


From 21ca59b365c091d583f36ac753eaa8baf947be6f Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 28 Oct 2021 12:31:14 +0200
Subject: binfmt_misc: enable sandboxed mounts

Enable unprivileged sandboxes to create their own binfmt_misc mounts.
This is based on Laurent's work in [1] but has been significantly
reworked to fix various issues we identified in earlier versions.

While binfmt_misc can currently only be mounted in the initial user
namespace, binary types registered in this binfmt_misc instance are
available to all sandboxes (Either by having them installed in the
sandbox or by registering the binary type with the F flag causing the
interpreter to be opened right away). So binfmt_misc binary types are
already delegated to sandboxes implicitly.

However, while a sandbox has access to all registered binary types in
binfmt_misc a sandbox cannot currently register its own binary types
in binfmt_misc. This has prevented various use-cases some of which were
already outlined in [1] but we have a range of issues associated with
this (cf. [3]-[5] below which are just a small sample).

Extend binfmt_misc to be mountable in non-initial user namespaces.
Similar to other filesystem such as nfsd, mqueue, and sunrpc we use
keyed superblock management. The key determines whether we need to
create a new superblock or can reuse an already existing one. We use the
user namespace of the mount as key. This means a new binfmt_misc
superblock is created once per user namespace creation. Subsequent
mounts of binfmt_misc in the same user namespace will mount the same
binfmt_misc instance. We explicitly do not create a new binfmt_misc
superblock on every binfmt_misc mount as the semantics for
load_misc_binary() line up with the keying model. This also allows us to
retrieve the relevant binfmt_misc instance based on the caller's user
namespace which can be done in a simple (bounded to 32 levels) loop.

Similar to the current binfmt_misc semantics allowing access to the
binary types in the initial binfmt_misc instance we do allow sandboxes
access to their parent's binfmt_misc mounts if they do not have created
a separate binfmt_misc instance.

Overall, this will unblock the use-cases mentioned below and in general
will also allow to support and harden execution of another
architecture's binaries in tight sandboxes. For instance, using the
unshare binary it possible to start a chroot of another architecture and
configure the binfmt_misc interpreter without being root to run the
binaries in this chroot and without requiring the host to modify its
binary type handlers.

Henning had already posted a few experiments in the cover letter at [1].
But here's an additional example where an unprivileged container
registers qemu-user-static binary handlers for various binary types in
its separate binfmt_misc mount and is then seamlessly able to start
containers with a different architecture without affecting the host:

root    [lxc monitor] /var/snap/lxd/common/lxd/containers f1
1000000  \_ /sbin/init
1000000      \_ /lib/systemd/systemd-journald
1000000      \_ /lib/systemd/systemd-udevd
1000100      \_ /lib/systemd/systemd-networkd
1000101      \_ /lib/systemd/systemd-resolved
1000000      \_ /usr/sbin/cron -f
1000103      \_ /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-activation --syslog-only
1000000      \_ /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
1000104      \_ /usr/sbin/rsyslogd -n -iNONE
1000000      \_ /lib/systemd/systemd-logind
1000000      \_ /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 vt220
1000107      \_ dnsmasq --conf-file=/dev/null -u lxc-dnsmasq --strict-order --bind-interfaces --pid-file=/run/lxc/dnsmasq.pid --liste
1000000      \_ [lxc monitor] /var/lib/lxc f1-s390x
1100000          \_ /usr/bin/qemu-s390x-static /sbin/init
1100000              \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-journald
1100000              \_ /usr/bin/qemu-s390x-static /usr/sbin/cron -f
1100103              \_ /usr/bin/qemu-s390x-static /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-ac
1100000              \_ /usr/bin/qemu-s390x-static /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
1100104              \_ /usr/bin/qemu-s390x-static /usr/sbin/rsyslogd -n -iNONE
1100000              \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-logind
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/0 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/1 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/2 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /sbin/agetty -o -p -- \u --noclear --keep-baud pts/3 115200,38400,9600 vt220
1100000              \_ /usr/bin/qemu-s390x-static /lib/systemd/systemd-udevd

[1]: https://lore.kernel.org/all/20191216091220.465626-1-laurent@vivier.eu
[2]: https://discuss.linuxcontainers.org/t/binfmt-misc-permission-denied
[3]: https://discuss.linuxcontainers.org/t/lxd-binfmt-support-for-qemu-static-interpreters
[4]: https://discuss.linuxcontainers.org/t/3-1-0-binfmt-support-service-in-unprivileged-guest-requires-write-access-on-hosts-proc-sys-fs-binfmt-misc
[5]: https://discuss.linuxcontainers.org/t/qemu-user-static-not-working-4-11

Link: https://lore.kernel.org/r/20191216091220.465626-2-laurent@vivier.eu (origin)
Link: https://lore.kernel.org/r/20211028103114.2849140-2-brauner@kernel.org (v1)
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Jann Horn <jannh@google.com>
Cc: Henning Schild <henning.schild@siemens.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Laurent Vivier <laurent@vivier.eu>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Laurent Vivier <laurent@vivier.eu>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
/* v2 */
- Serge Hallyn <serge@hallyn.com>:
  - Use GFP_KERNEL_ACCOUNT for userspace triggered allocations when a
    new binary type handler is registered.
- Christian Brauner <christian.brauner@ubuntu.com>:
  - Switch authorship to me. I refused to do that earlier even though
    Laurent said I should do so because I think it's genuinely bad form.
    But by now I have changed so many things that it'd be unfair to
    blame Laurent for any potential bugs in here.
  - Add more comments that explain what's going on.
  - Rename functions while changing them to better reflect what they are
    doing to make the code easier to understand.
  - In the first version when a specific binary type handler was removed
    either through a write to the entry's file or all binary type
    handlers were removed by a write to the binfmt_misc mount's status
    file all cleanup work happened during inode eviction.
    That includes removal of the relevant entries from entry list. While
    that works fine I disliked that model after thinking about it for a
    bit. Because it means that there was a window were someone has
    already removed a or all binary handlers but they could still be
    safely reached from load_misc_binary() when it has managed to take
    the read_lock() on the entries list while inode eviction was already
    happening. Again, that perfectly benign but it's cleaner to remove
    the binary handler from the list immediately meaning that ones the
    write to then entry's file or the binfmt_misc status file returns
    the binary type cannot be executed anymore. That gives stronger
    guarantees to the user.
---
 fs/binfmt_misc.c               | 202 ++++++++++++++++++++++++++++++++++-------
 include/linux/binfmts.h        |  10 ++
 include/linux/user_namespace.h |   8 ++
 kernel/user.c                  |  13 +++
 kernel/user_namespace.c        |   3 +
 5 files changed, 202 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index cf5ed5cd4102..deacc105119d 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -40,9 +40,6 @@ enum {
 	VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
 };
 
-static LIST_HEAD(entries);
-static int enabled = 1;
-
 enum {Enabled, Magic};
 #define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
 #define MISC_FMT_OPEN_BINARY (1UL << 30)
@@ -63,7 +60,6 @@ typedef struct {
 	refcount_t users;		/* sync removal with load_misc_binary() */
 } Node;
 
-static DEFINE_RWLOCK(entries_lock);
 static struct file_system_type bm_fs_type;
 
 /*
@@ -91,13 +87,14 @@ static struct file_system_type bm_fs_type;
  *
  * Return: binary type list entry on success, NULL on failure
  */
-static Node *search_binfmt_handler(struct linux_binprm *bprm)
+static Node *search_binfmt_handler(struct binfmt_misc *misc,
+				   struct linux_binprm *bprm)
 {
 	char *p = strrchr(bprm->interp, '.');
 	Node *e;
 
 	/* Walk all the registered handlers. */
-	list_for_each_entry(e, &entries, list) {
+	list_for_each_entry(e, &misc->entries, list) {
 		char *s;
 		int j;
 
@@ -140,15 +137,16 @@ static Node *search_binfmt_handler(struct linux_binprm *bprm)
  *
  * Return: binary type list entry on success, NULL on failure
  */
-static Node *get_binfmt_handler(struct linux_binprm *bprm)
+static Node *get_binfmt_handler(struct binfmt_misc *misc,
+				struct linux_binprm *bprm)
 {
 	Node *e;
 
-	read_lock(&entries_lock);
-	e = search_binfmt_handler(bprm);
+	read_lock(&misc->entries_lock);
+	e = search_binfmt_handler(misc, bprm);
 	if (e)
 		refcount_inc(&e->users);
-	read_unlock(&entries_lock);
+	read_unlock(&misc->entries_lock);
 	return e;
 }
 
@@ -169,6 +167,35 @@ static void put_binfmt_handler(Node *e)
 	}
 }
 
+/**
+ * load_binfmt_misc - load the binfmt_misc of the caller's user namespace
+ *
+ * To be called in load_misc_binary() to load the relevant struct binfmt_misc.
+ * If a user namespace doesn't have its own binfmt_misc mount it can make use
+ * of its ancestor's binfmt_misc handlers. This mimicks the behavior of
+ * pre-namespaced binfmt_misc where all registered binfmt_misc handlers where
+ * available to all user and user namespaces on the system.
+ *
+ * Return: the binfmt_misc instance of the caller's user namespace
+ */
+static struct binfmt_misc *load_binfmt_misc(void)
+{
+	const struct user_namespace *user_ns;
+	struct binfmt_misc *misc;
+
+	user_ns = current_user_ns();
+	while (user_ns) {
+		/* Pairs with smp_store_release() in bm_fill_super(). */
+		misc = smp_load_acquire(&user_ns->binfmt_misc);
+		if (misc)
+			return misc;
+
+		user_ns = user_ns->parent;
+	}
+
+	return &init_binfmt_misc;
+}
+
 /*
  * the loader itself
  */
@@ -176,13 +203,14 @@ static int load_misc_binary(struct linux_binprm *bprm)
 {
 	Node *fmt;
 	struct file *interp_file = NULL;
-	int retval;
+	int retval = -ENOEXEC;
+	struct binfmt_misc *misc;
 
-	retval = -ENOEXEC;
-	if (!enabled)
+	misc = load_binfmt_misc();
+	if (!misc->enabled)
 		return retval;
 
-	fmt = get_binfmt_handler(bprm);
+	fmt = get_binfmt_handler(misc, bprm);
 	if (!fmt)
 		return retval;
 
@@ -240,9 +268,9 @@ ret:
 	/*
 	 * If we actually put the node here all concurrent calls to
 	 * load_misc_binary() will have finished. We also know
-	 * that for the refcount to be zero ->evict_inode() must have removed
-	 * the node to be deleted from the list. All that is left for us is to
-	 * close and free.
+	 * that for the refcount to be zero someone must have concurently
+	 * removed the binary type handler from the list and it's our job to
+	 * free it.
 	 */
 	put_binfmt_handler(fmt);
 
@@ -334,7 +362,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
 
 	err = -ENOMEM;
 	memsize = sizeof(Node) + count + 8;
-	e = kmalloc(memsize, GFP_KERNEL);
+	e = kmalloc(memsize, GFP_KERNEL_ACCOUNT);
 	if (!e)
 		goto out;
 
@@ -446,7 +474,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
 
 			if (e->mask) {
 				int i;
-				char *masked = kmalloc(e->size, GFP_KERNEL);
+				char *masked = kmalloc(e->size, GFP_KERNEL_ACCOUNT);
 
 				print_hex_dump_bytes(
 					KBUILD_MODNAME ": register:  mask[decoded]: ",
@@ -599,6 +627,22 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 	return inode;
 }
 
+/**
+ * i_binfmt_misc - retrieve struct binfmt_misc from a binfmt_misc inode
+ * @inode: inode of the relevant binfmt_misc instance
+ *
+ * This helper retrieves struct binfmt_misc from a binfmt_misc inode. This can
+ * be done without any memory barriers because we are guaranteed that
+ * user_ns->binfmt_misc is fully initialized. It was fully initialized when the
+ * binfmt_misc mount was first created.
+ *
+ * Return: struct binfmt_misc of the relevant binfmt_misc instance
+ */
+static struct binfmt_misc *i_binfmt_misc(struct inode *inode)
+{
+	return inode->i_sb->s_user_ns->binfmt_misc;
+}
+
 /**
  * bm_evict_inode - cleanup data associated with @inode
  * @inode: inode to which the data is attached
@@ -619,10 +663,13 @@ static void bm_evict_inode(struct inode *inode)
 	clear_inode(inode);
 
 	if (e) {
-		write_lock(&entries_lock);
+		struct binfmt_misc *misc;
+
+		misc = i_binfmt_misc(inode);
+		write_lock(&misc->entries_lock);
 		if (!list_empty(&e->list))
 			list_del_init(&e->list);
-		write_unlock(&entries_lock);
+		write_unlock(&misc->entries_lock);
 		put_binfmt_handler(e);
 	}
 }
@@ -677,11 +724,11 @@ static void unlink_binfmt_dentry(struct dentry *dentry)
  * to use writes to files in order to delete binary type handlers. But it has
  * worked for so long that it's not a pressing issue.
  */
-static void remove_binfmt_handler(Node *e)
+static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e)
 {
-	write_lock(&entries_lock);
+	write_lock(&misc->entries_lock);
 	list_del_init(&e->list);
-	write_unlock(&entries_lock);
+	write_unlock(&misc->entries_lock);
 	unlink_binfmt_dentry(e->dentry);
 }
 
@@ -737,7 +784,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 		 * actually remove the entry from the list.
 		 */
 		if (!list_empty(&e->list))
-			remove_binfmt_handler(e);
+			remove_binfmt_handler(i_binfmt_misc(inode), e);
 
 		inode_unlock(inode);
 		break;
@@ -763,6 +810,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	struct inode *inode;
 	struct super_block *sb = file_inode(file)->i_sb;
 	struct dentry *root = sb->s_root, *dentry;
+	struct binfmt_misc *misc;
 	int err = 0;
 	struct file *f = NULL;
 
@@ -772,7 +820,18 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 		return PTR_ERR(e);
 
 	if (e->flags & MISC_FMT_OPEN_FILE) {
+		const struct cred *old_cred;
+
+		/*
+		 * Now that we support unprivileged binfmt_misc mounts make
+		 * sure we use the credentials that the register @file was
+		 * opened with to also open the interpreter. Before that this
+		 * didn't matter much as only a privileged process could open
+		 * the register file.
+		 */
+		old_cred = override_creds(file->f_cred);
 		f = open_exec(e->interpreter);
+		revert_creds(old_cred);
 		if (IS_ERR(f)) {
 			pr_notice("register: failed to install interpreter file %s\n",
 				 e->interpreter);
@@ -804,9 +863,10 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	inode->i_fop = &bm_entry_operations;
 
 	d_instantiate(dentry, inode);
-	write_lock(&entries_lock);
-	list_add(&e->list, &entries);
-	write_unlock(&entries_lock);
+	misc = i_binfmt_misc(inode);
+	write_lock(&misc->entries_lock);
+	list_add(&e->list, &misc->entries);
+	write_unlock(&misc->entries_lock);
 
 	err = 0;
 out2:
@@ -833,26 +893,31 @@ static const struct file_operations bm_register_operations = {
 static ssize_t
 bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-	char *s = enabled ? "enabled\n" : "disabled\n";
+	struct binfmt_misc *misc;
+	char *s;
 
+	misc = i_binfmt_misc(file_inode(file));
+	s = misc->enabled ? "enabled\n" : "disabled\n";
 	return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
 }
 
 static ssize_t bm_status_write(struct file *file, const char __user *buffer,
 		size_t count, loff_t *ppos)
 {
+	struct binfmt_misc *misc;
 	int res = parse_command(buffer, count);
 	Node *e, *next;
 	struct inode *inode;
 
+	misc = i_binfmt_misc(file_inode(file));
 	switch (res) {
 	case 1:
 		/* Disable all handlers. */
-		enabled = 0;
+		misc->enabled = false;
 		break;
 	case 2:
 		/* Enable all handlers. */
-		enabled = 1;
+		misc->enabled = true;
 		break;
 	case 3:
 		/* Delete all handlers. */
@@ -868,8 +933,8 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
 		 * read-only. So we only need to take the write lock when we
 		 * actually remove the entry from the list.
 		 */
-		list_for_each_entry_safe(e, next, &entries, list)
-			remove_binfmt_handler(e);
+		list_for_each_entry_safe(e, next, &misc->entries, list)
+			remove_binfmt_handler(misc, e);
 
 		inode_unlock(inode);
 		break;
@@ -888,32 +953,100 @@ static const struct file_operations bm_status_operations = {
 
 /* Superblock handling */
 
+static void bm_put_super(struct super_block *sb)
+{
+	struct user_namespace *user_ns = sb->s_fs_info;
+
+	sb->s_fs_info = NULL;
+	put_user_ns(user_ns);
+}
+
 static const struct super_operations s_ops = {
 	.statfs		= simple_statfs,
 	.evict_inode	= bm_evict_inode,
+	.put_super	= bm_put_super,
 };
 
 static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	int err;
+	struct user_namespace *user_ns = sb->s_user_ns;
+	struct binfmt_misc *misc;
 	static const struct tree_descr bm_files[] = {
 		[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
 		[3] = {"register", &bm_register_operations, S_IWUSR},
 		/* last one */ {""}
 	};
 
+	if (WARN_ON(user_ns != current_user_ns()))
+		return -EINVAL;
+
+	/*
+	 * Lazily allocate a new binfmt_misc instance for this namespace, i.e.
+	 * do it here during the first mount of binfmt_misc. We don't need to
+	 * waste memory for every user namespace allocation. It's likely much
+	 * more common to not mount a separate binfmt_misc instance than it is
+	 * to mount one.
+	 *
+	 * While multiple superblocks can exist they are keyed by userns in
+	 * s_fs_info for binfmt_misc. Hence, the vfs guarantees that
+	 * bm_fill_super() is called exactly once whenever a binfmt_misc
+	 * superblock for a userns is created. This in turn lets us conclude
+	 * that when a binfmt_misc superblock is created for the first time for
+	 * a userns there's no one racing us. Therefore we don't need any
+	 * barriers when we dereference binfmt_misc.
+	 */
+	misc = user_ns->binfmt_misc;
+	if (!misc) {
+		/*
+		 * If it turns out that most user namespaces actually want to
+		 * register their own binary type handler and therefore all
+		 * create their own separate binfm_misc mounts we should
+		 * consider turning this into a kmem cache.
+		 */
+		misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
+		if (!misc)
+			return -ENOMEM;
+
+		INIT_LIST_HEAD(&misc->entries);
+		rwlock_init(&misc->entries_lock);
+
+		/* Pairs with smp_load_acquire() in load_binfmt_misc(). */
+		smp_store_release(&user_ns->binfmt_misc, misc);
+	}
+
+	/*
+	 * When the binfmt_misc superblock for this userns is shutdown
+	 * ->enabled might have been set to false and we don't reinitialize
+	 * ->enabled again in put_super() as someone might already be mounting
+	 * binfmt_misc again. It also would be pointless since by the time
+	 * ->put_super() is called we know that the binary type list for this
+	 * bintfmt_misc mount is empty making load_misc_binary() return
+	 * -ENOEXEC independent of whether ->enabled is true. Instead, if
+	 * someone mounts binfmt_misc for the first time or again we simply
+	 * reset ->enabled to true.
+	 */
+	misc->enabled = true;
+
 	err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
 	if (!err)
 		sb->s_op = &s_ops;
 	return err;
 }
 
+static void bm_free(struct fs_context *fc)
+{
+	if (fc->s_fs_info)
+		put_user_ns(fc->s_fs_info);
+}
+
 static int bm_get_tree(struct fs_context *fc)
 {
-	return get_tree_single(fc, bm_fill_super);
+	return get_tree_keyed(fc, bm_fill_super, get_user_ns(fc->user_ns));
 }
 
 static const struct fs_context_operations bm_context_ops = {
+	.free		= bm_free,
 	.get_tree	= bm_get_tree,
 };
 
@@ -932,6 +1065,7 @@ static struct file_system_type bm_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "binfmt_misc",
 	.init_fs_context = bm_init_fs_context,
+	.fs_flags	= FS_USERNS_MOUNT,
 	.kill_sb	= kill_litter_super,
 };
 MODULE_ALIAS_FS("binfmt_misc");
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 8d51f69f9f5e..70f97f685bff 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -90,6 +90,16 @@ struct linux_binfmt {
 #endif
 } __randomize_layout;
 
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc {
+	struct list_head entries;
+	rwlock_t entries_lock;
+	bool enabled;
+} __randomize_layout;
+
+extern struct binfmt_misc init_binfmt_misc;
+#endif
+
 extern void __register_binfmt(struct linux_binfmt *fmt, int insert);
 
 /* Registration of default binfmt handlers */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 45f09bec02c4..6030a8235617 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -65,6 +65,10 @@ enum rlimit_type {
 	UCOUNT_RLIMIT_COUNTS,
 };
 
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc;
+#endif
+
 struct user_namespace {
 	struct uid_gid_map	uid_map;
 	struct uid_gid_map	gid_map;
@@ -102,6 +106,10 @@ struct user_namespace {
 	struct ucounts		*ucounts;
 	long ucount_max[UCOUNT_COUNTS];
 	long rlimit_max[UCOUNT_RLIMIT_COUNTS];
+
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+	struct binfmt_misc *binfmt_misc;
+#endif
 } __randomize_layout;
 
 struct ucounts {
diff --git a/kernel/user.c b/kernel/user.c
index d667debeafd6..03cedc366dc9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -18,8 +18,18 @@
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
+#include <linux/binfmts.h>
 #include <linux/proc_ns.h>
 
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc init_binfmt_misc = {
+	.entries = LIST_HEAD_INIT(init_binfmt_misc.entries),
+	.enabled = true,
+	.entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock),
+};
+EXPORT_SYMBOL_GPL(init_binfmt_misc);
+#endif
+
 /*
  * userns count is 1 for root user, 1 for init_uts_ns,
  * and 1 for... ?
@@ -67,6 +77,9 @@ struct user_namespace init_user_ns = {
 	.keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
 	.keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem),
 #endif
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+	.binfmt_misc = &init_binfmt_misc,
+#endif
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 1d8e47bed3f1..d52a894ecf57 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -213,6 +213,9 @@ static void free_user_ns(struct work_struct *work)
 			kfree(ns->projid_map.forward);
 			kfree(ns->projid_map.reverse);
 		}
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+		kfree(ns->binfmt_misc);
+#endif
 		retire_userns_sysctls(ns);
 		key_free_user_ns(ns);
 		ns_free_inum(&ns->ns);
-- 
cgit v1.2.3


From 80cc1d1d5ee35701daf11725ce06d8a240588973 Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang29@zte.com.cn>
Date: Tue, 10 Oct 2023 16:41:07 +0800
Subject: sched/psi: Avoid updating PSI triggers and ->rtpoll_total when there
 are no state changes

When psimon wakes up and there are no state changes for ->rtpoll_states,
it's unnecessary to update triggers and ->rtpoll_total because the pressures
being monitored by the user have not changed.

This will help to slightly reduce unnecessary computations of PSI.

[ mingo: Changelog updates ]

Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Peter Ziljstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/202310101641075436843@zte.com.cn
---
 kernel/sched/psi.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index be853f227e40..79f8db0c6150 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -704,11 +704,12 @@ static void psi_rtpoll_work(struct psi_group *group)
 	}
 
 	if (now >= group->rtpoll_next_update) {
-		update_triggers(group, now, &update_total, PSI_POLL);
-		group->rtpoll_next_update = now + group->rtpoll_min_period;
-		if (update_total)
+		if (changed_states & group->rtpoll_states) {
+			update_triggers(group, now, &update_total, PSI_POLL);
 			memcpy(group->rtpoll_total, group->total[PSI_POLL],
 				   sizeof(group->rtpoll_total));
+		}
+		group->rtpoll_next_update = now + group->rtpoll_min_period;
 	}
 
 	psi_schedule_rtpoll_work(group,
-- 
cgit v1.2.3


From 3657680f38cd7df413d665f2b2f38e9a78130d8b Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang29@zte.com.cn>
Date: Tue, 10 Oct 2023 16:45:43 +0800
Subject: sched/psi: Delete the 'update_total' function parameter from
 update_triggers()

The 'update_total' parameter of update_triggers() is always true after the
previous commit:

  80cc1d1d5ee3 ("sched/psi: Avoid updating PSI triggers and ->rtpoll_total when there are no state changes")

If the 'changed_states & group->rtpoll_states' condition is true,
'new_stall' in update_triggers() will be true, and then 'update_total'
should also be true.

So update_total is redundant - remove it.

[ mingo: Changelog updates ]

Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Peter Ziljstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/202310101645437859599@zte.com.cn
---
 kernel/sched/psi.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 79f8db0c6150..44a78774ae87 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -434,14 +434,13 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
 	return growth;
 }
 
-static void update_triggers(struct psi_group *group, u64 now, bool *update_total,
+static void update_triggers(struct psi_group *group, u64 now,
 						   enum psi_aggregators aggregator)
 {
 	struct psi_trigger *t;
 	u64 *total = group->total[aggregator];
 	struct list_head *triggers;
 	u64 *aggregator_total;
-	*update_total = false;
 
 	if (aggregator == PSI_AVGS) {
 		triggers = &group->avg_triggers;
@@ -471,14 +470,6 @@ static void update_triggers(struct psi_group *group, u64 now, bool *update_total
 		 * events without dropping any).
 		 */
 		if (new_stall) {
-			/*
-			 * Multiple triggers might be looking at the same state,
-			 * remember to update group->polling_total[] once we've
-			 * been through all of them. Also remember to extend the
-			 * polling time if we see new stall activity.
-			 */
-			*update_total = true;
-
 			/* Calculate growth since last update */
 			growth = window_update(&t->win, now, total[t->state]);
 			if (!t->pending_event) {
@@ -563,7 +554,6 @@ static void psi_avgs_work(struct work_struct *work)
 	struct delayed_work *dwork;
 	struct psi_group *group;
 	u32 changed_states;
-	bool update_total;
 	u64 now;
 
 	dwork = to_delayed_work(work);
@@ -582,7 +572,7 @@ static void psi_avgs_work(struct work_struct *work)
 	 * go - see calc_avgs() and missed_periods.
 	 */
 	if (now >= group->avg_next_update) {
-		update_triggers(group, now, &update_total, PSI_AVGS);
+		update_triggers(group, now, PSI_AVGS);
 		group->avg_next_update = update_averages(group, now);
 	}
 
@@ -638,7 +628,6 @@ static void psi_rtpoll_work(struct psi_group *group)
 {
 	bool force_reschedule = false;
 	u32 changed_states;
-	bool update_total;
 	u64 now;
 
 	mutex_lock(&group->rtpoll_trigger_lock);
@@ -705,7 +694,7 @@ static void psi_rtpoll_work(struct psi_group *group)
 
 	if (now >= group->rtpoll_next_update) {
 		if (changed_states & group->rtpoll_states) {
-			update_triggers(group, now, &update_total, PSI_POLL);
+			update_triggers(group, now, PSI_POLL);
 			memcpy(group->rtpoll_total, group->total[PSI_POLL],
 				   sizeof(group->rtpoll_total));
 		}
-- 
cgit v1.2.3


From fefba7d1ae198dcbf8b3b432de46a4e29f8dbd8c Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Wed, 11 Oct 2023 20:51:04 +0200
Subject: bpf: Propagate modified uaddrlen from cgroup sockaddr programs

As prep for adding unix socket support to the cgroup sockaddr hooks,
let's propagate the sockaddr length back to the caller after running
a bpf cgroup sockaddr hook program. While not important for AF_INET or
AF_INET6, the sockaddr length is important when working with AF_UNIX
sockaddrs as the size of the sockaddr cannot be determined just from the
address family or the sockaddr's contents.

__cgroup_bpf_run_filter_sock_addr() is modified to take the uaddrlen as
an input/output argument. After running the program, the modified sockaddr
length is stored in the uaddrlen pointer.

Signed-off-by: Daan De Meyer <daan.j.demeyer@gmail.com>
Link: https://lore.kernel.org/r/20231011185113.140426-3-daan.j.demeyer@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf-cgroup.h | 73 +++++++++++++++++++++++-----------------------
 include/linux/filter.h     |  1 +
 kernel/bpf/cgroup.c        | 17 +++++++++--
 net/ipv4/af_inet.c         |  7 +++--
 net/ipv4/ping.c            |  2 +-
 net/ipv4/tcp_ipv4.c        |  2 +-
 net/ipv4/udp.c             |  9 ++++--
 net/ipv6/af_inet6.c        |  9 +++---
 net/ipv6/ping.c            |  2 +-
 net/ipv6/tcp_ipv6.c        |  2 +-
 net/ipv6/udp.c             |  6 ++--
 11 files changed, 76 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8506690dbb9c..31561e789715 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -120,6 +120,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
+				      int *uaddrlen,
 				      enum cgroup_bpf_attach_type atype,
 				      void *t_ctx,
 				      u32 *flags);
@@ -230,22 +231,22 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk)				       \
 	BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND)
 
-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype)				       \
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype)		       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))					       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
-							  NULL, NULL);	       \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+							  atype, NULL, NULL);  \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx)		       \
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx)	       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
-							  t_ctx, NULL);	       \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+							  atype, t_ctx, NULL); \
 		release_sock(sk);					       \
 	}								       \
 	__ret;								       \
@@ -256,14 +257,14 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
  * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check
  * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE).
  */
-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, bind_flags)	       \
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, bind_flags) \
 ({									       \
 	u32 __flags = 0;						       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled(atype))	{				       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
-							  NULL, &__flags);     \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
+							  atype, NULL, &__flags); \
 		release_sock(sk);					       \
 		if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE)	       \
 			*bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE;	       \
@@ -276,29 +277,29 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 	  cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) &&		       \
 	 (sk)->sk_prot->pre_connect)
 
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT)
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen)			\
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT)
 
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr)			       \
-	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT)
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen)			\
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT)
 
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL)
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT, NULL)
 
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL)
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT, NULL)
 
-#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx)
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_SENDMSG, t_ctx)
 
-#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx)
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_SENDMSG, t_ctx)
 
-#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr)			\
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL)
+#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_RECVMSG, NULL)
 
-#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr)			\
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL)
+#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_RECVMSG, NULL)
 
 /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a
  * fullsock and its parent fullsock cannot be traced by
@@ -477,24 +478,24 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 }
 
 #define cgroup_bpf_enabled(atype) (0)
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; })
-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; })
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype) ({ 0; })
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, flags) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, flags) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; })
diff --git a/include/linux/filter.h b/include/linux/filter.h
index ff7ecc89d3dd..bcd2bc15ff56 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1335,6 +1335,7 @@ struct bpf_sock_addr_kern {
 	 */
 	u64 tmp_reg;
 	void *t_ctx;	/* Attach type specific context. */
+	u32 uaddrlen;
 };
 
 struct bpf_sock_ops_kern {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 03b3d4492980..ac37bd53aee0 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1450,6 +1450,9 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  *                                       provided by user sockaddr
  * @sk: sock struct that will use sockaddr
  * @uaddr: sockaddr struct provided by user
+ * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
+ *            read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
+ *            uaddr.
  * @atype: The type of program to be executed
  * @t_ctx: Pointer to attach type specific context
  * @flags: Pointer to u32 which contains higher bits of BPF program
@@ -1462,6 +1465,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  */
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
+				      int *uaddrlen,
 				      enum cgroup_bpf_attach_type atype,
 				      void *t_ctx,
 				      u32 *flags)
@@ -1473,6 +1477,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	};
 	struct sockaddr_storage unspec;
 	struct cgroup *cgrp;
+	int ret;
 
 	/* Check socket family since not all sockets represent network
 	 * endpoint (e.g. AF_UNIX).
@@ -1483,11 +1488,19 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	if (!ctx.uaddr) {
 		memset(&unspec, 0, sizeof(unspec));
 		ctx.uaddr = (struct sockaddr *)&unspec;
+		ctx.uaddrlen = 0;
+	} else {
+		ctx.uaddrlen = *uaddrlen;
 	}
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
-				     0, flags);
+	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
+				    0, flags);
+
+	if (!ret && uaddr)
+		*uaddrlen = ctx.uaddrlen;
+
+	return ret;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3d2e30e20473..7e27ad37b939 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -452,7 +452,7 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	/* BPF prog is run before any checks are done so that if the prog
 	 * changes context in a wrong way it will be caught.
 	 */
-	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
 						 CGROUP_INET4_BIND, &flags);
 	if (err)
 		return err;
@@ -788,6 +788,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 	struct sock *sk		= sock->sk;
 	struct inet_sock *inet	= inet_sk(sk);
 	DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
+	int sin_addr_len = sizeof(*sin);
 
 	sin->sin_family = AF_INET;
 	lock_sock(sk);
@@ -800,7 +801,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 		}
 		sin->sin_port = inet->inet_dport;
 		sin->sin_addr.s_addr = inet->inet_daddr;
-		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
 				       CGROUP_INET4_GETPEERNAME);
 	} else {
 		__be32 addr = inet->inet_rcv_saddr;
@@ -808,7 +809,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 			addr = inet->inet_saddr;
 		sin->sin_port = inet->inet_sport;
 		sin->sin_addr.s_addr = addr;
-		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
 				       CGROUP_INET4_GETSOCKNAME);
 	}
 	release_sock(sk);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 4dd809b7b188..2887177822c9 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -301,7 +301,7 @@ static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (addr_len < sizeof(struct sockaddr_in))
 		return -EINVAL;
 
-	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
 }
 
 /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f13eb7e23d03..7c18dd3ce011 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -194,7 +194,7 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	sock_owned_by_me(sk);
 
-	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
 }
 
 /* This will initiate an outgoing connection. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c3ff984b6354..7b21a51dd25a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1143,7 +1143,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
 		err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
-					    (struct sockaddr *)usin, &ipc.addr);
+					    (struct sockaddr *)usin,
+					    &msg->msg_namelen,
+					    &ipc.addr);
 		if (err)
 			goto out_free;
 		if (usin) {
@@ -1865,7 +1867,8 @@ try_again:
 		*addr_len = sizeof(*sin);
 
 		BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
-						      (struct sockaddr *)sin);
+						      (struct sockaddr *)sin,
+						      addr_len);
 	}
 
 	if (udp_test_bit(GRO_ENABLED, sk))
@@ -1904,7 +1907,7 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (addr_len < sizeof(struct sockaddr_in))
 		return -EINVAL;
 
-	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
 }
 EXPORT_SYMBOL(udp_pre_connect);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 6337fb4504fd..c35d302a3da9 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -454,7 +454,7 @@ int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	/* BPF prog is run before any checks are done so that if the prog
 	 * changes context in a wrong way it will be caught.
 	 */
-	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+	err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
 						 CGROUP_INET6_BIND, &flags);
 	if (err)
 		return err;
@@ -520,6 +520,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		  int peer)
 {
 	struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
+	int sin_addr_len = sizeof(*sin);
 	struct sock *sk = sock->sk;
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -539,7 +540,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		sin->sin6_addr = sk->sk_v6_daddr;
 		if (inet6_test_bit(SNDFLOW, sk))
 			sin->sin6_flowinfo = np->flow_label;
-		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
 				       CGROUP_INET6_GETPEERNAME);
 	} else {
 		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
@@ -547,13 +548,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		else
 			sin->sin6_addr = sk->sk_v6_rcv_saddr;
 		sin->sin6_port = inet->inet_sport;
-		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+		BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
 				       CGROUP_INET6_GETSOCKNAME);
 	}
 	sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
 						 sk->sk_bound_dev_if);
 	release_sock(sk);
-	return sizeof(*sin);
+	return sin_addr_len;
 }
 EXPORT_SYMBOL(inet6_getname);
 
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index e8fb0d275cc2..d2098dd4ceae 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -56,7 +56,7 @@ static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (addr_len < SIN6_LEN_RFC2133)
 		return -EINVAL;
 
-	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
 }
 
 static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 94afb8d0f2d0..3a1e76a2d33e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -135,7 +135,7 @@ static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	sock_owned_by_me(sk);
 
-	return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len);
 }
 
 static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5e9312eefed0..622b10a549f7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -410,7 +410,8 @@ try_again:
 		*addr_len = sizeof(*sin6);
 
 		BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
-						      (struct sockaddr *)sin6);
+						      (struct sockaddr *)sin6,
+						      addr_len);
 	}
 
 	if (udp_test_bit(GRO_ENABLED, sk))
@@ -1157,7 +1158,7 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 	if (addr_len < SIN6_LEN_RFC2133)
 		return -EINVAL;
 
-	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr);
+	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
 }
 
 /**
@@ -1510,6 +1511,7 @@ do_udp_sendmsg:
 	if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
 		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
 					   (struct sockaddr *)sin6,
+					   &addr_len,
 					   &fl6->saddr);
 		if (err)
 			goto out_no_dst;
-- 
cgit v1.2.3


From 53e380d21441909b12b6e0782b77187ae4b971c4 Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Wed, 11 Oct 2023 20:51:05 +0200
Subject: bpf: Add bpf_sock_addr_set_sun_path() to allow writing unix sockaddr
 from bpf

As prep for adding unix socket support to the cgroup sockaddr hooks,
let's add a kfunc bpf_sock_addr_set_sun_path() that allows modifying a unix
sockaddr from bpf. While this is already possible for AF_INET and AF_INET6,
we'll need this kfunc when we add unix socket support since modifying the
address for those requires modifying both the address and the sockaddr
length.

Signed-off-by: Daan De Meyer <daan.j.demeyer@gmail.com>
Link: https://lore.kernel.org/r/20231011185113.140426-4-daan.j.demeyer@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/btf.c  |  1 +
 net/core/filter.c | 35 ++++++++++++++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 69101200c124..15d71d2986d3 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7850,6 +7850,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
 	case BPF_PROG_TYPE_SYSCALL:
 		return BTF_KFUNC_HOOK_SYSCALL;
 	case BPF_PROG_TYPE_CGROUP_SKB:
+	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 		return BTF_KFUNC_HOOK_CGROUP_SKB;
 	case BPF_PROG_TYPE_SCHED_ACT:
 		return BTF_KFUNC_HOOK_SCHED_ACT;
diff --git a/net/core/filter.c b/net/core/filter.c
index 3880bf0b740d..ff0bd9f20b95 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -81,6 +81,7 @@
 #include <net/xdp.h>
 #include <net/mptcp.h>
 #include <net/netfilter/nf_conntrack_bpf.h>
+#include <linux/un.h>
 
 static const struct bpf_func_proto *
 bpf_sk_base_func_proto(enum bpf_func_id func_id);
@@ -11768,6 +11769,27 @@ __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags,
 
 	return 0;
 }
+
+__bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
+					   const u8 *sun_path, u32 sun_path__sz)
+{
+	struct sockaddr_un *un;
+
+	if (sa_kern->sk->sk_family != AF_UNIX)
+		return -EINVAL;
+
+	/* We do not allow changing the address to unnamed or larger than the
+	 * maximum allowed address size for a unix sockaddr.
+	 */
+	if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX)
+		return -EINVAL;
+
+	un = (struct sockaddr_un *)sa_kern->uaddr;
+	memcpy(un->sun_path, sun_path, sun_path__sz);
+	sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz;
+
+	return 0;
+}
 __diag_pop();
 
 int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
@@ -11792,6 +11814,10 @@ BTF_SET8_START(bpf_kfunc_check_set_xdp)
 BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
 BTF_SET8_END(bpf_kfunc_check_set_xdp)
 
+BTF_SET8_START(bpf_kfunc_check_set_sock_addr)
+BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
+BTF_SET8_END(bpf_kfunc_check_set_sock_addr)
+
 static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
 	.owner = THIS_MODULE,
 	.set = &bpf_kfunc_check_set_skb,
@@ -11802,6 +11828,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
 	.set = &bpf_kfunc_check_set_xdp,
 };
 
+static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = {
+	.owner = THIS_MODULE,
+	.set = &bpf_kfunc_check_set_sock_addr,
+};
+
 static int __init bpf_kfunc_init(void)
 {
 	int ret;
@@ -11816,7 +11847,9 @@ static int __init bpf_kfunc_init(void)
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
-	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
+	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+						&bpf_kfunc_set_sock_addr);
 }
 late_initcall(bpf_kfunc_init);
 
-- 
cgit v1.2.3


From 859051dd165ec6cc915f0f2114699021144fd249 Mon Sep 17 00:00:00 2001
From: Daan De Meyer <daan.j.demeyer@gmail.com>
Date: Wed, 11 Oct 2023 20:51:06 +0200
Subject: bpf: Implement cgroup sockaddr hooks for unix sockets

These hooks allows intercepting connect(), getsockname(),
getpeername(), sendmsg() and recvmsg() for unix sockets. The unix
socket hooks get write access to the address length because the
address length is not fixed when dealing with unix sockets and
needs to be modified when a unix socket address is modified by
the hook. Because abstract socket unix addresses start with a
NUL byte, we cannot recalculate the socket address in kernelspace
after running the hook by calculating the length of the unix socket
path using strlen().

These hooks can be used when users want to multiplex syscall to a
single unix socket to multiple different processes behind the scenes
by redirecting the connect() and other syscalls to process specific
sockets.

We do not implement support for intercepting bind() because when
using bind() with unix sockets with a pathname address, this creates
an inode in the filesystem which must be cleaned up. If we rewrite
the address, the user might try to clean up the wrong file, leaking
the socket in the filesystem where it is never cleaned up. Until we
figure out a solution for this (and a use case for intercepting bind()),
we opt to not allow rewriting the sockaddr in bind() calls.

We also implement recvmsg() support for connected streams so that
after a connect() that is modified by a sockaddr hook, any corresponding
recmvsg() on the connected socket can also be modified to make the
connected program think it is connected to the "intended" remote.

Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Daan De Meyer <daan.j.demeyer@gmail.com>
Link: https://lore.kernel.org/r/20231011185113.140426-5-daan.j.demeyer@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf-cgroup-defs.h |  5 +++++
 include/linux/bpf-cgroup.h      | 17 +++++++++++++++++
 include/uapi/linux/bpf.h        | 13 +++++++++----
 kernel/bpf/cgroup.c             | 11 +++++++++--
 kernel/bpf/syscall.c            | 15 +++++++++++++++
 kernel/bpf/verifier.c           |  5 ++++-
 net/core/filter.c               | 14 ++++++++++++--
 net/unix/af_unix.c              | 35 ++++++++++++++++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h  | 13 +++++++++----
 9 files changed, 114 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index 7b121bd780eb..0985221d5478 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -28,19 +28,24 @@ enum cgroup_bpf_attach_type {
 	CGROUP_INET6_BIND,
 	CGROUP_INET4_CONNECT,
 	CGROUP_INET6_CONNECT,
+	CGROUP_UNIX_CONNECT,
 	CGROUP_INET4_POST_BIND,
 	CGROUP_INET6_POST_BIND,
 	CGROUP_UDP4_SENDMSG,
 	CGROUP_UDP6_SENDMSG,
+	CGROUP_UNIX_SENDMSG,
 	CGROUP_SYSCTL,
 	CGROUP_UDP4_RECVMSG,
 	CGROUP_UDP6_RECVMSG,
+	CGROUP_UNIX_RECVMSG,
 	CGROUP_GETSOCKOPT,
 	CGROUP_SETSOCKOPT,
 	CGROUP_INET4_GETPEERNAME,
 	CGROUP_INET6_GETPEERNAME,
+	CGROUP_UNIX_GETPEERNAME,
 	CGROUP_INET4_GETSOCKNAME,
 	CGROUP_INET6_GETSOCKNAME,
+	CGROUP_UNIX_GETSOCKNAME,
 	CGROUP_INET_SOCK_RELEASE,
 	CGROUP_LSM_START,
 	CGROUP_LSM_END = CGROUP_LSM_START + CGROUP_LSM_NUM - 1,
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 31561e789715..98b8cea904fe 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -48,19 +48,24 @@ to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type)
 	CGROUP_ATYPE(CGROUP_INET6_BIND);
 	CGROUP_ATYPE(CGROUP_INET4_CONNECT);
 	CGROUP_ATYPE(CGROUP_INET6_CONNECT);
+	CGROUP_ATYPE(CGROUP_UNIX_CONNECT);
 	CGROUP_ATYPE(CGROUP_INET4_POST_BIND);
 	CGROUP_ATYPE(CGROUP_INET6_POST_BIND);
 	CGROUP_ATYPE(CGROUP_UDP4_SENDMSG);
 	CGROUP_ATYPE(CGROUP_UDP6_SENDMSG);
+	CGROUP_ATYPE(CGROUP_UNIX_SENDMSG);
 	CGROUP_ATYPE(CGROUP_SYSCTL);
 	CGROUP_ATYPE(CGROUP_UDP4_RECVMSG);
 	CGROUP_ATYPE(CGROUP_UDP6_RECVMSG);
+	CGROUP_ATYPE(CGROUP_UNIX_RECVMSG);
 	CGROUP_ATYPE(CGROUP_GETSOCKOPT);
 	CGROUP_ATYPE(CGROUP_SETSOCKOPT);
 	CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME);
 	CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME);
+	CGROUP_ATYPE(CGROUP_UNIX_GETPEERNAME);
 	CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME);
 	CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME);
+	CGROUP_ATYPE(CGROUP_UNIX_GETSOCKNAME);
 	CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE);
 	default:
 		return CGROUP_BPF_ATTACH_TYPE_INVALID;
@@ -289,18 +294,27 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT, NULL)
 
+#define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_CONNECT, NULL)
+
 #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_SENDMSG, t_ctx)
 
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_SENDMSG, t_ctx)
 
+#define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)	\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_SENDMSG, t_ctx)
+
 #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_RECVMSG, NULL)
 
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_RECVMSG, NULL)
 
+#define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen)		\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_RECVMSG, NULL)
+
 /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a
  * fullsock and its parent fullsock cannot be traced by
  * sk_to_full_sk().
@@ -492,10 +506,13 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; })
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e0aa457f94a9..7ba61b75bc0e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1047,6 +1047,11 @@ enum bpf_attach_type {
 	BPF_TCX_INGRESS,
 	BPF_TCX_EGRESS,
 	BPF_TRACE_UPROBE_MULTI,
+	BPF_CGROUP_UNIX_CONNECT,
+	BPF_CGROUP_UNIX_SENDMSG,
+	BPF_CGROUP_UNIX_RECVMSG,
+	BPF_CGROUP_UNIX_GETPEERNAME,
+	BPF_CGROUP_UNIX_GETSOCKNAME,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2704,8 +2709,8 @@ union bpf_attr {
  * 		*bpf_socket* should be one of the following:
  *
  * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
- * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
- * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**,
+ *		  **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**.
  *
  * 		This helper actually implements a subset of **setsockopt()**.
  * 		It supports the following *level*\ s:
@@ -2943,8 +2948,8 @@ union bpf_attr {
  * 		*bpf_socket* should be one of the following:
  *
  * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
- * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
- * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**,
+ *		  **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**.
  *
  * 		This helper actually implements a subset of **getsockopt()**.
  * 		It supports the same set of *optname*\ s that is supported by
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ac37bd53aee0..74ad2215e1ba 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1458,7 +1458,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  * @flags: Pointer to u32 which contains higher bits of BPF program
  *         return value (OR'ed together).
  *
- * socket is expected to be of type INET or INET6.
+ * socket is expected to be of type INET, INET6 or UNIX.
  *
  * This function will return %-EPERM if an attached program is found and
  * returned value != 1 during execution. In all other cases, 0 is returned.
@@ -1482,7 +1482,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	/* Check socket family since not all sockets represent network
 	 * endpoint (e.g. AF_UNIX).
 	 */
-	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
+	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
+	    sk->sk_family != AF_UNIX)
 		return 0;
 
 	if (!ctx.uaddr) {
@@ -2533,10 +2534,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		case BPF_CGROUP_SOCK_OPS:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 			return NULL;
 		default:
 			return &bpf_get_retval_proto;
@@ -2548,10 +2552,13 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		case BPF_CGROUP_SOCK_OPS:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 			return NULL;
 		default:
 			return &bpf_set_retval_proto;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6b5280f14a53..8677837f3deb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2446,14 +2446,19 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UNIX_CONNECT:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 		case BPF_CGROUP_UDP4_SENDMSG:
 		case BPF_CGROUP_UDP6_SENDMSG:
+		case BPF_CGROUP_UNIX_SENDMSG:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 			return 0;
 		default:
 			return -EINVAL;
@@ -3678,14 +3683,19 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UNIX_CONNECT:
 	case BPF_CGROUP_INET4_GETPEERNAME:
 	case BPF_CGROUP_INET6_GETPEERNAME:
+	case BPF_CGROUP_UNIX_GETPEERNAME:
 	case BPF_CGROUP_INET4_GETSOCKNAME:
 	case BPF_CGROUP_INET6_GETSOCKNAME:
+	case BPF_CGROUP_UNIX_GETSOCKNAME:
 	case BPF_CGROUP_UDP4_SENDMSG:
 	case BPF_CGROUP_UDP6_SENDMSG:
+	case BPF_CGROUP_UNIX_SENDMSG:
 	case BPF_CGROUP_UDP4_RECVMSG:
 	case BPF_CGROUP_UDP6_RECVMSG:
+	case BPF_CGROUP_UNIX_RECVMSG:
 		return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 	case BPF_CGROUP_SOCK_OPS:
 		return BPF_PROG_TYPE_SOCK_OPS;
@@ -3942,14 +3952,19 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET6_POST_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UNIX_CONNECT:
 	case BPF_CGROUP_INET4_GETPEERNAME:
 	case BPF_CGROUP_INET6_GETPEERNAME:
+	case BPF_CGROUP_UNIX_GETPEERNAME:
 	case BPF_CGROUP_INET4_GETSOCKNAME:
 	case BPF_CGROUP_INET6_GETSOCKNAME:
+	case BPF_CGROUP_UNIX_GETSOCKNAME:
 	case BPF_CGROUP_UDP4_SENDMSG:
 	case BPF_CGROUP_UDP6_SENDMSG:
+	case BPF_CGROUP_UNIX_SENDMSG:
 	case BPF_CGROUP_UDP4_RECVMSG:
 	case BPF_CGROUP_UDP6_RECVMSG:
+	case BPF_CGROUP_UNIX_RECVMSG:
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 	case BPF_CGROUP_SYSCTL:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eed7350e15f4..e777f50401b6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14797,10 +14797,13 @@ static int check_return_code(struct bpf_verifier_env *env, int regno)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 		if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
 		    env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
+		    env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
+		    env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
-		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
+		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME ||
+		    env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME)
 			range = tnum_range(1, 1);
 		if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
diff --git a/net/core/filter.c b/net/core/filter.c
index ff0bd9f20b95..cc2e4babc85f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7875,14 +7875,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UNIX_CONNECT:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 		case BPF_CGROUP_UDP4_SENDMSG:
 		case BPF_CGROUP_UDP6_SENDMSG:
+		case BPF_CGROUP_UNIX_SENDMSG:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 			return &bpf_sock_addr_setsockopt_proto;
 		default:
 			return NULL;
@@ -7893,14 +7898,19 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UNIX_CONNECT:
 		case BPF_CGROUP_UDP4_RECVMSG:
 		case BPF_CGROUP_UDP6_RECVMSG:
+		case BPF_CGROUP_UNIX_RECVMSG:
 		case BPF_CGROUP_UDP4_SENDMSG:
 		case BPF_CGROUP_UDP6_SENDMSG:
+		case BPF_CGROUP_UNIX_SENDMSG:
 		case BPF_CGROUP_INET4_GETPEERNAME:
 		case BPF_CGROUP_INET6_GETPEERNAME:
+		case BPF_CGROUP_UNIX_GETPEERNAME:
 		case BPF_CGROUP_INET4_GETSOCKNAME:
 		case BPF_CGROUP_INET6_GETSOCKNAME:
+		case BPF_CGROUP_UNIX_GETSOCKNAME:
 			return &bpf_sock_addr_getsockopt_proto;
 		default:
 			return NULL;
@@ -8948,8 +8958,8 @@ static bool sock_addr_is_valid_access(int off, int size,
 	if (off % size != 0)
 		return false;
 
-	/* Disallow access to IPv6 fields from IPv4 contex and vise
-	 * versa.
+	/* Disallow access to fields not belonging to the attach type's address
+	 * family.
 	 */
 	switch (off) {
 	case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3e8a04a13668..e10d07c76044 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -116,6 +116,7 @@
 #include <linux/freezer.h>
 #include <linux/file.h>
 #include <linux/btf_ids.h>
+#include <linux/bpf-cgroup.h>
 
 #include "scm.h"
 
@@ -1381,6 +1382,10 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 		if (err)
 			goto out;
 
+		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
+		if (err)
+			goto out;
+
 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
 		    !unix_sk(sk)->addr) {
@@ -1490,6 +1495,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	if (err)
 		goto out;
 
+	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
+	if (err)
+		goto out;
+
 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
 		err = unix_autobind(sk);
@@ -1770,6 +1779,13 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
 	} else {
 		err = addr->len;
 		memcpy(sunaddr, addr->name, addr->len);
+
+		if (peer)
+			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
+					       CGROUP_UNIX_GETPEERNAME);
+		else
+			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
+					       CGROUP_UNIX_GETSOCKNAME);
 	}
 	sock_put(sk);
 out:
@@ -1922,6 +1938,13 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 		err = unix_validate_addr(sunaddr, msg->msg_namelen);
 		if (err)
 			goto out;
+
+		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
+							    msg->msg_name,
+							    &msg->msg_namelen,
+							    NULL);
+		if (err)
+			goto out;
 	} else {
 		sunaddr = NULL;
 		err = -ENOTCONN;
@@ -2390,9 +2413,14 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
 						EPOLLOUT | EPOLLWRNORM |
 						EPOLLWRBAND);
 
-	if (msg->msg_name)
+	if (msg->msg_name) {
 		unix_copy_addr(msg, skb->sk);
 
+		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
+						      msg->msg_name,
+						      &msg->msg_namelen);
+	}
+
 	if (size > skb->len - skip)
 		size = skb->len - skip;
 	else if (size < skb->len - skip)
@@ -2744,6 +2772,11 @@ unlock:
 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
 					 state->msg->msg_name);
 			unix_copy_addr(state->msg, skb->sk);
+
+			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
+							      state->msg->msg_name,
+							      &state->msg->msg_namelen);
+
 			sunaddr = NULL;
 		}
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e0aa457f94a9..7ba61b75bc0e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1047,6 +1047,11 @@ enum bpf_attach_type {
 	BPF_TCX_INGRESS,
 	BPF_TCX_EGRESS,
 	BPF_TRACE_UPROBE_MULTI,
+	BPF_CGROUP_UNIX_CONNECT,
+	BPF_CGROUP_UNIX_SENDMSG,
+	BPF_CGROUP_UNIX_RECVMSG,
+	BPF_CGROUP_UNIX_GETPEERNAME,
+	BPF_CGROUP_UNIX_GETSOCKNAME,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2704,8 +2709,8 @@ union bpf_attr {
  * 		*bpf_socket* should be one of the following:
  *
  * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
- * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
- * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**,
+ *		  **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**.
  *
  * 		This helper actually implements a subset of **setsockopt()**.
  * 		It supports the following *level*\ s:
@@ -2943,8 +2948,8 @@ union bpf_attr {
  * 		*bpf_socket* should be one of the following:
  *
  * 		* **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
- * 		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
- * 		  and **BPF_CGROUP_INET6_CONNECT**.
+ *		* **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**,
+ *		  **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**.
  *
  * 		This helper actually implements a subset of **getsockopt()**.
  * 		It supports the same set of *optname*\ s that is supported by
-- 
cgit v1.2.3


From f577cd57bfaa889cf0718e30e92c08c7f78c9d85 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 12 Jul 2023 16:10:56 +0200
Subject: sched/topology: Rename 'DIE' domain to 'PKG'

While reworking the x86 topology code Thomas tripped over creating a 'DIE' domain
for the package mask. :-)

Since these names are CONFIG_SCHED_DEBUG=y only, rename them to make the
name less ambiguous.

[ Shrikanth Hegde: rename on s390 as well. ]
[ Valentin Schneider: also rename it in the comments. ]
[ mingo: port to recent kernels & find all remaining occurances. ]

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Valentin Schneider <vschneid@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Acked-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20230712141056.GI3100107@hirez.programming.kicks-ass.net
---
 arch/powerpc/kernel/smp.c   | 4 ++--
 arch/s390/kernel/topology.c | 2 +-
 arch/x86/kernel/smpboot.c   | 4 ++--
 kernel/sched/fair.c         | 2 +-
 kernel/sched/topology.c     | 8 ++++----
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5826f5108a12..4e4870031265 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1051,7 +1051,7 @@ static struct sched_domain_topology_level powerpc_topology[] = {
 #endif
 	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
 	{ cpu_mc_mask, SD_INIT_NAME(MC) },
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
 	{ NULL, },
 };
 
@@ -1595,7 +1595,7 @@ static void add_cpu_to_masks(int cpu)
 	/* Skip all CPUs already part of current CPU core mask */
 	cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu));
 
-	/* If chip_id is -1; limit the cpu_core_mask to within DIE*/
+	/* If chip_id is -1; limit the cpu_core_mask to within PKG */
 	if (chip_id == -1)
 		cpumask_and(mask, mask, cpu_cpu_mask(cpu));
 
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 68adf1de8888..66bda6a8f918 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -522,7 +522,7 @@ static struct sched_domain_topology_level s390_topology[] = {
 	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
 	{ cpu_book_mask, SD_INIT_NAME(BOOK) },
 	{ cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
 	{ NULL, },
 };
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 02765d9da682..e3b3e806bf61 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -641,13 +641,13 @@ static void __init build_sched_topology(void)
 	};
 #endif
 	/*
-	 * When there is NUMA topology inside the package skip the DIE domain
+	 * When there is NUMA topology inside the package skip the PKG domain
 	 * since the NUMA domains will auto-magically create the right spanning
 	 * domains based on the SLIT.
 	 */
 	if (!x86_has_numa_in_package) {
 		x86_topology[i++] = (struct sched_domain_topology_level){
-			cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(DIE)
+			cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG)
 		};
 	}
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 922905194c0c..a751e552f253 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9555,7 +9555,7 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
  * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
  * imbalances in the number of CPUS are dealt with in find_busiest_group().
  *
- * If we are balancing load within an SMT core, or at DIE domain level, always
+ * If we are balancing load within an SMT core, or at PKG domain level, always
  * proceed.
  *
  * Return: true if @env::dst_cpu can do with asym_packing load balance. False
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d9508617f7f8..a63729f87c21 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1119,7 +1119,7 @@ fail:
  *
  *  - Simultaneous multithreading (SMT)
  *  - Multi-Core Cache (MC)
- *  - Package (DIE)
+ *  - Package (PKG)
  *
  * Where the last one more or less denotes everything up to a NUMA node.
  *
@@ -1141,13 +1141,13 @@ fail:
  *
  * CPU   0   1   2   3   4   5   6   7
  *
- * DIE  [                             ]
+ * PKG  [                             ]
  * MC   [             ] [             ]
  * SMT  [     ] [     ] [     ] [     ]
  *
  *  - or -
  *
- * DIE  0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
+ * PKG  0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
  * MC	0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
  * SMT  0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
  *
@@ -1681,7 +1681,7 @@ static struct sched_domain_topology_level default_topology[] = {
 #ifdef CONFIG_SCHED_MC
 	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
 #endif
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
 	{ NULL, },
 };
 
-- 
cgit v1.2.3


From f06cc667f79909e9175460b167c277b7c64d3df0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 9 Oct 2023 23:04:25 +0200
Subject: perf: Optimize perf_cgroup_switch()

Namhyung reported that bd2756811766 ("perf: Rewrite core context handling")
regresses context switch overhead when perf-cgroup is in use together
with 'slow' PMUs like uncore.

Specifically, perf_cgroup_switch()'s perf_ctx_disable() /
ctx_sched_out() etc.. all iterate the full list of active PMUs for
that CPU, even if they don't have cgroup events.

Previously there was cgrp_cpuctx_list which linked the relevant PMUs
together, but that got lost in the rework. Instead of re-instruducing
a similar list, let the perf_event_pmu_context iteration skip those
that do not have cgroup events. This avoids growing multiple versions
of the perf_event_pmu_context iteration.

Measured performance (on a slightly different patch):

Before)

  $ taskset -c 0 ./perf bench sched pipe -l 10000 -G AAA,BBB
  # Running 'sched/pipe' benchmark:
  # Executed 10000 pipe operations between two processes

       Total time: 0.901 [sec]

        90.128700 usecs/op
            11095 ops/sec

After)

  $ taskset -c 0 ./perf bench sched pipe -l 10000 -G AAA,BBB
  # Running 'sched/pipe' benchmark:
  # Executed 10000 pipe operations between two processes

       Total time: 0.065 [sec]

         6.560100 usecs/op
           152436 ops/sec

Fixes: bd2756811766 ("perf: Rewrite core context handling")
Reported-by: Namhyung Kim <namhyung@kernel.org>
Debugged-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231009210425.GC6307@noisy.programming.kicks-ass.net
---
 include/linux/perf_event.h |   1 +
 kernel/events/core.c       | 115 +++++++++++++++++++++++----------------------
 2 files changed, 61 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f31f962a6445..0367d748fae0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -878,6 +878,7 @@ struct perf_event_pmu_context {
 	unsigned int			embedded : 1;
 
 	unsigned int			nr_events;
+	unsigned int			nr_cgroups;
 
 	atomic_t			refcount; /* event <-> epc */
 	struct rcu_head			rcu_head;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 708d474c2ede..3eb26c2c6e65 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -375,6 +375,7 @@ enum event_type_t {
 	EVENT_TIME = 0x4,
 	/* see ctx_resched() for details */
 	EVENT_CPU = 0x8,
+	EVENT_CGROUP = 0x10,
 	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 };
 
@@ -684,20 +685,26 @@ do {									\
 	___p;								\
 })
 
-static void perf_ctx_disable(struct perf_event_context *ctx)
+static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		if (cgroup && !pmu_ctx->nr_cgroups)
+			continue;
 		perf_pmu_disable(pmu_ctx->pmu);
+	}
 }
 
-static void perf_ctx_enable(struct perf_event_context *ctx)
+static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		if (cgroup && !pmu_ctx->nr_cgroups)
+			continue;
 		perf_pmu_enable(pmu_ctx->pmu);
+	}
 }
 
 static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
@@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task)
 		return;
 
 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-	perf_ctx_disable(&cpuctx->ctx);
+	perf_ctx_disable(&cpuctx->ctx, true);
 
-	ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+	ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
 	/*
 	 * must not be done before ctxswout due
 	 * to update_cgrp_time_from_cpuctx() in
@@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task)
 	 * perf_cgroup_set_timestamp() in ctx_sched_in()
 	 * to not have to pass task around
 	 */
-	ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
+	ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
 
-	perf_ctx_enable(&cpuctx->ctx);
+	perf_ctx_enable(&cpuctx->ctx, true);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
@@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
 	if (!is_cgroup_event(event))
 		return;
 
+	event->pmu_ctx->nr_cgroups++;
+
 	/*
 	 * Because cgroup events are always per-cpu events,
 	 * @ctx == &cpuctx->ctx.
@@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
 	if (!is_cgroup_event(event))
 		return;
 
+	event->pmu_ctx->nr_cgroups--;
+
 	/*
 	 * Because cgroup events are always per-cpu events,
 	 * @ctx == &cpuctx->ctx.
@@ -2677,9 +2688,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 
 	event_type &= EVENT_ALL;
 
-	perf_ctx_disable(&cpuctx->ctx);
+	perf_ctx_disable(&cpuctx->ctx, false);
 	if (task_ctx) {
-		perf_ctx_disable(task_ctx);
+		perf_ctx_disable(task_ctx, false);
 		task_ctx_sched_out(task_ctx, event_type);
 	}
 
@@ -2697,9 +2708,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 
 	perf_event_sched_in(cpuctx, task_ctx);
 
-	perf_ctx_enable(&cpuctx->ctx);
+	perf_ctx_enable(&cpuctx->ctx, false);
 	if (task_ctx)
-		perf_ctx_enable(task_ctx);
+		perf_ctx_enable(task_ctx, false);
 }
 
 void perf_pmu_resched(struct pmu *pmu)
@@ -3244,6 +3255,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	struct perf_event_pmu_context *pmu_ctx;
 	int is_active = ctx->is_active;
+	bool cgroup = event_type & EVENT_CGROUP;
+
+	event_type &= ~EVENT_CGROUP;
 
 	lockdep_assert_held(&ctx->lock);
 
@@ -3290,8 +3304,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
 
 	is_active ^= ctx->is_active; /* changed bits */
 
-	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		if (cgroup && !pmu_ctx->nr_cgroups)
+			continue;
 		__pmu_ctx_sched_out(pmu_ctx, is_active);
+	}
 }
 
 /*
@@ -3482,7 +3499,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
 		raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
 		if (context_equiv(ctx, next_ctx)) {
 
-			perf_ctx_disable(ctx);
+			perf_ctx_disable(ctx, false);
 
 			/* PMIs are disabled; ctx->nr_pending is stable. */
 			if (local_read(&ctx->nr_pending) ||
@@ -3502,7 +3519,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
 			perf_ctx_sched_task_cb(ctx, false);
 			perf_event_swap_task_ctx_data(ctx, next_ctx);
 
-			perf_ctx_enable(ctx);
+			perf_ctx_enable(ctx, false);
 
 			/*
 			 * RCU_INIT_POINTER here is safe because we've not
@@ -3526,13 +3543,13 @@ unlock:
 
 	if (do_switch) {
 		raw_spin_lock(&ctx->lock);
-		perf_ctx_disable(ctx);
+		perf_ctx_disable(ctx, false);
 
 inside_switch:
 		perf_ctx_sched_task_cb(ctx, false);
 		task_ctx_sched_out(ctx, EVENT_ALL);
 
-		perf_ctx_enable(ctx);
+		perf_ctx_enable(ctx, false);
 		raw_spin_unlock(&ctx->lock);
 	}
 }
@@ -3818,47 +3835,32 @@ static int merge_sched_in(struct perf_event *event, void *data)
 	return 0;
 }
 
-static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void pmu_groups_sched_in(struct perf_event_context *ctx,
+				struct perf_event_groups *groups,
+				struct pmu *pmu)
 {
-	struct perf_event_pmu_context *pmu_ctx;
 	int can_add_hw = 1;
-
-	if (pmu) {
-		visit_groups_merge(ctx, &ctx->pinned_groups,
-				   smp_processor_id(), pmu,
-				   merge_sched_in, &can_add_hw);
-	} else {
-		list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-			can_add_hw = 1;
-			visit_groups_merge(ctx, &ctx->pinned_groups,
-					   smp_processor_id(), pmu_ctx->pmu,
-					   merge_sched_in, &can_add_hw);
-		}
-	}
+	visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
+			   merge_sched_in, &can_add_hw);
 }
 
-static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void ctx_groups_sched_in(struct perf_event_context *ctx,
+				struct perf_event_groups *groups,
+				bool cgroup)
 {
 	struct perf_event_pmu_context *pmu_ctx;
-	int can_add_hw = 1;
 
-	if (pmu) {
-		visit_groups_merge(ctx, &ctx->flexible_groups,
-				   smp_processor_id(), pmu,
-				   merge_sched_in, &can_add_hw);
-	} else {
-		list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
-			can_add_hw = 1;
-			visit_groups_merge(ctx, &ctx->flexible_groups,
-					   smp_processor_id(), pmu_ctx->pmu,
-					   merge_sched_in, &can_add_hw);
-		}
+	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+		if (cgroup && !pmu_ctx->nr_cgroups)
+			continue;
+		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
 	}
 }
 
-static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
+			       struct pmu *pmu)
 {
-	ctx_flexible_sched_in(ctx, pmu);
+	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
 }
 
 static void
@@ -3866,6 +3868,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
 {
 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
 	int is_active = ctx->is_active;
+	bool cgroup = event_type & EVENT_CGROUP;
+
+	event_type &= ~EVENT_CGROUP;
 
 	lockdep_assert_held(&ctx->lock);
 
@@ -3898,11 +3903,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
 	 * in order to give them the best chance of going on.
 	 */
 	if (is_active & EVENT_PINNED)
-		ctx_pinned_sched_in(ctx, NULL);
+		ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
 
 	/* Then walk through the lower prio flexible groups */
 	if (is_active & EVENT_FLEXIBLE)
-		ctx_flexible_sched_in(ctx, NULL);
+		ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
 }
 
 static void perf_event_context_sched_in(struct task_struct *task)
@@ -3917,11 +3922,11 @@ static void perf_event_context_sched_in(struct task_struct *task)
 
 	if (cpuctx->task_ctx == ctx) {
 		perf_ctx_lock(cpuctx, ctx);
-		perf_ctx_disable(ctx);
+		perf_ctx_disable(ctx, false);
 
 		perf_ctx_sched_task_cb(ctx, true);
 
-		perf_ctx_enable(ctx);
+		perf_ctx_enable(ctx, false);
 		perf_ctx_unlock(cpuctx, ctx);
 		goto rcu_unlock;
 	}
@@ -3934,7 +3939,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
 	if (!ctx->nr_events)
 		goto unlock;
 
-	perf_ctx_disable(ctx);
+	perf_ctx_disable(ctx, false);
 	/*
 	 * We want to keep the following priority order:
 	 * cpu pinned (that don't need to move), task pinned,
@@ -3944,7 +3949,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
 	 * events, no need to flip the cpuctx's events around.
 	 */
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
-		perf_ctx_disable(&cpuctx->ctx);
+		perf_ctx_disable(&cpuctx->ctx, false);
 		ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
 	}
 
@@ -3953,9 +3958,9 @@ static void perf_event_context_sched_in(struct task_struct *task)
 	perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
 
 	if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
-		perf_ctx_enable(&cpuctx->ctx);
+		perf_ctx_enable(&cpuctx->ctx, false);
 
-	perf_ctx_enable(ctx);
+	perf_ctx_enable(ctx, false);
 
 unlock:
 	perf_ctx_unlock(cpuctx, ctx);
-- 
cgit v1.2.3


From 7b42f401fc6571b6604441789d892d440829e33c Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang1211@gmail.com>
Date: Wed, 11 Oct 2023 16:27:59 +0800
Subject: workqueue: Use the kmem_cache_free() instead of kfree() to release
 pwq

Currently, the kfree() be used for pwq objects allocated with
kmem_cache_alloc() in alloc_and_link_pwqs(), this isn't wrong.
but usually, use "trace_kmem_cache_alloc/trace_kmem_cache_free"
to track memory allocation and free. this commit therefore use
kmem_cache_free() instead of kfree() in alloc_and_link_pwqs()
and also consistent with release of the pwq in rcu_free_pwq().

Signed-off-by: Zqiang <qiang.zhang1211@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ebe24a5e1435..6f74cab2bd5a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4610,8 +4610,12 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 
 enomem:
 	if (wq->cpu_pwq) {
-		for_each_possible_cpu(cpu)
-			kfree(*per_cpu_ptr(wq->cpu_pwq, cpu));
+		for_each_possible_cpu(cpu) {
+			struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
+
+			if (pwq)
+				kmem_cache_free(pwq_cache, pwq);
+		}
 		free_percpu(wq->cpu_pwq);
 		wq->cpu_pwq = NULL;
 	}
-- 
cgit v1.2.3


From ac8b60be078abebc3ab8836f3f0ecac6980e0b4f Mon Sep 17 00:00:00 2001
From: Lucy Mielke <lucymielke@icloud.com>
Date: Thu, 12 Oct 2023 12:44:32 +0200
Subject: locking/lockdep: Fix string sizing bug that triggers a
 format-truncation compiler-warning

On an allyesconfig, with "treat warnings as errors" unset, GCC emits
these warnings:

	kernel/locking/lockdep_proc.c:438:32: Warning: Format specifier '%lld' may
		be truncated when writing 1 to 17 bytes into a region
		of size 15 [-Wformat-truncation=]

	kernel/locking/lockdep_proc.c:438:31: Note: Format directive argument is
		in the range [-9223372036854775, 9223372036854775]

	kernel/locking/lockdep_proc.c:438:9: Note: 'snprintf' has output
		between 5 and 22 bytes into a target of size 15

In seq_time(), the longest s64 is "-9223372036854775808"-ish, which
converted to the fixed-point float format is "-9223372036854775.80": 21 bytes,
plus termination is another byte: 22. Therefore, a larger buffer size
of 22 is needed here - not 15. The code was safe due to the snprintf().

Fix it.

Signed-off-by: Lucy Mielke <lucymielke@icloud.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/ZSfOEHRkZAWaQr3U@fedora.fritz.box
---
 kernel/locking/lockdep_proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 15fdc7fa5c68..e2bfb1db589d 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -440,7 +440,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
 
 static void seq_time(struct seq_file *m, s64 time)
 {
-	char num[15];
+	char num[22];
 
 	snprint_time(num, sizeof(num), time);
 	seq_printf(m, " %14s", num);
-- 
cgit v1.2.3


From ca10d851b9ad0338c19e8e3089e24d565ebfffd7 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 10 Oct 2023 22:48:42 -0400
Subject: workqueue: Override implicit ordered attribute in
 workqueue_apply_unbound_cpumask()

Commit 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1
to be ordered") enabled implicit ordered attribute to be added to
WQ_UNBOUND workqueues with max_active of 1. This prevented the changing
of attributes to these workqueues leading to fix commit 0a94efb5acbb
("workqueue: implicit ordered attribute should be overridable").

However, workqueue_apply_unbound_cpumask() was not updated at that time.
So sysfs changes to wq_unbound_cpumask has no effect on WQ_UNBOUND
workqueues with implicit ordered attribute. Since not all WQ_UNBOUND
workqueues are visible on sysfs, we are not able to make all the
necessary cpumask changes even if we iterates all the workqueue cpumasks
in sysfs and changing them one by one.

Fix this problem by applying the corresponding change made
to apply_workqueue_attrs_locked() in the fix commit to
workqueue_apply_unbound_cpumask().

Fixes: 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered")
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6f74cab2bd5a..51177ffe16f1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5792,9 +5792,13 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
 	list_for_each_entry(wq, &workqueues, list) {
 		if (!(wq->flags & WQ_UNBOUND))
 			continue;
+
 		/* creating multiple pwqs breaks ordering guarantee */
-		if (wq->flags & __WQ_ORDERED)
-			continue;
+		if (!list_empty(&wq->pwqs)) {
+			if (wq->flags & __WQ_ORDERED_EXPLICIT)
+				continue;
+			wq->flags &= ~__WQ_ORDERED;
+		}
 
 		ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
 		if (IS_ERR(ctx)) {
-- 
cgit v1.2.3


From 5d9c7a1e3e8e18db8e10c546de648cda2a57be52 Mon Sep 17 00:00:00 2001
From: Lucy Mielke <lucymielke@icloud.com>
Date: Mon, 9 Oct 2023 19:09:46 +0200
Subject: workqueue: fix -Wformat-truncation in create_worker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiling with W=1 emitted the following warning
(Compiler: gcc (x86-64, ver. 13.2.1, .config: result of make allyesconfig,
"Treat warnings as errors" turned off):

kernel/workqueue.c:2188:54: warning: ‘%d’ directive output may be
	truncated writing between 1 and 10 bytes into a region of size
	between 5 and 14 [-Wformat-truncation=]
kernel/workqueue.c:2188:50: note: directive argument in the range
	[0, 2147483647]
kernel/workqueue.c:2188:17: note: ‘snprintf’ output between 4 and 23 bytes
	into a destination of size 16

setting "id_buf" to size 23 will silence the warning, since GCC
determines snprintf's output to be max. 23 bytes in line 2188.

Please let me know if there are any mistakes in my patch!

Signed-off-by: Lucy Mielke <lucymielke@icloud.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 51177ffe16f1..a3522b70218d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2166,7 +2166,7 @@ static struct worker *create_worker(struct worker_pool *pool)
 {
 	struct worker *worker;
 	int id;
-	char id_buf[16];
+	char id_buf[23];
 
 	/* ID is needed to determine kthread name */
 	id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
-- 
cgit v1.2.3


From 0c2924079f5a83ed715630680e338b3685a0bf7d Mon Sep 17 00:00:00 2001
From: Haifeng Xu <haifeng.xu@shopee.com>
Date: Tue, 26 Sep 2023 11:57:22 +0000
Subject: sched/psi: Bail out early from irq time accounting

We could bail out early when psi was disabled.

Signed-off-by: Haifeng Xu <haifeng.xu@shopee.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
Link: https://lore.kernel.org/r/20230926115722.467833-1-haifeng.xu@shopee.com
---
 kernel/sched/psi.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 44a78774ae87..519bc922a960 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -998,6 +998,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
 	struct psi_group_cpu *groupc;
 	u64 now;
 
+	if (static_branch_likely(&psi_disabled))
+		return;
+
 	if (!task->pid)
 		return;
 
-- 
cgit v1.2.3


From f0498d2a54e7966ce23cd7c7ff42c64fa0059b07 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 10 Oct 2023 20:57:39 +0200
Subject: sched: Fix stop_one_cpu_nowait() vs hotplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Kuyo reported sporadic failures on a sched_setaffinity() vs CPU
hotplug stress-test -- notably affine_move_task() remains stuck in
wait_for_completion(), leading to a hung-task detector warning.

Specifically, it was reported that stop_one_cpu_nowait(.fn =
migration_cpu_stop) returns false -- this stopper is responsible for
the matching complete().

The race scenario is:

	CPU0					CPU1

					// doing _cpu_down()

  __set_cpus_allowed_ptr()
    task_rq_lock();
					takedown_cpu()
					  stop_machine_cpuslocked(take_cpu_down..)

					<PREEMPT: cpu_stopper_thread()
					  MULTI_STOP_PREPARE
					  ...
    __set_cpus_allowed_ptr_locked()
      affine_move_task()
        task_rq_unlock();

  <PREEMPT: cpu_stopper_thread()\>
    ack_state()
					  MULTI_STOP_RUN
					    take_cpu_down()
					      __cpu_disable();
					      stop_machine_park();
						stopper->enabled = false;
					 />
   />
	stop_one_cpu_nowait(.fn = migration_cpu_stop);
          if (stopper->enabled) // false!!!

That is, by doing stop_one_cpu_nowait() after dropping rq-lock, the
stopper thread gets a chance to preempt and allows the cpu-down for
the target CPU to complete.

OTOH, since stop_one_cpu_nowait() / cpu_stop_queue_work() needs to
issue a wakeup, it must not be ran under the scheduler locks.

Solve this apparent contradiction by keeping preemption disabled over
the unlock + queue_stopper combination:

	preempt_disable();
	task_rq_unlock(...);
	if (!stop_pending)
	  stop_one_cpu_nowait(...)
	preempt_enable();

This respects the lock ordering contraints while still avoiding the
above race. That is, if we find the CPU is online under rq-lock, the
targeted stop_one_cpu_nowait() must succeed.

Apply this pattern to all similar stop_one_cpu_nowait() invocations.

Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()")
Reported-by: "Kuyo Chang (張建文)" <Kuyo.Chang@mediatek.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: "Kuyo Chang (張建文)" <Kuyo.Chang@mediatek.com>
Link: https://lkml.kernel.org/r/20231010200442.GA16515@noisy.programming.kicks-ass.net
---
 kernel/sched/core.c     | 10 ++++++++--
 kernel/sched/deadline.c |  2 ++
 kernel/sched/fair.c     |  4 +++-
 kernel/sched/rt.c       |  4 ++++
 4 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a3f9cd52eec5..264c2eb380d7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2645,9 +2645,11 @@ static int migration_cpu_stop(void *data)
 		 * it.
 		 */
 		WARN_ON_ONCE(!pending->stop_pending);
+		preempt_disable();
 		task_rq_unlock(rq, p, &rf);
 		stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
 				    &pending->arg, &pending->stop_work);
+		preempt_enable();
 		return 0;
 	}
 out:
@@ -2967,12 +2969,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 			complete = true;
 		}
 
+		preempt_disable();
 		task_rq_unlock(rq, p, rf);
-
 		if (push_task) {
 			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
 					    p, &rq->push_work);
 		}
+		preempt_enable();
 
 		if (complete)
 			complete_all(&pending->done);
@@ -3038,12 +3041,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 		if (flags & SCA_MIGRATE_ENABLE)
 			p->migration_flags &= ~MDF_PUSH;
 
+		preempt_disable();
 		task_rq_unlock(rq, p, rf);
-
 		if (!stop_pending) {
 			stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
 					    &pending->arg, &pending->stop_work);
 		}
+		preempt_enable();
 
 		if (flags & SCA_MIGRATE_ENABLE)
 			return 0;
@@ -9421,9 +9425,11 @@ static void balance_push(struct rq *rq)
 	 * Temporarily drop rq->lock such that we can wake-up the stop task.
 	 * Both preemption and IRQs are still disabled.
 	 */
+	preempt_disable();
 	raw_spin_rq_unlock(rq);
 	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
 			    this_cpu_ptr(&push_work));
+	preempt_enable();
 	/*
 	 * At this point need_resched() is true and we'll take the loop in
 	 * schedule(). The next pick is obviously going to be the stop task
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7039a8d5ae9b..b28114478b82 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2420,9 +2420,11 @@ skip:
 		double_unlock_balance(this_rq, src_rq);
 
 		if (push_task) {
+			preempt_disable();
 			raw_spin_rq_unlock(this_rq);
 			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
 					    push_task, &src_rq->push_work);
+			preempt_enable();
 			raw_spin_rq_lock(this_rq);
 		}
 	}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a751e552f253..38d757c35004 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11254,13 +11254,15 @@ more_balance:
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
 			}
-			raw_spin_rq_unlock_irqrestore(busiest, flags);
 
+			preempt_disable();
+			raw_spin_rq_unlock_irqrestore(busiest, flags);
 			if (active_balance) {
 				stop_one_cpu_nowait(cpu_of(busiest),
 					active_load_balance_cpu_stop, busiest,
 					&busiest->active_balance_work);
 			}
+			preempt_enable();
 		}
 	} else {
 		sd->nr_balance_failed = 0;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e93b69ef919b..6aaf0a3d6081 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2063,9 +2063,11 @@ retry:
 		 */
 		push_task = get_push_task(rq);
 		if (push_task) {
+			preempt_disable();
 			raw_spin_rq_unlock(rq);
 			stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
 					    push_task, &rq->push_work);
+			preempt_enable();
 			raw_spin_rq_lock(rq);
 		}
 
@@ -2402,9 +2404,11 @@ skip:
 		double_unlock_balance(this_rq, src_rq);
 
 		if (push_task) {
+			preempt_disable();
 			raw_spin_rq_unlock(this_rq);
 			stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
 					    push_task, &src_rq->push_work);
+			preempt_enable();
 			raw_spin_rq_lock(this_rq);
 		}
 	}
-- 
cgit v1.2.3


From 1b8a955dd338dfbf39831d4687c25263e885a9cb Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 12 Oct 2023 08:58:24 -0400
Subject: sched: Make PELT acronym definition searchable

The PELT acronym definition can be found right at the top of
kernel/sched/pelt.c (of course), but it cannot be found through use of

grep -r PELT kernel/sched/

Add the acronym "(PELT)" after "Per Entity Load Tracking" at the top of
the source file.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231012125824.1260774-1-mathieu.desnoyers@efficios.com
---
 kernel/sched/pelt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 0f310768260c..63b6cf898220 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Per Entity Load Tracking
+ * Per Entity Load Tracking (PELT)
  *
  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *
-- 
cgit v1.2.3


From 021a8ca2ba23c01487a98ad23b68ac062e14cf32 Mon Sep 17 00:00:00 2001
From: Keguang Zhang <keguang.zhang@gmail.com>
Date: Mon, 25 Sep 2023 20:17:34 +0800
Subject: genirq/generic-chip: Fix the irq_chip name for /proc/interrupts

irq_init_generic_chip() only sets the name for the first chip type, which
leads to empty names for other chip types.  Eventually, these names will be
shown as "-" /proc/interrupts.

Set the name for all chip types by default.

Signed-off-by: Keguang Zhang <keguang.zhang@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230925121734.93017-1-keguang.zhang@gmail.com
---
 kernel/irq/generic-chip.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c653cd31548d..81ecca08caad 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -219,11 +219,15 @@ void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
 			   int num_ct, unsigned int irq_base,
 			   void __iomem *reg_base, irq_flow_handler_t handler)
 {
+	struct irq_chip_type *ct = gc->chip_types;
+	int i;
+
 	raw_spin_lock_init(&gc->lock);
 	gc->num_ct = num_ct;
 	gc->irq_base = irq_base;
 	gc->reg_base = reg_base;
-	gc->chip_types->chip.name = name;
+	for (i = 0; i < num_ct; i++)
+		ct[i].chip.name = name;
 	gc->chip_types->handler = handler;
 }
 
-- 
cgit v1.2.3


From 8a77f38bcd28d3c22ab7dd8eff3f299d43c00411 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 4 Oct 2023 01:29:00 +0200
Subject: srcu: Only accelerate on enqueue time

Acceleration in SRCU happens on enqueue time for each new callback. This
operation is expected not to fail and therefore any similar attempt
from other places shouldn't find any remaining callbacks to accelerate.

Moreover accelerations performed beyond enqueue time are error prone
because rcu_seq_snap() then may return the snapshot for a new grace
period that is not going to be started.

Remove these dangerous and needless accelerations and introduce instead
assertions reporting leaking unaccelerated callbacks beyond enqueue
time.

Co-developed-by: Yong He <alexyonghe@tencent.com>
Signed-off-by: Yong He <alexyonghe@tencent.com>
Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Co-developed-by: Neeraj upadhyay <Neeraj.Upadhyay@amd.com>
Signed-off-by: Neeraj upadhyay <Neeraj.Upadhyay@amd.com>
Reviewed-by: Like Xu <likexu@tencent.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/srcutree.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 9fab9ac36996..560e99ec5333 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -784,8 +784,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
 	spin_lock_rcu_node(sdp);  /* Interrupts already disabled. */
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
-	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
-				       rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
+	WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
 	spin_unlock_rcu_node(sdp);  /* Interrupts remain disabled. */
 	WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies);
 	WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0);
@@ -1721,6 +1720,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	ssp = sdp->ssp;
 	rcu_cblist_init(&ready_cbs);
 	spin_lock_irq_rcu_node(sdp);
+	WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
 	if (sdp->srcu_cblist_invoking ||
@@ -1750,8 +1750,6 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	 */
 	spin_lock_irq_rcu_node(sdp);
 	rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
-	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
-				       rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
 	sdp->srcu_cblist_invoking = false;
 	more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
 	spin_unlock_irq_rcu_node(sdp);
-- 
cgit v1.2.3


From 03adc61edad49e1bbecfb53f7ea5d78f398fe368 Mon Sep 17 00:00:00 2001
From: Dan Clash <daclash@linux.microsoft.com>
Date: Thu, 12 Oct 2023 14:55:18 -0700
Subject: audit,io_uring: io_uring openat triggers audit reference count
 underflow

An io_uring openat operation can update an audit reference count
from multiple threads resulting in the call trace below.

A call to io_uring_submit() with a single openat op with a flag of
IOSQE_ASYNC results in the following reference count updates.

These first part of the system call performs two increments that do not race.

do_syscall_64()
  __do_sys_io_uring_enter()
    io_submit_sqes()
      io_openat_prep()
        __io_openat_prep()
          getname()
            getname_flags()       /* update 1 (increment) */
              __audit_getname()   /* update 2 (increment) */

The openat op is queued to an io_uring worker thread which starts the
opportunity for a race.  The system call exit performs one decrement.

do_syscall_64()
  syscall_exit_to_user_mode()
    syscall_exit_to_user_mode_prepare()
      __audit_syscall_exit()
        audit_reset_context()
           putname()              /* update 3 (decrement) */

The io_uring worker thread performs one increment and two decrements.
These updates can race with the system call decrement.

io_wqe_worker()
  io_worker_handle_work()
    io_wq_submit_work()
      io_issue_sqe()
        io_openat()
          io_openat2()
            do_filp_open()
              path_openat()
                __audit_inode()   /* update 4 (increment) */
            putname()             /* update 5 (decrement) */
        __audit_uring_exit()
          audit_reset_context()
            putname()             /* update 6 (decrement) */

The fix is to change the refcnt member of struct audit_names
from int to atomic_t.

kernel BUG at fs/namei.c:262!
Call Trace:
...
 ? putname+0x68/0x70
 audit_reset_context.part.0.constprop.0+0xe1/0x300
 __audit_uring_exit+0xda/0x1c0
 io_issue_sqe+0x1f3/0x450
 ? lock_timer_base+0x3b/0xd0
 io_wq_submit_work+0x8d/0x2b0
 ? __try_to_del_timer_sync+0x67/0xa0
 io_worker_handle_work+0x17c/0x2b0
 io_wqe_worker+0x10a/0x350

Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/lkml/MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com/
Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")
Signed-off-by: Dan Clash <daclash@linux.microsoft.com>
Link: https://lore.kernel.org/r/20231012215518.GA4048@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c         | 9 +++++----
 include/linux/fs.h | 2 +-
 kernel/auditsc.c   | 8 ++++----
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/fs/namei.c b/fs/namei.c
index 567ee547492b..94565bd7e73f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -188,7 +188,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
 		}
 	}
 
-	result->refcnt = 1;
+	atomic_set(&result->refcnt, 1);
 	/* The empty path is special. */
 	if (unlikely(!len)) {
 		if (empty)
@@ -249,7 +249,7 @@ getname_kernel(const char * filename)
 	memcpy((char *)result->name, filename, len);
 	result->uptr = NULL;
 	result->aname = NULL;
-	result->refcnt = 1;
+	atomic_set(&result->refcnt, 1);
 	audit_getname(result);
 
 	return result;
@@ -261,9 +261,10 @@ void putname(struct filename *name)
 	if (IS_ERR(name))
 		return;
 
-	BUG_ON(name->refcnt <= 0);
+	if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
+		return;
 
-	if (--name->refcnt > 0)
+	if (!atomic_dec_and_test(&name->refcnt))
 		return;
 
 	if (name->name != name->iname) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b528f063e8ff..4a40823c3c67 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2403,7 +2403,7 @@ struct audit_names;
 struct filename {
 	const char		*name;	/* pointer to actual string */
 	const __user char	*uptr;	/* original userland pointer */
-	int			refcnt;
+	atomic_t		refcnt;
 	struct audit_names	*aname;
 	const char		iname[];
 };
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 21d2fa815e78..6f0d6fb6523f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2212,7 +2212,7 @@ __audit_reusename(const __user char *uptr)
 		if (!n->name)
 			continue;
 		if (n->name->uptr == uptr) {
-			n->name->refcnt++;
+			atomic_inc(&n->name->refcnt);
 			return n->name;
 		}
 	}
@@ -2241,7 +2241,7 @@ void __audit_getname(struct filename *name)
 	n->name = name;
 	n->name_len = AUDIT_NAME_FULL;
 	name->aname = n;
-	name->refcnt++;
+	atomic_inc(&name->refcnt);
 }
 
 static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2373,7 +2373,7 @@ out_alloc:
 		return;
 	if (name) {
 		n->name = name;
-		name->refcnt++;
+		atomic_inc(&name->refcnt);
 	}
 
 out:
@@ -2500,7 +2500,7 @@ void __audit_inode_child(struct inode *parent,
 		if (found_parent) {
 			found_child->name = found_parent->name;
 			found_child->name_len = AUDIT_NAME_FULL;
-			found_child->name->refcnt++;
+			atomic_inc(&found_child->name->refcnt);
 		}
 	}
 
-- 
cgit v1.2.3


From ba8ea72388a192c10f1ee5f5a4a32332e7cced76 Mon Sep 17 00:00:00 2001
From: Artem Savkov <asavkov@redhat.com>
Date: Fri, 13 Oct 2023 07:42:19 +0200
Subject: bpf: Change syscall_nr type to int in struct syscall_tp_t

linux-rt-devel tree contains a patch (b1773eac3f29c ("sched: Add support
for lazy preemption")) that adds an extra member to struct trace_entry.
This causes the offset of args field in struct trace_event_raw_sys_enter
be different from the one in struct syscall_trace_enter:

struct trace_event_raw_sys_enter {
        struct trace_entry         ent;                  /*     0    12 */

        /* XXX last struct has 3 bytes of padding */
        /* XXX 4 bytes hole, try to pack */

        long int                   id;                   /*    16     8 */
        long unsigned int          args[6];              /*    24    48 */
        /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
        char                       __data[];             /*    72     0 */

        /* size: 72, cachelines: 2, members: 4 */
        /* sum members: 68, holes: 1, sum holes: 4 */
        /* paddings: 1, sum paddings: 3 */
        /* last cacheline: 8 bytes */
};

struct syscall_trace_enter {
        struct trace_entry         ent;                  /*     0    12 */

        /* XXX last struct has 3 bytes of padding */

        int                        nr;                   /*    12     4 */
        long unsigned int          args[];               /*    16     0 */

        /* size: 16, cachelines: 1, members: 3 */
        /* paddings: 1, sum paddings: 3 */
        /* last cacheline: 16 bytes */
};

This, in turn, causes perf_event_set_bpf_prog() fail while running bpf
test_profiler testcase because max_ctx_offset is calculated based on the
former struct, while off on the latter:

  10488         if (is_tracepoint || is_syscall_tp) {
  10489                 int off = trace_event_get_offsets(event->tp_event);
  10490
  10491                 if (prog->aux->max_ctx_offset > off)
  10492                         return -EACCES;
  10493         }

What bpf program is actually getting is a pointer to struct
syscall_tp_t, defined in kernel/trace/trace_syscalls.c. This patch fixes
the problem by aligning struct syscall_tp_t with struct
syscall_trace_(enter|exit) and changing the tests to use these structs
to dereference context.

Signed-off-by: Artem Savkov <asavkov@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/bpf/20231013054219.172920-1-asavkov@redhat.com
---
 kernel/trace/trace_syscalls.c                    | 4 ++--
 tools/testing/selftests/bpf/progs/profiler.inc.h | 2 +-
 tools/testing/selftests/bpf/progs/test_vmlinux.c | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index de753403cdaf..9c581d6da843 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -556,7 +556,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
 {
 	struct syscall_tp_t {
 		struct trace_entry ent;
-		unsigned long syscall_nr;
+		int syscall_nr;
 		unsigned long args[SYSCALL_DEFINE_MAXARGS];
 	} __aligned(8) param;
 	int i;
@@ -661,7 +661,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
 {
 	struct syscall_tp_t {
 		struct trace_entry ent;
-		unsigned long syscall_nr;
+		int syscall_nr;
 		unsigned long ret;
 	} __aligned(8) param;
 
diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h
index f799d87e8700..897061930cb7 100644
--- a/tools/testing/selftests/bpf/progs/profiler.inc.h
+++ b/tools/testing/selftests/bpf/progs/profiler.inc.h
@@ -609,7 +609,7 @@ out:
 }
 
 SEC("tracepoint/syscalls/sys_enter_kill")
-int tracepoint__syscalls__sys_enter_kill(struct trace_event_raw_sys_enter* ctx)
+int tracepoint__syscalls__sys_enter_kill(struct syscall_trace_enter* ctx)
 {
 	struct bpf_func_stats_ctx stats_ctx;
 
diff --git a/tools/testing/selftests/bpf/progs/test_vmlinux.c b/tools/testing/selftests/bpf/progs/test_vmlinux.c
index 4b8e37f7fd06..78b23934d9f8 100644
--- a/tools/testing/selftests/bpf/progs/test_vmlinux.c
+++ b/tools/testing/selftests/bpf/progs/test_vmlinux.c
@@ -16,12 +16,12 @@ bool kprobe_called = false;
 bool fentry_called = false;
 
 SEC("tp/syscalls/sys_enter_nanosleep")
-int handle__tp(struct trace_event_raw_sys_enter *args)
+int handle__tp(struct syscall_trace_enter *args)
 {
 	struct __kernel_timespec *ts;
 	long tv_nsec;
 
-	if (args->id != __NR_nanosleep)
+	if (args->nr != __NR_nanosleep)
 		return 0;
 
 	ts = (void *)args->args[0];
-- 
cgit v1.2.3


From f10ca5da5bd71e5cefed7995e75a7c873ce3816e Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Fri, 13 Oct 2023 13:44:22 -0700
Subject: bpf: Don't explicitly emit BTF for struct btf_iter_num

Commit 6018e1f407cc ("bpf: implement numbers iterator") added the
BTF_TYPE_EMIT line that this patch is modifying. The struct btf_iter_num
doesn't exist, so only a forward declaration is emitted in BTF:

  FWD 'btf_iter_num' fwd_kind=struct

That commit was probably hoping to ensure that struct bpf_iter_num is
emitted in vmlinux BTF. A previous version of this patch changed the
line to emit the correct type, but Yonghong confirmed that it would
definitely be emitted regardless in [0], so this patch simply removes
the line.

This isn't marked "Fixes" because the extraneous btf_iter_num FWD wasn't
causing any issues that I noticed, aside from mild confusion when I
looked through the code.

  [0]: https://lore.kernel.org/bpf/25d08207-43e6-36a8-5e0f-47a913d4cda5@linux.dev/

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231013204426.1074286-2-davemarchevsky@fb.com
---
 kernel/bpf/bpf_iter.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 96856f130cbf..833faa04461b 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -793,8 +793,6 @@ __bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
 	BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
 	BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
 
-	BTF_TYPE_EMIT(struct btf_iter_num);
-
 	/* start == end is legit, it's an empty range and we'll just get NULL
 	 * on first (and any subsequent) bpf_iter_num_next() call
 	 */
-- 
cgit v1.2.3


From 4ac4546821584736798aaa9e97da9f6eaf689ea3 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Fri, 13 Oct 2023 13:44:24 -0700
Subject: bpf: Introduce task_vma open-coded iterator kfuncs

This patch adds kfuncs bpf_iter_task_vma_{new,next,destroy} which allow
creation and manipulation of struct bpf_iter_task_vma in open-coded
iterator style. BPF programs can use these kfuncs directly or through
bpf_for_each macro for natural-looking iteration of all task vmas.

The implementation borrows heavily from bpf_find_vma helper's locking -
differing only in that it holds the mmap_read lock for all iterations
while the helper only executes its provided callback on a maximum of 1
vma. Aside from locking, struct vma_iterator and vma_next do all the
heavy lifting.

A pointer to an inner data struct, struct bpf_iter_task_vma_data, is the
only field in struct bpf_iter_task_vma. This is because the inner data
struct contains a struct vma_iterator (not ptr), whose size is likely to
change under us. If bpf_iter_task_vma_kern contained vma_iterator directly
such a change would require change in opaque bpf_iter_task_vma struct's
size. So better to allocate vma_iterator using BPF allocator, and since
that alloc must already succeed, might as well allocate all iter fields,
thereby freezing struct bpf_iter_task_vma size.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231013204426.1074286-4-davemarchevsky@fb.com
---
 kernel/bpf/helpers.c   |  3 ++
 kernel/bpf/task_iter.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index d2840dd5b00d..62a53ebfedf9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2552,6 +2552,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
 BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 7473068ed313..fef17628341f 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -7,7 +7,9 @@
 #include <linux/fs.h>
 #include <linux/fdtable.h>
 #include <linux/filter.h>
+#include <linux/bpf_mem_alloc.h>
 #include <linux/btf_ids.h>
+#include <linux/mm_types.h>
 #include "mmap_unlock_work.h"
 
 static const char * const iter_task_type_names[] = {
@@ -803,6 +805,95 @@ const struct bpf_func_proto bpf_find_vma_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+struct bpf_iter_task_vma_kern_data {
+	struct task_struct *task;
+	struct mm_struct *mm;
+	struct mmap_unlock_irq_work *work;
+	struct vma_iterator vmi;
+};
+
+struct bpf_iter_task_vma {
+	/* opaque iterator state; having __u64 here allows to preserve correct
+	 * alignment requirements in vmlinux.h, generated from BTF
+	 */
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+/* Non-opaque version of bpf_iter_task_vma */
+struct bpf_iter_task_vma_kern {
+	struct bpf_iter_task_vma_kern_data *data;
+} __attribute__((aligned(8)));
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
+				      struct task_struct *task, u64 addr)
+{
+	struct bpf_iter_task_vma_kern *kit = (void *)it;
+	bool irq_work_busy = false;
+	int err;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
+
+	/* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
+	 * before, so non-NULL kit->data doesn't point to previously
+	 * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
+	 */
+	kit->data = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_iter_task_vma_kern_data));
+	if (!kit->data)
+		return -ENOMEM;
+
+	kit->data->task = get_task_struct(task);
+	kit->data->mm = task->mm;
+	if (!kit->data->mm) {
+		err = -ENOENT;
+		goto err_cleanup_iter;
+	}
+
+	/* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
+	irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
+	if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
+		err = -EBUSY;
+		goto err_cleanup_iter;
+	}
+
+	vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
+	return 0;
+
+err_cleanup_iter:
+	if (kit->data->task)
+		put_task_struct(kit->data->task);
+	bpf_mem_free(&bpf_global_ma, kit->data);
+	/* NULL kit->data signals failed bpf_iter_task_vma initialization */
+	kit->data = NULL;
+	return err;
+}
+
+__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
+{
+	struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+	if (!kit->data) /* bpf_iter_task_vma_new failed */
+		return NULL;
+	return vma_next(&kit->data->vmi);
+}
+
+__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
+{
+	struct bpf_iter_task_vma_kern *kit = (void *)it;
+
+	if (kit->data) {
+		bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
+		put_task_struct(kit->data->task);
+		bpf_mem_free(&bpf_global_ma, kit->data);
+	}
+}
+
+__diag_pop();
+
 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
 
 static void do_mmap_read_unlock(struct irq_work *entry)
-- 
cgit v1.2.3


From dd712d3d45807db9fcae28a522deee85c1f2fde6 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 22 Aug 2023 13:19:46 -0700
Subject: kgdb: Flush console before entering kgdb on panic

When entering kdb/kgdb on a kernel panic, it was be observed that the
console isn't flushed before the `kdb` prompt came up. Specifically,
when using the buddy lockup detector on arm64 and running:
  echo HARDLOCKUP > /sys/kernel/debug/provoke-crash/DIRECT

I could see:
  [   26.161099] lkdtm: Performing direct entry HARDLOCKUP
  [   32.499881] watchdog: Watchdog detected hard LOCKUP on cpu 6
  [   32.552865] Sending NMI from CPU 5 to CPUs 6:
  [   32.557359] NMI backtrace for cpu 6
  ... [backtrace for cpu 6] ...
  [   32.558353] NMI backtrace for cpu 5
  ... [backtrace for cpu 5] ...
  [   32.867471] Sending NMI from CPU 5 to CPUs 0-4,7:
  [   32.872321] NMI backtrace forP cpuANC: Hard LOCKUP

  Entering kdb (current=..., pid 0) on processor 5 due to Keyboard Entry
  [5]kdb>

As you can see, backtraces for the other CPUs start printing and get
interleaved with the kdb PANIC print.

Let's replicate the commands to flush the console in the kdb panic
entry point to avoid this.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20230822131945.1.I5b460ae8f954e4c4f628a373d6e74713c06dd26f@changeid
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/debug_core.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 621037a0aa87..ce1bb2301c06 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1006,6 +1006,9 @@ void kgdb_panic(const char *msg)
 	if (panic_timeout)
 		return;
 
+	debug_locks_off();
+	console_flush_on_panic(CONSOLE_FLUSH_PENDING);
+
 	if (dbg_kdb_mode)
 		kdb_printf("PANIC: %s\n", msg);
 
-- 
cgit v1.2.3


From 60c6946675fc06dd2fd2b7a4b6fd1c1f046f1056 Mon Sep 17 00:00:00 2001
From: Xabier Marquiegui <reibax@gmail.com>
Date: Thu, 12 Oct 2023 00:39:53 +0200
Subject: posix-clock: introduce posix_clock_context concept

Add the necessary structure to support custom private-data per
posix-clock user.

The previous implementation of posix-clock assumed all file open
instances need access to the same clock structure on private_data.

The need for individual data structures per file open instance has been
identified when developing support for multiple timestamp event queue
users for ptp_clock.

Signed-off-by: Xabier Marquiegui <reibax@gmail.com>
Suggested-by: Richard Cochran <richardcochran@gmail.com>
Suggested-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_chardev.c   | 21 +++++++++++++--------
 drivers/ptp/ptp_private.h   | 16 +++++++++-------
 include/linux/posix-clock.h | 35 +++++++++++++++++++++++++++--------
 kernel/time/posix-clock.c   | 36 +++++++++++++++++++++++++++---------
 4 files changed, 76 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 362bf756e6b7..0ba3e7064df2 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -101,14 +101,16 @@ int ptp_set_pinfunc(struct ptp_clock *ptp, unsigned int pin,
 	return 0;
 }
 
-int ptp_open(struct posix_clock *pc, fmode_t fmode)
+int ptp_open(struct posix_clock_context *pccontext, fmode_t fmode)
 {
 	return 0;
 }
 
-long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
+long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd,
+	       unsigned long arg)
 {
-	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
+	struct ptp_clock *ptp =
+		container_of(pccontext->clk, struct ptp_clock, clock);
 	struct ptp_sys_offset_extended *extoff = NULL;
 	struct ptp_sys_offset_precise precise_offset;
 	struct system_device_crosststamp xtstamp;
@@ -432,9 +434,11 @@ out:
 	return err;
 }
 
-__poll_t ptp_poll(struct posix_clock *pc, struct file *fp, poll_table *wait)
+__poll_t ptp_poll(struct posix_clock_context *pccontext, struct file *fp,
+		  poll_table *wait)
 {
-	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
+	struct ptp_clock *ptp =
+		container_of(pccontext->clk, struct ptp_clock, clock);
 
 	poll_wait(fp, &ptp->tsev_wq, wait);
 
@@ -443,10 +447,11 @@ __poll_t ptp_poll(struct posix_clock *pc, struct file *fp, poll_table *wait)
 
 #define EXTTS_BUFSIZE (PTP_BUF_TIMESTAMPS * sizeof(struct ptp_extts_event))
 
-ssize_t ptp_read(struct posix_clock *pc,
-		 uint rdflags, char __user *buf, size_t cnt)
+ssize_t ptp_read(struct posix_clock_context *pccontext, uint rdflags,
+		 char __user *buf, size_t cnt)
 {
-	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
+	struct ptp_clock *ptp =
+		container_of(pccontext->clk, struct ptp_clock, clock);
 	struct timestamp_event_queue *queue = &ptp->tsevq;
 	struct ptp_extts_event *event;
 	unsigned long flags;
diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
index 75f58fc468a7..a3110c85f694 100644
--- a/drivers/ptp/ptp_private.h
+++ b/drivers/ptp/ptp_private.h
@@ -117,16 +117,18 @@ extern struct class *ptp_class;
 int ptp_set_pinfunc(struct ptp_clock *ptp, unsigned int pin,
 		    enum ptp_pin_function func, unsigned int chan);
 
-long ptp_ioctl(struct posix_clock *pc,
-	       unsigned int cmd, unsigned long arg);
+long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd,
+	       unsigned long arg);
 
-int ptp_open(struct posix_clock *pc, fmode_t fmode);
+int ptp_open(struct posix_clock_context *pccontext, fmode_t fmode);
 
-ssize_t ptp_read(struct posix_clock *pc,
-		 uint flags, char __user *buf, size_t cnt);
+int ptp_release(struct posix_clock_context *pccontext);
 
-__poll_t ptp_poll(struct posix_clock *pc,
-	      struct file *fp, poll_table *wait);
+ssize_t ptp_read(struct posix_clock_context *pccontext, uint flags, char __user *buf,
+		 size_t cnt);
+
+__poll_t ptp_poll(struct posix_clock_context *pccontext, struct file *fp,
+		  poll_table *wait);
 
 /*
  * see ptp_sysfs.c
diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h
index 468328b1e1dd..ef8619f48920 100644
--- a/include/linux/posix-clock.h
+++ b/include/linux/posix-clock.h
@@ -14,6 +14,7 @@
 #include <linux/rwsem.h>
 
 struct posix_clock;
+struct posix_clock_context;
 
 /**
  * struct posix_clock_operations - functional interface to the clock
@@ -50,18 +51,18 @@ struct posix_clock_operations {
 	/*
 	 * Optional character device methods:
 	 */
-	long    (*ioctl)   (struct posix_clock *pc,
-			    unsigned int cmd, unsigned long arg);
+	long (*ioctl)(struct posix_clock_context *pccontext, unsigned int cmd,
+		      unsigned long arg);
 
-	int     (*open)    (struct posix_clock *pc, fmode_t f_mode);
+	int (*open)(struct posix_clock_context *pccontext, fmode_t f_mode);
 
-	__poll_t (*poll)   (struct posix_clock *pc,
-			    struct file *file, poll_table *wait);
+	__poll_t (*poll)(struct posix_clock_context *pccontext, struct file *file,
+			 poll_table *wait);
 
-	int     (*release) (struct posix_clock *pc);
+	int (*release)(struct posix_clock_context *pccontext);
 
-	ssize_t (*read)    (struct posix_clock *pc,
-			    uint flags, char __user *buf, size_t cnt);
+	ssize_t (*read)(struct posix_clock_context *pccontext, uint flags,
+			char __user *buf, size_t cnt);
 };
 
 /**
@@ -90,6 +91,24 @@ struct posix_clock {
 	bool zombie;
 };
 
+/**
+ * struct posix_clock_context - represents clock file operations context
+ *
+ * @clk:              Pointer to the clock
+ * @private_clkdata:  Pointer to user data
+ *
+ * Drivers should use struct posix_clock_context during specific character
+ * device file operation methods to access the posix clock.
+ *
+ * Drivers can store a private data structure during the open operation
+ * if they have specific information that is required in other file
+ * operations.
+ */
+struct posix_clock_context {
+	struct posix_clock *clk;
+	void *private_clkdata;
+};
+
 /**
  * posix_clock_register() - register a new clock
  * @clk:   Pointer to the clock. Caller must provide 'ops' field
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 77c0c2370b6d..9de66bbbb3d1 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -19,7 +19,8 @@
  */
 static struct posix_clock *get_posix_clock(struct file *fp)
 {
-	struct posix_clock *clk = fp->private_data;
+	struct posix_clock_context *pccontext = fp->private_data;
+	struct posix_clock *clk = pccontext->clk;
 
 	down_read(&clk->rwsem);
 
@@ -39,6 +40,7 @@ static void put_posix_clock(struct posix_clock *clk)
 static ssize_t posix_clock_read(struct file *fp, char __user *buf,
 				size_t count, loff_t *ppos)
 {
+	struct posix_clock_context *pccontext = fp->private_data;
 	struct posix_clock *clk = get_posix_clock(fp);
 	int err = -EINVAL;
 
@@ -46,7 +48,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
 		return -ENODEV;
 
 	if (clk->ops.read)
-		err = clk->ops.read(clk, fp->f_flags, buf, count);
+		err = clk->ops.read(pccontext, fp->f_flags, buf, count);
 
 	put_posix_clock(clk);
 
@@ -55,6 +57,7 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
 
 static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
 {
+	struct posix_clock_context *pccontext = fp->private_data;
 	struct posix_clock *clk = get_posix_clock(fp);
 	__poll_t result = 0;
 
@@ -62,7 +65,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
 		return EPOLLERR;
 
 	if (clk->ops.poll)
-		result = clk->ops.poll(clk, fp, wait);
+		result = clk->ops.poll(pccontext, fp, wait);
 
 	put_posix_clock(clk);
 
@@ -72,6 +75,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
 static long posix_clock_ioctl(struct file *fp,
 			      unsigned int cmd, unsigned long arg)
 {
+	struct posix_clock_context *pccontext = fp->private_data;
 	struct posix_clock *clk = get_posix_clock(fp);
 	int err = -ENOTTY;
 
@@ -79,7 +83,7 @@ static long posix_clock_ioctl(struct file *fp,
 		return -ENODEV;
 
 	if (clk->ops.ioctl)
-		err = clk->ops.ioctl(clk, cmd, arg);
+		err = clk->ops.ioctl(pccontext, cmd, arg);
 
 	put_posix_clock(clk);
 
@@ -90,6 +94,7 @@ static long posix_clock_ioctl(struct file *fp,
 static long posix_clock_compat_ioctl(struct file *fp,
 				     unsigned int cmd, unsigned long arg)
 {
+	struct posix_clock_context *pccontext = fp->private_data;
 	struct posix_clock *clk = get_posix_clock(fp);
 	int err = -ENOTTY;
 
@@ -97,7 +102,7 @@ static long posix_clock_compat_ioctl(struct file *fp,
 		return -ENODEV;
 
 	if (clk->ops.ioctl)
-		err = clk->ops.ioctl(clk, cmd, arg);
+		err = clk->ops.ioctl(pccontext, cmd, arg);
 
 	put_posix_clock(clk);
 
@@ -110,6 +115,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
 	int err;
 	struct posix_clock *clk =
 		container_of(inode->i_cdev, struct posix_clock, cdev);
+	struct posix_clock_context *pccontext;
 
 	down_read(&clk->rwsem);
 
@@ -117,14 +123,20 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
 		err = -ENODEV;
 		goto out;
 	}
+	pccontext = kzalloc(sizeof(*pccontext), GFP_KERNEL);
+	if (!pccontext) {
+		err = -ENOMEM;
+		goto out;
+	}
+	pccontext->clk = clk;
+	fp->private_data = pccontext;
 	if (clk->ops.open)
-		err = clk->ops.open(clk, fp->f_mode);
+		err = clk->ops.open(pccontext, fp->f_mode);
 	else
 		err = 0;
 
 	if (!err) {
 		get_device(clk->dev);
-		fp->private_data = clk;
 	}
 out:
 	up_read(&clk->rwsem);
@@ -133,14 +145,20 @@ out:
 
 static int posix_clock_release(struct inode *inode, struct file *fp)
 {
-	struct posix_clock *clk = fp->private_data;
+	struct posix_clock_context *pccontext = fp->private_data;
+	struct posix_clock *clk;
 	int err = 0;
 
+	if (!pccontext)
+		return -ENODEV;
+	clk = pccontext->clk;
+
 	if (clk->ops.release)
-		err = clk->ops.release(clk);
+		err = clk->ops.release(pccontext);
 
 	put_device(clk->dev);
 
+	kfree(pccontext);
 	fp->private_data = NULL;
 
 	return err;
-- 
cgit v1.2.3


From 7b3d8df549390e797f883efa16224fa0dfe35e55 Mon Sep 17 00:00:00 2001
From: Fan Yu <fan.yu9@zte.com.cn>
Date: Mon, 16 Oct 2023 19:20:39 +0800
Subject: sched/psi: Update poll => rtpoll in relevant comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PSI trigger code is now making a distinction between privileged and
unprivileged triggers, after the following commit:

 65457b74aa94 ("sched/psi: Rename existing poll members in preparation")

But some comments have not been modified along with the code, so they
need to be updated.

This will help readers better understand the code.

Signed-off-by: Fan Yu <fan.yu9@zte.com.cn>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Peter Ziljstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/202310161920399921184@zte.com.cn
---
 kernel/sched/psi.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 519bc922a960..7b4aa5809c0f 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -596,7 +596,7 @@ static void init_rtpoll_triggers(struct psi_group *group, u64 now)
 	group->rtpoll_next_update = now + group->rtpoll_min_period;
 }
 
-/* Schedule polling if it's not already scheduled or forced. */
+/* Schedule rtpolling if it's not already scheduled or forced. */
 static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay,
 				   bool force)
 {
@@ -636,37 +636,37 @@ static void psi_rtpoll_work(struct psi_group *group)
 
 	if (now > group->rtpoll_until) {
 		/*
-		 * We are either about to start or might stop polling if no
-		 * state change was recorded. Resetting poll_scheduled leaves
+		 * We are either about to start or might stop rtpolling if no
+		 * state change was recorded. Resetting rtpoll_scheduled leaves
 		 * a small window for psi_group_change to sneak in and schedule
-		 * an immediate poll_work before we get to rescheduling. One
-		 * potential extra wakeup at the end of the polling window
-		 * should be negligible and polling_next_update still keeps
+		 * an immediate rtpoll_work before we get to rescheduling. One
+		 * potential extra wakeup at the end of the rtpolling window
+		 * should be negligible and rtpoll_next_update still keeps
 		 * updates correctly on schedule.
 		 */
 		atomic_set(&group->rtpoll_scheduled, 0);
 		/*
-		 * A task change can race with the poll worker that is supposed to
+		 * A task change can race with the rtpoll worker that is supposed to
 		 * report on it. To avoid missing events, ensure ordering between
-		 * poll_scheduled and the task state accesses, such that if the poll
-		 * worker misses the state update, the task change is guaranteed to
-		 * reschedule the poll worker:
+		 * rtpoll_scheduled and the task state accesses, such that if the
+		 * rtpoll worker misses the state update, the task change is
+		 * guaranteed to reschedule the rtpoll worker:
 		 *
-		 * poll worker:
-		 *   atomic_set(poll_scheduled, 0)
+		 * rtpoll worker:
+		 *   atomic_set(rtpoll_scheduled, 0)
 		 *   smp_mb()
 		 *   LOAD states
 		 *
 		 * task change:
 		 *   STORE states
-		 *   if atomic_xchg(poll_scheduled, 1) == 0:
-		 *     schedule poll worker
+		 *   if atomic_xchg(rtpoll_scheduled, 1) == 0:
+		 *     schedule rtpoll worker
 		 *
 		 * The atomic_xchg() implies a full barrier.
 		 */
 		smp_mb();
 	} else {
-		/* Polling window is not over, keep rescheduling */
+		/* The rtpolling window is not over, keep rescheduling */
 		force_reschedule = true;
 	}
 
@@ -674,7 +674,7 @@ static void psi_rtpoll_work(struct psi_group *group)
 	collect_percpu_times(group, PSI_POLL, &changed_states);
 
 	if (changed_states & group->rtpoll_states) {
-		/* Initialize trigger windows when entering polling mode */
+		/* Initialize trigger windows when entering rtpolling mode */
 		if (now > group->rtpoll_until)
 			init_rtpoll_triggers(group, now);
 
-- 
cgit v1.2.3


From 72f8a1de4a7ecb23393a920dface58d5a96f42d8 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 11 Oct 2023 15:37:27 -0700
Subject: bpf: Disambiguate SCALAR register state output in verifier logs

Currently the way that verifier prints SCALAR_VALUE register state (and
PTR_TO_PACKET, which can have var_off and ranges info as well) is very
ambiguous.

In the name of brevity we are trying to eliminate "unnecessary" output
of umin/umax, smin/smax, u32_min/u32_max, and s32_min/s32_max values, if
possible. Current rules are that if any of those have their default
value (which for mins is the minimal value of its respective types: 0,
S32_MIN, or S64_MIN, while for maxs it's U32_MAX, S32_MAX, S64_MAX, or
U64_MAX) *OR* if there is another min/max value that as matching value.
E.g., if smin=100 and umin=100, we'll emit only umin=10, omitting smin
altogether. This approach has a few problems, being both ambiguous and
sort-of incorrect in some cases.

Ambiguity is due to missing value could be either default value or value
of umin/umax or smin/smax. This is especially confusing when we mix
signed and unsigned ranges. Quite often, umin=0 and smin=0, and so we'll
have only `umin=0` leaving anyone reading verifier log to guess whether
smin is actually 0 or it's actually -9223372036854775808 (S64_MIN). And
often times it's important to know, especially when debugging tricky
issues.

"Sort-of incorrectness" comes from mixing negative and positive values.
E.g., if umin is some large positive number, it can be equal to smin
which is, interpreted as signed value, is actually some negative value.
Currently, that smin will be omitted and only umin will be emitted with
a large positive value, giving an impression that smin is also positive.

Anyway, ambiguity is the biggest issue making it impossible to have an
exact understanding of register state, preventing any sort of automated
testing of verifier state based on verifier log. This patch is
attempting to rectify the situation by removing ambiguity, while
minimizing the verboseness of register state output.

The rules are straightforward:
  - if some of the values are missing, then it definitely has a default
  value. I.e., `umin=0` means that umin is zero, but smin is actually
  S64_MIN;
  - all the various boundaries that happen to have the same value are
  emitted in one equality separated sequence. E.g., if umin and smin are
  both 100, we'll emit `smin=umin=100`, making this explicit;
  - we do not mix negative and positive values together, and even if
  they happen to have the same bit-level value, they will be emitted
  separately with proper sign. I.e., if both umax and smax happen to be
  0xffffffffffffffff, we'll emit them both separately as
  `smax=-1,umax=18446744073709551615`;
  - in the name of a bit more uniformity and consistency,
  {u32,s32}_{min,max} are renamed to {s,u}{min,max}32, which seems to
  improve readability.

The above means that in case of all 4 ranges being, say, [50, 100] range,
we'd previously see hugely ambiguous:

    R1=scalar(umin=50,umax=100)

Now, we'll be more explicit:

    R1=scalar(smin=umin=smin32=umin32=50,smax=umax=smax32=umax32=100)

This is slightly more verbose, but distinct from the case when we don't
know anything about signed boundaries and 32-bit boundaries, which under
new rules will match the old case:

    R1=scalar(umin=50,umax=100)

Also, in the name of simplicity of implementation and consistency, order
for {s,u}32_{min,max} are emitted *before* var_off. Previously they were
emitted afterwards, for unclear reasons.

This patch also includes a few fixes to selftests that expect exact
register state to accommodate slight changes to verifier format. You can
see that the changes are pretty minimal in common cases.

Note, the special case when SCALAR_VALUE register is a known constant
isn't changed, we'll emit constant value once, interpreted as signed
value.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20231011223728.3188086-5-andrii@kernel.org
---
 kernel/bpf/verifier.c                              | 67 +++++++++++++++-------
 .../selftests/bpf/progs/exceptions_assert.c        | 18 +++---
 tools/testing/selftests/bpf/progs/verifier_ldsx.c  |  2 +-
 3 files changed, 55 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e777f50401b6..82ae14f32a01 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1342,6 +1342,50 @@ static void scrub_spilled_slot(u8 *stype)
 		*stype = STACK_MISC;
 }
 
+static void print_scalar_ranges(struct bpf_verifier_env *env,
+				const struct bpf_reg_state *reg,
+				const char **sep)
+{
+	struct {
+		const char *name;
+		u64 val;
+		bool omit;
+	} minmaxs[] = {
+		{"smin",   reg->smin_value,         reg->smin_value == S64_MIN},
+		{"smax",   reg->smax_value,         reg->smax_value == S64_MAX},
+		{"umin",   reg->umin_value,         reg->umin_value == 0},
+		{"umax",   reg->umax_value,         reg->umax_value == U64_MAX},
+		{"smin32", (s64)reg->s32_min_value, reg->s32_min_value == S32_MIN},
+		{"smax32", (s64)reg->s32_max_value, reg->s32_max_value == S32_MAX},
+		{"umin32", reg->u32_min_value,      reg->u32_min_value == 0},
+		{"umax32", reg->u32_max_value,      reg->u32_max_value == U32_MAX},
+	}, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
+	bool neg1, neg2;
+
+	for (m1 = &minmaxs[0]; m1 < mend; m1++) {
+		if (m1->omit)
+			continue;
+
+		neg1 = m1->name[0] == 's' && (s64)m1->val < 0;
+
+		verbose(env, "%s%s=", *sep, m1->name);
+		*sep = ",";
+
+		for (m2 = m1 + 2; m2 < mend; m2 += 2) {
+			if (m2->omit || m2->val != m1->val)
+				continue;
+			/* don't mix negatives with positives */
+			neg2 = m2->name[0] == 's' && (s64)m2->val < 0;
+			if (neg2 != neg1)
+				continue;
+			m2->omit = true;
+			verbose(env, "%s=", m2->name);
+		}
+
+		verbose(env, m1->name[0] == 's' ? "%lld" : "%llu", m1->val);
+	}
+}
+
 static void print_verifier_state(struct bpf_verifier_env *env,
 				 const struct bpf_func_state *state,
 				 bool print_all)
@@ -1405,34 +1449,13 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 				 */
 				verbose_a("imm=%llx", reg->var_off.value);
 			} else {
-				if (reg->smin_value != reg->umin_value &&
-				    reg->smin_value != S64_MIN)
-					verbose_a("smin=%lld", (long long)reg->smin_value);
-				if (reg->smax_value != reg->umax_value &&
-				    reg->smax_value != S64_MAX)
-					verbose_a("smax=%lld", (long long)reg->smax_value);
-				if (reg->umin_value != 0)
-					verbose_a("umin=%llu", (unsigned long long)reg->umin_value);
-				if (reg->umax_value != U64_MAX)
-					verbose_a("umax=%llu", (unsigned long long)reg->umax_value);
+				print_scalar_ranges(env, reg, &sep);
 				if (!tnum_is_unknown(reg->var_off)) {
 					char tn_buf[48];
 
 					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 					verbose_a("var_off=%s", tn_buf);
 				}
-				if (reg->s32_min_value != reg->smin_value &&
-				    reg->s32_min_value != S32_MIN)
-					verbose_a("s32_min=%d", (int)(reg->s32_min_value));
-				if (reg->s32_max_value != reg->smax_value &&
-				    reg->s32_max_value != S32_MAX)
-					verbose_a("s32_max=%d", (int)(reg->s32_max_value));
-				if (reg->u32_min_value != reg->umin_value &&
-				    reg->u32_min_value != U32_MIN)
-					verbose_a("u32_min=%d", (int)(reg->u32_min_value));
-				if (reg->u32_max_value != reg->umax_value &&
-				    reg->u32_max_value != U32_MAX)
-					verbose_a("u32_max=%d", (int)(reg->u32_max_value));
 			}
 #undef verbose_a
 
diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c
index fa35832e6748..e1e5c54a6a11 100644
--- a/tools/testing/selftests/bpf/progs/exceptions_assert.c
+++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c
@@ -31,35 +31,35 @@ check_assert(s64, eq, llong_max, LLONG_MAX);
 
 __msg(": R0_w=scalar(smax=2147483646) R10=fp0")
 check_assert(s64, lt, pos, INT_MAX);
-__msg(": R0_w=scalar(umin=9223372036854775808,var_off=(0x8000000000000000; 0x7fffffffffffffff))")
+__msg(": R0_w=scalar(smax=-1,umin=9223372036854775808,var_off=(0x8000000000000000; 0x7fffffffffffffff))")
 check_assert(s64, lt, zero, 0);
-__msg(": R0_w=scalar(umin=9223372036854775808,umax=18446744071562067967,var_off=(0x8000000000000000; 0x7fffffffffffffff))")
+__msg(": R0_w=scalar(smax=-2147483649,umin=9223372036854775808,umax=18446744071562067967,var_off=(0x8000000000000000; 0x7fffffffffffffff))")
 check_assert(s64, lt, neg, INT_MIN);
 
 __msg(": R0_w=scalar(smax=2147483647) R10=fp0")
 check_assert(s64, le, pos, INT_MAX);
 __msg(": R0_w=scalar(smax=0) R10=fp0")
 check_assert(s64, le, zero, 0);
-__msg(": R0_w=scalar(umin=9223372036854775808,umax=18446744071562067968,var_off=(0x8000000000000000; 0x7fffffffffffffff))")
+__msg(": R0_w=scalar(smax=-2147483648,umin=9223372036854775808,umax=18446744071562067968,var_off=(0x8000000000000000; 0x7fffffffffffffff))")
 check_assert(s64, le, neg, INT_MIN);
 
-__msg(": R0_w=scalar(umin=2147483648,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff))")
+__msg(": R0_w=scalar(smin=umin=2147483648,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff))")
 check_assert(s64, gt, pos, INT_MAX);
-__msg(": R0_w=scalar(umin=1,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff))")
+__msg(": R0_w=scalar(smin=umin=1,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff))")
 check_assert(s64, gt, zero, 0);
 __msg(": R0_w=scalar(smin=-2147483647) R10=fp0")
 check_assert(s64, gt, neg, INT_MIN);
 
-__msg(": R0_w=scalar(umin=2147483647,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff))")
+__msg(": R0_w=scalar(smin=umin=2147483647,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff))")
 check_assert(s64, ge, pos, INT_MAX);
-__msg(": R0_w=scalar(umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff)) R10=fp0")
+__msg(": R0_w=scalar(smin=0,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff)) R10=fp0")
 check_assert(s64, ge, zero, 0);
 __msg(": R0_w=scalar(smin=-2147483648) R10=fp0")
 check_assert(s64, ge, neg, INT_MIN);
 
 SEC("?tc")
 __log_level(2) __failure
-__msg(": R0=0 R1=ctx(off=0,imm=0) R2=scalar(smin=-2147483646,smax=2147483645) R10=fp0")
+__msg(": R0=0 R1=ctx(off=0,imm=0) R2=scalar(smin=smin32=-2147483646,smax=smax32=2147483645) R10=fp0")
 int check_assert_range_s64(struct __sk_buff *ctx)
 {
 	struct bpf_sock *sk = ctx->sk;
@@ -75,7 +75,7 @@ int check_assert_range_s64(struct __sk_buff *ctx)
 
 SEC("?tc")
 __log_level(2) __failure
-__msg(": R1=ctx(off=0,imm=0) R2=scalar(umin=4096,umax=8192,var_off=(0x0; 0x3fff))")
+__msg(": R1=ctx(off=0,imm=0) R2=scalar(smin=umin=smin32=umin32=4096,smax=umax=smax32=umax32=8192,var_off=(0x0; 0x3fff))")
 int check_assert_range_u64(struct __sk_buff *ctx)
 {
 	u64 num = ctx->len;
diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c
index f90016a57eec..375525329637 100644
--- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c
+++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c
@@ -64,7 +64,7 @@ __naked void ldsx_s32(void)
 SEC("socket")
 __description("LDSX, S8 range checking, privileged")
 __log_level(2) __success __retval(1)
-__msg("R1_w=scalar(smin=-128,smax=127)")
+__msg("R1_w=scalar(smin=smin32=-128,smax=smax32=127)")
 __naked void ldsx_s8_range_priv(void)
 {
 	asm volatile (
-- 
cgit v1.2.3


From 1a8a315f008a58f54fecb012b928aa6a494435b3 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 11 Oct 2023 15:37:28 -0700
Subject: bpf: Ensure proper register state printing for cond jumps

Verifier emits relevant register state involved in any given instruction
next to it after `;` to the right, if possible. Or, worst case, on the
separate line repeating instruction index.

E.g., a nice and simple case would be:

  2: (d5) if r0 s<= 0x0 goto pc+1       ; R0_w=0

But if there is some intervening extra output (e.g., precision
backtracking log) involved, we are supposed to see the state after the
precision backtrack log:

  4: (75) if r0 s>= 0x0 goto pc+1
  mark_precise: frame0: last_idx 4 first_idx 0 subseq_idx -1
  mark_precise: frame0: regs=r0 stack= before 2: (d5) if r0 s<= 0x0 goto pc+1
  mark_precise: frame0: regs=r0 stack= before 1: (b7) r0 = 0
  6: R0_w=0

First off, note that in `6: R0_w=0` instruction index corresponds to the
next instruction, not to the conditional jump instruction itself, which
is wrong and we'll get to that.

But besides that, the above is a happy case that does work today. Yet,
if it so happens that precision backtracking had to traverse some of the
parent states, this `6: R0_w=0` state output would be missing.

This is due to a quirk of print_verifier_state() routine, which performs
mark_verifier_state_clean(env) at the end. This marks all registers as
"non-scratched", which means that subsequent logic to print *relevant*
registers (that is, "scratched ones") fails and doesn't see anything
relevant to print and skips the output altogether.

print_verifier_state() is used both to print instruction context, but
also to print an **entire** verifier state indiscriminately, e.g.,
during precision backtracking (and in a few other situations, like
during entering or exiting subprogram).  Which means if we have to print
entire parent state before getting to printing instruction context
state, instruction context is marked as clean and is omitted.

Long story short, this is definitely not intentional. So we fix this
behavior in this patch by teaching print_verifier_state() to clear
scratch state only if it was used to print instruction state, not the
parent/callback state. This is determined by print_all option, so if
it's not set, we don't clear scratch state. This fixes missing
instruction state for these cases.

As for the mismatched instruction index, we fix that by making sure we
call print_insn_state() early inside check_cond_jmp_op() before we
adjusted insn_idx based on jump branch taken logic. And with that we get
desired correct information:

  9: (16) if w4 == 0x1 goto pc+9
  mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1
  mark_precise: frame0: parent state regs=r4 stack=: R2_w=1944 R4_rw=P1 R10=fp0
  mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9
  mark_precise: frame0: regs=r4 stack= before 8: (66) if w4 s> 0x3 goto pc+5
  mark_precise: frame0: regs=r4 stack= before 7: (b7) r4 = 1
  9: R4=1

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20231011223728.3188086-6-andrii@kernel.org
---
 kernel/bpf/verifier.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 82ae14f32a01..6d543a129406 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1539,7 +1539,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 	if (state->in_async_callback_fn)
 		verbose(env, " async_cb");
 	verbose(env, "\n");
-	mark_verifier_state_clean(env);
+	if (!print_all)
+		mark_verifier_state_clean(env);
 }
 
 static inline u32 vlog_alignment(u32 pos)
@@ -14408,6 +14409,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		    !sanitize_speculative_path(env, insn, *insn_idx + 1,
 					       *insn_idx))
 			return -EFAULT;
+		if (env->log.level & BPF_LOG_LEVEL)
+			print_insn_state(env, this_branch->frame[this_branch->curframe]);
 		*insn_idx += insn->off;
 		return 0;
 	} else if (pred == 0) {
@@ -14420,6 +14423,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 					       *insn_idx + insn->off + 1,
 					       *insn_idx))
 			return -EFAULT;
+		if (env->log.level & BPF_LOG_LEVEL)
+			print_insn_state(env, this_branch->frame[this_branch->curframe]);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 8c9ae56dc73b5ae48a14000b96292bd4f2aeb710 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 21 Sep 2023 15:44:17 +0800
Subject: sched/numa, mm: make numa migrate functions to take a folio

The cpupid (or access time) is stored in the head page for THP, so it is
safely to make should_numa_migrate_memory() and numa_hint_fault_latency()
to take a folio.  This is in preparation for large folio numa balancing.

Link: https://lkml.kernel.org/r/20230921074417.24004-7-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/numa_balancing.h |  6 +++---
 kernel/sched/fair.c                  | 12 ++++++------
 mm/mempolicy.c                       |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762efe15..06a9d35650f0 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -20,8 +20,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
 extern void task_numa_free(struct task_struct *p, bool final);
-extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
-					int src_nid, int dst_cpu);
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
+				int src_nid, int dst_cpu);
 #else
 static inline void task_numa_fault(int last_node, int node, int pages,
 				   int flags)
@@ -38,7 +38,7 @@ static inline void task_numa_free(struct task_struct *p, bool final)
 {
 }
 static inline bool should_numa_migrate_memory(struct task_struct *p,
-				struct page *page, int src_nid, int dst_cpu)
+				struct folio *folio, int src_nid, int dst_cpu)
 {
 	return true;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cb225921bbca..42aefe7e6fdc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1722,12 +1722,12 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat)
  * The smaller the hint page fault latency, the higher the possibility
  * for the page to be hot.
  */
-static int numa_hint_fault_latency(struct page *page)
+static int numa_hint_fault_latency(struct folio *folio)
 {
 	int last_time, time;
 
 	time = jiffies_to_msecs(jiffies);
-	last_time = xchg_page_access_time(page, time);
+	last_time = xchg_page_access_time(&folio->page, time);
 
 	return (time - last_time) & PAGE_ACCESS_TIME_MASK;
 }
@@ -1784,7 +1784,7 @@ static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
 	}
 }
 
-bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
 				int src_nid, int dst_cpu)
 {
 	struct numa_group *ng = deref_curr_numa_group(p);
@@ -1814,16 +1814,16 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 		numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
 
 		th = pgdat->nbp_threshold ? : def_th;
-		latency = numa_hint_fault_latency(page);
+		latency = numa_hint_fault_latency(folio);
 		if (latency >= th)
 			return false;
 
 		return !numa_promotion_rate_limit(pgdat, rate_limit,
-						  thp_nr_pages(page));
+						  folio_nr_pages(folio));
 	}
 
 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
-	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+	last_cpupid = page_cpupid_xchg_last(&folio->page, this_cpupid);
 
 	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
 	    !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 69c0eac7292c..abd94f4c7f6b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2642,7 +2642,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
 	if (pol->flags & MPOL_F_MORON) {
 		polnid = thisnid;
 
-		if (!should_numa_migrate_memory(current, &folio->page, curnid,
+		if (!should_numa_migrate_memory(current, folio, curnid,
 						thiscpu))
 			goto out;
 	}
-- 
cgit v1.2.3


From 94b3f0b5af2c7af69e3d6e0cdd9b0ea535f22186 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Mon, 21 Aug 2023 16:04:09 -0400
Subject: smp,csd: Throw an error if a CSD lock is stuck for too long

The CSD lock seems to get stuck in 2 "modes". When it gets stuck
temporarily, it usually gets released in a few seconds, and sometimes
up to one or two minutes.

If the CSD lock stays stuck for more than several minutes, it never
seems to get unstuck, and gradually more and more things in the system
end up also getting stuck.

In the latter case, we should just give up, so the system can dump out
a little more information about what went wrong, and, with panic_on_oops
and a kdump kernel loaded, dump a whole bunch more information about what
might have gone wrong.  In addition, there is an smp.panic_on_ipistall
kernel boot parameter that by default retains the old behavior, but when
set enables the panic after the CSD lock has been stuck for more than
the specified number of milliseconds, as in 300,000 for five minutes.

[ paulmck: Apply Imran Khan feedback. ]
[ paulmck: Apply Leonardo Bras feedback. ]

Link: https://lore.kernel.org/lkml/bc7cc8b0-f587-4451-8bcd-0daae627bcc7@paulmck-laptop/
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Imran Khan <imran.f.khan@oracle.com>
Reviewed-by: Leonardo Bras <leobras@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Randy Dunlap <rdunlap@infradead.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  7 +++++++
 kernel/smp.c                                    | 13 ++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 0a1731a0f0ef..41644336e358 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5858,6 +5858,13 @@
 			This feature may be more efficiently disabled
 			using the csdlock_debug- kernel parameter.
 
+	smp.panic_on_ipistall= [KNL]
+			If a csd_lock_timeout extends for more than
+			the specified number of milliseconds, panic the
+			system.  By default, let CSD-lock acquisition
+			take as long as they take.  Specifying 300,000
+			for this value provides a 5-minute timeout.
+
 	smsc-ircc2.nopnp	[HW] Don't use PNP to discover SMC devices
 	smsc-ircc2.ircc_cfg=	[HW] Device configuration I/O port
 	smsc-ircc2.ircc_sir=	[HW] SIR base I/O port
diff --git a/kernel/smp.c b/kernel/smp.c
index 8455a53465af..695eb13a276d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -170,6 +170,8 @@ static DEFINE_PER_CPU(void *, cur_csd_info);
 
 static ulong csd_lock_timeout = 5000;  /* CSD lock timeout in milliseconds. */
 module_param(csd_lock_timeout, ulong, 0444);
+static int panic_on_ipistall;  /* CSD panic timeout in milliseconds, 300000 for five minutes. */
+module_param(panic_on_ipistall, int, 0444);
 
 static atomic_t csd_bug_count = ATOMIC_INIT(0);
 
@@ -230,6 +232,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
 	}
 
 	ts2 = sched_clock();
+	/* How long since we last checked for a stuck CSD lock.*/
 	ts_delta = ts2 - *ts1;
 	if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
 		return false;
@@ -243,9 +246,17 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
 	else
 		cpux = cpu;
 	cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
+	/* How long since this CSD lock was stuck. */
+	ts_delta = ts2 - ts0;
 	pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
-		 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0,
+		 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta,
 		 cpu, csd->func, csd->info);
+	/*
+	 * If the CSD lock is still stuck after 5 minutes, it is unlikely
+	 * to become unstuck. Use a signed comparison to avoid triggering
+	 * on underflows when the TSC is out of sync between sockets.
+	 */
+	BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC));
 	if (cpu_cur_csd && csd != cpu_cur_csd) {
 		pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
 			 *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
-- 
cgit v1.2.3


From 700b2b439766e8aab8a7174991198497345bd411 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Tue, 17 Oct 2023 08:49:45 +0900
Subject: fprobe: Fix to ensure the number of active retprobes is not zero

The number of active retprobes can be zero but it is not acceptable,
so return EINVAL error if detected.

Link: https://lore.kernel.org/all/169750018550.186853.11198884812017796410.stgit@devnote2/

Reported-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Closes: https://lore.kernel.org/all/20231016222103.cb9f426edc60220eabd8aa6a@kernel.org/
Fixes: 5b0ab78998e3 ("fprobe: Add exit_handler support")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/fprobe.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 3b21f4063258..881f90f0cbcf 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -189,7 +189,7 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
 {
 	int i, size;
 
-	if (num < 0)
+	if (num <= 0)
 		return -EINVAL;
 
 	if (!fp->exit_handler) {
@@ -202,8 +202,8 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
 		size = fp->nr_maxactive;
 	else
 		size = num * num_possible_cpus() * 2;
-	if (size < 0)
-		return -E2BIG;
+	if (size <= 0)
+		return -EINVAL;
 
 	fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler);
 	if (!fp->rethook)
-- 
cgit v1.2.3


From 545a4f89cad5bd349522d17558b3a4208648e20e Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 12 Oct 2023 09:42:56 +0300
Subject: printk: Check valid console index for preferred console

Let's check for valid console index values for preferred console to avoid
bogus console index numbers from kernel command line.

Let's also return an error for negative index numbers for the preferred
console. Unlike for device drivers, a negative index is not valid for the
preferred console.

Let's also constify idx while at it.

Signed-off-by: Tony Lindgren <tony@atomide.com>
Link: https://lore.kernel.org/r/20231012064300.50221-1-tony@atomide.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/console.h |  2 +-
 kernel/printk/printk.c  | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/console.h b/include/linux/console.h
index 5ff6f11c47b1..1be13c9695e0 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -347,7 +347,7 @@ enum con_flush_mode {
 	CONSOLE_REPLAY_ALL,
 };
 
-extern int add_preferred_console(char *name, int idx, char *options);
+extern int add_preferred_console(char *name, const short idx, char *options);
 extern void console_force_preferred_locked(struct console *con);
 extern void register_console(struct console *);
 extern int unregister_console(struct console *);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 0b3af1529778..b16c0bab88c6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2404,12 +2404,20 @@ static void set_user_specified(struct console_cmdline *c, bool user_specified)
 	console_set_on_cmdline = 1;
 }
 
-static int __add_preferred_console(char *name, int idx, char *options,
+static int __add_preferred_console(char *name, const short idx, char *options,
 				   char *brl_options, bool user_specified)
 {
 	struct console_cmdline *c;
 	int i;
 
+	/*
+	 * We use a signed short index for struct console for device drivers to
+	 * indicate a not yet assigned index or port. However, a negative index
+	 * value is not valid for preferred console.
+	 */
+	if (idx < 0)
+		return -EINVAL;
+
 	/*
 	 *	See if this tty is not yet registered, and
 	 *	if we have a slot free.
@@ -2513,7 +2521,7 @@ __setup("console=", console_setup);
  * commonly to provide a default console (ie from PROM variables) when
  * the user has not supplied one.
  */
-int add_preferred_console(char *name, int idx, char *options)
+int add_preferred_console(char *name, const short idx, char *options)
 {
 	return __add_preferred_console(name, idx, options, NULL, false);
 }
-- 
cgit v1.2.3


From 1e3c8526918403a4cebbd67bcd18443bf68df939 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 12 Oct 2023 09:42:57 +0300
Subject: printk: Constify name for add_preferred_console()

While adding a preferred console handling for serial_core for serial port
hardware based device addressing, Jiri suggested we constify name for
add_preferred_console(). The name gets copied anyways. This allows serial
core to add a preferred console using serial drv->dev_name without copying
it.

Note that constifying options causes changes all over the place because of
struct console for match().

Suggested-by: Jiri Slaby <jirislaby@kernel.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Link: https://lore.kernel.org/r/20231012064300.50221-2-tony@atomide.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/console.h | 2 +-
 kernel/printk/printk.c  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/console.h b/include/linux/console.h
index 1be13c9695e0..eddf4c5279b5 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -347,7 +347,7 @@ enum con_flush_mode {
 	CONSOLE_REPLAY_ALL,
 };
 
-extern int add_preferred_console(char *name, const short idx, char *options);
+extern int add_preferred_console(const char *name, const short idx, char *options);
 extern void console_force_preferred_locked(struct console *con);
 extern void register_console(struct console *);
 extern int unregister_console(struct console *);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b16c0bab88c6..122cb6e83f42 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2404,7 +2404,7 @@ static void set_user_specified(struct console_cmdline *c, bool user_specified)
 	console_set_on_cmdline = 1;
 }
 
-static int __add_preferred_console(char *name, const short idx, char *options,
+static int __add_preferred_console(const char *name, const short idx, char *options,
 				   char *brl_options, bool user_specified)
 {
 	struct console_cmdline *c;
@@ -2521,7 +2521,7 @@ __setup("console=", console_setup);
  * commonly to provide a default console (ie from PROM variables) when
  * the user has not supplied one.
  */
-int add_preferred_console(char *name, const short idx, char *options)
+int add_preferred_console(const char *name, const short idx, char *options)
 {
 	return __add_preferred_console(name, idx, options, NULL, false);
 }
-- 
cgit v1.2.3


From 9a675ba55a96a45a9fb69e6a5c43f80c6682e541 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 16 Oct 2023 14:57:38 +0200
Subject: net, bpf: Add a warning if NAPI cb missed xdp_do_flush().
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A few drivers were missing a xdp_do_flush() invocation after
XDP_REDIRECT.

Add three helper functions each for one of the per-CPU lists. Return
true if the per-CPU list is non-empty and flush the list.

Add xdp_do_check_flushed() which invokes each helper functions and
creates a warning if one of the functions had a non-empty list.

Hide everything behind CONFIG_DEBUG_NET.

Suggested-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20231016125738.Yt79p1uF@linutronix.de
---
 include/linux/bpf.h    |  3 +++
 include/net/xdp_sock.h |  9 +++++++++
 kernel/bpf/cpumap.c    | 10 ++++++++++
 kernel/bpf/devmap.c    | 10 ++++++++++
 net/core/dev.c         |  2 ++
 net/core/dev.h         |  6 ++++++
 net/core/filter.c      | 16 ++++++++++++++++
 net/xdp/xsk.c          | 10 ++++++++++
 8 files changed, 66 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d3c51a507508..b4b40b45962b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2478,6 +2478,9 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
 		     enum bpf_dynptr_type type, u32 offset, u32 size);
 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
 void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);
+
+bool dev_check_flush(void);
+bool cpu_map_check_flush(void);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 69b472604b86..7dd0df2f6f8e 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -109,4 +109,13 @@ static inline void __xsk_map_flush(void)
 
 #endif /* CONFIG_XDP_SOCKETS */
 
+#if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET)
+bool xsk_map_check_flush(void);
+#else
+static inline bool xsk_map_check_flush(void)
+{
+	return false;
+}
+#endif
+
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index e42a1bdb7f53..8a0bb80fe48a 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -764,6 +764,16 @@ void __cpu_map_flush(void)
 	}
 }
 
+#ifdef CONFIG_DEBUG_NET
+bool cpu_map_check_flush(void)
+{
+	if (list_empty(this_cpu_ptr(&cpu_map_flush_list)))
+		return false;
+	__cpu_map_flush();
+	return true;
+}
+#endif
+
 static int __init cpu_map_init(void)
 {
 	int cpu;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 4d42f6ed6c11..a936c704d4e7 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -418,6 +418,16 @@ void __dev_flush(void)
 	}
 }
 
+#ifdef CONFIG_DEBUG_NET
+bool dev_check_flush(void)
+{
+	if (list_empty(this_cpu_ptr(&dev_flush_list)))
+		return false;
+	__dev_flush();
+	return true;
+}
+#endif
+
 /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
  * by local_bh_disable() (from XDP calls inside NAPI). The
  * rcu_read_lock_bh_held() below makes lockdep accept both.
diff --git a/net/core/dev.c b/net/core/dev.c
index 97e7b9833db9..4420831180c6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6535,6 +6535,8 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)
 	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
 		work = n->poll(n, weight);
 		trace_napi_poll(n, work, weight);
+
+		xdp_do_check_flushed(n);
 	}
 
 	if (unlikely(work > weight))
diff --git a/net/core/dev.h b/net/core/dev.h
index e075e198092c..f66125857af7 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -136,4 +136,10 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
 }
 
 int rps_cpumask_housekeeping(struct cpumask *mask);
+
+#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
+void xdp_do_check_flushed(struct napi_struct *napi);
+#else
+static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
+#endif
 #endif
diff --git a/net/core/filter.c b/net/core/filter.c
index cc2e4babc85f..21d75108c2e9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -83,6 +83,8 @@
 #include <net/netfilter/nf_conntrack_bpf.h>
 #include <linux/un.h>
 
+#include "dev.h"
+
 static const struct bpf_func_proto *
 bpf_sk_base_func_proto(enum bpf_func_id func_id);
 
@@ -4208,6 +4210,20 @@ void xdp_do_flush(void)
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush);
 
+#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
+void xdp_do_check_flushed(struct napi_struct *napi)
+{
+	bool ret;
+
+	ret = dev_check_flush();
+	ret |= cpu_map_check_flush();
+	ret |= xsk_map_check_flush();
+
+	WARN_ONCE(ret, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
+		  napi->poll);
+}
+#endif
+
 void bpf_clear_redirect_map(struct bpf_map *map)
 {
 	struct bpf_redirect_info *ri;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index f5e96e0d6e01..ba070fd37d24 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -391,6 +391,16 @@ void __xsk_map_flush(void)
 	}
 }
 
+#ifdef CONFIG_DEBUG_NET
+bool xsk_map_check_flush(void)
+{
+	if (list_empty(this_cpu_ptr(&xskmap_flush_list)))
+		return false;
+	__xsk_map_flush();
+	return true;
+}
+#endif
+
 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
 {
 	xskq_prod_submit_n(pool->cq, nb_entries);
-- 
cgit v1.2.3


From 29a7e00ffadddd8d68eff311de1bf12ae10687bb Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sat, 7 Oct 2023 13:59:44 +0000
Subject: bpf: Fix missed rcu read lock in bpf_task_under_cgroup()

When employed within a sleepable program not under RCU protection, the
use of 'bpf_task_under_cgroup()' may trigger a warning in the kernel log,
particularly when CONFIG_PROVE_RCU is enabled:

  [ 1259.662357] WARNING: suspicious RCU usage
  [ 1259.662358] 6.5.0+ #33 Not tainted
  [ 1259.662360] -----------------------------
  [ 1259.662361] include/linux/cgroup.h:423 suspicious rcu_dereference_check() usage!

Other info that might help to debug this:

  [ 1259.662366] rcu_scheduler_active = 2, debug_locks = 1
  [ 1259.662368] 1 lock held by trace/72954:
  [ 1259.662369]  #0: ffffffffb5e3eda0 (rcu_read_lock_trace){....}-{0:0}, at: __bpf_prog_enter_sleepable+0x0/0xb0

Stack backtrace:

  [ 1259.662385] CPU: 50 PID: 72954 Comm: trace Kdump: loaded Not tainted 6.5.0+ #33
  [ 1259.662391] Call Trace:
  [ 1259.662393]  <TASK>
  [ 1259.662395]  dump_stack_lvl+0x6e/0x90
  [ 1259.662401]  dump_stack+0x10/0x20
  [ 1259.662404]  lockdep_rcu_suspicious+0x163/0x1b0
  [ 1259.662412]  task_css_set.part.0+0x23/0x30
  [ 1259.662417]  bpf_task_under_cgroup+0xe7/0xf0
  [ 1259.662422]  bpf_prog_7fffba481a3bcf88_lsm_run+0x5c/0x93
  [ 1259.662431]  bpf_trampoline_6442505574+0x60/0x1000
  [ 1259.662439]  bpf_lsm_bpf+0x5/0x20
  [ 1259.662443]  ? security_bpf+0x32/0x50
  [ 1259.662452]  __sys_bpf+0xe6/0xdd0
  [ 1259.662463]  __x64_sys_bpf+0x1a/0x30
  [ 1259.662467]  do_syscall_64+0x38/0x90
  [ 1259.662472]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
  [ 1259.662479] RIP: 0033:0x7f487baf8e29
  [...]
  [ 1259.662504]  </TASK>

This issue can be reproduced by executing a straightforward program, as
demonstrated below:

SEC("lsm.s/bpf")
int BPF_PROG(lsm_run, int cmd, union bpf_attr *attr, unsigned int size)
{
        struct cgroup *cgrp = NULL;
        struct task_struct *task;
        int ret = 0;

        if (cmd != BPF_LINK_CREATE)
                return 0;

        // The cgroup2 should be mounted first
        cgrp = bpf_cgroup_from_id(1);
        if (!cgrp)
                goto out;
        task = bpf_get_current_task_btf();
        if (bpf_task_under_cgroup(task, cgrp))
                ret = -1;
        bpf_cgroup_release(cgrp);

out:
        return ret;
}

After running the program, if you subsequently execute another BPF program,
you will encounter the warning.

It's worth noting that task_under_cgroup_hierarchy() is also utilized by
bpf_current_task_under_cgroup(). However, bpf_current_task_under_cgroup()
doesn't exhibit this issue because it cannot be used in sleepable BPF
programs.

Fixes: b5ad4cdc46c7 ("bpf: Add bpf_task_under_cgroup() kfunc")
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Cc: Feng Zhou <zhoufeng.zf@bytedance.com>
Cc: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/bpf/20231007135945.4306-1-laoar.shao@gmail.com
---
 kernel/bpf/helpers.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 62a53ebfedf9..61f51dee8448 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2215,7 +2215,12 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
 __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
 				       struct cgroup *ancestor)
 {
-	return task_under_cgroup_hierarchy(task, ancestor);
+	long ret;
+
+	rcu_read_lock();
+	ret = task_under_cgroup_hierarchy(task, ancestor);
+	rcu_read_unlock();
+	return ret;
 }
 #endif /* CONFIG_CGROUPS */
 
-- 
cgit v1.2.3


From 38685e2a0476127db766f81b1c06019ddc4c9ffa Mon Sep 17 00:00:00 2001
From: Ran Xiaokai <ran.xiaokai@zte.com.cn>
Date: Tue, 17 Oct 2023 17:09:53 +0800
Subject: cpu/hotplug: Don't offline the last non-isolated CPU

If a system has isolated CPUs via the "isolcpus=" command line parameter,
then an attempt to offline the last housekeeping CPU will result in a
WARN_ON() when rebuilding the scheduler domains and a subsequent panic due
to and unhandled empty CPU mas in partition_sched_domains_locked().

cpuset_hotplug_workfn()
  rebuild_sched_domains_locked()
    ndoms = generate_sched_domains(&doms, &attr);
      cpumask_and(doms[0], top_cpuset.effective_cpus, housekeeping_cpumask(HK_FLAG_DOMAIN));

Thus results in an empty CPU mask which triggers the warning and then the
subsequent crash:

WARNING: CPU: 4 PID: 80 at kernel/sched/topology.c:2366 build_sched_domains+0x120c/0x1408
Call trace:
 build_sched_domains+0x120c/0x1408
 partition_sched_domains_locked+0x234/0x880
 rebuild_sched_domains_locked+0x37c/0x798
 rebuild_sched_domains+0x30/0x58
 cpuset_hotplug_workfn+0x2a8/0x930

Unable to handle kernel paging request at virtual address fffe80027ab37080
 partition_sched_domains_locked+0x318/0x880
 rebuild_sched_domains_locked+0x37c/0x798

Aside of the resulting crash, it does not make any sense to offline the last
last housekeeping CPU.

Prevent this by masking out the non-housekeeping CPUs when selecting a
target CPU for initiating the CPU unplug operation via the work queue.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ran Xiaokai <ran.xiaokai@zte.com.cn>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/202310171709530660462@zte.com.cn
---
 kernel/cpu.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6de7c6bb74ee..94430ea8b4a5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1515,11 +1515,14 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
 	/*
 	 * Ensure that the control task does not run on the to be offlined
 	 * CPU to prevent a deadlock against cfs_b->period_timer.
+	 * Also keep at least one housekeeping cpu onlined to avoid generating
+	 * an empty sched_domain span.
 	 */
-	cpu = cpumask_any_but(cpu_online_mask, cpu);
-	if (cpu >= nr_cpu_ids)
-		return -EBUSY;
-	return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
+	for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) {
+		if (cpu != work.cpu)
+			return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
+	}
+	return -EBUSY;
 }
 
 static int cpu_down(unsigned int cpu, enum cpuhp_state target)
-- 
cgit v1.2.3


From d2929762cc3f85528b0ca12f6f63c2a714f24778 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 17 Oct 2023 16:59:47 +0200
Subject: sched/eevdf: Fix heap corruption more

Because someone is a flaming idiot... and forgot we have current as
se->on_rq but not actually in the tree itself, and walking rb_parent()
on an entry not in the tree is 'funky' and KASAN complains.

Fixes: 8dafa9d0eb1a ("sched/eevdf: Fix min_deadline heap integrity")
Reported-by: 0599jiangyc@gmail.com
Reported-by: Dmitry Safonov <0x7f454c46@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Dmitry Safonov <0x7f454c46@gmail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218020
Link: https://lkml.kernel.org/r/CAJwJo6ZGXO07%3DQvW4fgQfbsDzQPs9xj5sAQ1zp%3DmAyPMNbHYww%40mail.gmail.com
---
 kernel/sched/fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 061a30a8925a..df348aa55d3c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3657,7 +3657,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 		 */
 		deadline = div_s64(deadline * old_weight, weight);
 		se->deadline = se->vruntime + deadline;
-		min_deadline_cb_propagate(&se->run_node, NULL);
+		if (se != cfs_rq->curr)
+			min_deadline_cb_propagate(&se->run_node, NULL);
 	}
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 1b7ef2d94ff4cb0b1186a224a97349864820c606 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Wed, 18 Oct 2023 14:27:59 +0800
Subject: sched/fair: Remove duplicate #include

./kernel/sched/fair.c: linux/sched/cond_resched.h is included more than once.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231018062759.44375-1-jiapeng.chong@linux.alibaba.com

Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=6907
---
 kernel/sched/fair.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 38d757c35004..9ae2208089e4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -51,8 +51,6 @@
 
 #include <asm/switch_to.h>
 
-#include <linux/sched/cond_resched.h>
-
 #include "sched.h"
 #include "stats.h"
 #include "autogroup.h"
-- 
cgit v1.2.3


From 265f3ed077036f053981f5eea0b5b43e7c5b39ff Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Sun, 24 Sep 2023 17:07:02 +0200
Subject: workqueue: Provide one lock class key per work_on_cpu() callsite

All callers of work_on_cpu() share the same lock class key for all the
functions queued. As a result the workqueue related locking scenario for
a function A may be spuriously accounted as an inversion against the
locking scenario of function B such as in the following model:

	long A(void *arg)
	{
		mutex_lock(&mutex);
		mutex_unlock(&mutex);
	}

	long B(void *arg)
	{
	}

	void launchA(void)
	{
		work_on_cpu(0, A, NULL);
	}

	void launchB(void)
	{
		mutex_lock(&mutex);
		work_on_cpu(1, B, NULL);
		mutex_unlock(&mutex);
	}

launchA and launchB running concurrently have no chance to deadlock.
However the above can be reported by lockdep as a possible locking
inversion because the works containing A() and B() are treated as
belonging to the same locking class.

The following shows an existing example of such a spurious lockdep splat:

	 ======================================================
	 WARNING: possible circular locking dependency detected
	 6.6.0-rc1-00065-g934ebd6e5359 #35409 Not tainted
	 ------------------------------------------------------
	 kworker/0:1/9 is trying to acquire lock:
	 ffffffff9bc72f30 (cpu_hotplug_lock){++++}-{0:0}, at: _cpu_down+0x57/0x2b0

	 but task is already holding lock:
	 ffff9e3bc0057e60 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_scheduled_works+0x216/0x500

	 which lock already depends on the new lock.

	 the existing dependency chain (in reverse order) is:

	 -> #2 ((work_completion)(&wfc.work)){+.+.}-{0:0}:
			__flush_work+0x83/0x4e0
			work_on_cpu+0x97/0xc0
			rcu_nocb_cpu_offload+0x62/0xb0
			rcu_nocb_toggle+0xd0/0x1d0
			kthread+0xe6/0x120
			ret_from_fork+0x2f/0x40
			ret_from_fork_asm+0x1b/0x30

	 -> #1 (rcu_state.barrier_mutex){+.+.}-{3:3}:
			__mutex_lock+0x81/0xc80
			rcu_nocb_cpu_deoffload+0x38/0xb0
			rcu_nocb_toggle+0x144/0x1d0
			kthread+0xe6/0x120
			ret_from_fork+0x2f/0x40
			ret_from_fork_asm+0x1b/0x30

	 -> #0 (cpu_hotplug_lock){++++}-{0:0}:
			__lock_acquire+0x1538/0x2500
			lock_acquire+0xbf/0x2a0
			percpu_down_write+0x31/0x200
			_cpu_down+0x57/0x2b0
			__cpu_down_maps_locked+0x10/0x20
			work_for_cpu_fn+0x15/0x20
			process_scheduled_works+0x2a7/0x500
			worker_thread+0x173/0x330
			kthread+0xe6/0x120
			ret_from_fork+0x2f/0x40
			ret_from_fork_asm+0x1b/0x30

	 other info that might help us debug this:

	 Chain exists of:
	   cpu_hotplug_lock --> rcu_state.barrier_mutex --> (work_completion)(&wfc.work)

	  Possible unsafe locking scenario:

			CPU0                    CPU1
			----                    ----
	   lock((work_completion)(&wfc.work));
									lock(rcu_state.barrier_mutex);
									lock((work_completion)(&wfc.work));
	   lock(cpu_hotplug_lock);

	  *** DEADLOCK ***

	 2 locks held by kworker/0:1/9:
	  #0: ffff900481068b38 ((wq_completion)events){+.+.}-{0:0}, at: process_scheduled_works+0x212/0x500
	  #1: ffff9e3bc0057e60 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_scheduled_works+0x216/0x500

	 stack backtrace:
	 CPU: 0 PID: 9 Comm: kworker/0:1 Not tainted 6.6.0-rc1-00065-g934ebd6e5359 #35409
	 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
	 Workqueue: events work_for_cpu_fn
	 Call Trace:
	 rcu-torture: rcu_torture_read_exit: Start of episode
	  <TASK>
	  dump_stack_lvl+0x4a/0x80
	  check_noncircular+0x132/0x150
	  __lock_acquire+0x1538/0x2500
	  lock_acquire+0xbf/0x2a0
	  ? _cpu_down+0x57/0x2b0
	  percpu_down_write+0x31/0x200
	  ? _cpu_down+0x57/0x2b0
	  _cpu_down+0x57/0x2b0
	  __cpu_down_maps_locked+0x10/0x20
	  work_for_cpu_fn+0x15/0x20
	  process_scheduled_works+0x2a7/0x500
	  worker_thread+0x173/0x330
	  ? __pfx_worker_thread+0x10/0x10
	  kthread+0xe6/0x120
	  ? __pfx_kthread+0x10/0x10
	  ret_from_fork+0x2f/0x40
	  ? __pfx_kthread+0x10/0x10
	  ret_from_fork_asm+0x1b/0x30
	  </TASK

Fix this with providing one lock class key per work_on_cpu() caller.

Reported-and-tested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 46 +++++++++++++++++++++++++++++++++++++++-------
 kernel/workqueue.c        | 20 ++++++++++++--------
 2 files changed, 51 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 1c1d06804d45..24b1e5070f4d 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -274,18 +274,16 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
  * to generate better code.
  */
 #ifdef CONFIG_LOCKDEP
-#define __INIT_WORK(_work, _func, _onstack)				\
+#define __INIT_WORK_KEY(_work, _func, _onstack, _key)			\
 	do {								\
-		static struct lock_class_key __key;			\
-									\
 		__init_work((_work), _onstack);				\
 		(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
-		lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \
+		lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, (_key), 0); \
 		INIT_LIST_HEAD(&(_work)->entry);			\
 		(_work)->func = (_func);				\
 	} while (0)
 #else
-#define __INIT_WORK(_work, _func, _onstack)				\
+#define __INIT_WORK_KEY(_work, _func, _onstack, _key)			\
 	do {								\
 		__init_work((_work), _onstack);				\
 		(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
@@ -294,12 +292,22 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 	} while (0)
 #endif
 
+#define __INIT_WORK(_work, _func, _onstack)				\
+	do {								\
+		static __maybe_unused struct lock_class_key __key;	\
+									\
+		__INIT_WORK_KEY(_work, _func, _onstack, &__key);	\
+	} while (0)
+
 #define INIT_WORK(_work, _func)						\
 	__INIT_WORK((_work), (_func), 0)
 
 #define INIT_WORK_ONSTACK(_work, _func)					\
 	__INIT_WORK((_work), (_func), 1)
 
+#define INIT_WORK_ONSTACK_KEY(_work, _func, _key)			\
+	__INIT_WORK_KEY((_work), (_func), 1, _key)
+
 #define __INIT_DELAYED_WORK(_work, _func, _tflags)			\
 	do {								\
 		INIT_WORK(&(_work)->work, (_func));			\
@@ -693,8 +701,32 @@ static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
 	return fn(arg);
 }
 #else
-long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
-long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg);
+long work_on_cpu_key(int cpu, long (*fn)(void *),
+		     void *arg, struct lock_class_key *key);
+/*
+ * A new key is defined for each caller to make sure the work
+ * associated with the function doesn't share its locking class.
+ */
+#define work_on_cpu(_cpu, _fn, _arg)			\
+({							\
+	static struct lock_class_key __key;		\
+							\
+	work_on_cpu_key(_cpu, _fn, _arg, &__key);	\
+})
+
+long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
+			  void *arg, struct lock_class_key *key);
+
+/*
+ * A new key is defined for each caller to make sure the work
+ * associated with the function doesn't share its locking class.
+ */
+#define work_on_cpu_safe(_cpu, _fn, _arg)		\
+({							\
+	static struct lock_class_key __key;		\
+							\
+	work_on_cpu_safe_key(_cpu, _fn, _arg, &__key);	\
+})
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a3522b70218d..0f682da96e1c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5622,50 +5622,54 @@ static void work_for_cpu_fn(struct work_struct *work)
 }
 
 /**
- * work_on_cpu - run a function in thread context on a particular cpu
+ * work_on_cpu_key - run a function in thread context on a particular cpu
  * @cpu: the cpu to run on
  * @fn: the function to run
  * @arg: the function arg
+ * @key: The lock class key for lock debugging purposes
  *
  * It is up to the caller to ensure that the cpu doesn't go offline.
  * The caller must not hold any locks which would prevent @fn from completing.
  *
  * Return: The value @fn returns.
  */
-long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu_key(int cpu, long (*fn)(void *),
+		     void *arg, struct lock_class_key *key)
 {
 	struct work_for_cpu wfc = { .fn = fn, .arg = arg };
 
-	INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+	INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
 	schedule_work_on(cpu, &wfc.work);
 	flush_work(&wfc.work);
 	destroy_work_on_stack(&wfc.work);
 	return wfc.ret;
 }
-EXPORT_SYMBOL_GPL(work_on_cpu);
+EXPORT_SYMBOL_GPL(work_on_cpu_key);
 
 /**
- * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * work_on_cpu_safe_key - run a function in thread context on a particular cpu
  * @cpu: the cpu to run on
  * @fn:  the function to run
  * @arg: the function argument
+ * @key: The lock class key for lock debugging purposes
  *
  * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
  * any locks which would prevent @fn from completing.
  *
  * Return: The value @fn returns.
  */
-long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
+			  void *arg, struct lock_class_key *key)
 {
 	long ret = -ENODEV;
 
 	cpus_read_lock();
 	if (cpu_online(cpu))
-		ret = work_on_cpu(cpu, fn, arg);
+		ret = work_on_cpu_key(cpu, fn, arg, key);
 	cpus_read_unlock();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(work_on_cpu_safe);
+EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
-- 
cgit v1.2.3


From 1bc628a756d93a7bed2f80914b15821bb9a4369c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 4 Oct 2023 14:53:06 -0400
Subject: bpf: convert to new timestamp accessors

Convert to using the new inode timestamp accessor functions.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20231004185347.80880-79-jlayton@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/bpf/inode.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 99d0625b6c82..1aafb2ff2e95 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -118,8 +118,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
 		return ERR_PTR(-ENOSPC);
 
 	inode->i_ino = get_next_ino();
-	inode->i_atime = inode_set_ctime_current(inode);
-	inode->i_mtime = inode->i_atime;
+	simple_inode_init_ts(inode);
 
 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
 
@@ -147,7 +146,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
 	d_instantiate(dentry, inode);
 	dget(dentry);
 
-	dir->i_mtime = inode_set_ctime_current(dir);
+	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 }
 
 static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
-- 
cgit v1.2.3


From 4bbd9345565933823f38a419df65661f12adbe5e Mon Sep 17 00:00:00 2001
From: "wuqiang.matt" <wuqiang.matt@bytedance.com>
Date: Tue, 17 Oct 2023 21:56:52 +0800
Subject: kprobes: kretprobe scalability improvement

kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.

Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_prctl):

OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s

         1T       2T       4T       8T      16T      24T
   24150045 29317964 15446741 12494489 18287272 17708768
        32T      48T      64T      72T      96T     128T
   16200682 13737658 11645677 11269858 10470118  9931051

This patch introduces objpool to replace freelist. objpool is a
high performance queue, which can bring near-linear scalability
to kretprobed routines. Tests of kretprobe throughput show the
biggest ratio as 159x of original freelist. Here's the result:

                  1T         2T         4T         8T        16T
native:     41186213   82336866  164250978  328662645  658810299
freelist:   24150045   29317964   15446741   12494489   18287272
objpool:    23926730   48010314   96125218  191782984  385091769
                 32T        48T        64T        96T       128T
native:   1330338351 1969957941 2512291791 2615754135 2671040914
freelist:   16200682   13737658   11645677   10470118    9931051
objpool:   764481096 1147149781 1456220214 1502109662 1579015050

Testings on 96-core ARM64 output similarly, but with the biggest
ratio up to 448x:

OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s

                  1T         2T         4T         8T        16T
native: .   30066096   63569843  126194076  257447289  505800181
freelist:   16152090   11064397   11124068    7215768    5663013
objpool:    13997541   28032100   55726624  110099926  221498787
                 24T        32T        48T        64T        96T
native:    763305277 1015925192 1521075123 2033009392 3021013752
freelist:    5015810    4602893    3766792    3382478    2945292
objpool:   328192025  439439564  668534502  887401381 1319972072

Link: https://lore.kernel.org/all/20231017135654.82270-4-wuqiang.matt@bytedance.com/

Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/kprobes.h | 11 ++----
 include/linux/rethook.h | 16 +++------
 kernel/kprobes.c        | 91 +++++++++++++++++++++----------------------------
 kernel/trace/fprobe.c   | 28 ++++++---------
 kernel/trace/rethook.c  | 90 ++++++++++++++++++++++--------------------------
 5 files changed, 96 insertions(+), 140 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 85a64cb95d75..365eb092e9c4 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -26,8 +26,7 @@
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
-#include <linux/refcount.h>
-#include <linux/freelist.h>
+#include <linux/objpool.h>
 #include <linux/rethook.h>
 #include <asm/kprobes.h>
 
@@ -141,7 +140,7 @@ static inline bool kprobe_ftrace(struct kprobe *p)
  */
 struct kretprobe_holder {
 	struct kretprobe	*rp;
-	refcount_t		ref;
+	struct objpool_head	pool;
 };
 
 struct kretprobe {
@@ -154,7 +153,6 @@ struct kretprobe {
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
 	struct rethook *rh;
 #else
-	struct freelist_head freelist;
 	struct kretprobe_holder *rph;
 #endif
 };
@@ -165,10 +163,7 @@ struct kretprobe_instance {
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
 	struct rethook_node node;
 #else
-	union {
-		struct freelist_node freelist;
-		struct rcu_head rcu;
-	};
+	struct rcu_head rcu;
 	struct llist_node llist;
 	struct kretprobe_holder *rph;
 	kprobe_opcode_t *ret_addr;
diff --git a/include/linux/rethook.h b/include/linux/rethook.h
index 26b6f3c81a76..ce69b2b7bc35 100644
--- a/include/linux/rethook.h
+++ b/include/linux/rethook.h
@@ -6,11 +6,10 @@
 #define _LINUX_RETHOOK_H
 
 #include <linux/compiler.h>
-#include <linux/freelist.h>
+#include <linux/objpool.h>
 #include <linux/kallsyms.h>
 #include <linux/llist.h>
 #include <linux/rcupdate.h>
-#include <linux/refcount.h>
 
 struct rethook_node;
 
@@ -30,14 +29,12 @@ typedef void (*rethook_handler_t) (struct rethook_node *, void *, unsigned long,
 struct rethook {
 	void			*data;
 	rethook_handler_t	handler;
-	struct freelist_head	pool;
-	refcount_t		ref;
+	struct objpool_head	pool;
 	struct rcu_head		rcu;
 };
 
 /**
  * struct rethook_node - The rethook shadow-stack entry node.
- * @freelist: The freelist, linked to struct rethook::pool.
  * @rcu: The rcu_head for deferred freeing.
  * @llist: The llist, linked to a struct task_struct::rethooks.
  * @rethook: The pointer to the struct rethook.
@@ -48,20 +45,16 @@ struct rethook {
  * on each entry of the shadow stack.
  */
 struct rethook_node {
-	union {
-		struct freelist_node freelist;
-		struct rcu_head      rcu;
-	};
+	struct rcu_head		rcu;
 	struct llist_node	llist;
 	struct rethook		*rethook;
 	unsigned long		ret_addr;
 	unsigned long		frame;
 };
 
-struct rethook *rethook_alloc(void *data, rethook_handler_t handler);
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler, int size, int num);
 void rethook_stop(struct rethook *rh);
 void rethook_free(struct rethook *rh);
-void rethook_add_node(struct rethook *rh, struct rethook_node *node);
 struct rethook_node *rethook_try_get(struct rethook *rh);
 void rethook_recycle(struct rethook_node *node);
 void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount);
@@ -98,4 +91,3 @@ void rethook_flush_task(struct task_struct *tk);
 #endif
 
 #endif
-
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0c6185aefaef..075a632e6c7c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1877,13 +1877,27 @@ static struct notifier_block kprobe_exceptions_nb = {
 #ifdef CONFIG_KRETPROBES
 
 #if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
+
+/* callbacks for objpool of kretprobe instances */
+static int kretprobe_init_inst(void *nod, void *context)
+{
+	struct kretprobe_instance *ri = nod;
+
+	ri->rph = context;
+	return 0;
+}
+static int kretprobe_fini_pool(struct objpool_head *head, void *context)
+{
+	kfree(context);
+	return 0;
+}
+
 static void free_rp_inst_rcu(struct rcu_head *head)
 {
 	struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
+	struct kretprobe_holder *rph = ri->rph;
 
-	if (refcount_dec_and_test(&ri->rph->ref))
-		kfree(ri->rph);
-	kfree(ri);
+	objpool_drop(ri, &rph->pool);
 }
 NOKPROBE_SYMBOL(free_rp_inst_rcu);
 
@@ -1892,7 +1906,7 @@ static void recycle_rp_inst(struct kretprobe_instance *ri)
 	struct kretprobe *rp = get_kretprobe(ri);
 
 	if (likely(rp))
-		freelist_add(&ri->freelist, &rp->freelist);
+		objpool_push(ri, &rp->rph->pool);
 	else
 		call_rcu(&ri->rcu, free_rp_inst_rcu);
 }
@@ -1929,23 +1943,12 @@ NOKPROBE_SYMBOL(kprobe_flush_task);
 
 static inline void free_rp_inst(struct kretprobe *rp)
 {
-	struct kretprobe_instance *ri;
-	struct freelist_node *node;
-	int count = 0;
-
-	node = rp->freelist.head;
-	while (node) {
-		ri = container_of(node, struct kretprobe_instance, freelist);
-		node = node->next;
-
-		kfree(ri);
-		count++;
-	}
+	struct kretprobe_holder *rph = rp->rph;
 
-	if (refcount_sub_and_test(count, &rp->rph->ref)) {
-		kfree(rp->rph);
-		rp->rph = NULL;
-	}
+	if (!rph)
+		return;
+	rp->rph = NULL;
+	objpool_fini(&rph->pool);
 }
 
 /* This assumes the 'tsk' is the current task or the is not running. */
@@ -2087,19 +2090,17 @@ NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
 static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+	struct kretprobe_holder *rph = rp->rph;
 	struct kretprobe_instance *ri;
-	struct freelist_node *fn;
 
-	fn = freelist_try_get(&rp->freelist);
-	if (!fn) {
+	ri = objpool_pop(&rph->pool);
+	if (!ri) {
 		rp->nmissed++;
 		return 0;
 	}
 
-	ri = container_of(fn, struct kretprobe_instance, freelist);
-
 	if (rp->entry_handler && rp->entry_handler(ri, regs)) {
-		freelist_add(&ri->freelist, &rp->freelist);
+		objpool_push(ri, &rph->pool);
 		return 0;
 	}
 
@@ -2193,7 +2194,6 @@ int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long o
 int register_kretprobe(struct kretprobe *rp)
 {
 	int ret;
-	struct kretprobe_instance *inst;
 	int i;
 	void *addr;
 
@@ -2227,19 +2227,12 @@ int register_kretprobe(struct kretprobe *rp)
 		rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
 
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
-	rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler);
-	if (!rp->rh)
-		return -ENOMEM;
+	rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler,
+				sizeof(struct kretprobe_instance) +
+				rp->data_size, rp->maxactive);
+	if (IS_ERR(rp->rh))
+		return PTR_ERR(rp->rh);
 
-	for (i = 0; i < rp->maxactive; i++) {
-		inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL);
-		if (inst == NULL) {
-			rethook_free(rp->rh);
-			rp->rh = NULL;
-			return -ENOMEM;
-		}
-		rethook_add_node(rp->rh, &inst->node);
-	}
 	rp->nmissed = 0;
 	/* Establish function entry probe point */
 	ret = register_kprobe(&rp->kp);
@@ -2248,24 +2241,18 @@ int register_kretprobe(struct kretprobe *rp)
 		rp->rh = NULL;
 	}
 #else	/* !CONFIG_KRETPROBE_ON_RETHOOK */
-	rp->freelist.head = NULL;
 	rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
 	if (!rp->rph)
 		return -ENOMEM;
 
-	rp->rph->rp = rp;
-	for (i = 0; i < rp->maxactive; i++) {
-		inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL);
-		if (inst == NULL) {
-			refcount_set(&rp->rph->ref, i);
-			free_rp_inst(rp);
-			return -ENOMEM;
-		}
-		inst->rph = rp->rph;
-		freelist_add(&inst->freelist, &rp->freelist);
+	if (objpool_init(&rp->rph->pool, rp->maxactive, rp->data_size +
+			sizeof(struct kretprobe_instance), GFP_KERNEL,
+			rp->rph, kretprobe_init_inst, kretprobe_fini_pool)) {
+		kfree(rp->rph);
+		rp->rph = NULL;
+		return -ENOMEM;
 	}
-	refcount_set(&rp->rph->ref, i);
-
+	rp->rph->rp = rp;
 	rp->nmissed = 0;
 	/* Establish function entry probe point */
 	ret = register_kprobe(&rp->kp);
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 3b21f4063258..6890f4fd17fc 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -187,7 +187,7 @@ static void fprobe_init(struct fprobe *fp)
 
 static int fprobe_init_rethook(struct fprobe *fp, int num)
 {
-	int i, size;
+	int size;
 
 	if (num < 0)
 		return -EINVAL;
@@ -205,26 +205,18 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
 	if (size < 0)
 		return -E2BIG;
 
-	fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler);
-	if (!fp->rethook)
-		return -ENOMEM;
-	for (i = 0; i < size; i++) {
-		struct fprobe_rethook_node *node;
-
-		node = kzalloc(sizeof(*node) + fp->entry_data_size, GFP_KERNEL);
-		if (!node) {
-			rethook_free(fp->rethook);
-			fp->rethook = NULL;
-			return -ENOMEM;
-		}
-		rethook_add_node(fp->rethook, &node->node);
-	}
+	/* Initialize rethook */
+	fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler,
+				sizeof(struct fprobe_rethook_node), size);
+	if (IS_ERR(fp->rethook))
+		return PTR_ERR(fp->rethook);
+
 	return 0;
 }
 
 static void fprobe_fail_cleanup(struct fprobe *fp)
 {
-	if (fp->rethook) {
+	if (!IS_ERR_OR_NULL(fp->rethook)) {
 		/* Don't need to cleanup rethook->handler because this is not used. */
 		rethook_free(fp->rethook);
 		fp->rethook = NULL;
@@ -379,14 +371,14 @@ int unregister_fprobe(struct fprobe *fp)
 	if (!fprobe_is_registered(fp))
 		return -EINVAL;
 
-	if (fp->rethook)
+	if (!IS_ERR_OR_NULL(fp->rethook))
 		rethook_stop(fp->rethook);
 
 	ret = unregister_ftrace_function(&fp->ops);
 	if (ret < 0)
 		return ret;
 
-	if (fp->rethook)
+	if (!IS_ERR_OR_NULL(fp->rethook))
 		rethook_free(fp->rethook);
 
 	ftrace_free_filter(&fp->ops);
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index 5eb9b598f4e9..13c8e6773892 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -9,6 +9,7 @@
 #include <linux/rethook.h>
 #include <linux/slab.h>
 #include <linux/sort.h>
+#include <linux/smp.h>
 
 /* Return hook list (shadow stack by list) */
 
@@ -36,21 +37,7 @@ void rethook_flush_task(struct task_struct *tk)
 static void rethook_free_rcu(struct rcu_head *head)
 {
 	struct rethook *rh = container_of(head, struct rethook, rcu);
-	struct rethook_node *rhn;
-	struct freelist_node *node;
-	int count = 1;
-
-	node = rh->pool.head;
-	while (node) {
-		rhn = container_of(node, struct rethook_node, freelist);
-		node = node->next;
-		kfree(rhn);
-		count++;
-	}
-
-	/* The rh->ref is the number of pooled node + 1 */
-	if (refcount_sub_and_test(count, &rh->ref))
-		kfree(rh);
+	objpool_fini(&rh->pool);
 }
 
 /**
@@ -83,54 +70,62 @@ void rethook_free(struct rethook *rh)
 	call_rcu(&rh->rcu, rethook_free_rcu);
 }
 
+static int rethook_init_node(void *nod, void *context)
+{
+	struct rethook_node *node = nod;
+
+	node->rethook = context;
+	return 0;
+}
+
+static int rethook_fini_pool(struct objpool_head *head, void *context)
+{
+	kfree(context);
+	return 0;
+}
+
 /**
  * rethook_alloc() - Allocate struct rethook.
  * @data: a data to pass the @handler when hooking the return.
- * @handler: the return hook callback function.
+ * @handler: the return hook callback function, must NOT be NULL
+ * @size: node size: rethook node and additional data
+ * @num: number of rethook nodes to be preallocated
  *
  * Allocate and initialize a new rethook with @data and @handler.
- * Return NULL if memory allocation fails or @handler is NULL.
+ * Return pointer of new rethook, or error codes for failures.
+ *
  * Note that @handler == NULL means this rethook is going to be freed.
  */
-struct rethook *rethook_alloc(void *data, rethook_handler_t handler)
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler,
+			      int size, int num)
 {
-	struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+	struct rethook *rh;
 
-	if (!rh || !handler) {
-		kfree(rh);
-		return NULL;
-	}
+	if (!handler || num <= 0 || size < sizeof(struct rethook_node))
+		return ERR_PTR(-EINVAL);
+
+	rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+	if (!rh)
+		return ERR_PTR(-ENOMEM);
 
 	rh->data = data;
 	rh->handler = handler;
-	rh->pool.head = NULL;
-	refcount_set(&rh->ref, 1);
 
+	/* initialize the objpool for rethook nodes */
+	if (objpool_init(&rh->pool, num, size, GFP_KERNEL, rh,
+			 rethook_init_node, rethook_fini_pool)) {
+		kfree(rh);
+		return ERR_PTR(-ENOMEM);
+	}
 	return rh;
 }
 
-/**
- * rethook_add_node() - Add a new node to the rethook.
- * @rh: the struct rethook.
- * @node: the struct rethook_node to be added.
- *
- * Add @node to @rh. User must allocate @node (as a part of user's
- * data structure.) The @node fields are initialized in this function.
- */
-void rethook_add_node(struct rethook *rh, struct rethook_node *node)
-{
-	node->rethook = rh;
-	freelist_add(&node->freelist, &rh->pool);
-	refcount_inc(&rh->ref);
-}
-
 static void free_rethook_node_rcu(struct rcu_head *head)
 {
 	struct rethook_node *node = container_of(head, struct rethook_node, rcu);
+	struct rethook *rh = node->rethook;
 
-	if (refcount_dec_and_test(&node->rethook->ref))
-		kfree(node->rethook);
-	kfree(node);
+	objpool_drop(node, &rh->pool);
 }
 
 /**
@@ -145,7 +140,7 @@ void rethook_recycle(struct rethook_node *node)
 	lockdep_assert_preemption_disabled();
 
 	if (likely(READ_ONCE(node->rethook->handler)))
-		freelist_add(&node->freelist, &node->rethook->pool);
+		objpool_push(node, &node->rethook->pool);
 	else
 		call_rcu(&node->rcu, free_rethook_node_rcu);
 }
@@ -161,7 +156,6 @@ NOKPROBE_SYMBOL(rethook_recycle);
 struct rethook_node *rethook_try_get(struct rethook *rh)
 {
 	rethook_handler_t handler = READ_ONCE(rh->handler);
-	struct freelist_node *fn;
 
 	lockdep_assert_preemption_disabled();
 
@@ -178,11 +172,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
 	if (unlikely(!rcu_is_watching()))
 		return NULL;
 
-	fn = freelist_try_get(&rh->pool);
-	if (!fn)
-		return NULL;
-
-	return container_of(fn, struct rethook_node, freelist);
+	return (struct rethook_node *)objpool_pop(&rh->pool);
 }
 NOKPROBE_SYMBOL(rethook_try_get);
 
-- 
cgit v1.2.3


From c43cfa42541c04a3a94312e39ab81c41ba431277 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Tue, 3 Oct 2023 00:14:51 +0100
Subject: mm: make __access_remote_vm() static

Patch series "various improvements to the GUP interface", v2.

A series of fixes to simplify and improve the GUP interface with an eye to
providing groundwork to future improvements:-

* __access_remote_vm() and access_remote_vm() are functionally identical,
  so make the former static such that in future we can potentially change
  the external-facing implementation details of this function.

* Extend is_valid_gup_args() to cover the missing FOLL_TOUCH case, and
  simplify things by defining INTERNAL_GUP_FLAGS to check against.

* Adjust __get_user_pages_locked() to explicitly treat a failure to pin any
  pages as an error in all circumstances other than FOLL_NOWAIT being
  specified, bringing it in line with the nommu implementation of this
  function.

* (With many thanks to Arnd who suggested this in the first instance)
  Update get_user_page_vma_remote() to explicitly only return a page or an
  error, simplifying the interface and avoiding the questionable
  IS_ERR_OR_NULL() pattern.


This patch (of 4):

access_remote_vm() passes through parameters to __access_remote_vm()
directly, so remove the __access_remote_vm() function from mm.h and use
access_remote_vm() in the one caller that needs it (ptrace_access_vm()).

This allows future adjustments to the GUP-internal __access_remote_vm()
function while keeping the access_remote_vm() function stable.

Link: https://lkml.kernel.org/r/cover.1696288092.git.lstoakes@gmail.com
Link: https://lkml.kernel.org/r/f7877c5039ce1c202a514a8aeeefc5cdd5e32d19.1696288092.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 --
 kernel/ptrace.c    | 2 +-
 mm/memory.c        | 4 ++--
 mm/nommu.c         | 4 ++--
 4 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 52c40b3d0813..7b89f7bd420d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2415,8 +2415,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
 		void *buf, int len, unsigned int gup_flags);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, unsigned int gup_flags);
-extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
-			      void *buf, int len, unsigned int gup_flags);
 
 long get_user_pages_remote(struct mm_struct *mm,
 			   unsigned long start, unsigned long nr_pages,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 443057bee87c..d8b5e13a2229 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -59,7 +59,7 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
 		return 0;
 	}
 
-	ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
+	ret = access_remote_vm(mm, addr, buf, len, gup_flags);
 	mmput(mm);
 
 	return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 22784d3b886d..e47c36c0aef0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5791,8 +5791,8 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
 /*
  * Access another process' address space as given in mm.
  */
-int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
-		       int len, unsigned int gup_flags)
+static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
+			      void *buf, int len, unsigned int gup_flags)
 {
 	void *old_buf = buf;
 	int write = gup_flags & FOLL_WRITE;
diff --git a/mm/nommu.c b/mm/nommu.c
index 7f9e9e5a0e12..f9553579389b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1651,8 +1651,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 }
 EXPORT_SYMBOL(filemap_map_pages);
 
-int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
-		       int len, unsigned int gup_flags)
+static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
+			      void *buf, int len, unsigned int gup_flags)
 {
 	struct vm_area_struct *vma;
 	int write = gup_flags & FOLL_WRITE;
-- 
cgit v1.2.3


From 6a1960b8a8773324d870fa32ba68ff3106523a95 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Tue, 3 Oct 2023 00:14:54 +0100
Subject: mm/gup: adapt get_user_page_vma_remote() to never return NULL

get_user_pages_remote() will never return 0 except in the case of
FOLL_NOWAIT being specified, which we explicitly disallow.

This simplifies error handling for the caller and avoids the awkwardness
of dealing with both errors and failing to pin.  Failing to pin here is an
error.

Link: https://lkml.kernel.org/r/00319ce292d27b3aae76a0eb220ce3f528187508.1696288092.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/kernel/mte.c |  4 ++--
 include/linux/mm.h      | 12 +++++++++---
 kernel/events/uprobes.c |  4 ++--
 mm/memory.c             |  3 +--
 4 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 4edecaac8f91..8878b392df58 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -411,8 +411,8 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
 		struct page *page = get_user_page_vma_remote(mm, addr,
 							     gup_flags, &vma);
 
-		if (IS_ERR_OR_NULL(page)) {
-			err = page == NULL ? -EIO : PTR_ERR(page);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
 			break;
 		}
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7b89f7bd420d..fa608cba041f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2425,6 +2425,9 @@ long pin_user_pages_remote(struct mm_struct *mm,
 			   unsigned int gup_flags, struct page **pages,
 			   int *locked);
 
+/*
+ * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT.
+ */
 static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
 						    unsigned long addr,
 						    int gup_flags,
@@ -2432,12 +2435,15 @@ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
 {
 	struct page *page;
 	struct vm_area_struct *vma;
-	int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
+	int got;
+
+	if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT)))
+		return ERR_PTR(-EINVAL);
+
+	got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
 
 	if (got < 0)
 		return ERR_PTR(got);
-	if (got == 0)
-		return NULL;
 
 	vma = vma_lookup(mm, addr);
 	if (WARN_ON_ONCE(!vma)) {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 3048589e2e85..435aac1d8c27 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -474,8 +474,8 @@ retry:
 		gup_flags |= FOLL_SPLIT_PMD;
 	/* Read the page with vaddr into memory */
 	old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
-	if (IS_ERR_OR_NULL(old_page))
-		return old_page ? PTR_ERR(old_page) : 0;
+	if (IS_ERR(old_page))
+		return PTR_ERR(old_page);
 
 	ret = verify_opcode(old_page, vaddr, &opcode);
 	if (ret <= 0)
diff --git a/mm/memory.c b/mm/memory.c
index e47c36c0aef0..1f88e4f6fbf2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5815,7 +5815,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		struct page *page = get_user_page_vma_remote(mm, addr,
 							     gup_flags, &vma);
 
-		if (IS_ERR_OR_NULL(page)) {
+		if (IS_ERR(page)) {
 			/* We might need to expand the stack to access it */
 			vma = vma_lookup(mm, addr);
 			if (!vma) {
@@ -5829,7 +5829,6 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
 				continue;
 			}
 
-
 			/*
 			 * Check if this is a VM_IO | VM_PFNMAP VMA, which
 			 * we can access using slightly different code.
-- 
cgit v1.2.3


From 247dbcdbf790c52fc76cf8e327cd0a5778e41e66 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 4 Oct 2023 17:53:07 +0100
Subject: bitops: add xor_unlock_is_negative_byte()

Replace clear_bit_and_unlock_is_negative_byte() with
xor_unlock_is_negative_byte().  We have a few places that like to lock a
folio, set a flag and unlock it again.  Allow for the possibility of
combining the latter two operations for efficiency.  We are guaranteed
that the caller holds the lock, so it is safe to unlock it with the xor.
The caller must guarantee that nobody else will set the flag without
holding the lock; it is not safe to do this with the PG_dirty flag, for
example.

Link: https://lkml.kernel.org/r/20231004165317.1061855-8-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/include/asm/bitops.h              | 17 +++++-----------
 arch/x86/include/asm/bitops.h                  | 11 +++++------
 include/asm-generic/bitops/instrumented-lock.h | 27 ++++++++++++++------------
 include/asm-generic/bitops/lock.h              | 21 +++++---------------
 kernel/kcsan/kcsan_test.c                      |  8 ++++----
 kernel/kcsan/selftest.c                        |  8 ++++----
 mm/filemap.c                                   |  5 +++++
 mm/kasan/kasan_test.c                          |  7 ++++---
 8 files changed, 47 insertions(+), 57 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index 7e0f0322912b..40cc3ded60cb 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -234,32 +234,25 @@ static inline int arch_test_and_change_bit(unsigned long nr,
 }
 
 #ifdef CONFIG_PPC64
-static inline unsigned long
-clear_bit_unlock_return_word(int nr, volatile unsigned long *addr)
+static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
+		volatile unsigned long *p)
 {
 	unsigned long old, t;
-	unsigned long *p = (unsigned long *)addr + BIT_WORD(nr);
-	unsigned long mask = BIT_MASK(nr);
 
 	__asm__ __volatile__ (
 	PPC_RELEASE_BARRIER
 "1:"	PPC_LLARX "%0,0,%3,0\n"
-	"andc %1,%0,%2\n"
+	"xor %1,%0,%2\n"
 	PPC_STLCX "%1,0,%3\n"
 	"bne- 1b\n"
 	: "=&r" (old), "=&r" (t)
 	: "r" (mask), "r" (p)
 	: "cc", "memory");
 
-	return old;
+	return (old & BIT_MASK(7)) != 0;
 }
 
-/*
- * This is a special function for mm/filemap.c
- * Bit 7 corresponds to PG_waiters.
- */
-#define arch_clear_bit_unlock_is_negative_byte(nr, addr)		\
-	(clear_bit_unlock_return_word(nr, addr) & BIT_MASK(7))
+#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte
 
 #endif /* CONFIG_PPC64 */
 
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 2edf68475fec..f03c0a50ec3a 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -94,18 +94,17 @@ arch___clear_bit(unsigned long nr, volatile unsigned long *addr)
 	asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
 }
 
-static __always_inline bool
-arch_clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
+static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
+		volatile unsigned long *addr)
 {
 	bool negative;
-	asm volatile(LOCK_PREFIX "andb %2,%1"
+	asm volatile(LOCK_PREFIX "xorb %2,%1"
 		CC_SET(s)
 		: CC_OUT(s) (negative), WBYTE_ADDR(addr)
-		: "ir" ((char) ~(1 << nr)) : "memory");
+		: "iq" ((char)mask) : "memory");
 	return negative;
 }
-#define arch_clear_bit_unlock_is_negative_byte                                 \
-	arch_clear_bit_unlock_is_negative_byte
+#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte
 
 static __always_inline void
 arch___clear_bit_unlock(long nr, volatile unsigned long *addr)
diff --git a/include/asm-generic/bitops/instrumented-lock.h b/include/asm-generic/bitops/instrumented-lock.h
index eb64bd4f11f3..e8ea3aeda9a9 100644
--- a/include/asm-generic/bitops/instrumented-lock.h
+++ b/include/asm-generic/bitops/instrumented-lock.h
@@ -58,27 +58,30 @@ static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr)
 	return arch_test_and_set_bit_lock(nr, addr);
 }
 
-#if defined(arch_clear_bit_unlock_is_negative_byte)
+#if defined(arch_xor_unlock_is_negative_byte)
 /**
- * clear_bit_unlock_is_negative_byte - Clear a bit in memory and test if bottom
- *                                     byte is negative, for unlock.
- * @nr: the bit to clear
- * @addr: the address to start counting from
+ * xor_unlock_is_negative_byte - XOR a single byte in memory and test if
+ * it is negative, for unlock.
+ * @mask: Change the bits which are set in this mask.
+ * @addr: The address of the word containing the byte to change.
  *
+ * Changes some of bits 0-6 in the word pointed to by @addr.
  * This operation is atomic and provides release barrier semantics.
+ * Used to optimise some folio operations which are commonly paired
+ * with an unlock or end of writeback.  Bit 7 is used as PG_waiters to
+ * indicate whether anybody is waiting for the unlock.
  *
- * This is a bit of a one-trick-pony for the filemap code, which clears
- * PG_locked and tests PG_waiters,
+ * Return: Whether the top bit of the byte is set.
  */
-static inline bool
-clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
+static inline bool xor_unlock_is_negative_byte(unsigned long mask,
+		volatile unsigned long *addr)
 {
 	kcsan_release();
-	instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
-	return arch_clear_bit_unlock_is_negative_byte(nr, addr);
+	instrument_atomic_write(addr, sizeof(long));
+	return arch_xor_unlock_is_negative_byte(mask, addr);
 }
 /* Let everybody know we have it. */
-#define clear_bit_unlock_is_negative_byte clear_bit_unlock_is_negative_byte
+#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte
 #endif
 
 #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H */
diff --git a/include/asm-generic/bitops/lock.h b/include/asm-generic/bitops/lock.h
index 40913516e654..6a638e89d130 100644
--- a/include/asm-generic/bitops/lock.h
+++ b/include/asm-generic/bitops/lock.h
@@ -66,27 +66,16 @@ arch___clear_bit_unlock(unsigned int nr, volatile unsigned long *p)
 	raw_atomic_long_set_release((atomic_long_t *)p, old);
 }
 
-/**
- * arch_clear_bit_unlock_is_negative_byte - Clear a bit in memory and test if bottom
- *                                          byte is negative, for unlock.
- * @nr: the bit to clear
- * @addr: the address to start counting from
- *
- * This is a bit of a one-trick-pony for the filemap code, which clears
- * PG_locked and tests PG_waiters,
- */
-#ifndef arch_clear_bit_unlock_is_negative_byte
-static inline bool arch_clear_bit_unlock_is_negative_byte(unsigned int nr,
-							  volatile unsigned long *p)
+#ifndef arch_xor_unlock_is_negative_byte
+static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
+		volatile unsigned long *p)
 {
 	long old;
-	unsigned long mask = BIT_MASK(nr);
 
-	p += BIT_WORD(nr);
-	old = raw_atomic_long_fetch_andnot_release(mask, (atomic_long_t *)p);
+	old = raw_atomic_long_fetch_xor_release(mask, (atomic_long_t *)p);
 	return !!(old & BIT(7));
 }
-#define arch_clear_bit_unlock_is_negative_byte arch_clear_bit_unlock_is_negative_byte
+#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte
 #endif
 
 #include <asm-generic/bitops/instrumented-lock.h>
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 0ddbdab5903d..1333d23ac4ef 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -700,10 +700,10 @@ static void test_barrier_nothreads(struct kunit *test)
 	KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false);
 	KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true);
 
-#ifdef clear_bit_unlock_is_negative_byte
-	KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
-	KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
-	KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
+#ifdef xor_unlock_is_negative_byte
+	KCSAN_EXPECT_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
+	KCSAN_EXPECT_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
+	KCSAN_EXPECT_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
 #endif
 	kcsan_nestable_atomic_end();
 }
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index 8679322450f2..619be7417420 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -228,10 +228,10 @@ static bool __init test_barrier(void)
 	spin_lock(&test_spinlock);
 	KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock));
 
-#ifdef clear_bit_unlock_is_negative_byte
-	KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
-	KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
-	KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
+#ifdef xor_unlock_is_negative_byte
+	KCSAN_CHECK_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
+	KCSAN_CHECK_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
+	KCSAN_CHECK_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
 #endif
 	kcsan_nestable_atomic_end();
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 1ce78c619294..c637863f4643 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1482,6 +1482,11 @@ void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
 }
 EXPORT_SYMBOL_GPL(folio_add_wait_queue);
 
+#ifdef xor_unlock_is_negative_byte
+#define clear_bit_unlock_is_negative_byte(nr, p)	\
+	xor_unlock_is_negative_byte(1 << nr, p)
+#endif
+
 #ifndef clear_bit_unlock_is_negative_byte
 
 /*
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index 2030c7ff7de9..821c9ea0473a 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -1099,9 +1099,10 @@ static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr)
 	KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr));
 	KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr));
 
-#if defined(clear_bit_unlock_is_negative_byte)
-	KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result =
-				clear_bit_unlock_is_negative_byte(nr, addr));
+#if defined(xor_unlock_is_negative_byte)
+	if (nr < 7)
+		KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result =
+				xor_unlock_is_negative_byte(1 << nr, addr));
 #endif
 }
 
-- 
cgit v1.2.3


From f12fb73b74fd23ca33e3f95fb996f295eeae1da7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 4 Oct 2023 17:53:14 +0100
Subject: mm: delete checks for xor_unlock_is_negative_byte()

Architectures which don't define their own use the one in
asm-generic/bitops/lock.h.  Get rid of all the ifdefs around "maybe we
don't have it".

Link: https://lkml.kernel.org/r/20231004165317.1061855-15-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/alpha/include/asm/bitops.h                |  1 -
 arch/m68k/include/asm/bitops.h                 |  1 -
 arch/mips/include/asm/bitops.h                 |  1 -
 arch/riscv/include/asm/bitops.h                |  1 -
 include/asm-generic/bitops/instrumented-lock.h |  5 -----
 include/asm-generic/bitops/lock.h              |  1 -
 kernel/kcsan/kcsan_test.c                      |  3 ---
 kernel/kcsan/selftest.c                        |  3 ---
 mm/filemap.c                                   | 30 +-------------------------
 mm/kasan/kasan_test.c                          |  3 ---
 10 files changed, 1 insertion(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h
index b50ad6b83e85..3e33621922c3 100644
--- a/arch/alpha/include/asm/bitops.h
+++ b/arch/alpha/include/asm/bitops.h
@@ -305,7 +305,6 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask,
 
 	return (old & BIT(7)) != 0;
 }
-#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte
 
 /*
  * ffz = Find First Zero in word. Undefined if no zero exists,
diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h
index 80ee36095905..14c64a6f1217 100644
--- a/arch/m68k/include/asm/bitops.h
+++ b/arch/m68k/include/asm/bitops.h
@@ -339,7 +339,6 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask,
 	return result;
 #endif
 }
-#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte
 
 /*
  *	The true 68020 and more advanced processors support the "bfffo"
diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h
index d98a05c478f4..89f73d1a4ea4 100644
--- a/arch/mips/include/asm/bitops.h
+++ b/arch/mips/include/asm/bitops.h
@@ -301,7 +301,6 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask,
 
 	return res;
 }
-#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte
 
 #undef __bit_op
 #undef __test_bit_op
diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
index 15e3044298a2..65f6eee4ab8d 100644
--- a/arch/riscv/include/asm/bitops.h
+++ b/arch/riscv/include/asm/bitops.h
@@ -202,7 +202,6 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask,
 		: "memory");
 	return (res & BIT(7)) != 0;
 }
-#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte
 
 #undef __test_and_op_bit
 #undef __op_bit
diff --git a/include/asm-generic/bitops/instrumented-lock.h b/include/asm-generic/bitops/instrumented-lock.h
index e8ea3aeda9a9..542d3727ee4e 100644
--- a/include/asm-generic/bitops/instrumented-lock.h
+++ b/include/asm-generic/bitops/instrumented-lock.h
@@ -58,7 +58,6 @@ static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr)
 	return arch_test_and_set_bit_lock(nr, addr);
 }
 
-#if defined(arch_xor_unlock_is_negative_byte)
 /**
  * xor_unlock_is_negative_byte - XOR a single byte in memory and test if
  * it is negative, for unlock.
@@ -80,8 +79,4 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask,
 	instrument_atomic_write(addr, sizeof(long));
 	return arch_xor_unlock_is_negative_byte(mask, addr);
 }
-/* Let everybody know we have it. */
-#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte
-#endif
-
 #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H */
diff --git a/include/asm-generic/bitops/lock.h b/include/asm-generic/bitops/lock.h
index 6a638e89d130..14d4ec8c5152 100644
--- a/include/asm-generic/bitops/lock.h
+++ b/include/asm-generic/bitops/lock.h
@@ -75,7 +75,6 @@ static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
 	old = raw_atomic_long_fetch_xor_release(mask, (atomic_long_t *)p);
 	return !!(old & BIT(7));
 }
-#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte
 #endif
 
 #include <asm-generic/bitops/instrumented-lock.h>
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 1333d23ac4ef..015586217875 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -699,12 +699,9 @@ static void test_barrier_nothreads(struct kunit *test)
 	KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true);
 	KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false);
 	KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true);
-
-#ifdef xor_unlock_is_negative_byte
 	KCSAN_EXPECT_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
 	KCSAN_EXPECT_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
 	KCSAN_EXPECT_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true);
-#endif
 	kcsan_nestable_atomic_end();
 }
 
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index 619be7417420..84a1200271af 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -227,12 +227,9 @@ static bool __init test_barrier(void)
 	KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock));
 	spin_lock(&test_spinlock);
 	KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock));
-
-#ifdef xor_unlock_is_negative_byte
 	KCSAN_CHECK_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
 	KCSAN_CHECK_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
 	KCSAN_CHECK_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var));
-#endif
 	kcsan_nestable_atomic_end();
 
 	return ret;
diff --git a/mm/filemap.c b/mm/filemap.c
index c637863f4643..458377e9a184 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1482,34 +1482,6 @@ void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
 }
 EXPORT_SYMBOL_GPL(folio_add_wait_queue);
 
-#ifdef xor_unlock_is_negative_byte
-#define clear_bit_unlock_is_negative_byte(nr, p)	\
-	xor_unlock_is_negative_byte(1 << nr, p)
-#endif
-
-#ifndef clear_bit_unlock_is_negative_byte
-
-/*
- * PG_waiters is the high bit in the same byte as PG_lock.
- *
- * On x86 (and on many other architectures), we can clear PG_lock and
- * test the sign bit at the same time. But if the architecture does
- * not support that special operation, we just do this all by hand
- * instead.
- *
- * The read of PG_waiters has to be after (or concurrently with) PG_locked
- * being cleared, but a memory barrier should be unnecessary since it is
- * in the same byte as PG_locked.
- */
-static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
-{
-	clear_bit_unlock(nr, mem);
-	/* smp_mb__after_atomic(); */
-	return test_bit(PG_waiters, mem);
-}
-
-#endif
-
 /**
  * folio_unlock - Unlock a locked folio.
  * @folio: The folio.
@@ -1525,7 +1497,7 @@ void folio_unlock(struct folio *folio)
 	BUILD_BUG_ON(PG_waiters != 7);
 	BUILD_BUG_ON(PG_locked > 7);
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
-	if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0)))
+	if (xor_unlock_is_negative_byte(1 << PG_locked, folio_flags(folio, 0)))
 		folio_wake_bit(folio, PG_locked);
 }
 EXPORT_SYMBOL(folio_unlock);
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index 821c9ea0473a..8281eb42464b 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -1098,12 +1098,9 @@ static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr)
 	KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr));
 	KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr));
 	KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr));
-
-#if defined(xor_unlock_is_negative_byte)
 	if (nr < 7)
 		KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result =
 				xor_unlock_is_negative_byte(1 << nr, addr));
-#endif
 }
 
 static void kasan_bitops_generic(struct kunit *test)
-- 
cgit v1.2.3


From 8cba9576df601c384abd334a503c3f6e1e29eefb Mon Sep 17 00:00:00 2001
From: Nhat Pham <nphamcs@gmail.com>
Date: Fri, 6 Oct 2023 11:46:28 -0700
Subject: hugetlb: memcg: account hugetlb-backed memory in memory controller

Currently, hugetlb memory usage is not acounted for in the memory
controller, which could lead to memory overprotection for cgroups with
hugetlb-backed memory.  This has been observed in our production system.

For instance, here is one of our usecases: suppose there are two 32G
containers.  The machine is booted with hugetlb_cma=6G, and each container
may or may not use up to 3 gigantic page, depending on the workload within
it.  The rest is anon, cache, slab, etc.  We can set the hugetlb cgroup
limit of each cgroup to 3G to enforce hugetlb fairness.  But it is very
difficult to configure memory.max to keep overall consumption, including
anon, cache, slab etc.  fair.

What we have had to resort to is to constantly poll hugetlb usage and
readjust memory.max.  Similar procedure is done to other memory limits
(memory.low for e.g).  However, this is rather cumbersome and buggy.
Furthermore, when there is a delay in memory limits correction, (for e.g
when hugetlb usage changes within consecutive runs of the userspace
agent), the system could be in an over/underprotected state.

This patch rectifies this issue by charging the memcg when the hugetlb
folio is utilized, and uncharging when the folio is freed (analogous to
the hugetlb controller).  Note that we do not charge when the folio is
allocated to the hugetlb pool, because at this point it is not owned by
any memcg.

Some caveats to consider:
  * This feature is only available on cgroup v2.
  * There is no hugetlb pool management involved in the memory
    controller. As stated above, hugetlb folios are only charged towards
    the memory controller when it is used. Host overcommit management
    has to consider it when configuring hard limits.
  * Failure to charge towards the memcg results in SIGBUS. This could
    happen even if the hugetlb pool still has pages (but the cgroup
    limit is hit and reclaim attempt fails).
  * When this feature is enabled, hugetlb pages contribute to memory
    reclaim protection. low, min limits tuning must take into account
    hugetlb memory.
  * Hugetlb pages utilized while this option is not selected will not
    be tracked by the memory controller (even if cgroup v2 is remounted
    later on).

Link: https://lkml.kernel.org/r/20231006184629.155543-4-nphamcs@gmail.com
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Frank van der Linden <fvdl@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun heo <tj@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 29 +++++++++++++++++++++++
 include/linux/cgroup-defs.h             |  5 ++++
 include/linux/memcontrol.h              |  9 +++++++
 kernel/cgroup/cgroup.c                  | 15 +++++++++++-
 mm/hugetlb.c                            | 35 +++++++++++++++++++++------
 mm/memcontrol.c                         | 42 ++++++++++++++++++++++++++++++++-
 mm/migrate.c                            |  3 +--
 7 files changed, 127 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 622a7f28db1f..606b2e0eac4b 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -210,6 +210,35 @@ cgroup v2 currently supports the following mount options.
         relying on the original semantics (e.g. specifying bogusly
         high 'bypass' protection values at higher tree levels).
 
+  memory_hugetlb_accounting
+        Count HugeTLB memory usage towards the cgroup's overall
+        memory usage for the memory controller (for the purpose of
+        statistics reporting and memory protetion). This is a new
+        behavior that could regress existing setups, so it must be
+        explicitly opted in with this mount option.
+
+        A few caveats to keep in mind:
+
+        * There is no HugeTLB pool management involved in the memory
+          controller. The pre-allocated pool does not belong to anyone.
+          Specifically, when a new HugeTLB folio is allocated to
+          the pool, it is not accounted for from the perspective of the
+          memory controller. It is only charged to a cgroup when it is
+          actually used (for e.g at page fault time). Host memory
+          overcommit management has to consider this when configuring
+          hard limits. In general, HugeTLB pool management should be
+          done via other mechanisms (such as the HugeTLB controller).
+        * Failure to charge a HugeTLB folio to the memory controller
+          results in SIGBUS. This could happen even if the HugeTLB pool
+          still has pages available (but the cgroup limit is hit and
+          reclaim attempt fails).
+        * Charging HugeTLB memory towards the memory controller affects
+          memory protection and reclaim dynamics. Any userspace tuning
+          (of low, min limits for e.g) needs to take this into account.
+        * HugeTLB pages utilized while this option is not selected
+          will not be tracked by the memory controller (even if cgroup
+          v2 is remounted later on).
+
 
 Organizing Processes and Threads
 --------------------------------
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index f1b3151ac30b..8641f4320c98 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -115,6 +115,11 @@ enum {
 	 * Enable recursive subtree protection
 	 */
 	CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18),
+
+	/*
+	 * Enable hugetlb accounting for the memory controller.
+	 */
+	 CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
 };
 
 /* cftype->flags */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a3fce570122e..6674c12725d5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -678,6 +678,9 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
 	return __mem_cgroup_charge(folio, mm, gfp);
 }
 
+int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
+		long nr_pages);
+
 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
 				  gfp_t gfp, swp_entry_t entry);
 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
@@ -1258,6 +1261,12 @@ static inline int mem_cgroup_charge(struct folio *folio,
 	return 0;
 }
 
+static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
+		gfp_t gfp, long nr_pages)
+{
+	return 0;
+}
+
 static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
 			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
 {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1fb7f562289d..f11488b18ceb 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1902,6 +1902,7 @@ enum cgroup2_param {
 	Opt_favordynmods,
 	Opt_memory_localevents,
 	Opt_memory_recursiveprot,
+	Opt_memory_hugetlb_accounting,
 	nr__cgroup2_params
 };
 
@@ -1910,6 +1911,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
 	fsparam_flag("favordynmods",		Opt_favordynmods),
 	fsparam_flag("memory_localevents",	Opt_memory_localevents),
 	fsparam_flag("memory_recursiveprot",	Opt_memory_recursiveprot),
+	fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
 	{}
 };
 
@@ -1936,6 +1938,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
 	case Opt_memory_recursiveprot:
 		ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
 		return 0;
+	case Opt_memory_hugetlb_accounting:
+		ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+		return 0;
 	}
 	return -EINVAL;
 }
@@ -1960,6 +1965,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
 			cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
 		else
 			cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+
+		if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+			cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+		else
+			cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
 	}
 }
 
@@ -1973,6 +1983,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
 		seq_puts(seq, ",memory_localevents");
 	if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
 		seq_puts(seq, ",memory_recursiveprot");
+	if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+		seq_puts(seq, ",memory_hugetlb_accounting");
 	return 0;
 }
 
@@ -7050,7 +7062,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
 			"nsdelegate\n"
 			"favordynmods\n"
 			"memory_localevents\n"
-			"memory_recursiveprot\n");
+			"memory_recursiveprot\n"
+			"memory_hugetlb_accounting\n");
 }
 static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e2b1c417b90a..da6f85b7db88 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1927,6 +1927,7 @@ void free_huge_folio(struct folio *folio)
 				     pages_per_huge_page(h), folio);
 	hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
 					  pages_per_huge_page(h), folio);
+	mem_cgroup_uncharge(folio);
 	if (restore_reserve)
 		h->resv_huge_pages++;
 
@@ -3026,11 +3027,20 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 	struct hugepage_subpool *spool = subpool_vma(vma);
 	struct hstate *h = hstate_vma(vma);
 	struct folio *folio;
-	long map_chg, map_commit;
+	long map_chg, map_commit, nr_pages = pages_per_huge_page(h);
 	long gbl_chg;
-	int ret, idx;
+	int memcg_charge_ret, ret, idx;
 	struct hugetlb_cgroup *h_cg = NULL;
+	struct mem_cgroup *memcg;
 	bool deferred_reserve;
+	gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
+
+	memcg = get_mem_cgroup_from_current();
+	memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages);
+	if (memcg_charge_ret == -ENOMEM) {
+		mem_cgroup_put(memcg);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	idx = hstate_index(h);
 	/*
@@ -3039,8 +3049,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 	 * code of zero indicates a reservation exists (no change).
 	 */
 	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
-	if (map_chg < 0)
+	if (map_chg < 0) {
+		if (!memcg_charge_ret)
+			mem_cgroup_cancel_charge(memcg, nr_pages);
+		mem_cgroup_put(memcg);
 		return ERR_PTR(-ENOMEM);
+	}
 
 	/*
 	 * Processes that did not create the mapping will have no
@@ -3051,10 +3065,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 	 */
 	if (map_chg || avoid_reserve) {
 		gbl_chg = hugepage_subpool_get_pages(spool, 1);
-		if (gbl_chg < 0) {
-			vma_end_reservation(h, vma, addr);
-			return ERR_PTR(-ENOSPC);
-		}
+		if (gbl_chg < 0)
+			goto out_end_reservation;
 
 		/*
 		 * Even though there was no reservation in the region/reserve
@@ -3136,6 +3148,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 			hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
 					pages_per_huge_page(h), folio);
 	}
+
+	if (!memcg_charge_ret)
+		mem_cgroup_commit_charge(folio, memcg);
+	mem_cgroup_put(memcg);
+
 	return folio;
 
 out_uncharge_cgroup:
@@ -3147,7 +3164,11 @@ out_uncharge_cgroup_reservation:
 out_subpool_put:
 	if (map_chg || avoid_reserve)
 		hugepage_subpool_put_pages(spool, 1);
+out_end_reservation:
 	vma_end_reservation(h, vma, addr);
+	if (!memcg_charge_ret)
+		mem_cgroup_cancel_charge(memcg, nr_pages);
+	mem_cgroup_put(memcg);
 	return ERR_PTR(-ENOSPC);
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 135d6637afe5..a86e7b445800 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7096,6 +7096,41 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
 	return ret;
 }
 
+/**
+ * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
+ * @memcg: memcg to charge.
+ * @gfp: reclaim mode.
+ * @nr_pages: number of pages to charge.
+ *
+ * This function is called when allocating a huge page folio to determine if
+ * the memcg has the capacity for it. It does not commit the charge yet,
+ * as the hugetlb folio itself has not been obtained from the hugetlb pool.
+ *
+ * Once we have obtained the hugetlb folio, we can call
+ * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
+ * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
+ * of try_charge().
+ *
+ * Returns 0 on success. Otherwise, an error code is returned.
+ */
+int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
+			long nr_pages)
+{
+	/*
+	 * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
+	 * but do not attempt to commit charge later (or cancel on error) either.
+	 */
+	if (mem_cgroup_disabled() || !memcg ||
+		!cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
+		!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
+		return -EOPNOTSUPP;
+
+	if (try_charge(memcg, gfp, nr_pages))
+		return -ENOMEM;
+
+	return 0;
+}
+
 /**
  * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
  * @folio: folio to charge.
@@ -7365,7 +7400,12 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
 		return;
 
 	memcg = folio_memcg(old);
-	VM_WARN_ON_ONCE_FOLIO(!memcg, old);
+	/*
+	 * Note that it is normal to see !memcg for a hugetlb folio.
+	 * For e.g, itt could have been allocated when memory_hugetlb_accounting
+	 * was not selected.
+	 */
+	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old);
 	if (!memcg)
 		return;
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 9d9eee5322d1..c602bf6dec97 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -633,8 +633,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
 
 	folio_copy_owner(newfolio, folio);
 
-	if (!folio_test_hugetlb(folio))
-		mem_cgroup_migrate(folio, newfolio);
+	mem_cgroup_migrate(folio, newfolio);
 }
 EXPORT_SYMBOL(folio_migrate_flags);
 
-- 
cgit v1.2.3


From 37acade0ce8938f00d6979bd02b8043b5b7089ae Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 10 Oct 2023 04:58:29 +0100
Subject: sched: remove wait bookmarks

There are no users of wait bookmarks left, so simplify the wait
code by removing them.

Link: https://lkml.kernel.org/r/20231010035829.544242-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Benjamin Segall <bsegall@google.com>
Cc: Bin Lai <sclaibin@gmail.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/wait.h |  9 +++-----
 kernel/sched/wait.c  | 60 +++++++++-------------------------------------------
 2 files changed, 13 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5ec7739400f4..3473b663176f 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -19,10 +19,9 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 /* wait_queue_entry::flags */
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
-#define WQ_FLAG_BOOKMARK	0x04
-#define WQ_FLAG_CUSTOM		0x08
-#define WQ_FLAG_DONE		0x10
-#define WQ_FLAG_PRIORITY	0x20
+#define WQ_FLAG_CUSTOM		0x04
+#define WQ_FLAG_DONE		0x08
+#define WQ_FLAG_PRIORITY	0x10
 
 /*
  * A single wait-queue entry structure:
@@ -212,8 +211,6 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
 int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
-		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 802d98cf2de3..51e38f5f4701 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -57,13 +57,6 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
 }
 EXPORT_SYMBOL(remove_wait_queue);
 
-/*
- * Scan threshold to break wait queue walk.
- * This allows a waker to take a break from holding the
- * wait queue lock during the wait queue walk.
- */
-#define WAITQUEUE_WALK_BREAK_CNT 64
-
 /*
  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
  * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
@@ -78,21 +71,13 @@ EXPORT_SYMBOL(remove_wait_queue);
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
 static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
-			int nr_exclusive, int wake_flags, void *key,
-			wait_queue_entry_t *bookmark)
+			int nr_exclusive, int wake_flags, void *key)
 {
 	wait_queue_entry_t *curr, *next;
-	int cnt = 0;
 
 	lockdep_assert_held(&wq_head->lock);
 
-	if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
-		curr = list_next_entry(bookmark, entry);
-
-		list_del(&bookmark->entry);
-		bookmark->flags = 0;
-	} else
-		curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+	curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
 
 	if (&curr->entry == &wq_head->head)
 		return nr_exclusive;
@@ -101,21 +86,11 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 		unsigned flags = curr->flags;
 		int ret;
 
-		if (flags & WQ_FLAG_BOOKMARK)
-			continue;
-
 		ret = curr->func(curr, mode, wake_flags, key);
 		if (ret < 0)
 			break;
 		if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
-
-		if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
-				(&next->entry != &wq_head->head)) {
-			bookmark->flags = WQ_FLAG_BOOKMARK;
-			list_add_tail(&bookmark->entry, &next->entry);
-			break;
-		}
 	}
 
 	return nr_exclusive;
@@ -125,20 +100,12 @@ static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int m
 			int nr_exclusive, int wake_flags, void *key)
 {
 	unsigned long flags;
-	wait_queue_entry_t bookmark;
-	int remaining = nr_exclusive;
+	int remaining;
 
-	bookmark.flags = 0;
-	bookmark.private = NULL;
-	bookmark.func = NULL;
-	INIT_LIST_HEAD(&bookmark.entry);
-
-	do {
-		spin_lock_irqsave(&wq_head->lock, flags);
-		remaining = __wake_up_common(wq_head, mode, remaining,
-						wake_flags, key, &bookmark);
-		spin_unlock_irqrestore(&wq_head->lock, flags);
-	} while (bookmark.flags & WQ_FLAG_BOOKMARK);
+	spin_lock_irqsave(&wq_head->lock, flags);
+	remaining = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags,
+			key);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
 
 	return nr_exclusive - remaining;
 }
@@ -171,23 +138,16 @@ void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode
  */
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
 {
-	__wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
+	__wake_up_common(wq_head, mode, nr, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
 {
-	__wake_up_common(wq_head, mode, 1, 0, key, NULL);
+	__wake_up_common(wq_head, mode, 1, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
-		unsigned int mode, void *key, wait_queue_entry_t *bookmark)
-{
-	__wake_up_common(wq_head, mode, 1, 0, key, bookmark);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
-
 /**
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @wq_head: the waitqueue
@@ -233,7 +193,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
 			       unsigned int mode, void *key)
 {
-        __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
+        __wake_up_common(wq_head, mode, 1, WF_SYNC, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);
 
-- 
cgit v1.2.3


From e8e17ee90eaf650c855adb0a3e5e965fd6692ff1 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Thu, 12 Oct 2023 18:04:28 +0100
Subject: mm: drop the assumption that VM_SHARED always implies writable

Patch series "permit write-sealed memfd read-only shared mappings", v4.

The man page for fcntl() describing memfd file seals states the following
about F_SEAL_WRITE:-

    Furthermore, trying to create new shared, writable memory-mappings via
    mmap(2) will also fail with EPERM.

With emphasis on 'writable'.  In turns out in fact that currently the
kernel simply disallows all new shared memory mappings for a memfd with
F_SEAL_WRITE applied, rendering this documentation inaccurate.

This matters because users are therefore unable to obtain a shared mapping
to a memfd after write sealing altogether, which limits their usefulness.
This was reported in the discussion thread [1] originating from a bug
report [2].

This is a product of both using the struct address_space->i_mmap_writable
atomic counter to determine whether writing may be permitted, and the
kernel adjusting this counter when any VM_SHARED mapping is performed and
more generally implicitly assuming VM_SHARED implies writable.

It seems sensible that we should only update this mapping if VM_MAYWRITE
is specified, i.e.  whether it is possible that this mapping could at any
point be written to.

If we do so then all we need to do to permit write seals to function as
documented is to clear VM_MAYWRITE when mapping read-only.  It turns out
this functionality already exists for F_SEAL_FUTURE_WRITE - we can
therefore simply adapt this logic to do the same for F_SEAL_WRITE.

We then hit a chicken and egg situation in mmap_region() where the check
for VM_MAYWRITE occurs before we are able to clear this flag.  To work
around this, perform this check after we invoke call_mmap(), with careful
consideration of error paths.

Thanks to Andy Lutomirski for the suggestion!

[1]:https://lore.kernel.org/all/20230324133646.16101dfa666f253c4715d965@linux-foundation.org/
[2]:https://bugzilla.kernel.org/show_bug.cgi?id=217238


This patch (of 3):

There is a general assumption that VMAs with the VM_SHARED flag set are
writable.  If the VM_MAYWRITE flag is not set, then this is simply not the
case.

Update those checks which affect the struct address_space->i_mmap_writable
field to explicitly test for this by introducing
[vma_]is_shared_maywrite() helper functions.

This remains entirely conservative, as the lack of VM_MAYWRITE guarantees
that the VMA cannot be written to.

Link: https://lkml.kernel.org/r/cover.1697116581.git.lstoakes@gmail.com
Link: https://lkml.kernel.org/r/d978aefefa83ec42d18dfa964ad180dbcde34795.1697116581.git.lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fs.h |  4 ++--
 include/linux/mm.h | 11 +++++++++++
 kernel/fork.c      |  2 +-
 mm/filemap.c       |  2 +-
 mm/madvise.c       |  2 +-
 mm/mmap.c          | 12 ++++++------
 6 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd539c9fef8e..5265186da0e2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -454,7 +454,7 @@ extern const struct address_space_operations empty_aops;
  *   It is also used to block modification of page cache contents through
  *   memory mappings.
  * @gfp_mask: Memory allocation flags to use for allocating pages.
- * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
  * @nr_thps: Number of THPs in the pagecache (non-shmem only).
  * @i_mmap: Tree of private and shared mappings.
  * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
@@ -557,7 +557,7 @@ static inline int mapping_mapped(struct address_space *mapping)
 
 /*
  * Might pages of this file have been modified in userspace?
- * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap
+ * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
  * marks vma as VM_SHARED if it is shared, and the file was opened for
  * writing i.e. vma may be mprotected writable even if now readonly.
  *
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1bddb151cd5c..c3b5749ede9d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -937,6 +937,17 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
 	return vma->vm_flags & VM_ACCESS_FLAGS;
 }
 
+static inline bool is_shared_maywrite(vm_flags_t vm_flags)
+{
+	return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
+		(VM_SHARED | VM_MAYWRITE);
+}
+
+static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+{
+	return is_shared_maywrite(vma->vm_flags);
+}
+
 static inline
 struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index e45a4457ba83..1e6c656e0857 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -733,7 +733,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 
 			get_file(file);
 			i_mmap_lock_write(mapping);
-			if (tmp->vm_flags & VM_SHARED)
+			if (vma_is_shared_maywrite(tmp))
 				mapping_allow_writable(mapping);
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
diff --git a/mm/filemap.c b/mm/filemap.c
index 9ef49255f1a5..9710f43a89ac 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3618,7 +3618,7 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
  */
 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+	if (vma_is_shared_maywrite(vma))
 		return -EINVAL;
 	return generic_file_mmap(file, vma);
 }
diff --git a/mm/madvise.c b/mm/madvise.c
index a6b48d4796ba..cf4d694280e9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -970,7 +970,7 @@ static long madvise_remove(struct vm_area_struct *vma,
 			return -EINVAL;
 	}
 
-	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
+	if (!vma_is_shared_maywrite(vma))
 		return -EACCES;
 
 	offset = (loff_t)(start - vma->vm_start)
diff --git a/mm/mmap.c b/mm/mmap.c
index 3ea52451623b..0041e3631f6c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -107,7 +107,7 @@ void vma_set_page_prot(struct vm_area_struct *vma)
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		struct file *file, struct address_space *mapping)
 {
-	if (vma->vm_flags & VM_SHARED)
+	if (vma_is_shared_maywrite(vma))
 		mapping_unmap_writable(mapping);
 
 	flush_dcache_mmap_lock(mapping);
@@ -384,7 +384,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm,
 static void __vma_link_file(struct vm_area_struct *vma,
 			    struct address_space *mapping)
 {
-	if (vma->vm_flags & VM_SHARED)
+	if (vma_is_shared_maywrite(vma))
 		mapping_allow_writable(mapping);
 
 	flush_dcache_mmap_lock(mapping);
@@ -2846,7 +2846,7 @@ cannot_expand:
 	vma->vm_pgoff = pgoff;
 
 	if (file) {
-		if (vm_flags & VM_SHARED) {
+		if (is_shared_maywrite(vm_flags)) {
 			error = mapping_map_writable(file->f_mapping);
 			if (error)
 				goto free_vma;
@@ -2920,7 +2920,7 @@ cannot_expand:
 	mm->map_count++;
 	if (vma->vm_file) {
 		i_mmap_lock_write(vma->vm_file->f_mapping);
-		if (vma->vm_flags & VM_SHARED)
+		if (vma_is_shared_maywrite(vma))
 			mapping_allow_writable(vma->vm_file->f_mapping);
 
 		flush_dcache_mmap_lock(vma->vm_file->f_mapping);
@@ -2937,7 +2937,7 @@ cannot_expand:
 
 	/* Once vma denies write, undo our temporary denial count */
 unmap_writable:
-	if (file && vm_flags & VM_SHARED)
+	if (file && is_shared_maywrite(vm_flags))
 		mapping_unmap_writable(file->f_mapping);
 	file = vma->vm_file;
 	ksm_add_vma(vma);
@@ -2985,7 +2985,7 @@ unmap_and_free_vma:
 		unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
 			     vma->vm_end, vma->vm_end, true);
 	}
-	if (file && (vm_flags & VM_SHARED))
+	if (file && is_shared_maywrite(vm_flags))
 		mapping_unmap_writable(file->f_mapping);
 free_vma:
 	vm_area_free(vma);
-- 
cgit v1.2.3


From a287116af12b33d8a03a281fce08af5acaac6ade Mon Sep 17 00:00:00 2001
From: Li kunyu <kunyu@nfschina.com>
Date: Tue, 26 Sep 2023 10:24:10 +0800
Subject: kernel/signal: remove unnecessary NULL values from ucounts

ucounts is assigned first, so it does not need to initialize the
assignment.

Link: https://lkml.kernel.org/r/20230926022410.4280-1-kunyu@nfschina.com
Signed-off-by: Li kunyu <kunyu@nfschina.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index ccfc3ded5672..edaf39382d21 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -415,7 +415,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
 		 int override_rlimit, const unsigned int sigqueue_flags)
 {
 	struct sigqueue *q = NULL;
-	struct ucounts *ucounts = NULL;
+	struct ucounts *ucounts;
 	long sigpending;
 
 	/*
-- 
cgit v1.2.3


From fbd126f5a658b92c7f6af986a6d89cf5e5693268 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:52:20 -0700
Subject: gcov: annotate struct gcov_iterator with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute.  Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct gcov_iterator.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Link: https://lkml.kernel.org/r/20230922175220.work.327-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Peter Oberparleiter <oberpar@linux.ibm.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Tom Rix <trix@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/gcov/fs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 5c3086cad8f9..01520689b57c 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -99,7 +99,7 @@ struct gcov_iterator {
 	struct gcov_info *info;
 	size_t size;
 	loff_t pos;
-	char buffer[];
+	char buffer[] __counted_by(size);
 };
 
 /**
-- 
cgit v1.2.3


From 68279f9c9f592e75d30a9ba5154a15e0a0b42ae8 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 11 Oct 2023 19:55:00 +0300
Subject: treewide: mark stuff as __ro_after_init

__read_mostly predates __ro_after_init. Many variables which are marked
__read_mostly should have been __ro_after_init from day 1.

Also, mark some stuff as "const" and "__init" while I'm at it.

[akpm@linux-foundation.org: revert sysctl_nr_open_min, sysctl_nr_open_max changes due to arm warning]
[akpm@linux-foundation.org: coding-style cleanups]
Link: https://lkml.kernel.org/r/4f6bb9c0-abba-4ee4-a7aa-89265e886817@p183
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 block/bdev.c                       |  6 +++---
 fs/anon_inodes.c                   |  4 ++--
 fs/buffer.c                        |  4 ++--
 fs/char_dev.c                      |  2 +-
 fs/dcache.c                        |  8 ++++----
 fs/direct-io.c                     |  2 +-
 fs/eventpoll.c                     |  6 +++---
 fs/fcntl.c                         |  2 +-
 fs/file_table.c                    |  2 +-
 fs/inode.c                         |  8 ++++----
 fs/kernfs/mount.c                  |  5 +++--
 fs/locks.c                         |  4 ++--
 fs/namespace.c                     | 16 ++++++++--------
 fs/notify/dnotify/dnotify.c        |  6 +++---
 fs/notify/fanotify/fanotify_user.c |  8 ++++----
 fs/notify/inotify/inotify_user.c   |  2 +-
 fs/pipe.c                          |  2 +-
 fs/userfaultfd.c                   |  2 +-
 kernel/audit_tree.c                |  4 ++--
 kernel/sched/core.c                |  2 +-
 kernel/user_namespace.c            |  2 +-
 kernel/workqueue.c                 | 16 ++++++++--------
 lib/debugobjects.c                 |  2 +-
 mm/khugepaged.c                    |  2 +-
 mm/shmem.c                         |  8 ++++----
 security/integrity/iint.c          |  2 +-
 26 files changed, 64 insertions(+), 63 deletions(-)

(limited to 'kernel')

diff --git a/block/bdev.c b/block/bdev.c
index f3b13aa1b7d4..aea9143d8908 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -292,7 +292,7 @@ EXPORT_SYMBOL(thaw_bdev);
  */
 
 static  __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock);
-static struct kmem_cache * bdev_cachep __read_mostly;
+static struct kmem_cache *bdev_cachep __ro_after_init;
 
 static struct inode *bdev_alloc_inode(struct super_block *sb)
 {
@@ -361,13 +361,13 @@ static struct file_system_type bd_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-struct super_block *blockdev_superblock __read_mostly;
+struct super_block *blockdev_superblock __ro_after_init;
 EXPORT_SYMBOL_GPL(blockdev_superblock);
 
 void __init bdev_cache_init(void)
 {
 	int err;
-	static struct vfsmount *bd_mnt;
+	static struct vfsmount *bd_mnt __ro_after_init;
 
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 24192a7667ed..d26222b7eefe 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -24,8 +24,8 @@
 
 #include <linux/uaccess.h>
 
-static struct vfsmount *anon_inode_mnt __read_mostly;
-static struct inode *anon_inode_inode;
+static struct vfsmount *anon_inode_mnt __ro_after_init;
+static struct inode *anon_inode_inode __ro_after_init;
 
 /*
  * anon_inodefs_dname() is called from d_path().
diff --git a/fs/buffer.c b/fs/buffer.c
index 12e9a71c693d..a19fef583116 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2988,13 +2988,13 @@ EXPORT_SYMBOL(try_to_free_buffers);
 /*
  * Buffer-head allocation
  */
-static struct kmem_cache *bh_cachep __read_mostly;
+static struct kmem_cache *bh_cachep __ro_after_init;
 
 /*
  * Once the number of bh's in the machine exceeds this level, we start
  * stripping them in writeback.
  */
-static unsigned long max_buffer_heads;
+static unsigned long max_buffer_heads __ro_after_init;
 
 int buffer_heads_over_limit;
 
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 950b6919fb87..3d52f3d3ae77 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -25,7 +25,7 @@
 
 #include "internal.h"
 
-static struct kobj_map *cdev_map;
+static struct kobj_map *cdev_map __ro_after_init;
 
 static DEFINE_MUTEX(chrdevs_lock);
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 25ac74d30bff..0650ccdaa335 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -78,7 +78,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(rename_lock);
 
-static struct kmem_cache *dentry_cache __read_mostly;
+static struct kmem_cache *dentry_cache __ro_after_init;
 
 const struct qstr empty_name = QSTR_INIT("", 0);
 EXPORT_SYMBOL(empty_name);
@@ -96,9 +96,9 @@ EXPORT_SYMBOL(dotdot_name);
  * information, yet avoid using a prime hash-size or similar.
  */
 
-static unsigned int d_hash_shift __read_mostly;
+static unsigned int d_hash_shift __ro_after_init;
 
-static struct hlist_bl_head *dentry_hashtable __read_mostly;
+static struct hlist_bl_head *dentry_hashtable __ro_after_init;
 
 static inline struct hlist_bl_head *d_hash(unsigned int hash)
 {
@@ -3324,7 +3324,7 @@ static void __init dcache_init(void)
 }
 
 /* SLAB cache for __getname() consumers */
-struct kmem_cache *names_cachep __read_mostly;
+struct kmem_cache *names_cachep __ro_after_init;
 EXPORT_SYMBOL(names_cachep);
 
 void __init vfs_caches_init_early(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7bc494ee56b9..20533266ade6 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -151,7 +151,7 @@ struct dio {
 	};
 } ____cacheline_aligned_in_smp;
 
-static struct kmem_cache *dio_cache __read_mostly;
+static struct kmem_cache *dio_cache __ro_after_init;
 
 /*
  * How many pages are in the queue?
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1d9a71a0c4c1..2877cc01cff1 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -256,10 +256,10 @@ static u64 loop_check_gen = 0;
 static struct eventpoll *inserting_into;
 
 /* Slab cache used to allocate "struct epitem" */
-static struct kmem_cache *epi_cache __read_mostly;
+static struct kmem_cache *epi_cache __ro_after_init;
 
 /* Slab cache used to allocate "struct eppoll_entry" */
-static struct kmem_cache *pwq_cache __read_mostly;
+static struct kmem_cache *pwq_cache __ro_after_init;
 
 /*
  * List of files with newly added links, where we may need to limit the number
@@ -271,7 +271,7 @@ struct epitems_head {
 };
 static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
 
-static struct kmem_cache *ephead_cache __read_mostly;
+static struct kmem_cache *ephead_cache __ro_after_init;
 
 static inline void free_ephead(struct epitems_head *head)
 {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index e871009f6c88..c80a6acad742 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -844,7 +844,7 @@ int send_sigurg(struct fown_struct *fown)
 }
 
 static DEFINE_SPINLOCK(fasync_lock);
-static struct kmem_cache *fasync_cache __read_mostly;
+static struct kmem_cache *fasync_cache __ro_after_init;
 
 static void fasync_free_rcu(struct rcu_head *head)
 {
diff --git a/fs/file_table.c b/fs/file_table.c
index ee21b3da9d08..687d33865035 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -40,7 +40,7 @@ static struct files_stat_struct files_stat = {
 };
 
 /* SLAB cache for file structures */
-static struct kmem_cache *filp_cachep __read_mostly;
+static struct kmem_cache *filp_cachep __ro_after_init;
 
 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
diff --git a/fs/inode.c b/fs/inode.c
index 84bc3c76e5cc..2b1d473a3631 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -54,9 +54,9 @@
  *   inode_hash_lock
  */
 
-static unsigned int i_hash_mask __read_mostly;
-static unsigned int i_hash_shift __read_mostly;
-static struct hlist_head *inode_hashtable __read_mostly;
+static unsigned int i_hash_mask __ro_after_init;
+static unsigned int i_hash_shift __ro_after_init;
+static struct hlist_head *inode_hashtable __ro_after_init;
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 
 /*
@@ -70,7 +70,7 @@ EXPORT_SYMBOL(empty_aops);
 static DEFINE_PER_CPU(unsigned long, nr_inodes);
 static DEFINE_PER_CPU(unsigned long, nr_unused);
 
-static struct kmem_cache *inode_cachep __read_mostly;
+static struct kmem_cache *inode_cachep __ro_after_init;
 
 static long get_nr_inodes(void)
 {
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index c4bf26142eec..43aea0ad95c8 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -21,8 +21,9 @@
 
 #include "kernfs-internal.h"
 
-struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
-struct kernfs_global_locks *kernfs_locks;
+struct kmem_cache *kernfs_node_cache __ro_after_init;
+struct kmem_cache *kernfs_iattrs_cache __ro_after_init;
+struct kernfs_global_locks *kernfs_locks __ro_after_init;
 
 static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
 {
diff --git a/fs/locks.c b/fs/locks.c
index 76ad05f8070a..dbd2fb1f7494 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -167,8 +167,8 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
  */
 static DEFINE_SPINLOCK(blocked_lock_lock);
 
-static struct kmem_cache *flctx_cache __read_mostly;
-static struct kmem_cache *filelock_cache __read_mostly;
+static struct kmem_cache *flctx_cache __ro_after_init;
+static struct kmem_cache *filelock_cache __ro_after_init;
 
 static struct file_lock_context *
 locks_get_lock_context(struct inode *inode, int type)
diff --git a/fs/namespace.c b/fs/namespace.c
index e157efc54023..69df848fbc17 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -39,10 +39,10 @@
 /* Maximum number of mounts in a mount namespace */
 static unsigned int sysctl_mount_max __read_mostly = 100000;
 
-static unsigned int m_hash_mask __read_mostly;
-static unsigned int m_hash_shift __read_mostly;
-static unsigned int mp_hash_mask __read_mostly;
-static unsigned int mp_hash_shift __read_mostly;
+static unsigned int m_hash_mask __ro_after_init;
+static unsigned int m_hash_shift __ro_after_init;
+static unsigned int mp_hash_mask __ro_after_init;
+static unsigned int mp_hash_shift __ro_after_init;
 
 static __initdata unsigned long mhash_entries;
 static int __init set_mhash_entries(char *str)
@@ -68,9 +68,9 @@ static u64 event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
 
-static struct hlist_head *mount_hashtable __read_mostly;
-static struct hlist_head *mountpoint_hashtable __read_mostly;
-static struct kmem_cache *mnt_cache __read_mostly;
+static struct hlist_head *mount_hashtable __ro_after_init;
+static struct hlist_head *mountpoint_hashtable __ro_after_init;
+static struct kmem_cache *mnt_cache __ro_after_init;
 static DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
@@ -86,7 +86,7 @@ struct mount_kattr {
 };
 
 /* /sys/fs */
-struct kobject *fs_kobj;
+struct kobject *fs_kobj __ro_after_init;
 EXPORT_SYMBOL_GPL(fs_kobj);
 
 /*
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index ebdcc25df0f7..7914d223289a 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -39,9 +39,9 @@ static void __init dnotify_sysctl_init(void)
 #define dnotify_sysctl_init() do { } while (0)
 #endif
 
-static struct kmem_cache *dnotify_struct_cache __read_mostly;
-static struct kmem_cache *dnotify_mark_cache __read_mostly;
-static struct fsnotify_group *dnotify_group __read_mostly;
+static struct kmem_cache *dnotify_struct_cache __ro_after_init;
+static struct kmem_cache *dnotify_mark_cache __ro_after_init;
+static struct fsnotify_group *dnotify_group __ro_after_init;
 
 /*
  * dnotify will attach one of these to each inode (i_fsnotify_marks) which
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index f69c451018e3..614b435c4a8c 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -112,10 +112,10 @@ static void __init fanotify_sysctls_init(void)
 
 extern const struct fsnotify_ops fanotify_fsnotify_ops;
 
-struct kmem_cache *fanotify_mark_cache __read_mostly;
-struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
-struct kmem_cache *fanotify_path_event_cachep __read_mostly;
-struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
+struct kmem_cache *fanotify_mark_cache __ro_after_init;
+struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
+struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
+struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
 
 #define FANOTIFY_EVENT_ALIGN 4
 #define FANOTIFY_FID_INFO_HDR_LEN \
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1c4bfdab008d..a3809ae92170 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -49,7 +49,7 @@
 /* configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_queued_events __read_mostly;
 
-struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
+struct kmem_cache *inotify_inode_mark_cachep __ro_after_init;
 
 #ifdef CONFIG_SYSCTL
 
diff --git a/fs/pipe.c b/fs/pipe.c
index 139190165a1c..6b279abf0129 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -854,7 +854,7 @@ void free_pipe_info(struct pipe_inode_info *pipe)
 	kfree(pipe);
 }
 
-static struct vfsmount *pipe_mnt __read_mostly;
+static struct vfsmount *pipe_mnt __ro_after_init;
 
 /*
  * pipefs_dname() is called from d_path().
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 56eaae9dac1a..ed09d70027a0 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -49,7 +49,7 @@ static struct ctl_table vm_userfaultfd_table[] = {
 };
 #endif
 
-static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
 
 /*
  * Start with fault_pending_wqh and fault_wqh so they're more likely
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e867c17d3f84..b21b9652c1a8 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -87,8 +87,8 @@ static struct task_struct *prune_thread;
  * that makes a difference.  Some.
  */
 
-static struct fsnotify_group *audit_tree_group;
-static struct kmem_cache *audit_tree_mark_cachep __read_mostly;
+static struct fsnotify_group *audit_tree_group __ro_after_init;
+static struct kmem_cache *audit_tree_mark_cachep __ro_after_init;
 
 static struct audit_tree *alloc_tree(const char *s)
 {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 802551e0009b..7d4cf741e087 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9903,7 +9903,7 @@ struct task_group root_task_group;
 LIST_HEAD(task_groups);
 
 /* Cacheline aligned slab cache for task_group */
-static struct kmem_cache *task_group_cache __read_mostly;
+static struct kmem_cache *task_group_cache __ro_after_init;
 #endif
 
 void __init sched_init(void)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 1d8e47bed3f1..bf2cb8c11571 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -22,7 +22,7 @@
 #include <linux/bsearch.h>
 #include <linux/sort.h>
 
-static struct kmem_cache *user_ns_cachep __read_mostly;
+static struct kmem_cache *user_ns_cachep __ro_after_init;
 static DEFINE_MUTEX(userns_state_mutex);
 
 static bool new_idmap_permitted(const struct file *file,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b9f053a5a5f0..96b89f0edbe3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -418,21 +418,21 @@ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
  * process context while holding a pool lock. Bounce to a dedicated kthread
  * worker to avoid A-A deadlocks.
  */
-static struct kthread_worker *pwq_release_worker;
+static struct kthread_worker *pwq_release_worker __ro_after_init;
 
-struct workqueue_struct *system_wq __read_mostly;
+struct workqueue_struct *system_wq __ro_after_init;
 EXPORT_SYMBOL(system_wq);
-struct workqueue_struct *system_highpri_wq __read_mostly;
+struct workqueue_struct *system_highpri_wq __ro_after_init;
 EXPORT_SYMBOL_GPL(system_highpri_wq);
-struct workqueue_struct *system_long_wq __read_mostly;
+struct workqueue_struct *system_long_wq __ro_after_init;
 EXPORT_SYMBOL_GPL(system_long_wq);
-struct workqueue_struct *system_unbound_wq __read_mostly;
+struct workqueue_struct *system_unbound_wq __ro_after_init;
 EXPORT_SYMBOL_GPL(system_unbound_wq);
-struct workqueue_struct *system_freezable_wq __read_mostly;
+struct workqueue_struct *system_freezable_wq __ro_after_init;
 EXPORT_SYMBOL_GPL(system_freezable_wq);
-struct workqueue_struct *system_power_efficient_wq __read_mostly;
+struct workqueue_struct *system_power_efficient_wq __ro_after_init;
 EXPORT_SYMBOL_GPL(system_power_efficient_wq);
-struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
+struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
 EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
 
 static int worker_thread(void *__worker);
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index a517256a270b..2a8e9d63fbe3 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -89,7 +89,7 @@ static int			debug_objects_pool_size __read_mostly
 static int			debug_objects_pool_min_level __read_mostly
 				= ODEBUG_POOL_MIN_LEVEL;
 static const struct debug_obj_descr *descr_test  __read_mostly;
-static struct kmem_cache	*obj_cache __read_mostly;
+static struct kmem_cache	*obj_cache __ro_after_init;
 
 /*
  * Track numbers of kmem_cache_alloc()/free() calls done.
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 88433cc25d8a..cb3f1d738810 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -91,7 +91,7 @@ static unsigned int khugepaged_max_ptes_shared __read_mostly;
 #define MM_SLOTS_HASH_BITS 10
 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
 
-static struct kmem_cache *mm_slot_cache __read_mostly;
+static struct kmem_cache *mm_slot_cache __ro_after_init;
 
 struct collapse_control {
 	bool is_khugepaged;
diff --git a/mm/shmem.c b/mm/shmem.c
index 69595d341882..389212972e72 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -42,7 +42,7 @@
 #include <linux/iversion.h>
 #include "swap.h"
 
-static struct vfsmount *shm_mnt;
+static struct vfsmount *shm_mnt __ro_after_init;
 
 #ifdef CONFIG_SHMEM
 /*
@@ -4394,7 +4394,7 @@ static const struct fs_context_operations shmem_fs_context_ops = {
 #endif
 };
 
-static struct kmem_cache *shmem_inode_cachep;
+static struct kmem_cache *shmem_inode_cachep __ro_after_init;
 
 static struct inode *shmem_alloc_inode(struct super_block *sb)
 {
@@ -4426,14 +4426,14 @@ static void shmem_init_inode(void *foo)
 	inode_init_once(&info->vfs_inode);
 }
 
-static void shmem_init_inodecache(void)
+static void __init shmem_init_inodecache(void)
 {
 	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
 				sizeof(struct shmem_inode_info),
 				0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
 }
 
-static void shmem_destroy_inodecache(void)
+static void __init shmem_destroy_inodecache(void)
 {
 	kmem_cache_destroy(shmem_inode_cachep);
 }
diff --git a/security/integrity/iint.c b/security/integrity/iint.c
index a462df827de2..5b1c2de6cc54 100644
--- a/security/integrity/iint.c
+++ b/security/integrity/iint.c
@@ -23,7 +23,7 @@
 
 static struct rb_root integrity_iint_tree = RB_ROOT;
 static DEFINE_RWLOCK(integrity_iint_lock);
-static struct kmem_cache *iint_cache __read_mostly;
+static struct kmem_cache *iint_cache __ro_after_init;
 
 struct dentry *integrity_dir;
 
-- 
cgit v1.2.3


From 32671e3799ca2e4590773fd0e63aaa4229e50c06 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 18 Oct 2023 13:56:54 +0200
Subject: perf: Disallow mis-matched inherited group reads

Because group consistency is non-atomic between parent (filedesc) and children
(inherited) events, it is possible for PERF_FORMAT_GROUP read() to try and sum
non-matching counter groups -- with non-sensical results.

Add group_generation to distinguish the case where a parent group removes and
adds an event and thus has the same number, but a different configuration of
events as inherited groups.

This became a problem when commit fa8c269353d5 ("perf/core: Invert
perf_read_group() loops") flipped the order of child_list and sibling_list.
Previously it would iterate the group (sibling_list) first, and for each
sibling traverse the child_list. In this order, only the group composition of
the parent is relevant. By flipping the order the group composition of the
child (inherited) events becomes an issue and the mis-match in group
composition becomes evident.

That said; even prior to this commit, while reading of a group that is not
equally inherited was not broken, it still made no sense.

(Ab)use ECHILD as error return to indicate issues with child process group
composition.

Fixes: fa8c269353d5 ("perf/core: Invert perf_read_group() loops")
Reported-by: Budimir Markovic <markovicbudimir@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231018115654.GK33217@noisy.programming.kicks-ass.net
---
 include/linux/perf_event.h |  1 +
 kernel/events/core.c       | 39 +++++++++++++++++++++++++++++++++------
 2 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e85cd1c0eaf3..7b5406e3288d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -704,6 +704,7 @@ struct perf_event {
 	/* The cumulative AND of all event_caps for events in this group. */
 	int				group_caps;
 
+	unsigned int			group_generation;
 	struct perf_event		*group_leader;
 	/*
 	 * event->pmu will always point to pmu in which this event belongs.
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4c72a41f11af..d0663b9324e7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1954,6 +1954,7 @@ static void perf_group_attach(struct perf_event *event)
 
 	list_add_tail(&event->sibling_list, &group_leader->sibling_list);
 	group_leader->nr_siblings++;
+	group_leader->group_generation++;
 
 	perf_event__header_size(group_leader);
 
@@ -2144,6 +2145,7 @@ static void perf_group_detach(struct perf_event *event)
 	if (leader != event) {
 		list_del_init(&event->sibling_list);
 		event->group_leader->nr_siblings--;
+		event->group_leader->group_generation++;
 		goto out;
 	}
 
@@ -5440,7 +5442,7 @@ static int __perf_read_group_add(struct perf_event *leader,
 					u64 read_format, u64 *values)
 {
 	struct perf_event_context *ctx = leader->ctx;
-	struct perf_event *sub;
+	struct perf_event *sub, *parent;
 	unsigned long flags;
 	int n = 1; /* skip @nr */
 	int ret;
@@ -5450,6 +5452,33 @@ static int __perf_read_group_add(struct perf_event *leader,
 		return ret;
 
 	raw_spin_lock_irqsave(&ctx->lock, flags);
+	/*
+	 * Verify the grouping between the parent and child (inherited)
+	 * events is still in tact.
+	 *
+	 * Specifically:
+	 *  - leader->ctx->lock pins leader->sibling_list
+	 *  - parent->child_mutex pins parent->child_list
+	 *  - parent->ctx->mutex pins parent->sibling_list
+	 *
+	 * Because parent->ctx != leader->ctx (and child_list nests inside
+	 * ctx->mutex), group destruction is not atomic between children, also
+	 * see perf_event_release_kernel(). Additionally, parent can grow the
+	 * group.
+	 *
+	 * Therefore it is possible to have parent and child groups in a
+	 * different configuration and summing over such a beast makes no sense
+	 * what so ever.
+	 *
+	 * Reject this.
+	 */
+	parent = leader->parent;
+	if (parent &&
+	    (parent->group_generation != leader->group_generation ||
+	     parent->nr_siblings != leader->nr_siblings)) {
+		ret = -ECHILD;
+		goto unlock;
+	}
 
 	/*
 	 * Since we co-schedule groups, {enabled,running} times of siblings
@@ -5483,8 +5512,9 @@ static int __perf_read_group_add(struct perf_event *leader,
 			values[n++] = atomic64_read(&sub->lost_samples);
 	}
 
+unlock:
 	raw_spin_unlock_irqrestore(&ctx->lock, flags);
-	return 0;
+	return ret;
 }
 
 static int perf_read_group(struct perf_event *event,
@@ -5503,10 +5533,6 @@ static int perf_read_group(struct perf_event *event,
 
 	values[0] = 1 + leader->nr_siblings;
 
-	/*
-	 * By locking the child_mutex of the leader we effectively
-	 * lock the child list of all siblings.. XXX explain how.
-	 */
 	mutex_lock(&leader->child_mutex);
 
 	ret = __perf_read_group_add(leader, read_format, values);
@@ -13346,6 +13372,7 @@ static int inherit_group(struct perf_event *parent_event,
 		    !perf_get_aux_event(child_ctr, leader))
 			return -EINVAL;
 	}
+	leader->group_generation = parent_event->group_generation;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 0ede61d8589cc2d93aa78230d74ac58b5b8d0244 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 29 Sep 2023 08:45:59 +0200
Subject: file: convert to SLAB_TYPESAFE_BY_RCU

In recent discussions around some performance improvements in the file
handling area we discussed switching the file cache to rely on
SLAB_TYPESAFE_BY_RCU which allows us to get rid of call_rcu() based
freeing for files completely. This is a pretty sensitive change overall
but it might actually be worth doing.

The main downside is the subtlety. The other one is that we should
really wait for Jann's patch to land that enables KASAN to handle
SLAB_TYPESAFE_BY_RCU UAFs. Currently it doesn't but a patch for this
exists.

With SLAB_TYPESAFE_BY_RCU objects may be freed and reused multiple times
which requires a few changes. So it isn't sufficient anymore to just
acquire a reference to the file in question under rcu using
atomic_long_inc_not_zero() since the file might have already been
recycled and someone else might have bumped the reference.

In other words, callers might see reference count bumps from newer
users. For this reason it is necessary to verify that the pointer is the
same before and after the reference count increment. This pattern can be
seen in get_file_rcu() and __files_get_rcu().

In addition, it isn't possible to access or check fields in struct file
without first aqcuiring a reference on it. Not doing that was always
very dodgy and it was only usable for non-pointer data in struct file.
With SLAB_TYPESAFE_BY_RCU it is necessary that callers first acquire a
reference under rcu or they must hold the files_lock of the fdtable.
Failing to do either one of this is a bug.

Thanks to Jann for pointing out that we need to ensure memory ordering
between reallocations and pointer check by ensuring that all subsequent
loads have a dependency on the second load in get_file_rcu() and
providing a fixup that was folded into this patch.

Cc: Jann Horn <jannh@google.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/files.rst          |  53 +++++-------
 arch/powerpc/platforms/cell/spufs/coredump.c |  11 ++-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c     |   4 +-
 fs/file.c                                    | 125 +++++++++++++++++++++++----
 fs/file_table.c                              |  40 +++++----
 fs/gfs2/glock.c                              |  11 ++-
 fs/notify/dnotify/dnotify.c                  |   6 +-
 fs/proc/fd.c                                 |  11 ++-
 include/linux/fdtable.h                      |  17 +---
 include/linux/fs.h                           |   4 +-
 kernel/bpf/task_iter.c                       |   4 +-
 kernel/fork.c                                |   4 +-
 kernel/kcmp.c                                |   4 +-
 13 files changed, 191 insertions(+), 103 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst
index bcf84459917f..9e38e4c221ca 100644
--- a/Documentation/filesystems/files.rst
+++ b/Documentation/filesystems/files.rst
@@ -62,7 +62,7 @@ the fdtable structure -
    be held.
 
 4. To look up the file structure given an fd, a reader
-   must use either lookup_fd_rcu() or files_lookup_fd_rcu() APIs. These
+   must use either lookup_fdget_rcu() or files_lookup_fdget_rcu() APIs. These
    take care of barrier requirements due to lock-free lookup.
 
    An example::
@@ -70,43 +70,22 @@ the fdtable structure -
 	struct file *file;
 
 	rcu_read_lock();
-	file = lookup_fd_rcu(fd);
-	if (file) {
-		...
-	}
-	....
+	file = lookup_fdget_rcu(fd);
 	rcu_read_unlock();
-
-5. Handling of the file structures is special. Since the look-up
-   of the fd (fget()/fget_light()) are lock-free, it is possible
-   that look-up may race with the last put() operation on the
-   file structure. This is avoided using atomic_long_inc_not_zero()
-   on ->f_count::
-
-	rcu_read_lock();
-	file = files_lookup_fd_rcu(files, fd);
 	if (file) {
-		if (atomic_long_inc_not_zero(&file->f_count))
-			*fput_needed = 1;
-		else
-		/* Didn't get the reference, someone's freed */
-			file = NULL;
+		...
+                fput(file);
 	}
-	rcu_read_unlock();
 	....
-	return file;
-
-   atomic_long_inc_not_zero() detects if refcounts is already zero or
-   goes to zero during increment. If it does, we fail
-   fget()/fget_light().
 
-6. Since both fdtable and file structures can be looked up
+5. Since both fdtable and file structures can be looked up
    lock-free, they must be installed using rcu_assign_pointer()
    API. If they are looked up lock-free, rcu_dereference()
    must be used. However it is advisable to use files_fdtable()
-   and lookup_fd_rcu()/files_lookup_fd_rcu() which take care of these issues.
+   and lookup_fdget_rcu()/files_lookup_fdget_rcu() which take care of these
+   issues.
 
-7. While updating, the fdtable pointer must be looked up while
+6. While updating, the fdtable pointer must be looked up while
    holding files->file_lock. If ->file_lock is dropped, then
    another thread expand the files thereby creating a new
    fdtable and making the earlier fdtable pointer stale.
@@ -126,3 +105,19 @@ the fdtable structure -
    Since locate_fd() can drop ->file_lock (and reacquire ->file_lock),
    the fdtable pointer (fdt) must be loaded after locate_fd().
 
+On newer kernels rcu based file lookup has been switched to rely on
+SLAB_TYPESAFE_BY_RCU instead of call_rcu(). It isn't sufficient anymore
+to just acquire a reference to the file in question under rcu using
+atomic_long_inc_not_zero() since the file might have already been
+recycled and someone else might have bumped the reference. In other
+words, callers might see reference count bumps from newer users. For
+this is reason it is necessary to verify that the pointer is the same
+before and after the reference count increment. This pattern can be seen
+in get_file_rcu() and __files_get_rcu().
+
+In addition, it isn't possible to access or check fields in struct file
+without first aqcuiring a reference on it under rcu lookup. Not doing
+that was always very dodgy and it was only usable for non-pointer data
+in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers
+either first acquire a reference or they must hold the files_lock of the
+fdtable.
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 1a587618015c..18daafbe2e65 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -66,7 +66,7 @@ static int match_context(const void *v, struct file *file, unsigned fd)
  */
 static struct spu_context *coredump_next_context(int *fd)
 {
-	struct spu_context *ctx;
+	struct spu_context *ctx = NULL;
 	struct file *file;
 	int n = iterate_fd(current->files, *fd, match_context, NULL);
 	if (!n)
@@ -74,10 +74,13 @@ static struct spu_context *coredump_next_context(int *fd)
 	*fd = n - 1;
 
 	rcu_read_lock();
-	file = lookup_fd_rcu(*fd);
-	ctx = SPUFS_I(file_inode(file))->i_ctx;
-	get_spu_context(ctx);
+	file = lookup_fdget_rcu(*fd);
 	rcu_read_unlock();
+	if (file) {
+		ctx = SPUFS_I(file_inode(file))->i_ctx;
+		get_spu_context(ctx);
+		fput(file);
+	}
 
 	return ctx;
 }
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index aa4d842d4c5a..97bf10861cad 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -916,9 +916,7 @@ static struct file *mmap_singleton(struct drm_i915_private *i915)
 	struct file *file;
 
 	rcu_read_lock();
-	file = READ_ONCE(i915->gem.mmap_singleton);
-	if (file && !get_file_rcu(file))
-		file = NULL;
+	file = get_file_rcu(&i915->gem.mmap_singleton);
 	rcu_read_unlock();
 	if (file)
 		return file;
diff --git a/fs/file.c b/fs/file.c
index 3e4a4dfa38fc..4eb026631673 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -853,8 +853,79 @@ void do_close_on_exec(struct files_struct *files)
 	spin_unlock(&files->file_lock);
 }
 
+static struct file *__get_file_rcu(struct file __rcu **f)
+{
+	struct file __rcu *file;
+	struct file __rcu *file_reloaded;
+	struct file __rcu *file_reloaded_cmp;
+
+	file = rcu_dereference_raw(*f);
+	if (!file)
+		return NULL;
+
+	if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+		return ERR_PTR(-EAGAIN);
+
+	file_reloaded = rcu_dereference_raw(*f);
+
+	/*
+	 * Ensure that all accesses have a dependency on the load from
+	 * rcu_dereference_raw() above so we get correct ordering
+	 * between reuse/allocation and the pointer check below.
+	 */
+	file_reloaded_cmp = file_reloaded;
+	OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
+
+	/*
+	 * atomic_long_inc_not_zero() above provided a full memory
+	 * barrier when we acquired a reference.
+	 *
+	 * This is paired with the write barrier from assigning to the
+	 * __rcu protected file pointer so that if that pointer still
+	 * matches the current file, we know we have successfully
+	 * acquired a reference to the right file.
+	 *
+	 * If the pointers don't match the file has been reallocated by
+	 * SLAB_TYPESAFE_BY_RCU.
+	 */
+	if (file == file_reloaded_cmp)
+		return file_reloaded;
+
+	fput(file);
+	return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * get_file_rcu - try go get a reference to a file under rcu
+ * @f: the file to get a reference on
+ *
+ * This function tries to get a reference on @f carefully verifying that
+ * @f hasn't been reused.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_rcu(struct file __rcu **f)
+{
+	for (;;) {
+		struct file __rcu *file;
+
+		file = __get_file_rcu(f);
+		if (unlikely(!file))
+			return NULL;
+
+		if (unlikely(IS_ERR(file)))
+			continue;
+
+		return file;
+	}
+}
+EXPORT_SYMBOL_GPL(get_file_rcu);
+
 static inline struct file *__fget_files_rcu(struct files_struct *files,
-	unsigned int fd, fmode_t mask)
+       unsigned int fd, fmode_t mask)
 {
 	for (;;) {
 		struct file *file;
@@ -865,12 +936,6 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 			return NULL;
 
 		fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
-		file = rcu_dereference_raw(*fdentry);
-		if (unlikely(!file))
-			return NULL;
-
-		if (unlikely(file->f_mode & mask))
-			return NULL;
 
 		/*
 		 * Ok, we have a file pointer. However, because we do
@@ -879,10 +944,15 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 		 *
 		 * Such a race can take two forms:
 		 *
-		 *  (a) the file ref already went down to zero,
-		 *      and get_file_rcu() fails. Just try again:
+		 *  (a) the file ref already went down to zero and the
+		 *      file hasn't been reused yet or the file count
+		 *      isn't zero but the file has already been reused.
 		 */
-		if (unlikely(!get_file_rcu(file)))
+		file = __get_file_rcu(fdentry);
+		if (unlikely(!file))
+			return NULL;
+
+		if (unlikely(IS_ERR(file)))
 			continue;
 
 		/*
@@ -893,12 +963,20 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 		 *
 		 * If so, we need to put our ref and try again.
 		 */
-		if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
-		    unlikely(rcu_dereference_raw(*fdentry) != file)) {
+		if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
 			fput(file);
 			continue;
 		}
 
+		/*
+		 * This isn't the file we're looking for or we're not
+		 * allowed to get a reference to it.
+		 */
+		if (unlikely(file->f_mode & mask)) {
+			fput(file);
+			return NULL;
+		}
+
 		/*
 		 * Ok, we have a ref to the file, and checked that it
 		 * still exists.
@@ -948,7 +1026,14 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
 	return file;
 }
 
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
+struct file *lookup_fdget_rcu(unsigned int fd)
+{
+	return __fget_files_rcu(current->files, fd, 0);
+
+}
+EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
+
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
 {
 	/* Must be called with rcu_read_lock held */
 	struct files_struct *files;
@@ -957,13 +1042,13 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
 	task_lock(task);
 	files = task->files;
 	if (files)
-		file = files_lookup_fd_rcu(files, fd);
+		file = __fget_files_rcu(files, fd, 0);
 	task_unlock(task);
 
 	return file;
 }
 
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
 {
 	/* Must be called with rcu_read_lock held */
 	struct files_struct *files;
@@ -974,7 +1059,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
 	files = task->files;
 	if (files) {
 		for (; fd < files_fdtable(files)->max_fds; fd++) {
-			file = files_lookup_fd_rcu(files, fd);
+			file = __fget_files_rcu(files, fd, 0);
 			if (file)
 				break;
 		}
@@ -983,7 +1068,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
 	*ret_fd = fd;
 	return file;
 }
-EXPORT_SYMBOL(task_lookup_next_fd_rcu);
+EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
 
 /*
  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -1272,12 +1357,16 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 {
 	if (unlikely(newfd == oldfd)) { /* corner case */
 		struct files_struct *files = current->files;
+		struct file *f;
 		int retval = oldfd;
 
 		rcu_read_lock();
-		if (!files_lookup_fd_rcu(files, oldfd))
+		f = __fget_files_rcu(files, oldfd, 0);
+		if (!f)
 			retval = -EBADF;
 		rcu_read_unlock();
+		if (f)
+			fput(f);
 		return retval;
 	}
 	return ksys_dup3(oldfd, newfd, 0);
diff --git a/fs/file_table.c b/fs/file_table.c
index e68e97d4f00a..a79a80031343 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -65,33 +65,33 @@ static void file_free_rcu(struct rcu_head *head)
 {
 	struct file *f = container_of(head, struct file, f_rcuhead);
 
-	put_cred(f->f_cred);
-	if (unlikely(f->f_mode & FMODE_BACKING))
-		kfree(backing_file(f));
-	else
-		kmem_cache_free(filp_cachep, f);
+	kfree(backing_file(f));
 }
 
 static inline void file_free(struct file *f)
 {
 	security_file_free(f);
-	if (unlikely(f->f_mode & FMODE_BACKING))
-		path_put(backing_file_real_path(f));
 	if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
 		percpu_counter_dec(&nr_files);
-	call_rcu(&f->f_rcuhead, file_free_rcu);
+	put_cred(f->f_cred);
+	if (unlikely(f->f_mode & FMODE_BACKING)) {
+		path_put(backing_file_real_path(f));
+		call_rcu(&f->f_rcuhead, file_free_rcu);
+	} else {
+		kmem_cache_free(filp_cachep, f);
+	}
 }
 
 void release_empty_file(struct file *f)
 {
 	WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
-	/* Uhm, we better find out who grabs references to an unopened file. */
-	WARN_ON_ONCE(atomic_long_cmpxchg(&f->f_count, 1, 0) != 1);
-	security_file_free(f);
-	put_cred(f->f_cred);
-	if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
-		percpu_counter_dec(&nr_files);
-	kmem_cache_free(filp_cachep, f);
+	if (atomic_long_dec_and_test(&f->f_count)) {
+		security_file_free(f);
+		put_cred(f->f_cred);
+		if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+			percpu_counter_dec(&nr_files);
+		kmem_cache_free(filp_cachep, f);
+	}
 }
 
 /*
@@ -176,7 +176,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 		return error;
 	}
 
-	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	spin_lock_init(&f->f_lock);
 	mutex_init(&f->f_pos_lock);
@@ -184,6 +183,12 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 	f->f_mode = OPEN_FMODE(flags);
 	/* f->f_version: 0 */
 
+	/*
+	 * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
+	 * fget-rcu pattern users need to be able to handle spurious
+	 * refcount bumps we should reinitialize the reused file first.
+	 */
+	atomic_long_set(&f->f_count, 1);
 	return 0;
 }
 
@@ -483,7 +488,8 @@ EXPORT_SYMBOL(__fput_sync);
 void __init files_init(void)
 {
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
+				SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
+				SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 }
 
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9cbf8d98489a..b4bc873aab7d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2717,16 +2717,19 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
 	for(;; i->fd++) {
 		struct inode *inode;
 
-		i->file = task_lookup_next_fd_rcu(i->task, &i->fd);
+		i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
 		if (!i->file) {
 			i->fd = 0;
 			break;
 		}
+
 		inode = file_inode(i->file);
-		if (inode->i_sb != i->sb)
-			continue;
-		if (get_file_rcu(i->file))
+		if (inode->i_sb == i->sb)
 			break;
+
+		rcu_read_unlock();
+		fput(i->file);
+		rcu_read_lock();
 	}
 	rcu_read_unlock();
 	return i->file;
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index ebdcc25df0f7..869b016014d2 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -265,7 +265,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
 	struct dnotify_struct *dn;
 	struct inode *inode;
 	fl_owner_t id = current->files;
-	struct file *f;
+	struct file *f = NULL;
 	int destroy = 0, error = 0;
 	__u32 mask;
 
@@ -345,7 +345,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
 	}
 
 	rcu_read_lock();
-	f = lookup_fd_rcu(fd);
+	f = lookup_fdget_rcu(fd);
 	rcu_read_unlock();
 
 	/* if (f != filp) means that we lost a race and another task/thread
@@ -392,6 +392,8 @@ out_err:
 		fsnotify_put_mark(new_fsn_mark);
 	if (dn)
 		kmem_cache_free(dnotify_struct_cache, dn);
+	if (f)
+		fput(f);
 	return error;
 }
 
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6276b3938842..6e72e5ad42bc 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -113,10 +113,12 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
 	struct file *file;
 
 	rcu_read_lock();
-	file = task_lookup_fd_rcu(task, fd);
-	if (file)
-		*mode = file->f_mode;
+	file = task_lookup_fdget_rcu(task, fd);
 	rcu_read_unlock();
+	if (file) {
+		*mode = file->f_mode;
+		fput(file);
+	}
 	return !!file;
 }
 
@@ -259,12 +261,13 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
 		char name[10 + 1];
 		unsigned int len;
 
-		f = task_lookup_next_fd_rcu(p, &fd);
+		f = task_lookup_next_fdget_rcu(p, &fd);
 		ctx->pos = fd + 2LL;
 		if (!f)
 			break;
 		data.mode = f->f_mode;
 		rcu_read_unlock();
+		fput(f);
 		data.fd = fd;
 
 		len = snprintf(name, sizeof(name), "%u", fd);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index e066816f3519..bc4c3287a65e 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -98,20 +98,9 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un
 	return files_lookup_fd_raw(files, fd);
 }
 
-static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd)
-{
-	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
-			   "suspicious rcu_dereference_check() usage");
-	return files_lookup_fd_raw(files, fd);
-}
-
-static inline struct file *lookup_fd_rcu(unsigned int fd)
-{
-	return files_lookup_fd_rcu(current->files, fd);
-}
-
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd);
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd);
+struct file *lookup_fdget_rcu(unsigned int fd);
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd);
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd);
 
 struct task_struct;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 58dea591a341..ceafc40cc25f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1042,7 +1042,9 @@ static inline struct file *get_file(struct file *f)
 	atomic_long_inc(&f->f_count);
 	return f;
 }
-#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count)
+
+struct file *get_file_rcu(struct file __rcu **f);
+
 #define file_count(x)	atomic_long_read(&(x)->f_count)
 
 #define	MAX_NON_LFS	((1UL<<31) - 1)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index c4ab9d6cdbe9..82ad23b1d257 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -308,11 +308,9 @@ again:
 	rcu_read_lock();
 	for (;; curr_fd++) {
 		struct file *f;
-		f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
+		f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
 		if (!f)
 			break;
-		if (!get_file_rcu(f))
-			continue;
 
 		/* set info->fd */
 		info->fd = curr_fd;
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b6d20dfb9a8..640123767726 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1492,9 +1492,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 	struct file *exe_file;
 
 	rcu_read_lock();
-	exe_file = rcu_dereference(mm->exe_file);
-	if (exe_file && !get_file_rcu(exe_file))
-		exe_file = NULL;
+	exe_file = get_file_rcu(&mm->exe_file);
 	rcu_read_unlock();
 	return exe_file;
 }
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 5353edfad8e1..b0639f21041f 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -64,8 +64,10 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx)
 	struct file *file;
 
 	rcu_read_lock();
-	file = task_lookup_fd_rcu(task, idx);
+	file = task_lookup_fdget_rcu(task, idx);
 	rcu_read_unlock();
+	if (file)
+		fput(file);
 
 	return file;
 }
-- 
cgit v1.2.3


From 08582d678fcf11fc86188f0b92239d3d49667d8e Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 9 Oct 2023 18:37:11 +0300
Subject: fs: create helper file_user_path() for user displayed mapped file
 path

Overlayfs uses backing files with "fake" overlayfs f_path and "real"
underlying f_inode, in order to use underlying inode aops for mapped
files and to display the overlayfs path in /proc/<pid>/maps.

In preparation for storing the overlayfs "fake" path instead of the
underlying "real" path in struct backing_file, define a noop helper
file_user_path() that returns f_path for now.

Use the new helper in procfs and kernel logs whenever a path of a
mapped file is displayed to users.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231009153712.1566422-3-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 arch/arc/kernel/troubleshoot.c |  6 ++++--
 fs/proc/base.c                 |  2 +-
 fs/proc/nommu.c                |  2 +-
 fs/proc/task_mmu.c             |  4 ++--
 fs/proc/task_nommu.c           |  2 +-
 include/linux/fs.h             | 14 ++++++++++++++
 kernel/trace/trace_output.c    |  2 +-
 7 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index d5b3ed2c58f5..c380d8c30704 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -90,10 +90,12 @@ static void show_faulting_vma(unsigned long address)
 	 */
 	if (vma) {
 		char buf[ARC_PATH_MAX];
-		char *nm = "?";
+		char *nm = "anon";
 
 		if (vma->vm_file) {
-			nm = file_path(vma->vm_file, buf, ARC_PATH_MAX-1);
+			/* XXX: can we use %pD below and get rid of buf? */
+			nm = d_path(file_user_path(vma->vm_file), buf,
+				    ARC_PATH_MAX-1);
 			if (IS_ERR(nm))
 				nm = "?";
 		}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ffd54617c354..20695c928ee6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2218,7 +2218,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
 	rc = -ENOENT;
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (vma && vma->vm_file) {
-		*path = vma->vm_file->f_path;
+		*path = *file_user_path(vma->vm_file);
 		path_get(path);
 		rc = 0;
 	}
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 4d3493579458..c6e7ebc63756 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_file_path(m, file, "");
+		seq_path(m, file_user_path(file), "");
 	}
 
 	seq_putc(m, '\n');
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3dd5be96691b..1593940ca01e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -296,7 +296,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 		if (anon_name)
 			seq_printf(m, "[anon_shmem:%s]", anon_name->name);
 		else
-			seq_file_path(m, file, "\n");
+			seq_path(m, file_user_path(file), "\n");
 		goto done;
 	}
 
@@ -1967,7 +1967,7 @@ static int show_numa_map(struct seq_file *m, void *v)
 
 	if (file) {
 		seq_puts(m, " file=");
-		seq_file_path(m, file, "\n\t= ");
+		seq_path(m, file_user_path(file), "\n\t= ");
 	} else if (vma_is_initial_heap(vma)) {
 		seq_puts(m, " heap");
 	} else if (vma_is_initial_stack(vma)) {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index a8ac0dd8041e..2d5862c13399 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -157,7 +157,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_file_path(m, file, "");
+		seq_path(m, file_user_path(file), "");
 	} else if (mm && vma_is_initial_stack(vma)) {
 		seq_pad(m, ' ');
 		seq_puts(m, "[stack]");
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ceafc40cc25f..057a3bbd4d27 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2513,6 +2513,20 @@ static inline const struct path *file_real_path(struct file *f)
 	return &f->f_path;
 }
 
+/*
+ * file_user_path - get the path to display for memory mapped file
+ *
+ * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
+ * stored in ->vm_file is a backing file whose f_inode is on the underlying
+ * filesystem.  When the mapped file path is displayed to user (e.g. via
+ * /proc/<pid>/maps), this helper should be used to get the path to display
+ * to the user, which is the path of the fd that user has requested to map.
+ */
+static inline const struct path *file_user_path(struct file *f)
+{
+	return &f->f_path;
+}
+
 static inline struct file *file_clone_open(struct file *file)
 {
 	return dentry_open(&file->f_path, file->f_flags, file->f_cred);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index db575094c498..d8b302d01083 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -404,7 +404,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 			vmstart = vma->vm_start;
 		}
 		if (file) {
-			ret = trace_seq_path(s, &file->f_path);
+			ret = trace_seq_path(s, file_user_path(file));
 			if (ret)
 				trace_seq_printf(s, "[+0x%lx]",
 						 ip - vmstart);
-- 
cgit v1.2.3


From 957e48087dfa38c976407f82e7525277d17a27ae Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 14 Aug 2023 09:04:50 -0400
Subject: locking: export contention tracepoints for bcachefs six locks

The bcachefs implementation of six locks is intended to land in
generic locking code in the long term, but has been pulled into the
bcachefs subsystem for internal use for the time being. This code
lift breaks the bcachefs module build as six locks depend a couple
of the generic locking tracepoints. Export these tracepoint symbols
for bcachefs.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 kernel/locking/mutex.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index d973fe6041bf..2deeeca3e71b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1126,6 +1126,9 @@ EXPORT_SYMBOL(ww_mutex_lock_interruptible);
 #endif /* !CONFIG_DEBUG_LOCK_ALLOC */
 #endif /* !CONFIG_PREEMPT_RT */
 
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin);
+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end);
+
 /**
  * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
  * @cnt: the atomic which we are to dec
-- 
cgit v1.2.3


From a615f67e1a426f35366b8398c11f31c148e7df48 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 16 Oct 2023 06:47:39 -0700
Subject: bpf: Add sockptr support for getsockopt

The whole network stack uses sockptr, and while it doesn't move to
something more modern, let's use sockptr in getsockptr BPF hooks, so, it
could be used by other callers.

The main motivation for this change is to use it in the io_uring
{g,s}etsockopt(), which will use a userspace pointer for *optval, but, a
kernel value for optlen.

Link: https://lore.kernel.org/all/ZSArfLaaGcfd8LH8@gmail.com/

Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20231016134750.1381153-2-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bpf-cgroup.h |  5 +++--
 kernel/bpf/cgroup.c        | 20 +++++++++++---------
 net/socket.c               |  5 +++--
 3 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8506690dbb9c..f5b4fb6ed8c6 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -139,9 +139,10 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
 				       int *optname, char __user *optval,
 				       int *optlen, char **kernel_optval);
+
 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
-				       int optname, char __user *optval,
-				       int __user *optlen, int max_optlen,
+				       int optname, sockptr_t optval,
+				       sockptr_t optlen, int max_optlen,
 				       int retval);
 
 int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 5b2741aa0d9b..ebc8c58f7e46 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1875,8 +1875,8 @@ out:
 }
 
 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
-				       int optname, char __user *optval,
-				       int __user *optlen, int max_optlen,
+				       int optname, sockptr_t optval,
+				       sockptr_t optlen, int max_optlen,
 				       int retval)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
@@ -1903,8 +1903,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 		 * one that kernel returned as well to let
 		 * BPF programs inspect the value.
 		 */
-
-		if (get_user(ctx.optlen, optlen)) {
+		if (copy_from_sockptr(&ctx.optlen, optlen,
+				      sizeof(ctx.optlen))) {
 			ret = -EFAULT;
 			goto out;
 		}
@@ -1915,8 +1915,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 		}
 		orig_optlen = ctx.optlen;
 
-		if (copy_from_user(ctx.optval, optval,
-				   min(ctx.optlen, max_optlen)) != 0) {
+		if (copy_from_sockptr(ctx.optval, optval,
+				      min(ctx.optlen, max_optlen))) {
 			ret = -EFAULT;
 			goto out;
 		}
@@ -1930,7 +1930,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	if (ret < 0)
 		goto out;
 
-	if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) {
+	if (!sockptr_is_null(optval) &&
+	    (ctx.optlen > max_optlen || ctx.optlen < 0)) {
 		if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
 			pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
 				     ctx.optlen, max_optlen);
@@ -1942,11 +1943,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	}
 
 	if (ctx.optlen != 0) {
-		if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) {
+		if (!sockptr_is_null(optval) &&
+		    copy_to_sockptr(optval, ctx.optval, ctx.optlen)) {
 			ret = -EFAULT;
 			goto out;
 		}
-		if (put_user(ctx.optlen, optlen)) {
+		if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) {
 			ret = -EFAULT;
 			goto out;
 		}
diff --git a/net/socket.c b/net/socket.c
index c8b08b32f097..82cd6890a4f0 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2356,8 +2356,9 @@ int __sys_getsockopt(int fd, int level, int optname, char __user *optval,
 
 	if (!in_compat_syscall())
 		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
-						     optval, optlen, max_optlen,
-						     err);
+						     USER_SOCKPTR(optval),
+						     USER_SOCKPTR(optlen),
+						     max_optlen, err);
 out_put:
 	fput_light(sock->file, fput_needed);
 	return err;
-- 
cgit v1.2.3


From 3f31e0d14d44ad491a81b7c1f83f32fbc300a867 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 16 Oct 2023 06:47:40 -0700
Subject: bpf: Add sockptr support for setsockopt

The whole network stack uses sockptr, and while it doesn't move to
something more modern, let's use sockptr in setsockptr BPF hooks, so, it
could be used by other callers.

The main motivation for this change is to use it in the io_uring
{g,s}etsockopt(), which will use a userspace pointer for *optval, but, a
kernel value for optlen.

Link: https://lore.kernel.org/all/ZSArfLaaGcfd8LH8@gmail.com/

Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20231016134750.1381153-3-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bpf-cgroup.h | 2 +-
 kernel/bpf/cgroup.c        | 5 +++--
 net/socket.c               | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index f5b4fb6ed8c6..cecfe8c99f28 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -137,7 +137,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   enum cgroup_bpf_attach_type atype);
 
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
-				       int *optname, char __user *optval,
+				       int *optname, sockptr_t optval,
 				       int *optlen, char **kernel_optval);
 
 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ebc8c58f7e46..f0dedd4f7f2e 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1785,7 +1785,7 @@ static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
 }
 
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
-				       int *optname, char __user *optval,
+				       int *optname, sockptr_t optval,
 				       int *optlen, char **kernel_optval)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
@@ -1808,7 +1808,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 
 	ctx.optlen = *optlen;
 
-	if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
+	if (copy_from_sockptr(ctx.optval, optval,
+			      min(*optlen, max_optlen))) {
 		ret = -EFAULT;
 		goto out;
 	}
diff --git a/net/socket.c b/net/socket.c
index 82cd6890a4f0..9a43f6e850c7 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2288,7 +2288,7 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval,
 
 	if (!in_compat_syscall())
 		err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname,
-						     user_optval, &optlen,
+						     optval, &optlen,
 						     &kernel_optval);
 	if (err < 0)
 		goto out_put;
-- 
cgit v1.2.3


From 6da88306811b40a207c94c9da9faf07bdb20776e Mon Sep 17 00:00:00 2001
From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Wed, 18 Oct 2023 14:17:39 +0800
Subject: cgroup: Prepare for using css_task_iter_*() in BPF

This patch makes some preparations for using css_task_iter_*() in BPF
Program.

1. Flags CSS_TASK_ITER_* are #define-s and it's not easy for bpf prog to
use them. Convert them to enum so bpf prog can take them from vmlinux.h.

2. In the next patch we will add css_task_iter_*() in common kfuncs which
is not safe. Since css_task_iter_*() does spin_unlock_irq() which might
screw up irq flags depending on the context where bpf prog is running.
So we should use irqsave/irqrestore here and the switching is harmless.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20231018061746.111364-2-zhouchuyi@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/cgroup.h | 12 +++++-------
 kernel/cgroup/cgroup.c | 18 ++++++++++++------
 2 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b307013b9c6c..0ef0af66080e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -40,13 +40,11 @@ struct kernel_clone_args;
 #define CGROUP_WEIGHT_DFL		100
 #define CGROUP_WEIGHT_MAX		10000
 
-/* walk only threadgroup leaders */
-#define CSS_TASK_ITER_PROCS		(1U << 0)
-/* walk all threaded css_sets in the domain */
-#define CSS_TASK_ITER_THREADED		(1U << 1)
-
-/* internal flags */
-#define CSS_TASK_ITER_SKIPPED		(1U << 16)
+enum {
+	CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
+	CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
+	CSS_TASK_ITER_SKIPPED  = (1U << 16), /* internal flags */
+};
 
 /* a css_task_iter should be treated as an opaque object */
 struct css_task_iter {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1fb7f562289d..b6d64f3b8888 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4917,9 +4917,11 @@ repeat:
 void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
 			 struct css_task_iter *it)
 {
+	unsigned long irqflags;
+
 	memset(it, 0, sizeof(*it));
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_irqsave(&css_set_lock, irqflags);
 
 	it->ss = css->ss;
 	it->flags = flags;
@@ -4933,7 +4935,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
 
 	css_task_iter_advance(it);
 
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_irqrestore(&css_set_lock, irqflags);
 }
 
 /**
@@ -4946,12 +4948,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
  */
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
+	unsigned long irqflags;
+
 	if (it->cur_task) {
 		put_task_struct(it->cur_task);
 		it->cur_task = NULL;
 	}
 
-	spin_lock_irq(&css_set_lock);
+	spin_lock_irqsave(&css_set_lock, irqflags);
 
 	/* @it may be half-advanced by skips, finish advancing */
 	if (it->flags & CSS_TASK_ITER_SKIPPED)
@@ -4964,7 +4968,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 		css_task_iter_advance(it);
 	}
 
-	spin_unlock_irq(&css_set_lock);
+	spin_unlock_irqrestore(&css_set_lock, irqflags);
 
 	return it->cur_task;
 }
@@ -4977,11 +4981,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
  */
 void css_task_iter_end(struct css_task_iter *it)
 {
+	unsigned long irqflags;
+
 	if (it->cur_cset) {
-		spin_lock_irq(&css_set_lock);
+		spin_lock_irqsave(&css_set_lock, irqflags);
 		list_del(&it->iters_node);
 		put_css_set_locked(it->cur_cset);
-		spin_unlock_irq(&css_set_lock);
+		spin_unlock_irqrestore(&css_set_lock, irqflags);
 	}
 
 	if (it->cur_dcset)
-- 
cgit v1.2.3


From 9c66dc94b62aef23300f05f63404afb8990920b4 Mon Sep 17 00:00:00 2001
From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Wed, 18 Oct 2023 14:17:40 +0800
Subject: bpf: Introduce css_task open-coded iterator kfuncs

This patch adds kfuncs bpf_iter_css_task_{new,next,destroy} which allow
creation and manipulation of struct bpf_iter_css_task in open-coded
iterator style. These kfuncs actually wrapps css_task_iter_{start,next,
end}. BPF programs can use these kfuncs through bpf_for_each macro for
iteration of all tasks under a css.

css_task_iter_*() would try to get the global spin-lock *css_set_lock*, so
the bpf side has to be careful in where it allows to use this iter.
Currently we only allow it in bpf_lsm and bpf iter-s.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20231018061746.111364-3-zhouchuyi@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c                           |  3 ++
 kernel/bpf/task_iter.c                         | 58 ++++++++++++++++++++++++++
 kernel/bpf/verifier.c                          | 23 ++++++++++
 tools/testing/selftests/bpf/bpf_experimental.h |  8 ++++
 4 files changed, 92 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 61f51dee8448..c01441db9fd5 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2560,6 +2560,9 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
 BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index fef17628341f..e4126698cecf 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -894,6 +894,64 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
 
 __diag_pop();
 
+struct bpf_iter_css_task {
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_css_task_kern {
+	struct css_task_iter *css_it;
+} __attribute__((aligned(8)));
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
+		struct cgroup_subsys_state *css, unsigned int flags)
+{
+	struct bpf_iter_css_task_kern *kit = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_css_task_kern) != sizeof(struct bpf_iter_css_task));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_css_task_kern) !=
+					__alignof__(struct bpf_iter_css_task));
+	kit->css_it = NULL;
+	switch (flags) {
+	case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED:
+	case CSS_TASK_ITER_PROCS:
+	case 0:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter));
+	if (!kit->css_it)
+		return -ENOMEM;
+	css_task_iter_start(css, flags, kit->css_it);
+	return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it)
+{
+	struct bpf_iter_css_task_kern *kit = (void *)it;
+
+	if (!kit->css_it)
+		return NULL;
+	return css_task_iter_next(kit->css_it);
+}
+
+__bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
+{
+	struct bpf_iter_css_task_kern *kit = (void *)it;
+
+	if (!kit->css_it)
+		return;
+	css_task_iter_end(kit->css_it);
+	bpf_mem_free(&bpf_global_ma, kit->css_it);
+}
+
+__diag_pop();
+
 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
 
 static void do_mmap_read_unlock(struct irq_work *entry)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bb58987e4844..974713185269 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10472,6 +10472,7 @@ enum special_kfunc_type {
 	KF_bpf_percpu_obj_new_impl,
 	KF_bpf_percpu_obj_drop_impl,
 	KF_bpf_throw,
+	KF_bpf_iter_css_task_new,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -10495,6 +10496,7 @@ BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
 BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_iter_css_task_new)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -10520,6 +10522,7 @@ BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
 BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_iter_css_task_new)
 
 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -11050,6 +11053,20 @@ static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
 						  &meta->arg_rbtree_root.field);
 }
 
+static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env)
+{
+	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+
+	switch (prog_type) {
+	case BPF_PROG_TYPE_LSM:
+		return true;
+	case BPF_TRACE_ITER:
+		return env->prog->aux->sleepable;
+	default:
+		return false;
+	}
+}
+
 static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
 			    int insn_idx)
 {
@@ -11300,6 +11317,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			break;
 		}
 		case KF_ARG_PTR_TO_ITER:
+			if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) {
+				if (!check_css_task_iter_allowlist(env)) {
+					verbose(env, "css_task_iter is only allowed in bpf_lsm and bpf iter-s\n");
+					return -EINVAL;
+				}
+			}
 			ret = process_iter_arg(env, regno, insn_idx, meta);
 			if (ret < 0)
 				return ret;
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 2c8cb3f61529..6792ed2b45d7 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -458,4 +458,12 @@ extern void bpf_throw(u64 cookie) __ksym;
 		__bpf_assert_op(LHS, <=, END, value, false);		\
 	})
 
+struct bpf_iter_css_task;
+struct cgroup_subsys_state;
+extern int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
+		struct cgroup_subsys_state *css, unsigned int flags) __weak __ksym;
+extern struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) __weak __ksym;
+extern void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) __weak __ksym;
+
+
 #endif
-- 
cgit v1.2.3


From c68a78ffe2cb4207f64fd0f4262818c728c67be0 Mon Sep 17 00:00:00 2001
From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Wed, 18 Oct 2023 14:17:41 +0800
Subject: bpf: Introduce task open coded iterator kfuncs

This patch adds kfuncs bpf_iter_task_{new,next,destroy} which allow
creation and manipulation of struct bpf_iter_task in open-coded iterator
style. BPF programs can use these kfuncs or through bpf_for_each macro to
iterate all processes in the system.

The API design keep consistent with SEC("iter/task"). bpf_iter_task_new()
accepts a specific task and iterating type which allows:

1. iterating all process in the system (BPF_TASK_ITER_ALL_PROCS)

2. iterating all threads in the system (BPF_TASK_ITER_ALL_THREADS)

3. iterating all threads of a specific task (BPF_TASK_ITER_PROC_THREADS)

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Link: https://lore.kernel.org/r/20231018061746.111364-4-zhouchuyi@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c                           |  3 +
 kernel/bpf/task_iter.c                         | 90 ++++++++++++++++++++++++++
 tools/testing/selftests/bpf/bpf_experimental.h |  5 ++
 3 files changed, 98 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index c01441db9fd5..c25941531265 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2563,6 +2563,9 @@ BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index e4126698cecf..faa1712c1df5 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -952,6 +952,96 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
 
 __diag_pop();
 
+struct bpf_iter_task {
+	__u64 __opaque[3];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_task_kern {
+	struct task_struct *task;
+	struct task_struct *pos;
+	unsigned int flags;
+} __attribute__((aligned(8)));
+
+enum {
+	/* all process in the system */
+	BPF_TASK_ITER_ALL_PROCS,
+	/* all threads in the system */
+	BPF_TASK_ITER_ALL_THREADS,
+	/* all threads of a specific process */
+	BPF_TASK_ITER_PROC_THREADS
+};
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
+		struct task_struct *task, unsigned int flags)
+{
+	struct bpf_iter_task_kern *kit = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_task_kern) > sizeof(struct bpf_iter_task));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
+					__alignof__(struct bpf_iter_task));
+
+	kit->task = kit->pos = NULL;
+	switch (flags) {
+	case BPF_TASK_ITER_ALL_THREADS:
+	case BPF_TASK_ITER_ALL_PROCS:
+	case BPF_TASK_ITER_PROC_THREADS:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (flags == BPF_TASK_ITER_PROC_THREADS)
+		kit->task = task;
+	else
+		kit->task = &init_task;
+	kit->pos = kit->task;
+	kit->flags = flags;
+	return 0;
+}
+
+__bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it)
+{
+	struct bpf_iter_task_kern *kit = (void *)it;
+	struct task_struct *pos;
+	unsigned int flags;
+
+	flags = kit->flags;
+	pos = kit->pos;
+
+	if (!pos)
+		return pos;
+
+	if (flags == BPF_TASK_ITER_ALL_PROCS)
+		goto get_next_task;
+
+	kit->pos = next_thread(kit->pos);
+	if (kit->pos == kit->task) {
+		if (flags == BPF_TASK_ITER_PROC_THREADS) {
+			kit->pos = NULL;
+			return pos;
+		}
+	} else
+		return pos;
+
+get_next_task:
+	kit->pos = next_task(kit->pos);
+	kit->task = kit->pos;
+	if (kit->pos == &init_task)
+		kit->pos = NULL;
+
+	return pos;
+}
+
+__bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
+{
+}
+
+__diag_pop();
+
 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
 
 static void do_mmap_read_unlock(struct irq_work *entry)
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 6792ed2b45d7..2f6c747aa874 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -465,5 +465,10 @@ extern int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
 extern struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) __weak __ksym;
 extern void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) __weak __ksym;
 
+struct bpf_iter_task;
+extern int bpf_iter_task_new(struct bpf_iter_task *it,
+		struct task_struct *task, unsigned int flags) __weak __ksym;
+extern struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) __weak __ksym;
+extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym;
 
 #endif
-- 
cgit v1.2.3


From 7251d0905e7518bcb990c8e9a3615b1bb23c78f2 Mon Sep 17 00:00:00 2001
From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Wed, 18 Oct 2023 14:17:42 +0800
Subject: bpf: Introduce css open-coded iterator kfuncs

This Patch adds kfuncs bpf_iter_css_{new,next,destroy} which allow
creation and manipulation of struct bpf_iter_css in open-coded iterator
style. These kfuncs actually wrapps css_next_descendant_{pre, post}.
css_iter can be used to:

1) iterating a sepcific cgroup tree with pre/post/up order

2) iterating cgroup_subsystem in BPF Prog, like
for_each_mem_cgroup_tree/cpuset_for_each_descendant_pre in kernel.

The API design is consistent with cgroup_iter. bpf_iter_css_new accepts
parameters defining iteration order and starting css. Here we also reuse
BPF_CGROUP_ITER_DESCENDANTS_PRE, BPF_CGROUP_ITER_DESCENDANTS_POST,
BPF_CGROUP_ITER_ANCESTORS_UP enums.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20231018061746.111364-5-zhouchuyi@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup_iter.c                       | 65 ++++++++++++++++++++++++++
 kernel/bpf/helpers.c                           |  3 ++
 tools/testing/selftests/bpf/bpf_experimental.h |  6 +++
 3 files changed, 74 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index 810378f04fbc..209e5135f9fb 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -294,3 +294,68 @@ static int __init bpf_cgroup_iter_init(void)
 }
 
 late_initcall(bpf_cgroup_iter_init);
+
+struct bpf_iter_css {
+	__u64 __opaque[3];
+} __attribute__((aligned(8)));
+
+struct bpf_iter_css_kern {
+	struct cgroup_subsys_state *start;
+	struct cgroup_subsys_state *pos;
+	unsigned int flags;
+} __attribute__((aligned(8)));
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		"Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
+		struct cgroup_subsys_state *start, unsigned int flags)
+{
+	struct bpf_iter_css_kern *kit = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) > sizeof(struct bpf_iter_css));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css));
+
+	kit->start = NULL;
+	switch (flags) {
+	case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+	case BPF_CGROUP_ITER_DESCENDANTS_POST:
+	case BPF_CGROUP_ITER_ANCESTORS_UP:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	kit->start = start;
+	kit->pos = NULL;
+	kit->flags = flags;
+	return 0;
+}
+
+__bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it)
+{
+	struct bpf_iter_css_kern *kit = (void *)it;
+
+	if (!kit->start)
+		return NULL;
+
+	switch (kit->flags) {
+	case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+		kit->pos = css_next_descendant_pre(kit->pos, kit->start);
+		break;
+	case BPF_CGROUP_ITER_DESCENDANTS_POST:
+		kit->pos = css_next_descendant_post(kit->pos, kit->start);
+		break;
+	case BPF_CGROUP_ITER_ANCESTORS_UP:
+		kit->pos = kit->pos ? kit->pos->parent : kit->start;
+	}
+
+	return kit->pos;
+}
+
+__bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it)
+{
+}
+
+__diag_pop();
\ No newline at end of file
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index c25941531265..b1d285ed4796 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2566,6 +2566,9 @@ BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
+BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 2f6c747aa874..1386baf9ae4a 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -471,4 +471,10 @@ extern int bpf_iter_task_new(struct bpf_iter_task *it,
 extern struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) __weak __ksym;
 extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym;
 
+struct bpf_iter_css;
+extern int bpf_iter_css_new(struct bpf_iter_css *it,
+				struct cgroup_subsys_state *start, unsigned int flags) __weak __ksym;
+extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym;
+extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym;
+
 #endif
-- 
cgit v1.2.3


From dfab99df147b0d364f0c199f832ff2aedfb2265a Mon Sep 17 00:00:00 2001
From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Wed, 18 Oct 2023 14:17:43 +0800
Subject: bpf: teach the verifier to enforce css_iter and task_iter in RCU CS

css_iter and task_iter should be used in rcu section. Specifically, in
sleepable progs explicit bpf_rcu_read_lock() is needed before use these
iters. In normal bpf progs that have implicit rcu_read_lock(), it's OK to
use them directly.

This patch adds a new a KF flag KF_RCU_PROTECTED for bpf_iter_task_new and
bpf_iter_css_new. It means the kfunc should be used in RCU CS. We check
whether we are in rcu cs before we want to invoke this kfunc. If the rcu
protection is guaranteed, we would let st->type = PTR_TO_STACK | MEM_RCU.
Once user do rcu_unlock during the iteration, state MEM_RCU of regs would
be cleared. is_iter_reg_valid_init() will reject if reg->type is UNTRUSTED.

It is worth noting that currently, bpf_rcu_read_unlock does not
clear the state of the STACK_ITER reg, since bpf_for_each_spilled_reg
only considers STACK_SPILL. This patch also let bpf_for_each_spilled_reg
search STACK_ITER.

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231018061746.111364-6-zhouchuyi@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 19 ++++++++++-------
 include/linux/btf.h          |  1 +
 kernel/bpf/helpers.c         |  4 ++--
 kernel/bpf/verifier.c        | 50 ++++++++++++++++++++++++++++++++++----------
 4 files changed, 53 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 94ec766432f5..e67cd45a85be 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -386,19 +386,18 @@ struct bpf_verifier_state {
 	u32 jmp_history_cnt;
 };
 
-#define bpf_get_spilled_reg(slot, frame)				\
+#define bpf_get_spilled_reg(slot, frame, mask)				\
 	(((slot < frame->allocated_stack / BPF_REG_SIZE) &&		\
-	  (frame->stack[slot].slot_type[0] == STACK_SPILL))		\
+	  ((1 << frame->stack[slot].slot_type[0]) & (mask))) \
 	 ? &frame->stack[slot].spilled_ptr : NULL)
 
 /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */
-#define bpf_for_each_spilled_reg(iter, frame, reg)			\
-	for (iter = 0, reg = bpf_get_spilled_reg(iter, frame);		\
+#define bpf_for_each_spilled_reg(iter, frame, reg, mask)			\
+	for (iter = 0, reg = bpf_get_spilled_reg(iter, frame, mask);		\
 	     iter < frame->allocated_stack / BPF_REG_SIZE;		\
-	     iter++, reg = bpf_get_spilled_reg(iter, frame))
+	     iter++, reg = bpf_get_spilled_reg(iter, frame, mask))
 
-/* Invoke __expr over regsiters in __vst, setting __state and __reg */
-#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr)   \
+#define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr)   \
 	({                                                               \
 		struct bpf_verifier_state *___vstate = __vst;            \
 		int ___i, ___j;                                          \
@@ -410,7 +409,7 @@ struct bpf_verifier_state {
 				__reg = &___regs[___j];                  \
 				(void)(__expr);                          \
 			}                                                \
-			bpf_for_each_spilled_reg(___j, __state, __reg) { \
+			bpf_for_each_spilled_reg(___j, __state, __reg, __mask) { \
 				if (!__reg)                              \
 					continue;                        \
 				(void)(__expr);                          \
@@ -418,6 +417,10 @@ struct bpf_verifier_state {
 		}                                                        \
 	})
 
+/* Invoke __expr over regsiters in __vst, setting __state and __reg */
+#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \
+	bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, 1 << STACK_SPILL, __expr)
+
 /* linked list of verifier states used to prune search */
 struct bpf_verifier_state_list {
 	struct bpf_verifier_state state;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 928113a80a95..c2231c64d60b 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -74,6 +74,7 @@
 #define KF_ITER_NEW     (1 << 8) /* kfunc implements BPF iter constructor */
 #define KF_ITER_NEXT    (1 << 9) /* kfunc implements BPF iter next method */
 #define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */
+#define KF_RCU_PROTECTED (1 << 11) /* kfunc should be protected by rcu cs when they are invoked */
 
 /*
  * Tag marking a kernel function as a kfunc. This is meant to minimize the
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b1d285ed4796..da058aead20c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2563,10 +2563,10 @@ BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 974713185269..fcdf2382153a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1173,7 +1173,12 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
 
 static void __mark_reg_known_zero(struct bpf_reg_state *reg);
 
+static bool in_rcu_cs(struct bpf_verifier_env *env);
+
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta);
+
 static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+				 struct bpf_kfunc_call_arg_meta *meta,
 				 struct bpf_reg_state *reg, int insn_idx,
 				 struct btf *btf, u32 btf_id, int nr_slots)
 {
@@ -1194,6 +1199,12 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
 
 		__mark_reg_known_zero(st);
 		st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+		if (is_kfunc_rcu_protected(meta)) {
+			if (in_rcu_cs(env))
+				st->type |= MEM_RCU;
+			else
+				st->type |= PTR_UNTRUSTED;
+		}
 		st->live |= REG_LIVE_WRITTEN;
 		st->ref_obj_id = i == 0 ? id : 0;
 		st->iter.btf = btf;
@@ -1268,7 +1279,7 @@ static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
 	return true;
 }
 
-static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 				   struct btf *btf, u32 btf_id, int nr_slots)
 {
 	struct bpf_func_state *state = func(env, reg);
@@ -1276,26 +1287,28 @@ static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_
 
 	spi = iter_get_spi(env, reg, nr_slots);
 	if (spi < 0)
-		return false;
+		return -EINVAL;
 
 	for (i = 0; i < nr_slots; i++) {
 		struct bpf_stack_state *slot = &state->stack[spi - i];
 		struct bpf_reg_state *st = &slot->spilled_ptr;
 
+		if (st->type & PTR_UNTRUSTED)
+			return -EPROTO;
 		/* only main (first) slot has ref_obj_id set */
 		if (i == 0 && !st->ref_obj_id)
-			return false;
+			return -EINVAL;
 		if (i != 0 && st->ref_obj_id)
-			return false;
+			return -EINVAL;
 		if (st->iter.btf != btf || st->iter.btf_id != btf_id)
-			return false;
+			return -EINVAL;
 
 		for (j = 0; j < BPF_REG_SIZE; j++)
 			if (slot->slot_type[j] != STACK_ITER)
-				return false;
+				return -EINVAL;
 	}
 
-	return true;
+	return 0;
 }
 
 /* Check if given stack slot is "special":
@@ -7640,15 +7653,24 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
 				return err;
 		}
 
-		err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+		err = mark_stack_slots_iter(env, meta, reg, insn_idx, meta->btf, btf_id, nr_slots);
 		if (err)
 			return err;
 	} else {
 		/* iter_next() or iter_destroy() expect initialized iter state*/
-		if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
+		err = is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots);
+		switch (err) {
+		case 0:
+			break;
+		case -EINVAL:
 			verbose(env, "expected an initialized iter_%s as arg #%d\n",
 				iter_type_str(meta->btf, btf_id), regno);
-			return -EINVAL;
+			return err;
+		case -EPROTO:
+			verbose(env, "expected an RCU CS when using %s\n", meta->func_name);
+			return err;
+		default:
+			return err;
 		}
 
 		spi = iter_get_spi(env, reg, nr_slots);
@@ -10231,6 +10253,11 @@ static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
 	return meta->kfunc_flags & KF_RCU;
 }
 
+static bool is_kfunc_rcu_protected(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_RCU_PROTECTED;
+}
+
 static bool __kfunc_param_match_suffix(const struct btf *btf,
 				       const struct btf_param *arg,
 				       const char *suffix)
@@ -11582,6 +11609,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	if (env->cur_state->active_rcu_lock) {
 		struct bpf_func_state *state;
 		struct bpf_reg_state *reg;
+		u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER);
 
 		if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) {
 			verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n");
@@ -11592,7 +11620,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
 			return -EINVAL;
 		} else if (rcu_unlock) {
-			bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+			bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({
 				if (reg->type & MEM_RCU) {
 					reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
 					reg->type |= PTR_UNTRUSTED;
-- 
cgit v1.2.3


From cb3ecf7915a1d7ce5304402f4d8616d9fa5193f7 Mon Sep 17 00:00:00 2001
From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Wed, 18 Oct 2023 14:17:44 +0800
Subject: bpf: Let bpf_iter_task_new accept null task ptr

When using task_iter to iterate all threads of a specific task, we enforce
that the user must pass a valid task pointer to ensure safety. However,
when iterating all threads/process in the system, BPF verifier still
require a valid ptr instead of "nullable" pointer, even though it's
pointless, which is a kind of surprising from usability standpoint. It
would be nice if we could let that kfunc accept a explicit null pointer
when we are using BPF_TASK_ITER_ALL_{PROCS, THREADS} and a valid pointer
when using BPF_TASK_ITER_THREAD.

Given a trival kfunc:
	__bpf_kfunc void FN(struct TYPE_A *obj);

BPF Prog would reject a nullptr for obj. The error info is:
"arg#x pointer type xx xx must point to scalar, or struct with scalar"
reported by get_kfunc_ptr_arg_type(). The reg->type is SCALAR_VALUE and
the btf type of ref_t is not scalar or scalar_struct which leads to the
rejection of get_kfunc_ptr_arg_type.

This patch add "__nullable" annotation:
	__bpf_kfunc void FN(struct TYPE_A *obj__nullable);
Here __nullable indicates obj can be optional, user can pass a explicit
nullptr or a normal TYPE_A pointer. In get_kfunc_ptr_arg_type(), we will
detect whether the current arg is optional and register is null, If so,
return a new kfunc_ptr_arg_type KF_ARG_PTR_TO_NULL and skip to the next
arg in check_kfunc_args().

Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231018061746.111364-7-zhouchuyi@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/task_iter.c |  7 +++++--
 kernel/bpf/verifier.c  | 13 ++++++++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index faa1712c1df5..59e747938bdb 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -976,7 +976,7 @@ __diag_ignore_all("-Wmissing-prototypes",
 		  "Global functions as their definitions will be in vmlinux BTF");
 
 __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
-		struct task_struct *task, unsigned int flags)
+		struct task_struct *task__nullable, unsigned int flags)
 {
 	struct bpf_iter_task_kern *kit = (void *)it;
 
@@ -988,14 +988,17 @@ __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
 	switch (flags) {
 	case BPF_TASK_ITER_ALL_THREADS:
 	case BPF_TASK_ITER_ALL_PROCS:
+		break;
 	case BPF_TASK_ITER_PROC_THREADS:
+		if (!task__nullable)
+			return -EINVAL;
 		break;
 	default:
 		return -EINVAL;
 	}
 
 	if (flags == BPF_TASK_ITER_PROC_THREADS)
-		kit->task = task;
+		kit->task = task__nullable;
 	else
 		kit->task = &init_task;
 	kit->pos = kit->task;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fcdf2382153a..e9bc5d4a25a1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10332,6 +10332,11 @@ static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf
 	return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr");
 }
 
+static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param *arg)
+{
+	return __kfunc_param_match_suffix(btf, arg, "__nullable");
+}
+
 static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
 					  const struct btf_param *arg,
 					  const char *name)
@@ -10474,6 +10479,7 @@ enum kfunc_ptr_arg_type {
 	KF_ARG_PTR_TO_CALLBACK,
 	KF_ARG_PTR_TO_RB_ROOT,
 	KF_ARG_PTR_TO_RB_NODE,
+	KF_ARG_PTR_TO_NULL,
 };
 
 enum special_kfunc_type {
@@ -10630,6 +10636,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_CALLBACK;
 
+	if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
+		return KF_ARG_PTR_TO_NULL;
 
 	if (argno + 1 < nargs &&
 	    (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
@@ -11180,7 +11188,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		}
 
 		if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
-		    (register_is_null(reg) || type_may_be_null(reg->type))) {
+		    (register_is_null(reg) || type_may_be_null(reg->type)) &&
+			!is_kfunc_arg_nullable(meta->btf, &args[i])) {
 			verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
 			return -EACCES;
 		}
@@ -11205,6 +11214,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			return kf_arg_type;
 
 		switch (kf_arg_type) {
+		case KF_ARG_PTR_TO_NULL:
+			continue;
 		case KF_ARG_PTR_TO_ALLOC_BTF_ID:
 		case KF_ARG_PTR_TO_BTF_ID:
 			if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
-- 
cgit v1.2.3


From 16ab7cb5825fc3425c16ad2c6e53d827f382d7c6 Mon Sep 17 00:00:00 2001
From: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Date: Tue, 10 Oct 2023 22:22:38 +0100
Subject: crypto: pkcs7 - remove sha1 support

Removes support for sha1 signed kernel modules, importing sha1 signed
x.509 certificates.

rsa-pkcs1pad keeps sha1 padding support, which seems to be used by
virtio driver.

sha1 remains available as there are many drivers and subsystems using
it. Note only hmac(sha1) with secret keys remains cryptographically
secure.

In the kernel there are filesystems, IMA, tpm/pcr that appear to be
using sha1. Maybe they can all start to be slowly upgraded to
something else i.e. blake3, ParallelHash, SHAKE256 as needed.

Signed-off-by: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/asymmetric_keys/mscode_parser.c    |  3 --
 crypto/asymmetric_keys/pkcs7_parser.c     |  4 --
 crypto/asymmetric_keys/public_key.c       |  3 +-
 crypto/asymmetric_keys/signature.c        |  2 +-
 crypto/asymmetric_keys/x509_cert_parser.c |  8 ----
 crypto/testmgr.h                          | 80 -------------------------------
 include/linux/oid_registry.h              |  4 --
 kernel/module/Kconfig                     |  5 --
 8 files changed, 2 insertions(+), 107 deletions(-)

(limited to 'kernel')

diff --git a/crypto/asymmetric_keys/mscode_parser.c b/crypto/asymmetric_keys/mscode_parser.c
index 690405ebe77b..6416bded0e07 100644
--- a/crypto/asymmetric_keys/mscode_parser.c
+++ b/crypto/asymmetric_keys/mscode_parser.c
@@ -75,9 +75,6 @@ int mscode_note_digest_algo(void *context, size_t hdrlen,
 
 	oid = look_up_OID(value, vlen);
 	switch (oid) {
-	case OID_sha1:
-		ctx->digest_algo = "sha1";
-		break;
 	case OID_sha256:
 		ctx->digest_algo = "sha256";
 		break;
diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c
index cf4caab9620f..ab647cb4d766 100644
--- a/crypto/asymmetric_keys/pkcs7_parser.c
+++ b/crypto/asymmetric_keys/pkcs7_parser.c
@@ -227,9 +227,6 @@ int pkcs7_sig_note_digest_algo(void *context, size_t hdrlen,
 	struct pkcs7_parse_context *ctx = context;
 
 	switch (ctx->last_oid) {
-	case OID_sha1:
-		ctx->sinfo->sig->hash_algo = "sha1";
-		break;
 	case OID_sha256:
 		ctx->sinfo->sig->hash_algo = "sha256";
 		break;
@@ -272,7 +269,6 @@ int pkcs7_sig_note_pkey_algo(void *context, size_t hdrlen,
 		ctx->sinfo->sig->pkey_algo = "rsa";
 		ctx->sinfo->sig->encoding = "pkcs1";
 		break;
-	case OID_id_ecdsa_with_sha1:
 	case OID_id_ecdsa_with_sha224:
 	case OID_id_ecdsa_with_sha256:
 	case OID_id_ecdsa_with_sha384:
diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c
index abeecb8329b3..5bf0452c17af 100644
--- a/crypto/asymmetric_keys/public_key.c
+++ b/crypto/asymmetric_keys/public_key.c
@@ -116,8 +116,7 @@ software_key_determine_akcipher(const struct public_key *pkey,
 		 */
 		if (!hash_algo)
 			return -EINVAL;
-		if (strcmp(hash_algo, "sha1") != 0 &&
-		    strcmp(hash_algo, "sha224") != 0 &&
+		if (strcmp(hash_algo, "sha224") != 0 &&
 		    strcmp(hash_algo, "sha256") != 0 &&
 		    strcmp(hash_algo, "sha384") != 0 &&
 		    strcmp(hash_algo, "sha512") != 0)
diff --git a/crypto/asymmetric_keys/signature.c b/crypto/asymmetric_keys/signature.c
index 2deff81f8af5..398983be77e8 100644
--- a/crypto/asymmetric_keys/signature.c
+++ b/crypto/asymmetric_keys/signature.c
@@ -115,7 +115,7 @@ EXPORT_SYMBOL_GPL(decrypt_blob);
  * Sign the specified data blob using the private key specified by params->key.
  * The signature is wrapped in an encoding if params->encoding is specified
  * (eg. "pkcs1").  If the encoding needs to know the digest type, this can be
- * passed through params->hash_algo (eg. "sha1").
+ * passed through params->hash_algo (eg. "sha512").
  *
  * Returns the length of the data placed in the signature buffer or an error.
  */
diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c
index 2c30928621b7..68ef1ffbbef6 100644
--- a/crypto/asymmetric_keys/x509_cert_parser.c
+++ b/crypto/asymmetric_keys/x509_cert_parser.c
@@ -198,10 +198,6 @@ int x509_note_sig_algo(void *context, size_t hdrlen, unsigned char tag,
 	default:
 		return -ENOPKG; /* Unsupported combination */
 
-	case OID_sha1WithRSAEncryption:
-		ctx->cert->sig->hash_algo = "sha1";
-		goto rsa_pkcs1;
-
 	case OID_sha256WithRSAEncryption:
 		ctx->cert->sig->hash_algo = "sha256";
 		goto rsa_pkcs1;
@@ -218,10 +214,6 @@ int x509_note_sig_algo(void *context, size_t hdrlen, unsigned char tag,
 		ctx->cert->sig->hash_algo = "sha224";
 		goto rsa_pkcs1;
 
-	case OID_id_ecdsa_with_sha1:
-		ctx->cert->sig->hash_algo = "sha1";
-		goto ecdsa;
-
 	case OID_id_ecdsa_with_sha224:
 		ctx->cert->sig->hash_algo = "sha224";
 		goto ecdsa;
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 0cd6e0600255..d7e98397549b 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -653,30 +653,6 @@ static const struct akcipher_testvec rsa_tv_template[] = {
 static const struct akcipher_testvec ecdsa_nist_p192_tv_template[] = {
 	{
 	.key =
-	"\x04\xf7\x46\xf8\x2f\x15\xf6\x22\x8e\xd7\x57\x4f\xcc\xe7\xbb\xc1"
-	"\xd4\x09\x73\xcf\xea\xd0\x15\x07\x3d\xa5\x8a\x8a\x95\x43\xe4\x68"
-	"\xea\xc6\x25\xc1\xc1\x01\x25\x4c\x7e\xc3\x3c\xa6\x04\x0a\xe7\x08"
-	"\x98",
-	.key_len = 49,
-	.params =
-	"\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48"
-	"\xce\x3d\x03\x01\x01",
-	.param_len = 21,
-	.m =
-	"\xcd\xb9\xd2\x1c\xb7\x6f\xcd\x44\xb3\xfd\x63\xea\xa3\x66\x7f\xae"
-	"\x63\x85\xe7\x82",
-	.m_size = 20,
-	.algo = OID_id_ecdsa_with_sha1,
-	.c =
-	"\x30\x35\x02\x19\x00\xba\xe5\x93\x83\x6e\xb6\x3b\x63\xa0\x27\x91"
-	"\xc6\xf6\x7f\xc3\x09\xad\x59\xad\x88\x27\xd6\x92\x6b\x02\x18\x10"
-	"\x68\x01\x9d\xba\xce\x83\x08\xef\x95\x52\x7b\xa0\x0f\xe4\x18\x86"
-	"\x80\x6f\xa5\x79\x77\xda\xd0",
-	.c_size = 55,
-	.public_key_vec = true,
-	.siggen_sigver_test = true,
-	}, {
-	.key =
 	"\x04\xb6\x4b\xb1\xd1\xac\xba\x24\x8f\x65\xb2\x60\x00\x90\xbf\xbd"
 	"\x78\x05\x73\xe9\x79\x1d\x6f\x7c\x0b\xd2\xc3\x93\xa7\x28\xe1\x75"
 	"\xf7\xd5\x95\x1d\x28\x10\xc0\x75\x50\x5c\x1a\x4f\x3f\x8f\xa5\xee"
@@ -780,32 +756,6 @@ static const struct akcipher_testvec ecdsa_nist_p192_tv_template[] = {
 static const struct akcipher_testvec ecdsa_nist_p256_tv_template[] = {
 	{
 	.key =
-	"\x04\xb9\x7b\xbb\xd7\x17\x64\xd2\x7e\xfc\x81\x5d\x87\x06\x83\x41"
-	"\x22\xd6\x9a\xaa\x87\x17\xec\x4f\x63\x55\x2f\x94\xba\xdd\x83\xe9"
-	"\x34\x4b\xf3\xe9\x91\x13\x50\xb6\xcb\xca\x62\x08\xe7\x3b\x09\xdc"
-	"\xc3\x63\x4b\x2d\xb9\x73\x53\xe4\x45\xe6\x7c\xad\xe7\x6b\xb0\xe8"
-	"\xaf",
-	.key_len = 65,
-	.params =
-	"\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48"
-	"\xce\x3d\x03\x01\x07",
-	.param_len = 21,
-	.m =
-	"\xc2\x2b\x5f\x91\x78\x34\x26\x09\x42\x8d\x6f\x51\xb2\xc5\xaf\x4c"
-	"\x0b\xde\x6a\x42",
-	.m_size = 20,
-	.algo = OID_id_ecdsa_with_sha1,
-	.c =
-	"\x30\x46\x02\x21\x00\xf9\x25\xce\x9f\x3a\xa6\x35\x81\xcf\xd4\xe7"
-	"\xb7\xf0\x82\x56\x41\xf7\xd4\xad\x8d\x94\x5a\x69\x89\xee\xca\x6a"
-	"\x52\x0e\x48\x4d\xcc\x02\x21\x00\xd7\xe4\xef\x52\x66\xd3\x5b\x9d"
-	"\x8a\xfa\x54\x93\x29\xa7\x70\x86\xf1\x03\x03\xf3\x3b\xe2\x73\xf7"
-	"\xfb\x9d\x8b\xde\xd4\x8d\x6f\xad",
-	.c_size = 72,
-	.public_key_vec = true,
-	.siggen_sigver_test = true,
-	}, {
-	.key =
 	"\x04\x8b\x6d\xc0\x33\x8e\x2d\x8b\x67\xf5\xeb\xc4\x7f\xa0\xf5\xd9"
 	"\x7b\x03\xa5\x78\x9a\xb5\xea\x14\xe4\x23\xd0\xaf\xd7\x0e\x2e\xa0"
 	"\xc9\x8b\xdb\x95\xf8\xb3\xaf\xac\x00\x2c\x2c\x1f\x7a\xfd\x95\x88"
@@ -916,36 +866,6 @@ static const struct akcipher_testvec ecdsa_nist_p256_tv_template[] = {
 
 static const struct akcipher_testvec ecdsa_nist_p384_tv_template[] = {
 	{
-	.key = /* secp384r1(sha1) */
-	"\x04\x89\x25\xf3\x97\x88\xcb\xb0\x78\xc5\x72\x9a\x14\x6e\x7a\xb1"
-	"\x5a\xa5\x24\xf1\x95\x06\x9e\x28\xfb\xc4\xb9\xbe\x5a\x0d\xd9\x9f"
-	"\xf3\xd1\x4d\x2d\x07\x99\xbd\xda\xa7\x66\xec\xbb\xea\xba\x79\x42"
-	"\xc9\x34\x89\x6a\xe7\x0b\xc3\xf2\xfe\x32\x30\xbe\xba\xf9\xdf\x7e"
-	"\x4b\x6a\x07\x8e\x26\x66\x3f\x1d\xec\xa2\x57\x91\x51\xdd\x17\x0e"
-	"\x0b\x25\xd6\x80\x5c\x3b\xe6\x1a\x98\x48\x91\x45\x7a\x73\xb0\xc3"
-	"\xf1",
-	.key_len = 97,
-	.params =
-	"\x30\x10\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x05\x2b\x81\x04"
-	"\x00\x22",
-	.param_len = 18,
-	.m =
-	"\x12\x55\x28\xf0\x77\xd5\xb6\x21\x71\x32\x48\xcd\x28\xa8\x25\x22"
-	"\x3a\x69\xc1\x93",
-	.m_size = 20,
-	.algo = OID_id_ecdsa_with_sha1,
-	.c =
-	"\x30\x66\x02\x31\x00\xf5\x0f\x24\x4c\x07\x93\x6f\x21\x57\x55\x07"
-	"\x20\x43\x30\xde\xa0\x8d\x26\x8e\xae\x63\x3f\xbc\x20\x3a\xc6\xf1"
-	"\x32\x3c\xce\x70\x2b\x78\xf1\x4c\x26\xe6\x5b\x86\xcf\xec\x7c\x7e"
-	"\xd0\x87\xd7\xd7\x6e\x02\x31\x00\xcd\xbb\x7e\x81\x5d\x8f\x63\xc0"
-	"\x5f\x63\xb1\xbe\x5e\x4c\x0e\xa1\xdf\x28\x8c\x1b\xfa\xf9\x95\x88"
-	"\x74\xa0\x0f\xbf\xaf\xc3\x36\x76\x4a\xa1\x59\xf1\x1c\xa4\x58\x26"
-	"\x79\x12\x2a\xb7\xc5\x15\x92\xc5",
-	.c_size = 104,
-	.public_key_vec = true,
-	.siggen_sigver_test = true,
-	}, {
 	.key = /* secp384r1(sha224) */
 	"\x04\x69\x6c\xcf\x62\xee\xd0\x0d\xe5\xb5\x2f\x70\x54\xcf\x26\xa0"
 	"\xd9\x98\x8d\x92\x2a\xab\x9b\x11\xcb\x48\x18\xa1\xa9\x0d\xd5\x18"
diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index 4d04fa5d1eec..8b79e55cfcec 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -17,12 +17,10 @@
  *	  build_OID_registry.pl to generate the data for look_up_OID().
  */
 enum OID {
-	OID_id_dsa_with_sha1,		/* 1.2.840.10030.4.3 */
 	OID_id_dsa,			/* 1.2.840.10040.4.1 */
 	OID_id_ecPublicKey,		/* 1.2.840.10045.2.1 */
 	OID_id_prime192v1,		/* 1.2.840.10045.3.1.1 */
 	OID_id_prime256v1,		/* 1.2.840.10045.3.1.7 */
-	OID_id_ecdsa_with_sha1,		/* 1.2.840.10045.4.1 */
 	OID_id_ecdsa_with_sha224,	/* 1.2.840.10045.4.3.1 */
 	OID_id_ecdsa_with_sha256,	/* 1.2.840.10045.4.3.2 */
 	OID_id_ecdsa_with_sha384,	/* 1.2.840.10045.4.3.3 */
@@ -30,7 +28,6 @@ enum OID {
 
 	/* PKCS#1 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-1(1)} */
 	OID_rsaEncryption,		/* 1.2.840.113549.1.1.1 */
-	OID_sha1WithRSAEncryption,	/* 1.2.840.113549.1.1.5 */
 	OID_sha256WithRSAEncryption,	/* 1.2.840.113549.1.1.11 */
 	OID_sha384WithRSAEncryption,	/* 1.2.840.113549.1.1.12 */
 	OID_sha512WithRSAEncryption,	/* 1.2.840.113549.1.1.13 */
@@ -67,7 +64,6 @@ enum OID {
 	OID_PKU2U,			/* 1.3.5.1.5.2.7 */
 	OID_Scram,			/* 1.3.6.1.5.5.14 */
 	OID_certAuthInfoAccess,		/* 1.3.6.1.5.5.7.1.1 */
-	OID_sha1,			/* 1.3.14.3.2.26 */
 	OID_id_ansip384r1,		/* 1.3.132.0.34 */
 	OID_sha256,			/* 2.16.840.1.101.3.4.2.1 */
 	OID_sha384,			/* 2.16.840.1.101.3.4.2.2 */
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 33a2e991f608..19a53d5e7736 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -236,10 +236,6 @@ choice
 	  possible to load a signed module containing the algorithm to check
 	  the signature on that module.
 
-config MODULE_SIG_SHA1
-	bool "Sign modules with SHA-1"
-	select CRYPTO_SHA1
-
 config MODULE_SIG_SHA224
 	bool "Sign modules with SHA-224"
 	select CRYPTO_SHA256
@@ -261,7 +257,6 @@ endchoice
 config MODULE_SIG_HASH
 	string
 	depends on MODULE_SIG || IMA_APPRAISE_MODSIG
-	default "sha1" if MODULE_SIG_SHA1
 	default "sha224" if MODULE_SIG_SHA224
 	default "sha256" if MODULE_SIG_SHA256
 	default "sha384" if MODULE_SIG_SHA384
-- 
cgit v1.2.3


From fc3225fd6f1e6ac07a8463e7751ecfa228880c71 Mon Sep 17 00:00:00 2001
From: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Date: Tue, 10 Oct 2023 22:26:33 +0100
Subject: module: Do not offer sha224 for built-in module signing

sha224 does not provide enough security against collision attacks
relative to the default keys used for signing (RSA 4k & P-384). Also
sha224 never became popular, as sha256 got widely adopter ahead of
sha224 being introduced.

Signed-off-by: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 kernel/module/Kconfig | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 19a53d5e7736..9d7d45525fc4 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -236,10 +236,6 @@ choice
 	  possible to load a signed module containing the algorithm to check
 	  the signature on that module.
 
-config MODULE_SIG_SHA224
-	bool "Sign modules with SHA-224"
-	select CRYPTO_SHA256
-
 config MODULE_SIG_SHA256
 	bool "Sign modules with SHA-256"
 	select CRYPTO_SHA256
@@ -257,7 +253,6 @@ endchoice
 config MODULE_SIG_HASH
 	string
 	depends on MODULE_SIG || IMA_APPRAISE_MODSIG
-	default "sha224" if MODULE_SIG_SHA224
 	default "sha256" if MODULE_SIG_SHA256
 	default "sha384" if MODULE_SIG_SHA384
 	default "sha512" if MODULE_SIG_SHA512
-- 
cgit v1.2.3


From fb064e5ae1657595c090ebbc5b15787a3ef603e9 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Fri, 20 Oct 2023 01:40:27 +0000
Subject: sched/nohz: Update comments about NEWILB_KICK

How ILB is triggered without IPIs is cryptic. Out of mercy for future
code readers, document it in code comments.

The comments are derived from a discussion with Vincent in a past
review.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231020014031.919742-2-joel@joelfernandes.org
---
 kernel/sched/fair.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9ae2208089e4..8c486ffcb779 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12005,8 +12005,19 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 }
 
 /*
- * Check if we need to run the ILB for updating blocked load before entering
- * idle state.
+ * Check if we need to directly run the ILB for updating blocked load before
+ * entering idle state. Here we run ILB directly without issuing IPIs.
+ *
+ * Note that when this function is called, the tick may not yet be stopped on
+ * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
+ * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
+ * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
+ * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
+ * called from this function on (this) CPU that's not yet in the mask. That's
+ * OK because the goal of nohz_run_idle_balance() is to run ILB only for
+ * updating the blocked load of already idle CPUs without waking up one of
+ * those idle CPUs and outside the preempt disable / irq off phase of the local
+ * cpu about to enter idle, because it can take a long time.
  */
 void nohz_run_idle_balance(int cpu)
 {
-- 
cgit v1.2.3


From b022f0c7e404887a7c5229788fc99eff9f9a80d5 Mon Sep 17 00:00:00 2001
From: Francis Laniel <flaniel@linux.microsoft.com>
Date: Fri, 20 Oct 2023 13:42:49 +0300
Subject: tracing/kprobes: Return EADDRNOTAVAIL when func matches several
 symbols

When a kprobe is attached to a function that's name is not unique (is
static and shares the name with other functions in the kernel), the
kprobe is attached to the first function it finds. This is a bug as the
function that it is attaching to is not necessarily the one that the
user wants to attach to.

Instead of blindly picking a function to attach to what is ambiguous,
error with EADDRNOTAVAIL to let the user know that this function is not
unique, and that the user must use another unique function with an
address offset to get to the function they want to attach to.

Link: https://lore.kernel.org/all/20231020104250.9537-2-flaniel@linux.microsoft.com/

Cc: stable@vger.kernel.org
Fixes: 413d37d1eb69 ("tracing: Add kprobe-based event tracer")
Suggested-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Francis Laniel <flaniel@linux.microsoft.com>
Link: https://lore.kernel.org/lkml/20230819101105.b0c104ae4494a7d1f2eea742@kernel.org/
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace_kprobe.c | 63 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_probe.h  |  1 +
 2 files changed, 64 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3d7a180a8427..a8fef6ab0872 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -705,6 +705,25 @@ static struct notifier_block trace_kprobe_module_nb = {
 	.priority = 1	/* Invoked after kprobe module callback */
 };
 
+static int count_symbols(void *data, unsigned long unused)
+{
+	unsigned int *count = data;
+
+	(*count)++;
+
+	return 0;
+}
+
+static unsigned int number_of_same_symbols(char *func_name)
+{
+	unsigned int count;
+
+	count = 0;
+	kallsyms_on_each_match_symbol(count_symbols, func_name, &count);
+
+	return count;
+}
+
 static int __trace_kprobe_create(int argc, const char *argv[])
 {
 	/*
@@ -836,6 +855,31 @@ static int __trace_kprobe_create(int argc, const char *argv[])
 		}
 	}
 
+	if (symbol && !strchr(symbol, ':')) {
+		unsigned int count;
+
+		count = number_of_same_symbols(symbol);
+		if (count > 1) {
+			/*
+			 * Users should use ADDR to remove the ambiguity of
+			 * using KSYM only.
+			 */
+			trace_probe_log_err(0, NON_UNIQ_SYMBOL);
+			ret = -EADDRNOTAVAIL;
+
+			goto error;
+		} else if (count == 0) {
+			/*
+			 * We can return ENOENT earlier than when register the
+			 * kprobe.
+			 */
+			trace_probe_log_err(0, BAD_PROBE_ADDR);
+			ret = -ENOENT;
+
+			goto error;
+		}
+	}
+
 	trace_probe_log_set_index(0);
 	if (event) {
 		ret = traceprobe_parse_event_name(&event, &group, gbuf,
@@ -1695,6 +1739,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk)
 }
 
 #ifdef CONFIG_PERF_EVENTS
+
 /* create a trace_kprobe, but don't add it to global lists */
 struct trace_event_call *
 create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
@@ -1705,6 +1750,24 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
 	int ret;
 	char *event;
 
+	if (func) {
+		unsigned int count;
+
+		count = number_of_same_symbols(func);
+		if (count > 1)
+			/*
+			 * Users should use addr to remove the ambiguity of
+			 * using func only.
+			 */
+			return ERR_PTR(-EADDRNOTAVAIL);
+		else if (count == 0)
+			/*
+			 * We can return ENOENT earlier than when register the
+			 * kprobe.
+			 */
+			return ERR_PTR(-ENOENT);
+	}
+
 	/*
 	 * local trace_kprobes are not added to dyn_event, so they are never
 	 * searched in find_trace_kprobe(). Therefore, there is no concern of
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 02b432ae7513..850d9ecb6765 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -450,6 +450,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
 	C(BAD_MAXACT,		"Invalid maxactive number"),		\
 	C(MAXACT_TOO_BIG,	"Maxactive is too big"),		\
 	C(BAD_PROBE_ADDR,	"Invalid probed address or symbol"),	\
+	C(NON_UNIQ_SYMBOL,	"The symbol is not unique"),		\
 	C(BAD_RETPROBE,		"Retprobe address must be an function entry"), \
 	C(NO_TRACEPOINT,	"Tracepoint is not found"),		\
 	C(BAD_ADDR_SUFFIX,	"Invalid probed address suffix"), \
-- 
cgit v1.2.3


From 4c456c9ad334a940e354da1002184bc19f4493ef Mon Sep 17 00:00:00 2001
From: Yiwei Lin <s921975628@gmail.com>
Date: Fri, 20 Oct 2023 13:56:17 +0800
Subject: sched/fair: Remove unused 'curr' argument from pick_next_entity()

The 'curr' argument of pick_next_entity() has become unused after
the EEVDF changes.

[ mingo: Updated the changelog. ]

Signed-off-by: Yiwei Lin <s921975628@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231020055617.42064-1-s921975628@gmail.com
---
 kernel/sched/fair.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8c486ffcb779..4b70b0d14698 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5256,7 +5256,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * 4) do not run the "skip" process, if something else is available
  */
 static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+pick_next_entity(struct cfs_rq *cfs_rq)
 {
 	/*
 	 * Enabling NEXT_BUDDY will affect latency but not fairness.
@@ -8160,7 +8160,7 @@ again:
 				goto again;
 		}
 
-		se = pick_next_entity(cfs_rq, curr);
+		se = pick_next_entity(cfs_rq);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
@@ -8223,7 +8223,7 @@ again:
 			}
 		}
 
-		se = pick_next_entity(cfs_rq, curr);
+		se = pick_next_entity(cfs_rq);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
@@ -8262,7 +8262,7 @@ simple:
 		put_prev_task(rq, prev);
 
 	do {
-		se = pick_next_entity(cfs_rq, NULL);
+		se = pick_next_entity(cfs_rq);
 		set_next_entity(cfs_rq, se);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
-- 
cgit v1.2.3


From 5264a2f4bb3baf712e19f1f053caaa8d7d3afa2e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Fri, 20 Oct 2023 16:52:45 +0300
Subject: tracing: Fix a NULL vs IS_ERR() bug in event_subsystem_dir()

The eventfs_create_dir() function returns error pointers, it never returns
NULL.  Update the check to reflect that.

Link: https://lore.kernel.org/linux-trace-kernel/ff641474-84e2-46a7-9d7a-62b251a1050c@moroto.mountain

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Fixes: 5790b1fb3d67 ("eventfs: Remove eventfs_file and just use eventfs_inode")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index db46d2116500..f9e3e24d8796 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2354,7 +2354,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 		nr_entries = ARRAY_SIZE(system_entries);
 
 	ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir);
-	if (!ei) {
+	if (IS_ERR(ei)) {
 		pr_warn("Failed to create system directory %s\n", name);
 		__put_system(system);
 		goto out_free;
-- 
cgit v1.2.3


From d0ed46b60396cfa7e0056f55e1ce0b43c7db57b6 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 20 Oct 2023 04:35:45 +0100
Subject: tracing: Move readpos from seq_buf to trace_seq

To make seq_buf more lightweight as a string buf, move the readpos member
from seq_buf to its container, trace_seq.  That puts the responsibility
of maintaining the readpos entirely in the tracing code.  If some future
users want to package up the readpos with a seq_buf, we can define a
new struct then.

Link: https://lore.kernel.org/linux-trace-kernel/20231020033545.2587554-2-willy@infradead.org

Cc: Kees Cook <keescook@chromium.org>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h   |  5 +----
 include/linux/trace_seq.h |  2 ++
 kernel/trace/trace.c      | 10 +++++-----
 kernel/trace/trace_seq.c  |  6 +++++-
 lib/seq_buf.c             | 22 ++++++++++------------
 5 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 515d7fcb9634..a0fb013cebdf 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -14,19 +14,16 @@
  * @buffer:	pointer to the buffer
  * @size:	size of the buffer
  * @len:	the amount of data inside the buffer
- * @readpos:	The next position to read in the buffer.
  */
 struct seq_buf {
 	char			*buffer;
 	size_t			size;
 	size_t			len;
-	loff_t			readpos;
 };
 
 static inline void seq_buf_clear(struct seq_buf *s)
 {
 	s->len = 0;
-	s->readpos = 0;
 }
 
 static inline void
@@ -143,7 +140,7 @@ extern __printf(2, 0)
 int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args);
 extern int seq_buf_print_seq(struct seq_file *m, struct seq_buf *s);
 extern int seq_buf_to_user(struct seq_buf *s, char __user *ubuf,
-			   int cnt);
+			   size_t start, int cnt);
 extern int seq_buf_puts(struct seq_buf *s, const char *str);
 extern int seq_buf_putc(struct seq_buf *s, unsigned char c);
 extern int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len);
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 6be92bf559fe..3691e0e76a1a 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -14,6 +14,7 @@
 struct trace_seq {
 	char			buffer[PAGE_SIZE];
 	struct seq_buf		seq;
+	size_t			readpos;
 	int			full;
 };
 
@@ -22,6 +23,7 @@ trace_seq_init(struct trace_seq *s)
 {
 	seq_buf_init(&s->seq, s->buffer, PAGE_SIZE);
 	s->full = 0;
+	s->readpos = 0;
 }
 
 /**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4383be8fa1b0..d629065c2383 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1731,15 +1731,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 {
 	int len;
 
-	if (trace_seq_used(s) <= s->seq.readpos)
+	if (trace_seq_used(s) <= s->readpos)
 		return -EBUSY;
 
-	len = trace_seq_used(s) - s->seq.readpos;
+	len = trace_seq_used(s) - s->readpos;
 	if (cnt > len)
 		cnt = len;
-	memcpy(buf, s->buffer + s->seq.readpos, cnt);
+	memcpy(buf, s->buffer + s->readpos, cnt);
 
-	s->seq.readpos += cnt;
+	s->readpos += cnt;
 	return cnt;
 }
 
@@ -7008,7 +7008,7 @@ waitagain:
 
 	/* Now copy what we have to the user */
 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
-	if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
+	if (iter->seq.readpos >= trace_seq_used(&iter->seq))
 		trace_seq_init(&iter->seq);
 
 	/*
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index bac06ee3b98b..7be97229ddf8 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -370,8 +370,12 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
  */
 int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
 {
+	int ret;
 	__trace_seq_init(s);
-	return seq_buf_to_user(&s->seq, ubuf, cnt);
+	ret = seq_buf_to_user(&s->seq, ubuf, s->readpos, cnt);
+	if (ret > 0)
+		s->readpos += ret;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(trace_seq_to_user);
 
diff --git a/lib/seq_buf.c b/lib/seq_buf.c
index 45c450f423fa..b7477aefff53 100644
--- a/lib/seq_buf.c
+++ b/lib/seq_buf.c
@@ -324,23 +324,24 @@ int seq_buf_path(struct seq_buf *s, const struct path *path, const char *esc)
  * seq_buf_to_user - copy the sequence buffer to user space
  * @s: seq_buf descriptor
  * @ubuf: The userspace memory location to copy to
+ * @start: The first byte in the buffer to copy
  * @cnt: The amount to copy
  *
  * Copies the sequence buffer into the userspace memory pointed to
- * by @ubuf. It starts from the last read position (@s->readpos)
- * and writes up to @cnt characters or till it reaches the end of
- * the content in the buffer (@s->len), which ever comes first.
+ * by @ubuf. It starts from @start and writes up to @cnt characters
+ * or until it reaches the end of the content in the buffer (@s->len),
+ * whichever comes first.
  *
  * On success, it returns a positive number of the number of bytes
  * it copied.
  *
  * On failure it returns -EBUSY if all of the content in the
  * sequence has been already read, which includes nothing in the
- * sequence (@s->len == @s->readpos).
+ * sequence (@s->len == @start).
  *
  * Returns -EFAULT if the copy to userspace fails.
  */
-int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, int cnt)
+int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, size_t start, int cnt)
 {
 	int len;
 	int ret;
@@ -350,20 +351,17 @@ int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, int cnt)
 
 	len = seq_buf_used(s);
 
-	if (len <= s->readpos)
+	if (len <= start)
 		return -EBUSY;
 
-	len -= s->readpos;
+	len -= start;
 	if (cnt > len)
 		cnt = len;
-	ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+	ret = copy_to_user(ubuf, s->buffer + start, cnt);
 	if (ret == cnt)
 		return -EFAULT;
 
-	cnt -= ret;
-
-	s->readpos += cnt;
-	return cnt;
+	return cnt - ret;
 }
 
 /**
-- 
cgit v1.2.3


From baa8fdecd87bb8751237b45e3bcb5a179e5a12ca Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:31:58 +0800
Subject: bpf: Re-enable unit_size checking for global per-cpu allocator

With pcpu_alloc_size() in place, check whether or not the size of
the dynamic per-cpu area is matched with unit_size.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-4-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 39ea316c55e7..776bdf5ffd80 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -491,21 +491,17 @@ static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx)
 	struct llist_node *first;
 	unsigned int obj_size;
 
-	/* For per-cpu allocator, the size of free objects in free list doesn't
-	 * match with unit_size and now there is no way to get the size of
-	 * per-cpu pointer saved in free object, so just skip the checking.
-	 */
-	if (c->percpu_size)
-		return 0;
-
 	first = c->free_llist.first;
 	if (!first)
 		return 0;
 
-	obj_size = ksize(first);
+	if (c->percpu_size)
+		obj_size = pcpu_alloc_size(((void **)first)[1]);
+	else
+		obj_size = ksize(first);
 	if (obj_size != c->unit_size) {
-		WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n",
-			  idx, obj_size, c->unit_size);
+		WARN_ONCE(1, "bpf_mem_cache[%u]: percpu %d, unexpected object size %u, expect %u\n",
+			  idx, c->percpu_size, obj_size, c->unit_size);
 		return -EINVAL;
 	}
 	return 0;
@@ -973,6 +969,12 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
 	return !ret ? NULL : ret + LLIST_NODE_SZ;
 }
 
+/* The alignment of dynamic per-cpu area is 8, so c->unit_size and the
+ * actual size of dynamic per-cpu area will always be matched and there is
+ * no need to adjust size_index for per-cpu allocation. However for the
+ * simplicity of the implementation, use an unified size_index for both
+ * kmalloc and per-cpu allocation.
+ */
 static __init int bpf_mem_cache_adjust_size(void)
 {
 	unsigned int size;
-- 
cgit v1.2.3


From 3f2189e4f77b7a3e979d143dc4ff586488c7e8a5 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:31:59 +0800
Subject: bpf: Use pcpu_alloc_size() in bpf_mem_free{_rcu}()

For bpf_global_percpu_ma, the pointer passed to bpf_mem_free_rcu() is
allocated by kmalloc() and its size is fixed (16-bytes on x86-64). So
no matter which cache allocates the dynamic per-cpu area, on x86-64
cache[2] will always be used to free the per-cpu area.

Fix the unbalance by checking whether the bpf memory allocator is
per-cpu or not and use pcpu_alloc_size() instead of ksize() to
find the correct cache for per-cpu free.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-5-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |  1 +
 kernel/bpf/memalloc.c         | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index d644bbb298af..bb1223b21308 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -11,6 +11,7 @@ struct bpf_mem_caches;
 struct bpf_mem_alloc {
 	struct bpf_mem_caches __percpu *caches;
 	struct bpf_mem_cache __percpu *cache;
+	bool percpu;
 	struct work_struct work;
 };
 
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 776bdf5ffd80..5308e386380a 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -525,6 +525,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 	/* room for llist_node and per-cpu pointer */
 	if (percpu)
 		percpu_size = LLIST_NODE_SZ + sizeof(void *);
+	ma->percpu = percpu;
 
 	if (size) {
 		pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
@@ -874,6 +875,17 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
 	return !ret ? NULL : ret + LLIST_NODE_SZ;
 }
 
+static notrace int bpf_mem_free_idx(void *ptr, bool percpu)
+{
+	size_t size;
+
+	if (percpu)
+		size = pcpu_alloc_size(*((void **)ptr));
+	else
+		size = ksize(ptr - LLIST_NODE_SZ);
+	return bpf_mem_cache_idx(size);
+}
+
 void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
 {
 	int idx;
@@ -881,7 +893,7 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
 	if (!ptr)
 		return;
 
-	idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+	idx = bpf_mem_free_idx(ptr, ma->percpu);
 	if (idx < 0)
 		return;
 
@@ -895,7 +907,7 @@ void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
 	if (!ptr)
 		return;
 
-	idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+	idx = bpf_mem_free_idx(ptr, ma->percpu);
 	if (idx < 0)
 		return;
 
-- 
cgit v1.2.3


From e581a3461de3f129cfe888a67d9f31086328271f Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:32:00 +0800
Subject: bpf: Move the declaration of __bpf_obj_drop_impl() to bpf.h

both syscall.c and helpers.c have the declaration of
__bpf_obj_drop_impl(), so just move it to a common header file.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-6-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 1 +
 kernel/bpf/helpers.c | 2 --
 kernel/bpf/syscall.c | 2 --
 3 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4b40b45962b..ebd412179771 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2058,6 +2058,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec);
 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
 void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
 
 struct bpf_map *bpf_map_get(u32 ufd);
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index da058aead20c..c814bb44d2d1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1811,8 +1811,6 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 	}
 }
 
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
 void bpf_list_head_free(const struct btf_field *field, void *list_head,
 			struct bpf_spin_lock *spin_lock)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 341f8cb4405c..69998f84f7c8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -626,8 +626,6 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
 	bpf_timer_cancel_and_free(obj + rec->timer_off);
 }
 
-extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 {
 	const struct btf_field *fields;
-- 
cgit v1.2.3


From e383a45902337356d9ccad797094a27c6b2150f9 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 20 Oct 2023 21:32:01 +0800
Subject: bpf: Use bpf_global_percpu_ma for per-cpu kptr in
 __bpf_obj_drop_impl()

The following warning was reported when running "./test_progs -t
test_bpf_ma/percpu_free_through_map_free":

  ------------[ cut here ]------------
  WARNING: CPU: 1 PID: 68 at kernel/bpf/memalloc.c:342
  CPU: 1 PID: 68 Comm: kworker/u16:2 Not tainted 6.6.0-rc2+ #222
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
  Workqueue: events_unbound bpf_map_free_deferred
  RIP: 0010:bpf_mem_refill+0x21c/0x2a0
  ......
  Call Trace:
   <IRQ>
   ? bpf_mem_refill+0x21c/0x2a0
   irq_work_single+0x27/0x70
   irq_work_run_list+0x2a/0x40
   irq_work_run+0x18/0x40
   __sysvec_irq_work+0x1c/0xc0
   sysvec_irq_work+0x73/0x90
   </IRQ>
   <TASK>
   asm_sysvec_irq_work+0x1b/0x20
  RIP: 0010:unit_free+0x50/0x80
   ......
   bpf_mem_free+0x46/0x60
   __bpf_obj_drop_impl+0x40/0x90
   bpf_obj_free_fields+0x17d/0x1a0
   array_map_free+0x6b/0x170
   bpf_map_free_deferred+0x54/0xa0
   process_scheduled_works+0xba/0x370
   worker_thread+0x16d/0x2e0
   kthread+0x105/0x140
   ret_from_fork+0x39/0x60
   ret_from_fork_asm+0x1b/0x30
   </TASK>
  ---[ end trace 0000000000000000 ]---

The reason is simple: __bpf_obj_drop_impl() does not know the freeing
field is a per-cpu pointer and it uses bpf_global_ma to free the
pointer. Because bpf_global_ma is not a per-cpu allocator, so ksize() is
used to select the corresponding cache. The bpf_mem_cache with 16-bytes
unit_size will always be selected to do the unmatched free and it will
trigger the warning in free_bulk() eventually.

Because per-cpu kptr doesn't support list or rb-tree now, so fix the
problem by only checking whether or not the type of kptr is per-cpu in
bpf_obj_free_fields(), and using bpf_global_percpu_ma to these kptrs.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231020133202.4043247-7-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  |  2 +-
 kernel/bpf/helpers.c | 22 ++++++++++++++--------
 kernel/bpf/syscall.c |  4 ++--
 3 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ebd412179771..b4825d3cdb29 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2058,7 +2058,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec);
 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
 void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);
 
 struct bpf_map *bpf_map_get(u32 ufd);
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index c814bb44d2d1..e46ac288a108 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1842,7 +1842,7 @@ unlock:
 		 * bpf_list_head which needs to be freed.
 		 */
 		migrate_disable();
-		__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
+		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
 		migrate_enable();
 	}
 }
@@ -1881,7 +1881,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 
 
 		migrate_disable();
-		__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
+		__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
 		migrate_enable();
 	}
 }
@@ -1913,8 +1913,10 @@ __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 }
 
 /* Must be called under migrate_disable(), as required by bpf_mem_free */
-void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
 {
+	struct bpf_mem_alloc *ma;
+
 	if (rec && rec->refcount_off >= 0 &&
 	    !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
 		/* Object is refcounted and refcount_dec didn't result in 0
@@ -1926,10 +1928,14 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
 	if (rec)
 		bpf_obj_free_fields(rec, p);
 
+	if (percpu)
+		ma = &bpf_global_percpu_ma;
+	else
+		ma = &bpf_global_ma;
 	if (rec && rec->refcount_off >= 0)
-		bpf_mem_free_rcu(&bpf_global_ma, p);
+		bpf_mem_free_rcu(ma, p);
 	else
-		bpf_mem_free(&bpf_global_ma, p);
+		bpf_mem_free(ma, p);
 }
 
 __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
@@ -1937,7 +1943,7 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
 	struct btf_struct_meta *meta = meta__ign;
 	void *p = p__alloc;
 
-	__bpf_obj_drop_impl(p, meta ? meta->record : NULL);
+	__bpf_obj_drop_impl(p, meta ? meta->record : NULL, false);
 }
 
 __bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign)
@@ -1981,7 +1987,7 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
 	 */
 	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
 		/* Only called from BPF prog, no need to migrate_disable */
-		__bpf_obj_drop_impl((void *)n - off, rec);
+		__bpf_obj_drop_impl((void *)n - off, rec, false);
 		return -EINVAL;
 	}
 
@@ -2080,7 +2086,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root,
 	 */
 	if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
 		/* Only called from BPF prog, no need to migrate_disable */
-		__bpf_obj_drop_impl((void *)n - off, rec);
+		__bpf_obj_drop_impl((void *)n - off, rec, false);
 		return -EINVAL;
 	}
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 69998f84f7c8..a9b2cb500bf7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -660,8 +660,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 									   field->kptr.btf_id);
 				migrate_disable();
 				__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
-								 pointee_struct_meta->record :
-								 NULL);
+								 pointee_struct_meta->record : NULL,
+								 fields[i].type == BPF_KPTR_PERCPU);
 				migrate_enable();
 			} else {
 				field->kptr.dtor(xchgd_field);
-- 
cgit v1.2.3


From da323d4640704001f2287f729124e1cd9d5684d0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 5 Oct 2023 09:06:55 +0200
Subject: dma-direct: add dependencies to CONFIG_DMA_GLOBAL_POOL

CONFIG_DMA_GLOBAL_POOL can't be combined with other DMA coherent
allocators.  Add dependencies to Kconfig to document this, and make
kconfig complain about unmet dependencies if someone tries.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Greg Ungerer <gerg@linux-m68k.org>
Tested-by: Greg Ungerer <gerg@linux-m68k.org>
---
 kernel/dma/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index f488997b0717..4524db877eba 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -135,6 +135,8 @@ config DMA_COHERENT_POOL
 
 config DMA_GLOBAL_POOL
 	select DMA_DECLARE_COHERENT
+	depends on !ARCH_HAS_DMA_SET_UNCACHED
+	depends on !DMA_DIRECT_REMAP
 	bool
 
 config DMA_DIRECT_REMAP
-- 
cgit v1.2.3


From 2c8ed1b960fb97c82ede5afc974329bfdb457f5f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 5 Oct 2023 09:05:36 +0200
Subject: dma-direct: add a CONFIG_ARCH_HAS_DMA_ALLOC symbol

Instead of using arch_dma_alloc if none of the generic coherent
allocators are used, require the architectures to explicitly opt into
providing it.  This will used to deal with the case of m68knommu and
coldfire where we can't do any coherent allocations whatsoever, and
also makes it clear that arch_dma_alloc is a last resort.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Greg Ungerer <gerg@linux-m68k.org>
Tested-by: Greg Ungerer <gerg@linux-m68k.org>
---
 arch/arm/Kconfig    |  1 +
 arch/m68k/Kconfig   |  1 +
 arch/parisc/Kconfig |  1 +
 kernel/dma/Kconfig  |  9 +++++++++
 kernel/dma/direct.c | 12 ++----------
 5 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9557808e8937..f8567e95f98b 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -8,6 +8,7 @@ config ARM
 	select ARCH_HAS_CPU_FINALIZE_INIT if MMU
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
+	select ARCH_HAS_DMA_ALLOC if MMU
 	select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FORTIFY_SOURCE
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 3e318bf9504c..4f3e7dec2171 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -6,6 +6,7 @@ config M68K
 	select ARCH_HAS_BINFMT_FLAT
 	select ARCH_HAS_CPU_FINALIZE_INIT if MMU
 	select ARCH_HAS_CURRENT_STACK_POINTER
+	select ARCH_HAS_DMA_ALLOC if !MMU || COLDFIRE
 	select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE
 	select ARCH_HAS_SYNC_DMA_FOR_DEVICE if HAS_DMA
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index a15ab147af2e..46e8e4ea7a57 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -8,6 +8,7 @@ config PARISC
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_SYSCALL_TRACEPOINTS
 	select ARCH_WANT_FRAME_POINTERS
+	select ARCH_HAS_DMA_ALLOC if PA11
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_HAS_STRICT_MODULE_RWX
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 4524db877eba..d62f5957f36b 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -144,6 +144,15 @@ config DMA_DIRECT_REMAP
 	select DMA_COHERENT_POOL
 	select DMA_NONCOHERENT_MMAP
 
+#
+# Fallback to arch code for DMA allocations.  This should eventually go away.
+#
+config ARCH_HAS_DMA_ALLOC
+	depends on !ARCH_HAS_DMA_SET_UNCACHED
+	depends on !DMA_DIRECT_REMAP
+	depends on !DMA_GLOBAL_POOL
+	bool
+
 config DMA_CMA
 	bool "DMA Contiguous Memory Allocator"
 	depends on HAVE_DMA_CONTIGUOUS && CMA
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 9596ae1aa0da..c078090cd38e 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -220,13 +220,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 		return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp);
 
 	if (!dev_is_dma_coherent(dev)) {
-		/*
-		 * Fallback to the arch handler if it exists.  This should
-		 * eventually go away.
-		 */
-		if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
-		    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-		    !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+		if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) &&
 		    !is_swiotlb_for_alloc(dev))
 			return arch_dma_alloc(dev, size, dma_handle, gfp,
 					      attrs);
@@ -330,9 +324,7 @@ void dma_direct_free(struct device *dev, size_t size,
 		return;
 	}
 
-	if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
-	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-	    !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+	if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) &&
 	    !dev_is_dma_coherent(dev) &&
 	    !is_swiotlb_for_alloc(dev)) {
 		arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
-- 
cgit v1.2.3


From b1da46d70e5427a9ba5eae28bcca29a20e68a442 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 6 Oct 2023 15:13:34 +0200
Subject: dma-direct: simplify the use atomic pool logic in dma_direct_alloc

The logic in dma_direct_alloc when to use the atomic pool vs remapping
grew a bit unreadable.  Consolidate it into a single check, and clean
up the set_uncached vs remap logic a bit as well.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Greg Ungerer <gerg@linux-m68k.org>
Tested-by: Greg Ungerer <gerg@linux-m68k.org>
---
 kernel/dma/direct.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index c078090cd38e..9657ef7c055e 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -234,27 +234,22 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 					dma_handle);
 
 		/*
-		 * Otherwise remap if the architecture is asking for it.  But
-		 * given that remapping memory is a blocking operation we'll
-		 * instead have to dip into the atomic pools.
+		 * Otherwise we require the architecture to either be able to
+		 * mark arbitrary parts of the kernel direct mapping uncached,
+		 * or remapped it uncached.
 		 */
+		set_uncached = IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED);
 		remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP);
-		if (remap) {
-			if (dma_direct_use_pool(dev, gfp))
-				return dma_direct_alloc_from_pool(dev, size,
-						dma_handle, gfp);
-		} else {
-			if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED))
-				return NULL;
-			set_uncached = true;
-		}
+		if (!set_uncached && !remap)
+			return NULL;
 	}
 
 	/*
-	 * Decrypting memory may block, so allocate the memory from the atomic
-	 * pools if we can't block.
+	 * Remapping or decrypting memory may block, allocate the memory from
+	 * the atomic pools instead if we aren't allowed block.
 	 */
-	if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
+	if ((remap || force_dma_unencrypted(dev)) &&
+	    dma_direct_use_pool(dev, gfp))
 		return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
 
 	/* we always manually zero the memory once we are done */
-- 
cgit v1.2.3


From 63f067e33c996a6e4d9a34d138116a43105182f1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 6 Oct 2023 15:17:54 +0200
Subject: dma-direct: warn when coherent allocations aren't supported

Log a warning once when dma_alloc_coherent fails because the platform
does not support coherent allocations at all.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Greg Ungerer <gerg@linux-m68k.org>
Tested-by: Greg Ungerer <gerg@linux-m68k.org>
---
 kernel/dma/direct.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 9657ef7c055e..ed3056eb20b8 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -240,8 +240,10 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 		 */
 		set_uncached = IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED);
 		remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP);
-		if (!set_uncached && !remap)
+		if (!set_uncached && !remap) {
+			pr_warn_once("coherent DMA allocations not supported on this platform.\n");
 			return NULL;
+		}
 	}
 
 	/*
-- 
cgit v1.2.3


From 1132a1dc053ef4391bb09fdb2242a628615291bb Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 20 Oct 2023 08:42:15 -0700
Subject: swiotlb: rewrite comment explaining why the source is preserved on
 DMA_FROM_DEVICE

Rewrite the comment explaining why swiotlb copies the original buffer to
the TLB buffer before initiating DMA *from* the device, i.e. before the
device DMAs into the TLB buffer.  The existing comment's argument that
preserving the original data can prevent a kernel memory leak is bogus.

If the driver that triggered the mapping _knows_ that the device will
overwrite the entire mapping, or the driver will consume only the written
parts, then copying from the original memory is completely pointless.

If neither of the above holds true, then copying from the original adds
value only if preserving the data is necessary for functional
correctness, or the driver explicitly initialized the original memory.
If the driver didn't initialize the memory, then copying the original
buffer to the TLB buffer simply changes what kernel data is leaked to
user space.

Writing the entire TLB buffer _does_ prevent leaking stale TLB buffer
data from a previous bounce, but that can be achieved by simply zeroing
the TLB buffer when grabbing a slot.

The real reason swiotlb ended up initializing the TLB buffer with the
original buffer is that it's necessary to make swiotlb operate as
transparently as possible, i.e. to behave as closely as possible to
hardware, and to avoid corrupting the original buffer, e.g. if the driver
knows the device will do partial writes and is relying on the unwritten
data to be preserved.

Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/all/ZN5elYQ5szQndN8n@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 01637677736f..fd5dacd0628d 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1296,11 +1296,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 		pool->slots[index + i].orig_addr = slot_addr(orig_addr, i);
 	tlb_addr = slot_addr(pool->start, index) + offset;
 	/*
-	 * When dir == DMA_FROM_DEVICE we could omit the copy from the orig
-	 * to the tlb buffer, if we knew for sure the device will
-	 * overwrite the entire current content. But we don't. Thus
-	 * unconditional bounce may prevent leaking swiotlb content (i.e.
-	 * kernel memory) to user-space.
+	 * When the device is writing memory, i.e. dir == DMA_FROM_DEVICE, copy
+	 * the original buffer to the TLB buffer before initiating DMA in order
+	 * to preserve the original's data if the device does a partial write,
+	 * i.e. if the device doesn't overwrite the entire buffer.  Preserving
+	 * the original data, even if it's garbage, is necessary to match
+	 * hardware behavior.  Use of swiotlb is supposed to be transparent,
+	 * i.e. swiotlb must not corrupt memory by clobbering unwritten bytes.
 	 */
 	swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
 	return tlb_addr;
-- 
cgit v1.2.3


From 36d91e851598a9ea523ad4681dd11fa661d59695 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 19 Oct 2023 11:25:38 -0400
Subject: dma-debug: Fix a typo in a debugging eye-catcher

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 06366acd27b0..3de494375b7b 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -139,7 +139,7 @@ static const char *const maperr2str[] = {
 
 static const char *type2name[] = {
 	[dma_debug_single] = "single",
-	[dma_debug_sg] = "scather-gather",
+	[dma_debug_sg] = "scatter-gather",
 	[dma_debug_coherent] = "coherent",
 	[dma_debug_resource] = "resource",
 };
-- 
cgit v1.2.3


From 545db7e21e64766e6b7cb987fbfd3e79419726ce Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 21 Oct 2023 16:42:41 +0200
Subject: tracing/histograms: Simplify last_cmd_set()

Turn a kzalloc()+strcpy()+strncat() into an equivalent and less verbose
kasprintf().

Link: https://lore.kernel.org/linux-trace-kernel/30b6fb04dadc10a03cc1ad08f5d8a93ef623a167.1697899346.git.christophe.jaillet@wanadoo.fr

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Mukesh ojha <quic_mojha@quicinc.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index d06938ae0717..1abc07fba1b9 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -774,23 +774,16 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
 {
 	const char *system = NULL, *name = NULL;
 	struct trace_event_call *call;
-	int len;
 
 	if (!str)
 		return;
 
-	/* sizeof() contains the nul byte */
-	len = sizeof(HIST_PREFIX) + strlen(str);
 	kfree(last_cmd);
-	last_cmd = kzalloc(len, GFP_KERNEL);
+
+	last_cmd = kasprintf(GFP_KERNEL, HIST_PREFIX "%s", str);
 	if (!last_cmd)
 		return;
 
-	strcpy(last_cmd, HIST_PREFIX);
-	/* Again, sizeof() contains the nul byte */
-	len -= sizeof(HIST_PREFIX);
-	strncat(last_cmd, str, len);
-
 	if (file) {
 		call = file->event_call;
 		system = call->class->system;
-- 
cgit v1.2.3


From b63dadd6f97522513fe9497b5fde84a154e39a0b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 23 Oct 2023 20:50:15 +0200
Subject: bpf, tcx: Get rid of tcx_link_const

Small clean up to get rid of the extra tcx_link_const() and only retain
the tcx_link().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20231023185015.21152-1-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/net/tcx.h | 7 +------
 kernel/bpf/tcx.c  | 4 ++--
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/net/tcx.h b/include/net/tcx.h
index 264f147953ba..04be9377785d 100644
--- a/include/net/tcx.h
+++ b/include/net/tcx.h
@@ -38,16 +38,11 @@ static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry)
 	return container_of(bundle, struct tcx_entry, bundle);
 }
 
-static inline struct tcx_link *tcx_link(struct bpf_link *link)
+static inline struct tcx_link *tcx_link(const struct bpf_link *link)
 {
 	return container_of(link, struct tcx_link, link);
 }
 
-static inline const struct tcx_link *tcx_link_const(const struct bpf_link *link)
-{
-	return tcx_link((struct bpf_link *)link);
-}
-
 void tcx_inc(void);
 void tcx_dec(void);
 
diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c
index 1338a13a8b64..2e4885e7781f 100644
--- a/kernel/bpf/tcx.c
+++ b/kernel/bpf/tcx.c
@@ -250,7 +250,7 @@ static void tcx_link_dealloc(struct bpf_link *link)
 
 static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
 {
-	const struct tcx_link *tcx = tcx_link_const(link);
+	const struct tcx_link *tcx = tcx_link(link);
 	u32 ifindex = 0;
 
 	rtnl_lock();
@@ -267,7 +267,7 @@ static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
 static int tcx_link_fill_info(const struct bpf_link *link,
 			      struct bpf_link_info *info)
 {
-	const struct tcx_link *tcx = tcx_link_const(link);
+	const struct tcx_link *tcx = tcx_link(link);
 	u32 ifindex = 0;
 
 	rtnl_lock();
-- 
cgit v1.2.3


From 4758560fa268cecfa1144f015aa9f2525d164b7e Mon Sep 17 00:00:00 2001
From: "wuqiang.matt" <wuqiang.matt@bytedance.com>
Date: Mon, 23 Oct 2023 19:22:45 +0800
Subject: kprobes: unused header files removed

As kernel test robot reported, lib/test_objpool.c (trace:probes/for-next)
has linux/version.h included, but version.h is not used at all. Then more
unused headers are found in test_objpool.c and rethook.c, and all of them
should be removed.

Link: https://lore.kernel.org/all/20231023112245.6112-1-wuqiang.matt@bytedance.com/

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202310191512.vvypKU5Z-lkp@intel.com/
Signed-off-by: wuqiang.matt <wuqiang.matt@bytedance.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/rethook.c | 2 --
 lib/test_objpool.c     | 6 ------
 2 files changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index 13c8e6773892..6fd7d4ecbbc6 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -8,8 +8,6 @@
 #include <linux/preempt.h>
 #include <linux/rethook.h>
 #include <linux/slab.h>
-#include <linux/sort.h>
-#include <linux/smp.h>
 
 /* Return hook list (shadow stack by list) */
 
diff --git a/lib/test_objpool.c b/lib/test_objpool.c
index 98b5b37b6eea..a94078402138 100644
--- a/lib/test_objpool.c
+++ b/lib/test_objpool.c
@@ -6,21 +6,15 @@
  * Copyright: wuqiang.matt@bytedance.com
  */
 
-#include <linux/version.h>
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
 #include <linux/completion.h>
 #include <linux/kthread.h>
-#include <linux/cpu.h>
-#include <linux/cpuset.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
 #include <linux/hrtimer.h>
-#include <linux/interrupt.h>
 #include <linux/objpool.h>
 
 #define OT_NR_MAX_BULK (16)
-- 
cgit v1.2.3


From 3c4e420cb6536026ddd50eaaff5f30e4f144200d Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 24 Oct 2023 03:09:11 +0300
Subject: bpf: move explored_state() closer to the beginning of verifier.c

Subsequent patches would make use of explored_state() function.
Move it up to avoid adding unnecessary prototype.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231024000917.12153-2-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e9bc5d4a25a1..e6232b5d3964 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1817,6 +1817,19 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	return 0;
 }
 
+static u32 state_htab_size(struct bpf_verifier_env *env)
+{
+	return env->prog->len;
+}
+
+static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx)
+{
+	struct bpf_verifier_state *cur = env->cur_state;
+	struct bpf_func_state *state = cur->frame[cur->curframe];
+
+	return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
+}
+
 static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
 {
 	while (st) {
@@ -15020,21 +15033,6 @@ enum {
 	BRANCH = 2,
 };
 
-static u32 state_htab_size(struct bpf_verifier_env *env)
-{
-	return env->prog->len;
-}
-
-static struct bpf_verifier_state_list **explored_state(
-					struct bpf_verifier_env *env,
-					int idx)
-{
-	struct bpf_verifier_state *cur = env->cur_state;
-	struct bpf_func_state *state = cur->frame[cur->curframe];
-
-	return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
-}
-
 static void mark_prune_point(struct bpf_verifier_env *env, int idx)
 {
 	env->insn_aux_data[idx].prune_point = true;
-- 
cgit v1.2.3


From 4c97259abc9bc8df7712f76f58ce385581876857 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 24 Oct 2023 03:09:12 +0300
Subject: bpf: extract same_callsites() as utility function

Extract same_callsites() from clean_live_states() as a utility function.
This function would be used by the next patch in the set.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231024000917.12153-3-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e6232b5d3964..366029e484a0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1830,6 +1830,20 @@ static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *
 	return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
 }
 
+static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_state *b)
+{
+	int fr;
+
+	if (a->curframe != b->curframe)
+		return false;
+
+	for (fr = a->curframe; fr >= 0; fr--)
+		if (a->frame[fr]->callsite != b->frame[fr]->callsite)
+			return false;
+
+	return true;
+}
+
 static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
 {
 	while (st) {
@@ -15909,18 +15923,14 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
 			      struct bpf_verifier_state *cur)
 {
 	struct bpf_verifier_state_list *sl;
-	int i;
 
 	sl = *explored_state(env, insn);
 	while (sl) {
 		if (sl->state.branches)
 			goto next;
 		if (sl->state.insn_idx != insn ||
-		    sl->state.curframe != cur->curframe)
+		    !same_callsites(&sl->state, cur))
 			goto next;
-		for (i = 0; i <= cur->curframe; i++)
-			if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
-				goto next;
 		clean_verifier_state(env, &sl->state);
 next:
 		sl = sl->next;
-- 
cgit v1.2.3


From 2793a8b015f7f1caadb9bce9c63dc659f7522676 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 24 Oct 2023 03:09:13 +0300
Subject: bpf: exact states comparison for iterator convergence checks

Convergence for open coded iterators is computed in is_state_visited()
by examining states with branches count > 1 and using states_equal().
states_equal() computes sub-state relation using read and precision marks.
Read and precision marks are propagated from children states,
thus are not guaranteed to be complete inside a loop when branches
count > 1. This could be demonstrated using the following unsafe program:

     1. r7 = -16
     2. r6 = bpf_get_prandom_u32()
     3. while (bpf_iter_num_next(&fp[-8])) {
     4.   if (r6 != 42) {
     5.     r7 = -32
     6.     r6 = bpf_get_prandom_u32()
     7.     continue
     8.   }
     9.   r0 = r10
    10.   r0 += r7
    11.   r8 = *(u64 *)(r0 + 0)
    12.   r6 = bpf_get_prandom_u32()
    13. }

Here verifier would first visit path 1-3, create a checkpoint at 3
with r7=-16, continue to 4-7,3 with r7=-32.

Because instructions at 9-12 had not been visitied yet existing
checkpoint at 3 does not have read or precision mark for r7.
Thus states_equal() would return true and verifier would discard
current state, thus unsafe memory access at 11 would not be caught.

This commit fixes this loophole by introducing exact state comparisons
for iterator convergence logic:
- registers are compared using regs_exact() regardless of read or
  precision marks;
- stack slots have to have identical type.

Unfortunately, this is too strict even for simple programs like below:

    i = 0;
    while(iter_next(&it))
      i++;

At each iteration step i++ would produce a new distinct state and
eventually instruction processing limit would be reached.

To avoid such behavior speculatively forget (widen) range for
imprecise scalar registers, if those registers were not precise at the
end of the previous iteration and do not match exactly.

This a conservative heuristic that allows to verify wide range of
programs, however it precludes verification of programs that conjure
an imprecise value on the first loop iteration and use it as precise
on the second.

Test case iter_task_vma_for_each() presents one of such cases:

        unsigned int seen = 0;
        ...
        bpf_for_each(task_vma, vma, task, 0) {
                if (seen >= 1000)
                        break;
                ...
                seen++;
        }

Here clang generates the following code:

<LBB0_4>:
      24:       r8 = r6                          ; stash current value of
                ... body ...                       'seen'
      29:       r1 = r10
      30:       r1 += -0x8
      31:       call bpf_iter_task_vma_next
      32:       r6 += 0x1                        ; seen++;
      33:       if r0 == 0x0 goto +0x2 <LBB0_6>  ; exit on next() == NULL
      34:       r7 += 0x10
      35:       if r8 < 0x3e7 goto -0xc <LBB0_4> ; loop on seen < 1000

<LBB0_6>:
      ... exit ...

Note that counter in r6 is copied to r8 and then incremented,
conditional jump is done using r8. Because of this precision mark for
r6 lags one state behind of precision mark on r8 and widening logic
kicks in.

Adding barrier_var(seen) after conditional is sufficient to force
clang use the same register for both counting and conditional jump.

This issue was discussed in the thread [1] which was started by
Andrew Werner <awerner32@gmail.com> demonstrating a similar bug
in callback functions handling. The callbacks would be addressed
in a followup patch.

[1] https://lore.kernel.org/bpf/97a90da09404c65c8e810cf83c94ac703705dc0e.camel@gmail.com/

Co-developed-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Co-developed-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231024000917.12153-4-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                       |   1 +
 kernel/bpf/verifier.c                              | 218 ++++++++++++++++++---
 tools/testing/selftests/bpf/progs/iters_task_vma.c |   1 +
 3 files changed, 189 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e67cd45a85be..38b788228594 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -384,6 +384,7 @@ struct bpf_verifier_state {
 	 */
 	struct bpf_idx_pair *jmp_history;
 	u32 jmp_history_cnt;
+	u32 dfs_depth;
 };
 
 #define bpf_get_spilled_reg(slot, frame, mask)				\
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 366029e484a0..6573e8c77329 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1802,6 +1802,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->parent = src->parent;
 	dst_state->first_insn_idx = src->first_insn_idx;
 	dst_state->last_insn_idx = src->last_insn_idx;
+	dst_state->dfs_depth = src->dfs_depth;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -7723,6 +7724,81 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
 	return 0;
 }
 
+/* Look for a previous loop entry at insn_idx: nearest parent state
+ * stopped at insn_idx with callsites matching those in cur->frame.
+ */
+static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
+						  struct bpf_verifier_state *cur,
+						  int insn_idx)
+{
+	struct bpf_verifier_state_list *sl;
+	struct bpf_verifier_state *st;
+
+	/* Explored states are pushed in stack order, most recent states come first */
+	sl = *explored_state(env, insn_idx);
+	for (; sl; sl = sl->next) {
+		/* If st->branches != 0 state is a part of current DFS verification path,
+		 * hence cur & st for a loop.
+		 */
+		st = &sl->state;
+		if (st->insn_idx == insn_idx && st->branches && same_callsites(st, cur) &&
+		    st->dfs_depth < cur->dfs_depth)
+			return st;
+	}
+
+	return NULL;
+}
+
+static void reset_idmap_scratch(struct bpf_verifier_env *env);
+static bool regs_exact(const struct bpf_reg_state *rold,
+		       const struct bpf_reg_state *rcur,
+		       struct bpf_idmap *idmap);
+
+static void maybe_widen_reg(struct bpf_verifier_env *env,
+			    struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+			    struct bpf_idmap *idmap)
+{
+	if (rold->type != SCALAR_VALUE)
+		return;
+	if (rold->type != rcur->type)
+		return;
+	if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap))
+		return;
+	__mark_reg_unknown(env, rcur);
+}
+
+static int widen_imprecise_scalars(struct bpf_verifier_env *env,
+				   struct bpf_verifier_state *old,
+				   struct bpf_verifier_state *cur)
+{
+	struct bpf_func_state *fold, *fcur;
+	int i, fr;
+
+	reset_idmap_scratch(env);
+	for (fr = old->curframe; fr >= 0; fr--) {
+		fold = old->frame[fr];
+		fcur = cur->frame[fr];
+
+		for (i = 0; i < MAX_BPF_REG; i++)
+			maybe_widen_reg(env,
+					&fold->regs[i],
+					&fcur->regs[i],
+					&env->idmap_scratch);
+
+		for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) {
+			if (!is_spilled_reg(&fold->stack[i]) ||
+			    !is_spilled_reg(&fcur->stack[i]))
+				continue;
+
+			maybe_widen_reg(env,
+					&fold->stack[i].spilled_ptr,
+					&fcur->stack[i].spilled_ptr,
+					&env->idmap_scratch);
+		}
+	}
+	return 0;
+}
+
 /* process_iter_next_call() is called when verifier gets to iterator's next
  * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
  * to it as just "iter_next()" in comments below.
@@ -7764,25 +7840,47 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id
  * is some statically known limit on number of iterations (e.g., if there is
  * an explicit `if n > 100 then break;` statement somewhere in the loop).
  *
- * One very subtle but very important aspect is that we *always* simulate NULL
- * condition first (as the current state) before we simulate non-NULL case.
- * This has to do with intricacies of scalar precision tracking. By simulating
- * "exit condition" of iter_next() returning NULL first, we make sure all the
- * relevant precision marks *that will be set **after** we exit iterator loop*
- * are propagated backwards to common parent state of NULL and non-NULL
- * branches. Thanks to that, state equivalence checks done later in forked
- * state, when reaching iter_next() for ACTIVE iterator, can assume that
- * precision marks are finalized and won't change. Because simulating another
- * ACTIVE iterator iteration won't change them (because given same input
- * states we'll end up with exactly same output states which we are currently
- * comparing; and verification after the loop already propagated back what
- * needs to be **additionally** tracked as precise). It's subtle, grok
- * precision tracking for more intuitive understanding.
+ * Iteration convergence logic in is_state_visited() relies on exact
+ * states comparison, which ignores read and precision marks.
+ * This is necessary because read and precision marks are not finalized
+ * while in the loop. Exact comparison might preclude convergence for
+ * simple programs like below:
+ *
+ *     i = 0;
+ *     while(iter_next(&it))
+ *       i++;
+ *
+ * At each iteration step i++ would produce a new distinct state and
+ * eventually instruction processing limit would be reached.
+ *
+ * To avoid such behavior speculatively forget (widen) range for
+ * imprecise scalar registers, if those registers were not precise at the
+ * end of the previous iteration and do not match exactly.
+ *
+ * This is a conservative heuristic that allows to verify wide range of programs,
+ * however it precludes verification of programs that conjure an
+ * imprecise value on the first loop iteration and use it as precise on a second.
+ * For example, the following safe program would fail to verify:
+ *
+ *     struct bpf_num_iter it;
+ *     int arr[10];
+ *     int i = 0, a = 0;
+ *     bpf_iter_num_new(&it, 0, 10);
+ *     while (bpf_iter_num_next(&it)) {
+ *       if (a == 0) {
+ *         a = 1;
+ *         i = 7; // Because i changed verifier would forget
+ *                // it's range on second loop entry.
+ *       } else {
+ *         arr[i] = 42; // This would fail to verify.
+ *       }
+ *     }
+ *     bpf_iter_num_destroy(&it);
  */
 static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
 				  struct bpf_kfunc_call_arg_meta *meta)
 {
-	struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
+	struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st;
 	struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
 	struct bpf_reg_state *cur_iter, *queued_iter;
 	int iter_frameno = meta->iter.frameno;
@@ -7800,6 +7898,19 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
 	}
 
 	if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+		/* Because iter_next() call is a checkpoint is_state_visitied()
+		 * should guarantee parent state with same call sites and insn_idx.
+		 */
+		if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx ||
+		    !same_callsites(cur_st->parent, cur_st)) {
+			verbose(env, "bug: bad parent state for iter next call");
+			return -EFAULT;
+		}
+		/* Note cur_st->parent in the call below, it is necessary to skip
+		 * checkpoint created for cur_st by is_state_visited()
+		 * right at this instruction.
+		 */
+		prev_st = find_prev_entry(env, cur_st->parent, insn_idx);
 		/* branch out active iter state */
 		queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
 		if (!queued_st)
@@ -7808,6 +7919,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
 		queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
 		queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
 		queued_iter->iter.depth++;
+		if (prev_st)
+			widen_imprecise_scalars(env, prev_st, queued_st);
 
 		queued_fr = queued_st->frame[queued_st->curframe];
 		mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
@@ -15948,8 +16061,11 @@ static bool regs_exact(const struct bpf_reg_state *rold,
 
 /* Returns true if (rold safe implies rcur safe) */
 static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
-		    struct bpf_reg_state *rcur, struct bpf_idmap *idmap)
+		    struct bpf_reg_state *rcur, struct bpf_idmap *idmap, bool exact)
 {
+	if (exact)
+		return regs_exact(rold, rcur, idmap);
+
 	if (!(rold->live & REG_LIVE_READ))
 		/* explored state didn't use this */
 		return true;
@@ -16066,7 +16182,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 }
 
 static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
-		      struct bpf_func_state *cur, struct bpf_idmap *idmap)
+		      struct bpf_func_state *cur, struct bpf_idmap *idmap, bool exact)
 {
 	int i, spi;
 
@@ -16079,7 +16195,12 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 
 		spi = i / BPF_REG_SIZE;
 
-		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
+		if (exact &&
+		    old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+		    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+			return false;
+
+		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) && !exact) {
 			i += BPF_REG_SIZE - 1;
 			/* explored state didn't use this */
 			continue;
@@ -16129,7 +16250,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 			 * return false to continue verification of this path
 			 */
 			if (!regsafe(env, &old->stack[spi].spilled_ptr,
-				     &cur->stack[spi].spilled_ptr, idmap))
+				     &cur->stack[spi].spilled_ptr, idmap, exact))
 				return false;
 			break;
 		case STACK_DYNPTR:
@@ -16211,16 +16332,16 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
  * the current state will reach 'bpf_exit' instruction safely
  */
 static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
-			      struct bpf_func_state *cur)
+			      struct bpf_func_state *cur, bool exact)
 {
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
 		if (!regsafe(env, &old->regs[i], &cur->regs[i],
-			     &env->idmap_scratch))
+			     &env->idmap_scratch, exact))
 			return false;
 
-	if (!stacksafe(env, old, cur, &env->idmap_scratch))
+	if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
 		return false;
 
 	if (!refsafe(old, cur, &env->idmap_scratch))
@@ -16229,17 +16350,23 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
 	return true;
 }
 
+static void reset_idmap_scratch(struct bpf_verifier_env *env)
+{
+	env->idmap_scratch.tmp_id_gen = env->id_gen;
+	memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+}
+
 static bool states_equal(struct bpf_verifier_env *env,
 			 struct bpf_verifier_state *old,
-			 struct bpf_verifier_state *cur)
+			 struct bpf_verifier_state *cur,
+			 bool exact)
 {
 	int i;
 
 	if (old->curframe != cur->curframe)
 		return false;
 
-	env->idmap_scratch.tmp_id_gen = env->id_gen;
-	memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map));
+	reset_idmap_scratch(env);
 
 	/* Verification state from speculative execution simulation
 	 * must never prune a non-speculative execution one.
@@ -16269,7 +16396,7 @@ static bool states_equal(struct bpf_verifier_env *env,
 	for (i = 0; i <= old->curframe; i++) {
 		if (old->frame[i]->callsite != cur->frame[i]->callsite)
 			return false;
-		if (!func_states_equal(env, old->frame[i], cur->frame[i]))
+		if (!func_states_equal(env, old->frame[i], cur->frame[i], exact))
 			return false;
 	}
 	return true;
@@ -16524,7 +16651,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl, **pprev;
 	struct bpf_verifier_state *cur = env->cur_state, *new;
-	int i, j, err, states_cnt = 0;
+	int i, j, n, err, states_cnt = 0;
 	bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
 	bool add_new_state = force_new_state;
 
@@ -16579,9 +16706,33 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * It's safe to assume that iterator loop will finish, taking into
 			 * account iter_next() contract of eventually returning
 			 * sticky NULL result.
+			 *
+			 * Note, that states have to be compared exactly in this case because
+			 * read and precision marks might not be finalized inside the loop.
+			 * E.g. as in the program below:
+			 *
+			 *     1. r7 = -16
+			 *     2. r6 = bpf_get_prandom_u32()
+			 *     3. while (bpf_iter_num_next(&fp[-8])) {
+			 *     4.   if (r6 != 42) {
+			 *     5.     r7 = -32
+			 *     6.     r6 = bpf_get_prandom_u32()
+			 *     7.     continue
+			 *     8.   }
+			 *     9.   r0 = r10
+			 *    10.   r0 += r7
+			 *    11.   r8 = *(u64 *)(r0 + 0)
+			 *    12.   r6 = bpf_get_prandom_u32()
+			 *    13. }
+			 *
+			 * Here verifier would first visit path 1-3, create a checkpoint at 3
+			 * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does
+			 * not have read or precision mark for r7 yet, thus inexact states
+			 * comparison would discard current state with r7=-32
+			 * => unsafe memory access at 11 would not be caught.
 			 */
 			if (is_iter_next_insn(env, insn_idx)) {
-				if (states_equal(env, &sl->state, cur)) {
+				if (states_equal(env, &sl->state, cur, true)) {
 					struct bpf_func_state *cur_frame;
 					struct bpf_reg_state *iter_state, *iter_reg;
 					int spi;
@@ -16604,7 +16755,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			}
 			/* attempt to detect infinite loop to avoid unnecessary doomed work */
 			if (states_maybe_looping(&sl->state, cur) &&
-			    states_equal(env, &sl->state, cur) &&
+			    states_equal(env, &sl->state, cur, false) &&
 			    !iter_active_depths_differ(&sl->state, cur)) {
 				verbose_linfo(env, insn_idx, "; ");
 				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
@@ -16629,7 +16780,7 @@ skip_inf_loop_check:
 				add_new_state = false;
 			goto miss;
 		}
-		if (states_equal(env, &sl->state, cur)) {
+		if (states_equal(env, &sl->state, cur, false)) {
 hit:
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
@@ -16668,8 +16819,12 @@ miss:
 		 * to keep checking from state equivalence point of view.
 		 * Higher numbers increase max_states_per_insn and verification time,
 		 * but do not meaningfully decrease insn_processed.
+		 * 'n' controls how many times state could miss before eviction.
+		 * Use bigger 'n' for checkpoints because evicting checkpoint states
+		 * too early would hinder iterator convergence.
 		 */
-		if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
+		n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3;
+		if (sl->miss_cnt > sl->hit_cnt * n + n) {
 			/* the state is unlikely to be useful. Remove it to
 			 * speed up verification
 			 */
@@ -16743,6 +16898,7 @@ next:
 
 	cur->parent = new;
 	cur->first_insn_idx = insn_idx;
+	cur->dfs_depth = new->dfs_depth + 1;
 	clear_jmp_history(cur);
 	new_sl->next = *explored_state(env, insn_idx);
 	*explored_state(env, insn_idx) = new_sl;
diff --git a/tools/testing/selftests/bpf/progs/iters_task_vma.c b/tools/testing/selftests/bpf/progs/iters_task_vma.c
index 44edecfdfaee..e085a51d153e 100644
--- a/tools/testing/selftests/bpf/progs/iters_task_vma.c
+++ b/tools/testing/selftests/bpf/progs/iters_task_vma.c
@@ -30,6 +30,7 @@ int iter_task_vma_for_each(const void *ctx)
 	bpf_for_each(task_vma, vma, task, 0) {
 		if (seen >= 1000)
 			break;
+		barrier_var(seen);
 
 		vm_ranges[seen].vm_start = vma->vm_start;
 		vm_ranges[seen].vm_end = vma->vm_end;
-- 
cgit v1.2.3


From 2a0992829ea3864939d917a5c7b48be6629c6217 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 24 Oct 2023 03:09:15 +0300
Subject: bpf: correct loop detection for iterators convergence

It turns out that .branches > 0 in is_state_visited() is not a
sufficient condition to identify if two verifier states form a loop
when iterators convergence is computed. This commit adds logic to
distinguish situations like below:

 (I)            initial       (II)            initial
                  |                             |
                  V                             V
     .---------> hdr                           ..
     |            |                             |
     |            V                             V
     |    .------...                    .------..
     |    |       |                     |       |
     |    V       V                     V       V
     |   ...     ...               .-> hdr     ..
     |    |       |                |    |       |
     |    V       V                |    V       V
     |   succ <- cur               |   succ <- cur
     |    |                        |    |
     |    V                        |    V
     |   ...                       |   ...
     |    |                        |    |
     '----'                        '----'

For both (I) and (II) successor 'succ' of the current state 'cur' was
previously explored and has branches count at 0. However, loop entry
'hdr' corresponding to 'succ' might be a part of current DFS path.
If that is the case 'succ' and 'cur' are members of the same loop
and have to be compared exactly.

Co-developed-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Co-developed-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Reviewed-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231024000917.12153-6-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  15 ++++
 kernel/bpf/verifier.c        | 207 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 218 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 38b788228594..24213a99cc79 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -373,10 +373,25 @@ struct bpf_verifier_state {
 	struct bpf_active_lock active_lock;
 	bool speculative;
 	bool active_rcu_lock;
+	/* If this state was ever pointed-to by other state's loop_entry field
+	 * this flag would be set to true. Used to avoid freeing such states
+	 * while they are still in use.
+	 */
+	bool used_as_loop_entry;
 
 	/* first and last insn idx of this verifier state */
 	u32 first_insn_idx;
 	u32 last_insn_idx;
+	/* If this state is a part of states loop this field points to some
+	 * parent of this state such that:
+	 * - it is also a member of the same states loop;
+	 * - DFS states traversal starting from initial state visits loop_entry
+	 *   state before this state.
+	 * Used to compute topmost loop entry for state loops.
+	 * State loops might appear because of open coded iterators logic.
+	 * See get_loop_entry() for more information.
+	 */
+	struct bpf_verifier_state *loop_entry;
 	/* jmp history recorded from first to last.
 	 * backtracking is using it to go from last to first.
 	 * For most states jmp_history_cnt is [0-3].
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6573e8c77329..f23fbfe82c59 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1803,6 +1803,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->first_insn_idx = src->first_insn_idx;
 	dst_state->last_insn_idx = src->last_insn_idx;
 	dst_state->dfs_depth = src->dfs_depth;
+	dst_state->used_as_loop_entry = src->used_as_loop_entry;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -1845,11 +1846,176 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
 	return true;
 }
 
+/* Open coded iterators allow back-edges in the state graph in order to
+ * check unbounded loops that iterators.
+ *
+ * In is_state_visited() it is necessary to know if explored states are
+ * part of some loops in order to decide whether non-exact states
+ * comparison could be used:
+ * - non-exact states comparison establishes sub-state relation and uses
+ *   read and precision marks to do so, these marks are propagated from
+ *   children states and thus are not guaranteed to be final in a loop;
+ * - exact states comparison just checks if current and explored states
+ *   are identical (and thus form a back-edge).
+ *
+ * Paper "A New Algorithm for Identifying Loops in Decompilation"
+ * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient
+ * algorithm for loop structure detection and gives an overview of
+ * relevant terminology. It also has helpful illustrations.
+ *
+ * [1] https://api.semanticscholar.org/CorpusID:15784067
+ *
+ * We use a similar algorithm but because loop nested structure is
+ * irrelevant for verifier ours is significantly simpler and resembles
+ * strongly connected components algorithm from Sedgewick's textbook.
+ *
+ * Define topmost loop entry as a first node of the loop traversed in a
+ * depth first search starting from initial state. The goal of the loop
+ * tracking algorithm is to associate topmost loop entries with states
+ * derived from these entries.
+ *
+ * For each step in the DFS states traversal algorithm needs to identify
+ * the following situations:
+ *
+ *          initial                     initial                   initial
+ *            |                           |                         |
+ *            V                           V                         V
+ *           ...                         ...           .---------> hdr
+ *            |                           |            |            |
+ *            V                           V            |            V
+ *           cur                     .-> succ          |    .------...
+ *            |                      |    |            |    |       |
+ *            V                      |    V            |    V       V
+ *           succ                    '-- cur           |   ...     ...
+ *                                                     |    |       |
+ *                                                     |    V       V
+ *                                                     |   succ <- cur
+ *                                                     |    |
+ *                                                     |    V
+ *                                                     |   ...
+ *                                                     |    |
+ *                                                     '----'
+ *
+ *  (A) successor state of cur   (B) successor state of cur or it's entry
+ *      not yet traversed            are in current DFS path, thus cur and succ
+ *                                   are members of the same outermost loop
+ *
+ *                      initial                  initial
+ *                        |                        |
+ *                        V                        V
+ *                       ...                      ...
+ *                        |                        |
+ *                        V                        V
+ *                .------...               .------...
+ *                |       |                |       |
+ *                V       V                V       V
+ *           .-> hdr     ...              ...     ...
+ *           |    |       |                |       |
+ *           |    V       V                V       V
+ *           |   succ <- cur              succ <- cur
+ *           |    |                        |
+ *           |    V                        V
+ *           |   ...                      ...
+ *           |    |                        |
+ *           '----'                       exit
+ *
+ * (C) successor state of cur is a part of some loop but this loop
+ *     does not include cur or successor state is not in a loop at all.
+ *
+ * Algorithm could be described as the following python code:
+ *
+ *     traversed = set()   # Set of traversed nodes
+ *     entries = {}        # Mapping from node to loop entry
+ *     depths = {}         # Depth level assigned to graph node
+ *     path = set()        # Current DFS path
+ *
+ *     # Find outermost loop entry known for n
+ *     def get_loop_entry(n):
+ *         h = entries.get(n, None)
+ *         while h in entries and entries[h] != h:
+ *             h = entries[h]
+ *         return h
+ *
+ *     # Update n's loop entry if h's outermost entry comes
+ *     # before n's outermost entry in current DFS path.
+ *     def update_loop_entry(n, h):
+ *         n1 = get_loop_entry(n) or n
+ *         h1 = get_loop_entry(h) or h
+ *         if h1 in path and depths[h1] <= depths[n1]:
+ *             entries[n] = h1
+ *
+ *     def dfs(n, depth):
+ *         traversed.add(n)
+ *         path.add(n)
+ *         depths[n] = depth
+ *         for succ in G.successors(n):
+ *             if succ not in traversed:
+ *                 # Case A: explore succ and update cur's loop entry
+ *                 #         only if succ's entry is in current DFS path.
+ *                 dfs(succ, depth + 1)
+ *                 h = get_loop_entry(succ)
+ *                 update_loop_entry(n, h)
+ *             else:
+ *                 # Case B or C depending on `h1 in path` check in update_loop_entry().
+ *                 update_loop_entry(n, succ)
+ *         path.remove(n)
+ *
+ * To adapt this algorithm for use with verifier:
+ * - use st->branch == 0 as a signal that DFS of succ had been finished
+ *   and cur's loop entry has to be updated (case A), handle this in
+ *   update_branch_counts();
+ * - use st->branch > 0 as a signal that st is in the current DFS path;
+ * - handle cases B and C in is_state_visited();
+ * - update topmost loop entry for intermediate states in get_loop_entry().
+ */
+static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st)
+{
+	struct bpf_verifier_state *topmost = st->loop_entry, *old;
+
+	while (topmost && topmost->loop_entry && topmost != topmost->loop_entry)
+		topmost = topmost->loop_entry;
+	/* Update loop entries for intermediate states to avoid this
+	 * traversal in future get_loop_entry() calls.
+	 */
+	while (st && st->loop_entry != topmost) {
+		old = st->loop_entry;
+		st->loop_entry = topmost;
+		st = old;
+	}
+	return topmost;
+}
+
+static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
+{
+	struct bpf_verifier_state *cur1, *hdr1;
+
+	cur1 = get_loop_entry(cur) ?: cur;
+	hdr1 = get_loop_entry(hdr) ?: hdr;
+	/* The head1->branches check decides between cases B and C in
+	 * comment for get_loop_entry(). If hdr1->branches == 0 then
+	 * head's topmost loop entry is not in current DFS path,
+	 * hence 'cur' and 'hdr' are not in the same loop and there is
+	 * no need to update cur->loop_entry.
+	 */
+	if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) {
+		cur->loop_entry = hdr;
+		hdr->used_as_loop_entry = true;
+	}
+}
+
 static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
 {
 	while (st) {
 		u32 br = --st->branches;
 
+		/* br == 0 signals that DFS exploration for 'st' is finished,
+		 * thus it is necessary to update parent's loop entry if it
+		 * turned out that st is a part of some loop.
+		 * This is a part of 'case A' in get_loop_entry() comment.
+		 */
+		if (br == 0 && st->parent && st->loop_entry)
+			update_loop_entry(st->parent, st->loop_entry);
+
 		/* WARN_ON(br > 1) technically makes sense here,
 		 * but see comment in push_stack(), hence:
 		 */
@@ -16650,10 +16816,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl, **pprev;
-	struct bpf_verifier_state *cur = env->cur_state, *new;
+	struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
 	int i, j, n, err, states_cnt = 0;
 	bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
 	bool add_new_state = force_new_state;
+	bool force_exact;
 
 	/* bpf progs typically have pruning point every 4 instructions
 	 * http://vger.kernel.org/bpfconf2019.html#session-1
@@ -16748,8 +16915,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 					 */
 					spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
 					iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
-					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
+					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
+						update_loop_entry(cur, &sl->state);
 						goto hit;
+					}
 				}
 				goto skip_inf_loop_check;
 			}
@@ -16780,7 +16949,36 @@ skip_inf_loop_check:
 				add_new_state = false;
 			goto miss;
 		}
-		if (states_equal(env, &sl->state, cur, false)) {
+		/* If sl->state is a part of a loop and this loop's entry is a part of
+		 * current verification path then states have to be compared exactly.
+		 * 'force_exact' is needed to catch the following case:
+		 *
+		 *                initial     Here state 'succ' was processed first,
+		 *                  |         it was eventually tracked to produce a
+		 *                  V         state identical to 'hdr'.
+		 *     .---------> hdr        All branches from 'succ' had been explored
+		 *     |            |         and thus 'succ' has its .branches == 0.
+		 *     |            V
+		 *     |    .------...        Suppose states 'cur' and 'succ' correspond
+		 *     |    |       |         to the same instruction + callsites.
+		 *     |    V       V         In such case it is necessary to check
+		 *     |   ...     ...        if 'succ' and 'cur' are states_equal().
+		 *     |    |       |         If 'succ' and 'cur' are a part of the
+		 *     |    V       V         same loop exact flag has to be set.
+		 *     |   succ <- cur        To check if that is the case, verify
+		 *     |    |                 if loop entry of 'succ' is in current
+		 *     |    V                 DFS path.
+		 *     |   ...
+		 *     |    |
+		 *     '----'
+		 *
+		 * Additional details are in the comment before get_loop_entry().
+		 */
+		loop_entry = get_loop_entry(&sl->state);
+		force_exact = loop_entry && loop_entry->branches > 0;
+		if (states_equal(env, &sl->state, cur, force_exact)) {
+			if (force_exact)
+				update_loop_entry(cur, loop_entry);
 hit:
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
@@ -16829,7 +17027,8 @@ miss:
 			 * speed up verification
 			 */
 			*pprev = sl->next;
-			if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
+			if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE &&
+			    !sl->state.used_as_loop_entry) {
 				u32 br = sl->state.branches;
 
 				WARN_ONCE(br,
-- 
cgit v1.2.3


From b4d8239534fddc036abe4a0fdbf474d9894d4641 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 24 Oct 2023 03:09:17 +0300
Subject: bpf: print full verifier states on infinite loop detection

Additional logging in is_state_visited(): if infinite loop is detected
print full verifier state for both current and equivalent states.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231024000917.12153-8-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f23fbfe82c59..98f9d0f35931 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -16928,6 +16928,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			    !iter_active_depths_differ(&sl->state, cur)) {
 				verbose_linfo(env, insn_idx, "; ");
 				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+				verbose(env, "cur state:");
+				print_verifier_state(env, cur->frame[cur->curframe], true);
+				verbose(env, "old state:");
+				print_verifier_state(env, sl->state.frame[cur->curframe], true);
 				return -EINVAL;
 			}
 			/* if the verifier is processing a loop, avoid adding new state
-- 
cgit v1.2.3


From 3f6074cf467f6c2beb7db0b0d685fb53f25ae9aa Mon Sep 17 00:00:00 2001
From: Li kunyu <kunyu@nfschina.com>
Date: Mon, 23 Oct 2023 14:23:59 +0800
Subject: printk: printk: Remove unnecessary statements＇len = 0;＇
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the following two functions, len has already been assigned a value of
0 when defining the variable, so remove 'len=0;'.

Signed-off-by: Li kunyu <kunyu@nfschina.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20231023062359.130633-1-kunyu@nfschina.com
---
 kernel/printk/printk.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 96fc38cb2e84..1c6ed598b480 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1671,7 +1671,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 
 	prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
 
-	len = 0;
 	prb_for_each_record(seq, prb, seq, &r) {
 		int textlen;
 
@@ -4190,7 +4189,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
 
 	prb_rec_init_rd(&r, &info, buf, size);
 
-	len = 0;
 	prb_for_each_record(seq, prb, seq, &r) {
 		if (r.info->seq >= iter->next_seq)
 			break;
-- 
cgit v1.2.3


From 5ebde09d91707a4a9bec1e3d213e3c12ffde348f Mon Sep 17 00:00:00 2001
From: Hao Jia <jiahao.os@bytedance.com>
Date: Thu, 12 Oct 2023 17:00:03 +0800
Subject: sched/core: Fix RQCF_ACT_SKIP leak

Igor Raits and Bagas Sanjaya report a RQCF_ACT_SKIP leak warning.

This warning may be triggered in the following situations:

    CPU0                                      CPU1

__schedule()
  *rq->clock_update_flags <<= 1;*   unregister_fair_sched_group()
  pick_next_task_fair+0x4a/0x410      destroy_cfs_bandwidth()
    newidle_balance+0x115/0x3e0       for_each_possible_cpu(i) *i=0*
      rq_unpin_lock(this_rq, rf)      __cfsb_csd_unthrottle()
      raw_spin_rq_unlock(this_rq)
                                      rq_lock(*CPU0_rq*, &rf)
                                      rq_clock_start_loop_update()
                                      rq->clock_update_flags & RQCF_ACT_SKIP <--
      raw_spin_rq_lock(this_rq)

The purpose of RQCF_ACT_SKIP is to skip the update rq clock,
but the update is very early in __schedule(), but we clear
RQCF_*_SKIP very late, causing it to span that gap above
and triggering this warning.

In __schedule() we can clear the RQCF_*_SKIP flag immediately
after update_rq_clock() to avoid this RQCF_ACT_SKIP leak warning.
And set rq->clock_update_flags to RQCF_UPDATED to avoid
rq->clock_update_flags < RQCF_ACT_SKIP warning that may be triggered later.

Fixes: ebb83d84e49b ("sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle()")
Closes: https://lore.kernel.org/all/20230913082424.73252-1-jiahao.os@bytedance.com
Reported-by: Igor Raits <igor.raits@gmail.com>
Reported-by: Bagas Sanjaya <bagasdotme@gmail.com>
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Hao Jia <jiahao.os@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/all/a5dd536d-041a-2ce9-f4b7-64d8d85c86dc@gmail.com
---
 kernel/sched/core.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 264c2eb380d7..dc724f59e495 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5361,8 +5361,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	/* switch_mm_cid() requires the memory barriers above. */
 	switch_mm_cid(rq, prev, next);
 
-	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-
 	prepare_lock_switch(rq, next, rf);
 
 	/* Here we just switch the register state and the stack. */
@@ -6600,6 +6598,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 	/* Promote REQ to ACT */
 	rq->clock_update_flags <<= 1;
 	update_rq_clock(rq);
+	rq->clock_update_flags = RQCF_UPDATED;
 
 	switch_count = &prev->nivcsw;
 
@@ -6679,8 +6678,6 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 		/* Also unlocks the rq: */
 		rq = context_switch(rq, prev, next, &rf);
 	} else {
-		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-
 		rq_unpin_lock(rq, &rf);
 		__balance_callbacks(rq);
 		raw_spin_rq_unlock_irq(rq);
-- 
cgit v1.2.3


From b95303e0aeaf446b65169dd4142cacdaeb7d4c8b Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Thu, 19 Oct 2023 11:33:21 +0800
Subject: sched: Add cpus_share_resources API

Add cpus_share_resources() API. This is the preparation for the
optimization of select_idle_cpu() on platforms with cluster scheduler
level.

On a machine with clusters cpus_share_resources() will test whether
two cpus are within the same cluster. On a non-cluster machine it
will behaves the same as cpus_share_cache(). So we use "resources"
here for cache resources.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-and-reviewed-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lkml.kernel.org/r/20231019033323.54147-2-yangyicong@huawei.com
---
 include/linux/sched/sd_flags.h |  7 +++++++
 include/linux/sched/topology.h |  8 +++++++-
 kernel/sched/core.c            | 12 ++++++++++++
 kernel/sched/sched.h           |  1 +
 kernel/sched/topology.c        | 13 +++++++++++++
 5 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index fad77b5172e2..a8b28647aafc 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -109,6 +109,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
  */
 SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
 
+/*
+ * Domain members share CPU cluster (LLC tags or L2 cache)
+ *
+ * NEEDS_GROUPS: Clusters are shared between groups.
+ */
+SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS)
+
 /*
  * Domain members share CPU package resources (i.e. caches)
  *
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 67b573d5bf28..4c14fe127223 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void)
 #ifdef CONFIG_SCHED_CLUSTER
 static inline int cpu_cluster_flags(void)
 {
-	return SD_SHARE_PKG_RESOURCES;
+	return SD_CLUSTER | SD_SHARE_PKG_RESOURCES;
 }
 #endif
 
@@ -179,6 +179,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
 bool cpus_share_cache(int this_cpu, int that_cpu);
+bool cpus_share_resources(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
@@ -232,6 +233,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
 	return true;
 }
 
+static inline bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+	return true;
+}
+
 #endif	/* !CONFIG_SMP */
 
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dc724f59e495..5e1fb8a63b2e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3939,6 +3939,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
 	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
 }
 
+/*
+ * Whether CPUs are share cache resources, which means LLC on non-cluster
+ * machines and LLC tag or L2 on machines with clusters.
+ */
+bool cpus_share_resources(int this_cpu, int that_cpu)
+{
+	if (this_cpu == that_cpu)
+		return true;
+
+	return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu);
+}
+
 static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 {
 	/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 65cad0e5729e..998f03d02de0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1853,6 +1853,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(int, sd_share_id);
 DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a63729f87c21..dbb8c328e8ad 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -668,6 +668,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(int, sd_share_id);
 DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
@@ -693,6 +694,17 @@ static void update_top_cache_domain(int cpu)
 	per_cpu(sd_llc_id, cpu) = id;
 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
 
+	sd = lowest_flag_domain(cpu, SD_CLUSTER);
+	if (sd)
+		id = cpumask_first(sched_domain_span(sd));
+
+	/*
+	 * This assignment should be placed after the sd_llc_id as
+	 * we want this id equals to cluster id on cluster machines
+	 * but equals to LLC id on non-Cluster machines.
+	 */
+	per_cpu(sd_share_id, cpu) = id;
+
 	sd = lowest_flag_domain(cpu, SD_NUMA);
 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
 
@@ -1550,6 +1562,7 @@ static struct cpumask		***sched_domains_numa_masks;
  */
 #define TOPOLOGY_SD_FLAGS		\
 	(SD_SHARE_CPUCAPACITY	|	\
+	 SD_CLUSTER		|	\
 	 SD_SHARE_PKG_RESOURCES |	\
 	 SD_NUMA		|	\
 	 SD_ASYM_PACKING)
-- 
cgit v1.2.3


From 8881e1639f1f899b64e9bccf6cc14d51c1d3c822 Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Thu, 19 Oct 2023 11:33:22 +0800
Subject: sched/fair: Scan cluster before scanning LLC in wake-up path

For platforms having clusters like Kunpeng920, CPUs within the same cluster
have lower latency when synchronizing and accessing shared resources like
cache. Thus, this patch tries to find an idle cpu within the cluster of the
target CPU before scanning the whole LLC to gain lower latency. This
will be implemented in 2 steps in select_idle_sibling():
1. When the prev_cpu/recent_used_cpu are good wakeup candidates, use them
   if they're sharing cluster with the target CPU. Otherwise trying to
   scan for an idle CPU in the target's cluster.
2. Scanning the cluster prior to the LLC of the target CPU for an
   idle CPU to wakeup.

Testing has been done on Kunpeng920 by pinning tasks to one numa and two
numa. On Kunpeng920, Each numa has 8 clusters and each cluster has 4 CPUs.

With this patch, We noticed enhancement on tbench and netperf within one
numa or cross two numa on top of tip-sched-core commit
9b46f1abc6d4 ("sched/debug: Print 'tgid' in sched_show_task()")

tbench results (node 0):
            baseline                     patched
  1:        327.2833        372.4623 (   13.80%)
  4:       1320.5933       1479.8833 (   12.06%)
  8:       2638.4867       2921.5267 (   10.73%)
 16:       5282.7133       5891.5633 (   11.53%)
 32:       9810.6733       9877.3400 (    0.68%)
 64:       7408.9367       7447.9900 (    0.53%)
128:       6203.2600       6191.6500 (   -0.19%)
tbench results (node 0-1):
            baseline                     patched
  1:        332.0433        372.7223 (   12.25%)
  4:       1325.4667       1477.6733 (   11.48%)
  8:       2622.9433       2897.9967 (   10.49%)
 16:       5218.6100       5878.2967 (   12.64%)
 32:      10211.7000      11494.4000 (   12.56%)
 64:      13313.7333      16740.0333 (   25.74%)
128:      13959.1000      14533.9000 (    4.12%)

netperf results TCP_RR (node 0):
            baseline                     patched
  1:      76546.5033      90649.9867 (   18.42%)
  4:      77292.4450      90932.7175 (   17.65%)
  8:      77367.7254      90882.3467 (   17.47%)
 16:      78519.9048      90938.8344 (   15.82%)
 32:      72169.5035      72851.6730 (    0.95%)
 64:      25911.2457      25882.2315 (   -0.11%)
128:      10752.6572      10768.6038 (    0.15%)

netperf results TCP_RR (node 0-1):
            baseline                     patched
  1:      76857.6667      90892.2767 (   18.26%)
  4:      78236.6475      90767.3017 (   16.02%)
  8:      77929.6096      90684.1633 (   16.37%)
 16:      77438.5873      90502.5787 (   16.87%)
 32:      74205.6635      88301.5612 (   19.00%)
 64:      69827.8535      71787.6706 (    2.81%)
128:      25281.4366      25771.3023 (    1.94%)

netperf results UDP_RR (node 0):
            baseline                     patched
  1:      96869.8400     110800.8467 (   14.38%)
  4:      97744.9750     109680.5425 (   12.21%)
  8:      98783.9863     110409.9637 (   11.77%)
 16:      99575.0235     110636.2435 (   11.11%)
 32:      95044.7250      97622.8887 (    2.71%)
 64:      32925.2146      32644.4991 (   -0.85%)
128:      12859.2343      12824.0051 (   -0.27%)

netperf results UDP_RR (node 0-1):
            baseline                     patched
  1:      97202.4733     110190.1200 (   13.36%)
  4:      95954.0558     106245.7258 (   10.73%)
  8:      96277.1958     105206.5304 (    9.27%)
 16:      97692.7810     107927.2125 (   10.48%)
 32:      79999.6702     103550.2999 (   29.44%)
 64:      80592.7413      87284.0856 (    8.30%)
128:      27701.5770      29914.5820 (    7.99%)

Note neither Kunpeng920 nor x86 Jacobsville supports SMT, so the SMT branch
in the code has not been tested but it supposed to work.

Chen Yu also noticed this will improve the performance of tbench and
netperf on a 24 CPUs Jacobsville machine, there are 4 CPUs in one
cluster sharing L2 Cache.

[https://lore.kernel.org/lkml/Ytfjs+m1kUs0ScSn@worktop.programming.kicks-ass.net]
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-and-reviewed-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lkml.kernel.org/r/20231019033323.54147-3-yangyicong@huawei.com
---
 kernel/sched/fair.c     | 40 ++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h    |  1 +
 kernel/sched/topology.c | 12 ++++++++++++
 3 files changed, 49 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aa2091a08034..c47b38eded19 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7259,6 +7259,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 		}
 	}
 
+	if (static_branch_unlikely(&sched_cluster_active)) {
+		struct sched_group *sg = sd->groups;
+
+		if (sg->flags & SD_CLUSTER) {
+			for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
+				if (!cpumask_test_cpu(cpu, cpus))
+					continue;
+
+				if (has_idle_core) {
+					i = select_idle_core(p, cpu, cpus, &idle_cpu);
+					if ((unsigned int)i < nr_cpumask_bits)
+						return i;
+				} else {
+					if (--nr <= 0)
+						return -1;
+					idle_cpu = __select_idle_cpu(cpu, p);
+					if ((unsigned int)idle_cpu < nr_cpumask_bits)
+						return idle_cpu;
+				}
+			}
+			cpumask_andnot(cpus, cpus, sched_group_span(sg));
+		}
+	}
+
 	for_each_cpu_wrap(cpu, cpus, target + 1) {
 		if (has_idle_core) {
 			i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -7266,7 +7290,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 				return i;
 
 		} else {
-			if (!--nr)
+			if (--nr <= 0)
 				return -1;
 			idle_cpu = __select_idle_cpu(cpu, p);
 			if ((unsigned int)idle_cpu < nr_cpumask_bits)
@@ -7395,8 +7419,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 */
 	if (prev != target && cpus_share_cache(prev, target) &&
 	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
-	    asym_fits_cpu(task_util, util_min, util_max, prev))
-		return prev;
+	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
+
+		if (!static_branch_unlikely(&sched_cluster_active) ||
+		    cpus_share_resources(prev, target))
+			return prev;
+	}
 
 	/*
 	 * Allow a per-cpu kthread to stack with the wakee if the
@@ -7423,7 +7451,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
 	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
 	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
-		return recent_used_cpu;
+
+		if (!static_branch_unlikely(&sched_cluster_active) ||
+		    cpus_share_resources(recent_used_cpu, target))
+			return recent_used_cpu;
+
 	}
 
 	/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 998f03d02de0..ef4fe7bcf740 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1859,6 +1859,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 extern struct static_key_false sched_asym_cpucapacity;
+extern struct static_key_false sched_cluster_active;
 
 static __always_inline bool sched_asym_cpucap_active(void)
 {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index dbb8c328e8ad..10d1391e7416 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -673,7 +673,9 @@ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
 
 static void update_top_cache_domain(int cpu)
 {
@@ -2386,6 +2388,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	struct rq *rq = NULL;
 	int i, ret = -ENOMEM;
 	bool has_asym = false;
+	bool has_cluster = false;
 
 	if (WARN_ON(cpumask_empty(cpu_map)))
 		goto error;
@@ -2514,12 +2517,18 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 			WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
 
 		cpu_attach_domain(sd, d.rd, i);
+
+		if (lowest_flag_domain(i, SD_CLUSTER))
+			has_cluster = true;
 	}
 	rcu_read_unlock();
 
 	if (has_asym)
 		static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
 
+	if (has_cluster)
+		static_branch_inc_cpuslocked(&sched_cluster_active);
+
 	if (rq && sched_debug_verbose) {
 		pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
 			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
@@ -2619,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map)
 	if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
 		static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
 
+	if (static_branch_unlikely(&sched_cluster_active))
+		static_branch_dec_cpuslocked(&sched_cluster_active);
+
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
-- 
cgit v1.2.3


From 22165f61d0c4092adf40f967c899e5d8b8a0d703 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Thu, 19 Oct 2023 11:33:23 +0800
Subject: sched/fair: Use candidate prev/recent_used CPU if scanning failed for
 cluster wakeup

Chen Yu reports a hackbench regression of cluster wakeup when
hackbench threads equal to the CPU number [1]. Analysis shows
it's because we wake up more on the target CPU even if the
prev_cpu is a good wakeup candidate and leads to the decrease
of the CPU utilization.

Generally if the task's prev_cpu is idle we'll wake up the task
on it without scanning. On cluster machines we'll try to wake up
the task in the same cluster of the target for better cache
affinity, so if the prev_cpu is idle but not sharing the same
cluster with the target we'll still try to find an idle CPU within
the cluster. This will improve the performance at low loads on
cluster machines. But in the issue above, if the prev_cpu is idle
but not in the cluster with the target CPU, we'll try to scan an
idle one in the cluster. But since the system is busy, we're
likely to fail the scanning and use target instead, even if
the prev_cpu is idle. Then leads to the regression.

This patch solves this in 2 steps:
o record the prev_cpu/recent_used_cpu if they're good wakeup
  candidates but not sharing the cluster with the target.
o on scanning failure use the prev_cpu/recent_used_cpu if
  they're recorded as idle

[1] https://lore.kernel.org/all/ZGzDLuVaHR1PAYDt@chenyu5-mobl1/

Closes: https://lore.kernel.org/all/ZGsLy83wPIpamy6x@chenyu5-mobl1/
Reported-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Tested-and-reviewed-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20231019033323.54147-4-yangyicong@huawei.com
---
 kernel/sched/fair.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c47b38eded19..523b5aee2d6a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7392,7 +7392,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	bool has_idle_core = false;
 	struct sched_domain *sd;
 	unsigned long task_util, util_min, util_max;
-	int i, recent_used_cpu;
+	int i, recent_used_cpu, prev_aff = -1;
 
 	/*
 	 * On asymmetric system, update task utilization because we will check
@@ -7424,6 +7424,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 		if (!static_branch_unlikely(&sched_cluster_active) ||
 		    cpus_share_resources(prev, target))
 			return prev;
+
+		prev_aff = prev;
 	}
 
 	/*
@@ -7456,6 +7458,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 		    cpus_share_resources(recent_used_cpu, target))
 			return recent_used_cpu;
 
+	} else {
+		recent_used_cpu = -1;
 	}
 
 	/*
@@ -7496,6 +7500,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if ((unsigned)i < nr_cpumask_bits)
 		return i;
 
+	/*
+	 * For cluster machines which have lower sharing cache like L2 or
+	 * LLC Tag, we tend to find an idle CPU in the target's cluster
+	 * first. But prev_cpu or recent_used_cpu may also be a good candidate,
+	 * use them if possible when no idle CPU found in select_idle_cpu().
+	 */
+	if ((unsigned int)prev_aff < nr_cpumask_bits)
+		return prev_aff;
+	if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
+		return recent_used_cpu;
+
 	return target;
 }
 
-- 
cgit v1.2.3


From 984ffb6a4366752c949f7b39640aecdce222607f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 20 Oct 2023 12:35:33 +0200
Subject: sched/fair: Remove SIS_PROP

SIS_UTIL seems to work well, lets remove the old thing.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20231020134337.GD33965@noisy.programming.kicks-ass.net
---
 include/linux/sched/topology.h |  2 --
 kernel/sched/core.c            |  5 -----
 kernel/sched/fair.c            | 48 ------------------------------------------
 kernel/sched/features.h        |  1 -
 kernel/sched/sched.h           |  3 ---
 5 files changed, 59 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 4c14fe127223..de545ba85218 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -109,8 +109,6 @@ struct sched_domain {
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
 
-	u64 avg_scan_cost;		/* select_idle_sibling */
-
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5e1fb8a63b2e..7a0c16115b79 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3792,9 +3792,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 		if (rq->avg_idle > max)
 			rq->avg_idle = max;
 
-		rq->wake_stamp = jiffies;
-		rq->wake_avg_idle = rq->avg_idle / 2;
-
 		rq->idle_stamp = 0;
 	}
 #endif
@@ -9953,8 +9950,6 @@ void __init sched_init(void)
 		rq->online = 0;
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
-		rq->wake_stamp = jiffies;
-		rq->wake_avg_idle = rq->avg_idle;
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 
 		INIT_LIST_HEAD(&rq->cfs_tasks);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 523b5aee2d6a..8767988242ee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7209,45 +7209,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
 	int i, cpu, idle_cpu = -1, nr = INT_MAX;
 	struct sched_domain_shared *sd_share;
-	struct rq *this_rq = this_rq();
-	int this = smp_processor_id();
-	struct sched_domain *this_sd = NULL;
-	u64 time = 0;
 
 	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
 
-	if (sched_feat(SIS_PROP) && !has_idle_core) {
-		u64 avg_cost, avg_idle, span_avg;
-		unsigned long now = jiffies;
-
-		this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
-		if (!this_sd)
-			return -1;
-
-		/*
-		 * If we're busy, the assumption that the last idle period
-		 * predicts the future is flawed; age away the remaining
-		 * predicted idle time.
-		 */
-		if (unlikely(this_rq->wake_stamp < now)) {
-			while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
-				this_rq->wake_stamp++;
-				this_rq->wake_avg_idle >>= 1;
-			}
-		}
-
-		avg_idle = this_rq->wake_avg_idle;
-		avg_cost = this_sd->avg_scan_cost + 1;
-
-		span_avg = sd->span_weight * avg_idle;
-		if (span_avg > 4*avg_cost)
-			nr = div_u64(span_avg, avg_cost);
-		else
-			nr = 4;
-
-		time = cpu_clock(this);
-	}
-
 	if (sched_feat(SIS_UTIL)) {
 		sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
 		if (sd_share) {
@@ -7301,18 +7265,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 	if (has_idle_core)
 		set_idle_cores(target, false);
 
-	if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) {
-		time = cpu_clock(this) - time;
-
-		/*
-		 * Account for the scan cost of wakeups against the average
-		 * idle time.
-		 */
-		this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
-
-		update_avg(&this_sd->avg_scan_cost, time);
-	}
-
 	return idle_cpu;
 }
 
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index f770168230ae..a3ddf84de430 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -49,7 +49,6 @@ SCHED_FEAT(TTWU_QUEUE, true)
 /*
  * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
  */
-SCHED_FEAT(SIS_PROP, false)
 SCHED_FEAT(SIS_UTIL, true)
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef4fe7bcf740..2e5a95486a42 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1059,9 +1059,6 @@ struct rq {
 	u64			idle_stamp;
 	u64			avg_idle;
 
-	unsigned long		wake_stamp;
-	u64			wake_avg_idle;
-
 	/* This is used to determine avg_idle's max value */
 	u64			max_idle_balance_cost;
 
-- 
cgit v1.2.3


From a71ef31485bb51b846e8db8b3a35e432cc15afb5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 24 Oct 2023 11:42:21 +0200
Subject: perf/core: Fix potential NULL deref

Smatch is awesome.

Fixes: 32671e3799ca ("perf: Disallow mis-matched inherited group reads")
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d0663b9324e7..a2f2a9525d72 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -13372,7 +13372,8 @@ static int inherit_group(struct perf_event *parent_event,
 		    !perf_get_aux_event(child_ctr, leader))
 			return -EINVAL;
 	}
-	leader->group_generation = parent_event->group_generation;
+	if (leader)
+		leader->group_generation = parent_event->group_generation;
 	return 0;
 }
 
-- 
cgit v1.2.3


From d35381aa73f7e1e8b25f3ed5283287a64d9ddff5 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 11 Oct 2023 22:57:41 -0700
Subject: bpf: Fix unnecessary -EBUSY from htab_lock_bucket

htab_lock_bucket uses the following logic to avoid recursion:

1. preempt_disable();
2. check percpu counter htab->map_locked[hash] for recursion;
   2.1. if map_lock[hash] is already taken, return -BUSY;
3. raw_spin_lock_irqsave();

However, if an IRQ hits between 2 and 3, BPF programs attached to the IRQ
logic will not able to access the same hash of the hashtab and get -EBUSY.

This -EBUSY is not really necessary. Fix it by disabling IRQ before
checking map_locked:

1. preempt_disable();
2. local_irq_save();
3. check percpu counter htab->map_locked[hash] for recursion;
   3.1. if map_lock[hash] is already taken, return -BUSY;
4. raw_spin_lock().

Similarly, use raw_spin_unlock() and local_irq_restore() in
htab_unlock_bucket().

Fixes: 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked")
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/7a9576222aa40b1c84ad3a9ba3e64011d1a04d41.camel@linux.ibm.com
Link: https://lore.kernel.org/bpf/20231012055741.3375999-1-song@kernel.org
---
 kernel/bpf/hashtab.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a8c7e1c5abfa..fd8d4b0addfc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -155,13 +155,15 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab,
 	hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
 
 	preempt_disable();
+	local_irq_save(flags);
 	if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
 		__this_cpu_dec(*(htab->map_locked[hash]));
+		local_irq_restore(flags);
 		preempt_enable();
 		return -EBUSY;
 	}
 
-	raw_spin_lock_irqsave(&b->raw_lock, flags);
+	raw_spin_lock(&b->raw_lock);
 	*pflags = flags;
 
 	return 0;
@@ -172,8 +174,9 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab,
 				      unsigned long flags)
 {
 	hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
-	raw_spin_unlock_irqrestore(&b->raw_lock, flags);
+	raw_spin_unlock(&b->raw_lock);
 	__this_cpu_dec(*(htab->map_locked[hash]));
+	local_irq_restore(flags);
 	preempt_enable();
 }
 
-- 
cgit v1.2.3


From 06646da01458682023321bdc7553b8140e95d077 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 18 Oct 2023 15:28:32 -0700
Subject: bpf: Fold smp_mb__before_atomic() into atomic_set_release()

The bpf_user_ringbuf_drain() BPF_CALL function uses an atomic_set()
immediately preceded by smp_mb__before_atomic() so as to order storing
of ring-buffer consumer and producer positions prior to the atomic_set()
call's clearing of the ->busy flag, as follows:

        smp_mb__before_atomic();
        atomic_set(&rb->busy, 0);

Although this works given current architectures and implementations, and
given that this only needs to order prior writes against a later write.
However, it does so by accident because the smp_mb__before_atomic()
is only guaranteed to work with read-modify-write atomic operations, and
not at all with things like atomic_set() and atomic_read().

Note especially that smp_mb__before_atomic() will not, repeat *not*,
order the prior write to "a" before the subsequent non-read-modify-write
atomic read from "b", even on strongly ordered systems such as x86:

        WRITE_ONCE(a, 1);
        smp_mb__before_atomic();
        r1 = atomic_read(&b);

Therefore, replace the smp_mb__before_atomic() and atomic_set() with
atomic_set_release() as follows:

        atomic_set_release(&rb->busy, 0);

This is no slower (and sometimes is faster) than the original, and also
provides a formal guarantee of ordering that the original lacks.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/ec86d38e-cfb4-44aa-8fdb-6c925922d93c@paulmck-laptop
---
 kernel/bpf/ringbuf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index f045fde632e5..0ee653a936ea 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -770,8 +770,7 @@ schedule_work_return:
 	/* Prevent the clearing of the busy-bit from being reordered before the
 	 * storing of any rb consumer or producer positions.
 	 */
-	smp_mb__before_atomic();
-	atomic_set(&rb->busy, 0);
+	atomic_set_release(&rb->busy, 0);
 
 	if (flags & BPF_RB_FORCE_WAKEUP)
 		irq_work_queue(&rb->work);
-- 
cgit v1.2.3


From 42d31dd601fa43b9afdf069d1ba410b2306a4c76 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sun, 22 Oct 2023 13:57:37 -0700
Subject: bpf: Improve JEQ/JNE branch taken logic

When determining if an if/else branch will always or never be taken, use
signed range knowledge in addition to currently used unsigned range knowledge.
If either signed or unsigned range suggests that condition is always/never
taken, return corresponding branch_taken verdict.

Current use of unsigned range for this seems arbitrary and unnecessarily
incomplete. It is possible for *signed* operations to be performed on
register, which could "invalidate" unsigned range for that register. In such
case branch_taken will be artificially useless, even if we can still tell
that some constant is outside of register value range based on its signed
bounds.

veristat-based validation shows zero differences across selftests, Cilium,
and Meta-internal BPF object files.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/bpf/20231022205743.72352-2-andrii@kernel.org
---
 kernel/bpf/verifier.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 98f9d0f35931..857d76694517 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14031,12 +14031,16 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
 			return !!tnum_equals_const(subreg, val);
 		else if (val < reg->u32_min_value || val > reg->u32_max_value)
 			return 0;
+		else if (sval < reg->s32_min_value || sval > reg->s32_max_value)
+			return 0;
 		break;
 	case BPF_JNE:
 		if (tnum_is_const(subreg))
 			return !tnum_equals_const(subreg, val);
 		else if (val < reg->u32_min_value || val > reg->u32_max_value)
 			return 1;
+		else if (sval < reg->s32_min_value || sval > reg->s32_max_value)
+			return 1;
 		break;
 	case BPF_JSET:
 		if ((~subreg.mask & subreg.value) & val)
@@ -14108,12 +14112,16 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
 			return !!tnum_equals_const(reg->var_off, val);
 		else if (val < reg->umin_value || val > reg->umax_value)
 			return 0;
+		else if (sval < reg->smin_value || sval > reg->smax_value)
+			return 0;
 		break;
 	case BPF_JNE:
 		if (tnum_is_const(reg->var_off))
 			return !tnum_equals_const(reg->var_off, val);
 		else if (val < reg->umin_value || val > reg->umax_value)
 			return 1;
+		else if (sval < reg->smin_value || sval > reg->smax_value)
+			return 1;
 		break;
 	case BPF_JSET:
 		if ((~reg->var_off.mask & reg->var_off.value) & val)
-- 
cgit v1.2.3


From 35dfaad7188cdc043fde31709c796f5a692ba2bd Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 24 Oct 2023 23:48:58 +0200
Subject: netkit, bpf: Add bpf programmable net device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This work adds a new, minimal BPF-programmable device called "netkit"
(former PoC code-name "meta") we recently presented at LSF/MM/BPF. The
core idea is that BPF programs are executed within the drivers xmit routine
and therefore e.g. in case of containers/Pods moving BPF processing closer
to the source.

One of the goals was that in case of Pod egress traffic, this allows to
move BPF programs from hostns tcx ingress into the device itself, providing
earlier drop or forward mechanisms, for example, if the BPF program
determines that the skb must be sent out of the node, then a redirect to
the physical device can take place directly without going through per-CPU
backlog queue. This helps to shift processing for such traffic from softirq
to process context, leading to better scheduling decisions/performance (see
measurements in the slides).

In this initial version, the netkit device ships as a pair, but we plan to
extend this further so it can also operate in single device mode. The pair
comes with a primary and a peer device. Only the primary device, typically
residing in hostns, can manage BPF programs for itself and its peer. The
peer device is designated for containers/Pods and cannot attach/detach
BPF programs. Upon the device creation, the user can set the default policy
to 'pass' or 'drop' for the case when no BPF program is attached.

Additionally, the device can be operated in L3 (default) or L2 mode. The
management of BPF programs is done via bpf_mprog, so that multi-attach is
supported right from the beginning with similar API and dependency controls
as tcx. For details on the latter see commit 053c8e1f235d ("bpf: Add generic
attach/detach/query API for multi-progs"). tc BPF compatibility is provided,
so that existing programs can be easily migrated.

Going forward, we plan to use netkit devices in Cilium as the main device
type for connecting Pods. They will be operated in L3 mode in order to
simplify a Pod's neighbor management and the peer will operate in default
drop mode, so that no traffic is leaving between the time when a Pod is
brought up by the CNI plugin and programs attached by the agent.
Additionally, the programs we attach via tcx on the physical devices are
using bpf_redirect_peer() for inbound traffic into netkit device, hence the
latter is also supporting the ndo_get_peer_dev callback. Similarly, we use
bpf_redirect_neigh() for the way out, pushing from netkit peer to phys device
directly. Also, BIG TCP is supported on netkit device. For the follow-up
work in single device mode, we plan to convert Cilium's cilium_host/_net
devices into a single one.

An extensive test suite for checking device operations and the BPF program
and link management API comes as BPF selftests in this series.

Co-developed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://github.com/borkmann/iproute2/tree/pr/netkit
Link: http://vger.kernel.org/bpfconf2023_material/tcx_meta_netdev_borkmann.pdf (24ff.)
Link: https://lore.kernel.org/r/20231024214904.29825-2-daniel@iogearbox.net
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 MAINTAINERS                    |   9 +
 drivers/net/Kconfig            |   9 +
 drivers/net/Makefile           |   1 +
 drivers/net/netkit.c           | 940 +++++++++++++++++++++++++++++++++++++++++
 include/net/netkit.h           |  38 ++
 include/uapi/linux/bpf.h       |  14 +
 include/uapi/linux/if_link.h   |  24 ++
 kernel/bpf/syscall.c           |  30 +-
 tools/include/uapi/linux/bpf.h |  14 +
 9 files changed, 1074 insertions(+), 5 deletions(-)
 create mode 100644 drivers/net/netkit.c
 create mode 100644 include/net/netkit.h

(limited to 'kernel')

diff --git a/MAINTAINERS b/MAINTAINERS
index ed33b9df8b3d..43be6197e655 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3795,6 +3795,15 @@ L:	bpf@vger.kernel.org
 S:	Odd Fixes
 K:	(?:\b|_)bpf(?:\b|_)
 
+BPF [NETKIT] (BPF-programmable network device)
+M:	Daniel Borkmann <daniel@iogearbox.net>
+M:	Nikolay Aleksandrov <razor@blackwall.org>
+L:	bpf@vger.kernel.org
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	drivers/net/netkit.c
+F:	include/net/netkit.h
+
 BPF [NETWORKING] (struct_ops, reuseport)
 M:	Martin KaFai Lau <martin.lau@linux.dev>
 L:	bpf@vger.kernel.org
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 44eeb5d61ba9..af0da4bb429b 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -448,6 +448,15 @@ config NLMON
 	  diagnostics, etc. This is mostly intended for developers or support
 	  to debug netlink issues. If unsure, say N.
 
+config NETKIT
+	bool "BPF-programmable network device"
+	depends on BPF_SYSCALL
+	help
+	  The netkit device is a virtual networking device where BPF programs
+	  can be attached to the device(s) transmission routine in order to
+	  implement the driver's internal logic. The device can be configured
+	  to operate in L3 or L2 mode. If unsure, say N.
+
 config NET_VRF
 	tristate "Virtual Routing and Forwarding (Lite)"
 	depends on IP_MULTIPLE_TABLES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 8a83db32509d..7cab36f94782 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_MDIO) += mdio.o
 obj-$(CONFIG_NET) += loopback.o
 obj-$(CONFIG_NETDEV_LEGACY_INIT) += Space.o
 obj-$(CONFIG_NETCONSOLE) += netconsole.o
+obj-$(CONFIG_NETKIT) += netkit.o
 obj-y += phy/
 obj-y += pse-pd/
 obj-y += mdio/
diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
new file mode 100644
index 000000000000..7e484f9fd3ae
--- /dev/null
+++ b/drivers/net/netkit.c
@@ -0,0 +1,940 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2023 Isovalent */
+
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/etherdevice.h>
+#include <linux/filter.h>
+#include <linux/netfilter_netdev.h>
+#include <linux/bpf_mprog.h>
+
+#include <net/netkit.h>
+#include <net/dst.h>
+#include <net/tcx.h>
+
+#define DRV_NAME "netkit"
+
+struct netkit {
+	/* Needed in fast-path */
+	struct net_device __rcu *peer;
+	struct bpf_mprog_entry __rcu *active;
+	enum netkit_action policy;
+	struct bpf_mprog_bundle	bundle;
+
+	/* Needed in slow-path */
+	enum netkit_mode mode;
+	bool primary;
+	u32 headroom;
+};
+
+struct netkit_link {
+	struct bpf_link link;
+	struct net_device *dev;
+	u32 location;
+};
+
+static __always_inline int
+netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
+	   enum netkit_action ret)
+{
+	const struct bpf_mprog_fp *fp;
+	const struct bpf_prog *prog;
+
+	bpf_mprog_foreach_prog(entry, fp, prog) {
+		bpf_compute_data_pointers(skb);
+		ret = bpf_prog_run(prog, skb);
+		if (ret != NETKIT_NEXT)
+			break;
+	}
+	return ret;
+}
+
+static void netkit_prep_forward(struct sk_buff *skb, bool xnet)
+{
+	skb_scrub_packet(skb, xnet);
+	skb->priority = 0;
+	nf_skip_egress(skb, true);
+}
+
+static struct netkit *netkit_priv(const struct net_device *dev)
+{
+	return netdev_priv(dev);
+}
+
+static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct netkit *nk = netkit_priv(dev);
+	enum netkit_action ret = READ_ONCE(nk->policy);
+	netdev_tx_t ret_dev = NET_XMIT_SUCCESS;
+	const struct bpf_mprog_entry *entry;
+	struct net_device *peer;
+
+	rcu_read_lock();
+	peer = rcu_dereference(nk->peer);
+	if (unlikely(!peer || !(peer->flags & IFF_UP) ||
+		     !pskb_may_pull(skb, ETH_HLEN) ||
+		     skb_orphan_frags(skb, GFP_ATOMIC)))
+		goto drop;
+	netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer)));
+	skb->dev = peer;
+	entry = rcu_dereference(nk->active);
+	if (entry)
+		ret = netkit_run(entry, skb, ret);
+	switch (ret) {
+	case NETKIT_NEXT:
+	case NETKIT_PASS:
+		skb->protocol = eth_type_trans(skb, skb->dev);
+		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+		__netif_rx(skb);
+		break;
+	case NETKIT_REDIRECT:
+		skb_do_redirect(skb);
+		break;
+	case NETKIT_DROP:
+	default:
+drop:
+		kfree_skb(skb);
+		dev_core_stats_tx_dropped_inc(dev);
+		ret_dev = NET_XMIT_DROP;
+		break;
+	}
+	rcu_read_unlock();
+	return ret_dev;
+}
+
+static int netkit_open(struct net_device *dev)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct net_device *peer = rtnl_dereference(nk->peer);
+
+	if (!peer)
+		return -ENOTCONN;
+	if (peer->flags & IFF_UP) {
+		netif_carrier_on(dev);
+		netif_carrier_on(peer);
+	}
+	return 0;
+}
+
+static int netkit_close(struct net_device *dev)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct net_device *peer = rtnl_dereference(nk->peer);
+
+	netif_carrier_off(dev);
+	if (peer)
+		netif_carrier_off(peer);
+	return 0;
+}
+
+static int netkit_get_iflink(const struct net_device *dev)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct net_device *peer;
+	int iflink = 0;
+
+	rcu_read_lock();
+	peer = rcu_dereference(nk->peer);
+	if (peer)
+		iflink = peer->ifindex;
+	rcu_read_unlock();
+	return iflink;
+}
+
+static void netkit_set_multicast(struct net_device *dev)
+{
+	/* Nothing to do, we receive whatever gets pushed to us! */
+}
+
+static void netkit_set_headroom(struct net_device *dev, int headroom)
+{
+	struct netkit *nk = netkit_priv(dev), *nk2;
+	struct net_device *peer;
+
+	if (headroom < 0)
+		headroom = NET_SKB_PAD;
+
+	rcu_read_lock();
+	peer = rcu_dereference(nk->peer);
+	if (unlikely(!peer))
+		goto out;
+
+	nk2 = netkit_priv(peer);
+	nk->headroom = headroom;
+	headroom = max(nk->headroom, nk2->headroom);
+
+	peer->needed_headroom = headroom;
+	dev->needed_headroom = headroom;
+out:
+	rcu_read_unlock();
+}
+
+static struct net_device *netkit_peer_dev(struct net_device *dev)
+{
+	return rcu_dereference(netkit_priv(dev)->peer);
+}
+
+static void netkit_uninit(struct net_device *dev);
+
+static const struct net_device_ops netkit_netdev_ops = {
+	.ndo_open		= netkit_open,
+	.ndo_stop		= netkit_close,
+	.ndo_start_xmit		= netkit_xmit,
+	.ndo_set_rx_mode	= netkit_set_multicast,
+	.ndo_set_rx_headroom	= netkit_set_headroom,
+	.ndo_get_iflink		= netkit_get_iflink,
+	.ndo_get_peer_dev	= netkit_peer_dev,
+	.ndo_uninit		= netkit_uninit,
+	.ndo_features_check	= passthru_features_check,
+};
+
+static void netkit_get_drvinfo(struct net_device *dev,
+			       struct ethtool_drvinfo *info)
+{
+	strscpy(info->driver, DRV_NAME, sizeof(info->driver));
+}
+
+static const struct ethtool_ops netkit_ethtool_ops = {
+	.get_drvinfo		= netkit_get_drvinfo,
+};
+
+static void netkit_setup(struct net_device *dev)
+{
+	static const netdev_features_t netkit_features_hw_vlan =
+		NETIF_F_HW_VLAN_CTAG_TX |
+		NETIF_F_HW_VLAN_CTAG_RX |
+		NETIF_F_HW_VLAN_STAG_TX |
+		NETIF_F_HW_VLAN_STAG_RX;
+	static const netdev_features_t netkit_features =
+		netkit_features_hw_vlan |
+		NETIF_F_SG |
+		NETIF_F_FRAGLIST |
+		NETIF_F_HW_CSUM |
+		NETIF_F_RXCSUM |
+		NETIF_F_SCTP_CRC |
+		NETIF_F_HIGHDMA |
+		NETIF_F_GSO_SOFTWARE |
+		NETIF_F_GSO_ENCAP_ALL;
+
+	ether_setup(dev);
+	dev->max_mtu = ETH_MAX_MTU;
+
+	dev->flags |= IFF_NOARP;
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	dev->priv_flags |= IFF_PHONY_HEADROOM;
+	dev->priv_flags |= IFF_NO_QUEUE;
+
+	dev->ethtool_ops = &netkit_ethtool_ops;
+	dev->netdev_ops  = &netkit_netdev_ops;
+
+	dev->features |= netkit_features | NETIF_F_LLTX;
+	dev->hw_features = netkit_features;
+	dev->hw_enc_features = netkit_features;
+	dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
+	dev->vlan_features = dev->features & ~netkit_features_hw_vlan;
+
+	dev->needs_free_netdev = true;
+
+	netif_set_tso_max_size(dev, GSO_MAX_SIZE);
+}
+
+static struct net *netkit_get_link_net(const struct net_device *dev)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct net_device *peer = rtnl_dereference(nk->peer);
+
+	return peer ? dev_net(peer) : dev_net(dev);
+}
+
+static int netkit_check_policy(int policy, struct nlattr *tb,
+			       struct netlink_ext_ack *extack)
+{
+	switch (policy) {
+	case NETKIT_PASS:
+	case NETKIT_DROP:
+		return 0;
+	default:
+		NL_SET_ERR_MSG_ATTR(extack, tb,
+				    "Provided default xmit policy not supported");
+		return -EINVAL;
+	}
+}
+
+static int netkit_check_mode(int mode, struct nlattr *tb,
+			     struct netlink_ext_ack *extack)
+{
+	switch (mode) {
+	case NETKIT_L2:
+	case NETKIT_L3:
+		return 0;
+	default:
+		NL_SET_ERR_MSG_ATTR(extack, tb,
+				    "Provided device mode can only be L2 or L3");
+		return -EINVAL;
+	}
+}
+
+static int netkit_validate(struct nlattr *tb[], struct nlattr *data[],
+			   struct netlink_ext_ack *extack)
+{
+	struct nlattr *attr = tb[IFLA_ADDRESS];
+
+	if (!attr)
+		return 0;
+	NL_SET_ERR_MSG_ATTR(extack, attr,
+			    "Setting Ethernet address is not supported");
+	return -EOPNOTSUPP;
+}
+
+static struct rtnl_link_ops netkit_link_ops;
+
+static int netkit_new_link(struct net *src_net, struct net_device *dev,
+			   struct nlattr *tb[], struct nlattr *data[],
+			   struct netlink_ext_ack *extack)
+{
+	struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb, *attr;
+	enum netkit_action default_prim = NETKIT_PASS;
+	enum netkit_action default_peer = NETKIT_PASS;
+	enum netkit_mode mode = NETKIT_L3;
+	unsigned char ifname_assign_type;
+	struct ifinfomsg *ifmp = NULL;
+	struct net_device *peer;
+	char ifname[IFNAMSIZ];
+	struct netkit *nk;
+	struct net *net;
+	int err;
+
+	if (data) {
+		if (data[IFLA_NETKIT_MODE]) {
+			attr = data[IFLA_NETKIT_MODE];
+			mode = nla_get_u32(attr);
+			err = netkit_check_mode(mode, attr, extack);
+			if (err < 0)
+				return err;
+		}
+		if (data[IFLA_NETKIT_PEER_INFO]) {
+			attr = data[IFLA_NETKIT_PEER_INFO];
+			ifmp = nla_data(attr);
+			err = rtnl_nla_parse_ifinfomsg(peer_tb, attr, extack);
+			if (err < 0)
+				return err;
+			err = netkit_validate(peer_tb, NULL, extack);
+			if (err < 0)
+				return err;
+			tbp = peer_tb;
+		}
+		if (data[IFLA_NETKIT_POLICY]) {
+			attr = data[IFLA_NETKIT_POLICY];
+			default_prim = nla_get_u32(attr);
+			err = netkit_check_policy(default_prim, attr, extack);
+			if (err < 0)
+				return err;
+		}
+		if (data[IFLA_NETKIT_PEER_POLICY]) {
+			attr = data[IFLA_NETKIT_PEER_POLICY];
+			default_peer = nla_get_u32(attr);
+			err = netkit_check_policy(default_peer, attr, extack);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	if (ifmp && tbp[IFLA_IFNAME]) {
+		nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
+		ifname_assign_type = NET_NAME_USER;
+	} else {
+		strscpy(ifname, "nk%d", IFNAMSIZ);
+		ifname_assign_type = NET_NAME_ENUM;
+	}
+
+	net = rtnl_link_get_net(src_net, tbp);
+	if (IS_ERR(net))
+		return PTR_ERR(net);
+
+	peer = rtnl_create_link(net, ifname, ifname_assign_type,
+				&netkit_link_ops, tbp, extack);
+	if (IS_ERR(peer)) {
+		put_net(net);
+		return PTR_ERR(peer);
+	}
+
+	netif_inherit_tso_max(peer, dev);
+
+	if (mode == NETKIT_L2)
+		eth_hw_addr_random(peer);
+	if (ifmp && dev->ifindex)
+		peer->ifindex = ifmp->ifi_index;
+
+	nk = netkit_priv(peer);
+	nk->primary = false;
+	nk->policy = default_peer;
+	nk->mode = mode;
+	bpf_mprog_bundle_init(&nk->bundle);
+	RCU_INIT_POINTER(nk->active, NULL);
+	RCU_INIT_POINTER(nk->peer, NULL);
+
+	err = register_netdevice(peer);
+	put_net(net);
+	if (err < 0)
+		goto err_register_peer;
+	netif_carrier_off(peer);
+	if (mode == NETKIT_L2)
+		dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
+
+	err = rtnl_configure_link(peer, NULL, 0, NULL);
+	if (err < 0)
+		goto err_configure_peer;
+
+	if (mode == NETKIT_L2)
+		eth_hw_addr_random(dev);
+	if (tb[IFLA_IFNAME])
+		nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
+	else
+		strscpy(dev->name, "nk%d", IFNAMSIZ);
+
+	nk = netkit_priv(dev);
+	nk->primary = true;
+	nk->policy = default_prim;
+	nk->mode = mode;
+	bpf_mprog_bundle_init(&nk->bundle);
+	RCU_INIT_POINTER(nk->active, NULL);
+	RCU_INIT_POINTER(nk->peer, NULL);
+
+	err = register_netdevice(dev);
+	if (err < 0)
+		goto err_configure_peer;
+	netif_carrier_off(dev);
+	if (mode == NETKIT_L2)
+		dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL);
+
+	rcu_assign_pointer(netkit_priv(dev)->peer, peer);
+	rcu_assign_pointer(netkit_priv(peer)->peer, dev);
+	return 0;
+err_configure_peer:
+	unregister_netdevice(peer);
+	return err;
+err_register_peer:
+	free_netdev(peer);
+	return err;
+}
+
+static struct bpf_mprog_entry *netkit_entry_fetch(struct net_device *dev,
+						  bool bundle_fallback)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct bpf_mprog_entry *entry;
+
+	ASSERT_RTNL();
+	entry = rcu_dereference_rtnl(nk->active);
+	if (entry)
+		return entry;
+	if (bundle_fallback)
+		return &nk->bundle.a;
+	return NULL;
+}
+
+static void netkit_entry_update(struct net_device *dev,
+				struct bpf_mprog_entry *entry)
+{
+	struct netkit *nk = netkit_priv(dev);
+
+	ASSERT_RTNL();
+	rcu_assign_pointer(nk->active, entry);
+}
+
+static void netkit_entry_sync(void)
+{
+	synchronize_rcu();
+}
+
+static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 which)
+{
+	struct net_device *dev;
+	struct netkit *nk;
+
+	ASSERT_RTNL();
+
+	switch (which) {
+	case BPF_NETKIT_PRIMARY:
+	case BPF_NETKIT_PEER:
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	dev = __dev_get_by_index(net, ifindex);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+	if (dev->netdev_ops != &netkit_netdev_ops)
+		return ERR_PTR(-ENXIO);
+
+	nk = netkit_priv(dev);
+	if (!nk->primary)
+		return ERR_PTR(-EACCES);
+	if (which == BPF_NETKIT_PEER) {
+		dev = rcu_dereference_rtnl(nk->peer);
+		if (!dev)
+			return ERR_PTR(-ENODEV);
+	}
+	return dev;
+}
+
+int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_mprog_entry *entry, *entry_new;
+	struct bpf_prog *replace_prog = NULL;
+	struct net_device *dev;
+	int ret;
+
+	rtnl_lock();
+	dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex,
+			       attr->attach_type);
+	if (IS_ERR(dev)) {
+		ret = PTR_ERR(dev);
+		goto out;
+	}
+	entry = netkit_entry_fetch(dev, true);
+	if (attr->attach_flags & BPF_F_REPLACE) {
+		replace_prog = bpf_prog_get_type(attr->replace_bpf_fd,
+						 prog->type);
+		if (IS_ERR(replace_prog)) {
+			ret = PTR_ERR(replace_prog);
+			replace_prog = NULL;
+			goto out;
+		}
+	}
+	ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog,
+			       attr->attach_flags, attr->relative_fd,
+			       attr->expected_revision);
+	if (!ret) {
+		if (entry != entry_new) {
+			netkit_entry_update(dev, entry_new);
+			netkit_entry_sync();
+		}
+		bpf_mprog_commit(entry);
+	}
+out:
+	if (replace_prog)
+		bpf_prog_put(replace_prog);
+	rtnl_unlock();
+	return ret;
+}
+
+int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_mprog_entry *entry, *entry_new;
+	struct net_device *dev;
+	int ret;
+
+	rtnl_lock();
+	dev = netkit_dev_fetch(current->nsproxy->net_ns, attr->target_ifindex,
+			       attr->attach_type);
+	if (IS_ERR(dev)) {
+		ret = PTR_ERR(dev);
+		goto out;
+	}
+	entry = netkit_entry_fetch(dev, false);
+	if (!entry) {
+		ret = -ENOENT;
+		goto out;
+	}
+	ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags,
+			       attr->relative_fd, attr->expected_revision);
+	if (!ret) {
+		if (!bpf_mprog_total(entry_new))
+			entry_new = NULL;
+		netkit_entry_update(dev, entry_new);
+		netkit_entry_sync();
+		bpf_mprog_commit(entry);
+	}
+out:
+	rtnl_unlock();
+	return ret;
+}
+
+int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+	struct net_device *dev;
+	int ret;
+
+	rtnl_lock();
+	dev = netkit_dev_fetch(current->nsproxy->net_ns,
+			       attr->query.target_ifindex,
+			       attr->query.attach_type);
+	if (IS_ERR(dev)) {
+		ret = PTR_ERR(dev);
+		goto out;
+	}
+	ret = bpf_mprog_query(attr, uattr, netkit_entry_fetch(dev, false));
+out:
+	rtnl_unlock();
+	return ret;
+}
+
+static struct netkit_link *netkit_link(const struct bpf_link *link)
+{
+	return container_of(link, struct netkit_link, link);
+}
+
+static int netkit_link_prog_attach(struct bpf_link *link, u32 flags,
+				   u32 id_or_fd, u64 revision)
+{
+	struct netkit_link *nkl = netkit_link(link);
+	struct bpf_mprog_entry *entry, *entry_new;
+	struct net_device *dev = nkl->dev;
+	int ret;
+
+	ASSERT_RTNL();
+	entry = netkit_entry_fetch(dev, true);
+	ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags,
+			       id_or_fd, revision);
+	if (!ret) {
+		if (entry != entry_new) {
+			netkit_entry_update(dev, entry_new);
+			netkit_entry_sync();
+		}
+		bpf_mprog_commit(entry);
+	}
+	return ret;
+}
+
+static void netkit_link_release(struct bpf_link *link)
+{
+	struct netkit_link *nkl = netkit_link(link);
+	struct bpf_mprog_entry *entry, *entry_new;
+	struct net_device *dev;
+	int ret = 0;
+
+	rtnl_lock();
+	dev = nkl->dev;
+	if (!dev)
+		goto out;
+	entry = netkit_entry_fetch(dev, false);
+	if (!entry) {
+		ret = -ENOENT;
+		goto out;
+	}
+	ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0);
+	if (!ret) {
+		if (!bpf_mprog_total(entry_new))
+			entry_new = NULL;
+		netkit_entry_update(dev, entry_new);
+		netkit_entry_sync();
+		bpf_mprog_commit(entry);
+		nkl->dev = NULL;
+	}
+out:
+	WARN_ON_ONCE(ret);
+	rtnl_unlock();
+}
+
+static int netkit_link_update(struct bpf_link *link, struct bpf_prog *nprog,
+			      struct bpf_prog *oprog)
+{
+	struct netkit_link *nkl = netkit_link(link);
+	struct bpf_mprog_entry *entry, *entry_new;
+	struct net_device *dev;
+	int ret = 0;
+
+	rtnl_lock();
+	dev = nkl->dev;
+	if (!dev) {
+		ret = -ENOLINK;
+		goto out;
+	}
+	if (oprog && link->prog != oprog) {
+		ret = -EPERM;
+		goto out;
+	}
+	oprog = link->prog;
+	if (oprog == nprog) {
+		bpf_prog_put(nprog);
+		goto out;
+	}
+	entry = netkit_entry_fetch(dev, false);
+	if (!entry) {
+		ret = -ENOENT;
+		goto out;
+	}
+	ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog,
+			       BPF_F_REPLACE | BPF_F_ID,
+			       link->prog->aux->id, 0);
+	if (!ret) {
+		WARN_ON_ONCE(entry != entry_new);
+		oprog = xchg(&link->prog, nprog);
+		bpf_prog_put(oprog);
+		bpf_mprog_commit(entry);
+	}
+out:
+	rtnl_unlock();
+	return ret;
+}
+
+static void netkit_link_dealloc(struct bpf_link *link)
+{
+	kfree(netkit_link(link));
+}
+
+static void netkit_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
+{
+	const struct netkit_link *nkl = netkit_link(link);
+	u32 ifindex = 0;
+
+	rtnl_lock();
+	if (nkl->dev)
+		ifindex = nkl->dev->ifindex;
+	rtnl_unlock();
+
+	seq_printf(seq, "ifindex:\t%u\n", ifindex);
+	seq_printf(seq, "attach_type:\t%u (%s)\n",
+		   nkl->location,
+		   nkl->location == BPF_NETKIT_PRIMARY ? "primary" : "peer");
+}
+
+static int netkit_link_fill_info(const struct bpf_link *link,
+				 struct bpf_link_info *info)
+{
+	const struct netkit_link *nkl = netkit_link(link);
+	u32 ifindex = 0;
+
+	rtnl_lock();
+	if (nkl->dev)
+		ifindex = nkl->dev->ifindex;
+	rtnl_unlock();
+
+	info->netkit.ifindex = ifindex;
+	info->netkit.attach_type = nkl->location;
+	return 0;
+}
+
+static int netkit_link_detach(struct bpf_link *link)
+{
+	netkit_link_release(link);
+	return 0;
+}
+
+static const struct bpf_link_ops netkit_link_lops = {
+	.release	= netkit_link_release,
+	.detach		= netkit_link_detach,
+	.dealloc	= netkit_link_dealloc,
+	.update_prog	= netkit_link_update,
+	.show_fdinfo	= netkit_link_fdinfo,
+	.fill_link_info	= netkit_link_fill_info,
+};
+
+static int netkit_link_init(struct netkit_link *nkl,
+			    struct bpf_link_primer *link_primer,
+			    const union bpf_attr *attr,
+			    struct net_device *dev,
+			    struct bpf_prog *prog)
+{
+	bpf_link_init(&nkl->link, BPF_LINK_TYPE_NETKIT,
+		      &netkit_link_lops, prog);
+	nkl->location = attr->link_create.attach_type;
+	nkl->dev = dev;
+	return bpf_link_prime(&nkl->link, link_primer);
+}
+
+int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_link_primer link_primer;
+	struct netkit_link *nkl;
+	struct net_device *dev;
+	int ret;
+
+	rtnl_lock();
+	dev = netkit_dev_fetch(current->nsproxy->net_ns,
+			       attr->link_create.target_ifindex,
+			       attr->link_create.attach_type);
+	if (IS_ERR(dev)) {
+		ret = PTR_ERR(dev);
+		goto out;
+	}
+	nkl = kzalloc(sizeof(*nkl), GFP_KERNEL_ACCOUNT);
+	if (!nkl) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = netkit_link_init(nkl, &link_primer, attr, dev, prog);
+	if (ret) {
+		kfree(nkl);
+		goto out;
+	}
+	ret = netkit_link_prog_attach(&nkl->link,
+				      attr->link_create.flags,
+				      attr->link_create.netkit.relative_fd,
+				      attr->link_create.netkit.expected_revision);
+	if (ret) {
+		nkl->dev = NULL;
+		bpf_link_cleanup(&link_primer);
+		goto out;
+	}
+	ret = bpf_link_settle(&link_primer);
+out:
+	rtnl_unlock();
+	return ret;
+}
+
+static void netkit_release_all(struct net_device *dev)
+{
+	struct bpf_mprog_entry *entry;
+	struct bpf_tuple tuple = {};
+	struct bpf_mprog_fp *fp;
+	struct bpf_mprog_cp *cp;
+
+	entry = netkit_entry_fetch(dev, false);
+	if (!entry)
+		return;
+	netkit_entry_update(dev, NULL);
+	netkit_entry_sync();
+	bpf_mprog_foreach_tuple(entry, fp, cp, tuple) {
+		if (tuple.link)
+			netkit_link(tuple.link)->dev = NULL;
+		else
+			bpf_prog_put(tuple.prog);
+	}
+}
+
+static void netkit_uninit(struct net_device *dev)
+{
+	netkit_release_all(dev);
+}
+
+static void netkit_del_link(struct net_device *dev, struct list_head *head)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct net_device *peer = rtnl_dereference(nk->peer);
+
+	RCU_INIT_POINTER(nk->peer, NULL);
+	unregister_netdevice_queue(dev, head);
+	if (peer) {
+		nk = netkit_priv(peer);
+		RCU_INIT_POINTER(nk->peer, NULL);
+		unregister_netdevice_queue(peer, head);
+	}
+}
+
+static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
+			      struct nlattr *data[],
+			      struct netlink_ext_ack *extack)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct net_device *peer = rtnl_dereference(nk->peer);
+	enum netkit_action policy;
+	struct nlattr *attr;
+	int err;
+
+	if (!nk->primary) {
+		NL_SET_ERR_MSG(extack,
+			       "netkit link settings can be changed only through the primary device");
+		return -EACCES;
+	}
+
+	if (data[IFLA_NETKIT_MODE]) {
+		NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_MODE],
+				    "netkit link operating mode cannot be changed after device creation");
+		return -EACCES;
+	}
+
+	if (data[IFLA_NETKIT_POLICY]) {
+		attr = data[IFLA_NETKIT_POLICY];
+		policy = nla_get_u32(attr);
+		err = netkit_check_policy(policy, attr, extack);
+		if (err)
+			return err;
+		WRITE_ONCE(nk->policy, policy);
+	}
+
+	if (data[IFLA_NETKIT_PEER_POLICY]) {
+		err = -EOPNOTSUPP;
+		attr = data[IFLA_NETKIT_PEER_POLICY];
+		policy = nla_get_u32(attr);
+		if (peer)
+			err = netkit_check_policy(policy, attr, extack);
+		if (err)
+			return err;
+		nk = netkit_priv(peer);
+		WRITE_ONCE(nk->policy, policy);
+	}
+
+	return 0;
+}
+
+static size_t netkit_get_size(const struct net_device *dev)
+{
+	return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */
+	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_POLICY */
+	       nla_total_size(sizeof(u8))  + /* IFLA_NETKIT_PRIMARY */
+	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */
+	       0;
+}
+
+static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct net_device *peer = rtnl_dereference(nk->peer);
+
+	if (nla_put_u8(skb, IFLA_NETKIT_PRIMARY, nk->primary))
+		return -EMSGSIZE;
+	if (nla_put_u32(skb, IFLA_NETKIT_POLICY, nk->policy))
+		return -EMSGSIZE;
+	if (nla_put_u32(skb, IFLA_NETKIT_MODE, nk->mode))
+		return -EMSGSIZE;
+
+	if (peer) {
+		nk = netkit_priv(peer);
+		if (nla_put_u32(skb, IFLA_NETKIT_PEER_POLICY, nk->policy))
+			return -EMSGSIZE;
+	}
+
+	return 0;
+}
+
+static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
+	[IFLA_NETKIT_PEER_INFO]		= { .len = sizeof(struct ifinfomsg) },
+	[IFLA_NETKIT_POLICY]		= { .type = NLA_U32 },
+	[IFLA_NETKIT_MODE]		= { .type = NLA_U32 },
+	[IFLA_NETKIT_PEER_POLICY]	= { .type = NLA_U32 },
+	[IFLA_NETKIT_PRIMARY]		= { .type = NLA_REJECT,
+					    .reject_message = "Primary attribute is read-only" },
+};
+
+static struct rtnl_link_ops netkit_link_ops = {
+	.kind		= DRV_NAME,
+	.priv_size	= sizeof(struct netkit),
+	.setup		= netkit_setup,
+	.newlink	= netkit_new_link,
+	.dellink	= netkit_del_link,
+	.changelink	= netkit_change_link,
+	.get_link_net	= netkit_get_link_net,
+	.get_size	= netkit_get_size,
+	.fill_info	= netkit_fill_info,
+	.policy		= netkit_policy,
+	.validate	= netkit_validate,
+	.maxtype	= IFLA_NETKIT_MAX,
+};
+
+static __init int netkit_init(void)
+{
+	BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT ||
+		     (int)NETKIT_PASS != (int)TCX_PASS ||
+		     (int)NETKIT_DROP != (int)TCX_DROP ||
+		     (int)NETKIT_REDIRECT != (int)TCX_REDIRECT);
+
+	return rtnl_link_register(&netkit_link_ops);
+}
+
+static __exit void netkit_exit(void)
+{
+	rtnl_link_unregister(&netkit_link_ops);
+}
+
+module_init(netkit_init);
+module_exit(netkit_exit);
+
+MODULE_DESCRIPTION("BPF-programmable network device");
+MODULE_AUTHOR("Daniel Borkmann <daniel@iogearbox.net>");
+MODULE_AUTHOR("Nikolay Aleksandrov <razor@blackwall.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK(DRV_NAME);
diff --git a/include/net/netkit.h b/include/net/netkit.h
new file mode 100644
index 000000000000..0ba2e6b847ca
--- /dev/null
+++ b/include/net/netkit.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2023 Isovalent */
+#ifndef __NET_NETKIT_H
+#define __NET_NETKIT_H
+
+#include <linux/bpf.h>
+
+#ifdef CONFIG_NETKIT
+int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog);
+int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
+#else
+static inline int netkit_prog_attach(const union bpf_attr *attr,
+				     struct bpf_prog *prog)
+{
+	return -EINVAL;
+}
+
+static inline int netkit_link_attach(const union bpf_attr *attr,
+				     struct bpf_prog *prog)
+{
+	return -EINVAL;
+}
+
+static inline int netkit_prog_detach(const union bpf_attr *attr,
+				     struct bpf_prog *prog)
+{
+	return -EINVAL;
+}
+
+static inline int netkit_prog_query(const union bpf_attr *attr,
+				    union bpf_attr __user *uattr)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_NETKIT */
+#endif /* __NET_NETKIT_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7ba61b75bc0e..0f6cdf52b1da 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1052,6 +1052,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_UNIX_RECVMSG,
 	BPF_CGROUP_UNIX_GETPEERNAME,
 	BPF_CGROUP_UNIX_GETSOCKNAME,
+	BPF_NETKIT_PRIMARY,
+	BPF_NETKIT_PEER,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1071,6 +1073,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_NETFILTER = 10,
 	BPF_LINK_TYPE_TCX = 11,
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
+	BPF_LINK_TYPE_NETKIT = 13,
 	MAX_BPF_LINK_TYPE,
 };
 
@@ -1656,6 +1659,13 @@ union bpf_attr {
 				__u32		flags;
 				__u32		pid;
 			} uprobe_multi;
+			struct {
+				union {
+					__u32	relative_fd;
+					__u32	relative_id;
+				};
+				__u64		expected_revision;
+			} netkit;
 		};
 	} link_create;
 
@@ -6576,6 +6586,10 @@ struct bpf_link_info {
 			__u32 ifindex;
 			__u32 attach_type;
 		} tcx;
+		struct {
+			__u32 ifindex;
+			__u32 attach_type;
+		} netkit;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index fac351a93aed..a0aa05a28cf2 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -756,6 +756,30 @@ struct tunnel_msg {
 	__u32 ifindex;
 };
 
+/* netkit section */
+enum netkit_action {
+	NETKIT_NEXT	= -1,
+	NETKIT_PASS	= 0,
+	NETKIT_DROP	= 2,
+	NETKIT_REDIRECT	= 7,
+};
+
+enum netkit_mode {
+	NETKIT_L2,
+	NETKIT_L3,
+};
+
+enum {
+	IFLA_NETKIT_UNSPEC,
+	IFLA_NETKIT_PEER_INFO,
+	IFLA_NETKIT_PRIMARY,
+	IFLA_NETKIT_POLICY,
+	IFLA_NETKIT_PEER_POLICY,
+	IFLA_NETKIT_MODE,
+	__IFLA_NETKIT_MAX,
+};
+#define IFLA_NETKIT_MAX	(__IFLA_NETKIT_MAX - 1)
+
 /* VXLAN section */
 
 /* include statistics in the dump */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a9b2cb500bf7..0ed286b8a0f0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -35,8 +35,9 @@
 #include <linux/rcupdate_trace.h>
 #include <linux/memcontrol.h>
 #include <linux/trace_events.h>
-#include <net/netfilter/nf_bpf_link.h>
 
+#include <net/netfilter/nf_bpf_link.h>
+#include <net/netkit.h>
 #include <net/tcx.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -3730,6 +3731,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_LSM;
 	case BPF_TCX_INGRESS:
 	case BPF_TCX_EGRESS:
+	case BPF_NETKIT_PRIMARY:
+	case BPF_NETKIT_PEER:
 		return BPF_PROG_TYPE_SCHED_CLS;
 	default:
 		return BPF_PROG_TYPE_UNSPEC;
@@ -3781,7 +3784,9 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 		return 0;
 	case BPF_PROG_TYPE_SCHED_CLS:
 		if (attach_type != BPF_TCX_INGRESS &&
-		    attach_type != BPF_TCX_EGRESS)
+		    attach_type != BPF_TCX_EGRESS &&
+		    attach_type != BPF_NETKIT_PRIMARY &&
+		    attach_type != BPF_NETKIT_PEER)
 			return -EINVAL;
 		return 0;
 	default:
@@ -3864,7 +3869,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 			ret = cgroup_bpf_prog_attach(attr, ptype, prog);
 		break;
 	case BPF_PROG_TYPE_SCHED_CLS:
-		ret = tcx_prog_attach(attr, prog);
+		if (attr->attach_type == BPF_TCX_INGRESS ||
+		    attr->attach_type == BPF_TCX_EGRESS)
+			ret = tcx_prog_attach(attr, prog);
+		else
+			ret = netkit_prog_attach(attr, prog);
 		break;
 	default:
 		ret = -EINVAL;
@@ -3925,7 +3934,11 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		ret = cgroup_bpf_prog_detach(attr, ptype);
 		break;
 	case BPF_PROG_TYPE_SCHED_CLS:
-		ret = tcx_prog_detach(attr, prog);
+		if (attr->attach_type == BPF_TCX_INGRESS ||
+		    attr->attach_type == BPF_TCX_EGRESS)
+			ret = tcx_prog_detach(attr, prog);
+		else
+			ret = netkit_prog_detach(attr, prog);
 		break;
 	default:
 		ret = -EINVAL;
@@ -3992,6 +4005,9 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_TCX_INGRESS:
 	case BPF_TCX_EGRESS:
 		return tcx_prog_query(attr, uattr);
+	case BPF_NETKIT_PRIMARY:
+	case BPF_NETKIT_PEER:
+		return netkit_prog_query(attr, uattr);
 	default:
 		return -EINVAL;
 	}
@@ -4973,7 +4989,11 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 		ret = bpf_xdp_link_attach(attr, prog);
 		break;
 	case BPF_PROG_TYPE_SCHED_CLS:
-		ret = tcx_link_attach(attr, prog);
+		if (attr->link_create.attach_type == BPF_TCX_INGRESS ||
+		    attr->link_create.attach_type == BPF_TCX_EGRESS)
+			ret = tcx_link_attach(attr, prog);
+		else
+			ret = netkit_link_attach(attr, prog);
 		break;
 	case BPF_PROG_TYPE_NETFILTER:
 		ret = bpf_nf_link_attach(attr, prog);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7ba61b75bc0e..0f6cdf52b1da 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1052,6 +1052,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_UNIX_RECVMSG,
 	BPF_CGROUP_UNIX_GETPEERNAME,
 	BPF_CGROUP_UNIX_GETSOCKNAME,
+	BPF_NETKIT_PRIMARY,
+	BPF_NETKIT_PEER,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1071,6 +1073,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_NETFILTER = 10,
 	BPF_LINK_TYPE_TCX = 11,
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
+	BPF_LINK_TYPE_NETKIT = 13,
 	MAX_BPF_LINK_TYPE,
 };
 
@@ -1656,6 +1659,13 @@ union bpf_attr {
 				__u32		flags;
 				__u32		pid;
 			} uprobe_multi;
+			struct {
+				union {
+					__u32	relative_fd;
+					__u32	relative_id;
+				};
+				__u64		expected_revision;
+			} netkit;
 		};
 	} link_create;
 
@@ -6576,6 +6586,10 @@ struct bpf_link_info {
 			__u32 ifindex;
 			__u32 attach_type;
 		} tcx;
+		struct {
+			__u32 ifindex;
+			__u32 attach_type;
+		} netkit;
 	};
 } __attribute__((aligned(8)));
 
-- 
cgit v1.2.3


From d5090484b021794271280ab64d20253883b7f6fd Mon Sep 17 00:00:00 2001
From: Petr Tesarik <petr.tesarik1@huawei-partners.com>
Date: Wed, 25 Oct 2023 10:44:25 +0200
Subject: swiotlb: do not try to allocate a TLB bigger than MAX_ORDER pages

When allocating a new pool at runtime, reduce the number of slabs so
that the allocation order is at most MAX_ORDER.  This avoids a kernel
warning in __alloc_pages().

The warning is relatively benign, because the pool size is subsequently
reduced when allocation fails, but it is silly to start with a request
that is known to fail, especially since this is the default behavior if
the kernel is built with CONFIG_SWIOTLB_DYNAMIC=y and booted without any
swiotlb= parameter.

Reported-by: Ben Greear <greearb@candelatech.com>
Closes: https://lore.kernel.org/netdev/4f173dd2-324a-0240-ff8d-abf5c191be18@candelatech.com/
Fixes: 1aaa736815eb ("swiotlb: allocate a new memory pool when existing pools are full")
Signed-off-by: Petr Tesarik <petr.tesarik1@huawei-partners.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 01637677736f..dff067bd56b1 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -678,6 +678,11 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
 	size_t pool_size;
 	size_t tlb_size;
 
+	if (nslabs > SLABS_PER_PAGE << MAX_ORDER) {
+		nslabs = SLABS_PER_PAGE << MAX_ORDER;
+		nareas = limit_nareas(nareas, nslabs);
+	}
+
 	pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas);
 	pool = kzalloc(pool_size, gfp);
 	if (!pool)
-- 
cgit v1.2.3


From a0b0bad10587ae2948a7c36ca4ffc206007fbcf3 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Fri, 20 Oct 2023 15:25:22 +0800
Subject: genirq/matrix: Exclude managed interrupts in irq_matrix_allocated()

When a CPU is about to be offlined, x86 validates that all active
interrupts which are targeted to this CPU can be migrated to the remaining
online CPUs. If not, the offline operation is aborted.

The validation uses irq_matrix_allocated() to retrieve the number of
vectors which are allocated on the outgoing CPU. The returned number of
allocated vectors includes also vectors which are associated to managed
interrupts.

That's overaccounting because managed interrupts are:

  - not migrated when the affinity mask of the interrupt targets only
    the outgoing CPU

  - migrated to another CPU, but in that case the vector is already
    pre-allocated on the potential target CPUs and must not be taken into
    account.

As a consequence the check whether the remaining online CPUs have enough
capacity for migrating the allocated vectors from the outgoing CPU might
fail incorrectly.

Let irq_matrix_allocated() return only the number of allocated non-managed
interrupts to make this validation check correct.

[ tglx: Amend changelog and fixup kernel-doc comment ]

Fixes: 2f75d9e1c905 ("genirq: Implement bitmap matrix allocator")
Reported-by: Wendy Wang <wendy.wang@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231020072522.557846-1-yu.c.chen@intel.com
---
 kernel/irq/matrix.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 1698e77645ac..75d0ae490e29 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -466,16 +466,16 @@ unsigned int irq_matrix_reserved(struct irq_matrix *m)
 }
 
 /**
- * irq_matrix_allocated - Get the number of allocated irqs on the local cpu
+ * irq_matrix_allocated - Get the number of allocated non-managed irqs on the local CPU
  * @m:		Pointer to the matrix to search
  *
- * This returns number of allocated irqs
+ * This returns number of allocated non-managed interrupts.
  */
 unsigned int irq_matrix_allocated(struct irq_matrix *m)
 {
 	struct cpumap *cm = this_cpu_ptr(m->maps);
 
-	return cm->allocated;
+	return cm->allocated - cm->managed_allocated;
 }
 
 #ifdef CONFIG_GENERIC_IRQ_DEBUGFS
-- 
cgit v1.2.3


From 0b201c3624ae9f58ebfff8484f304f3008fb01b8 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:07:55 +0800
Subject: sched/fair: use folio_xchg_access_time() in numa_hint_fault_latency()

Convert to use folio_xchg_access_time() in numa_hint_fault_latency().

Link: https://lkml.kernel.org/r/20231018140806.2783514-9-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 42aefe7e6fdc..3ee9d3564c20 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1727,7 +1727,7 @@ static int numa_hint_fault_latency(struct folio *folio)
 	int last_time, time;
 
 	time = jiffies_to_msecs(jiffies);
-	last_time = xchg_page_access_time(&folio->page, time);
+	last_time = folio_xchg_access_time(folio, time);
 
 	return (time - last_time) & PAGE_ACCESS_TIME_MASK;
 }
-- 
cgit v1.2.3


From 1b143cc77f2074dd43b610d6bfffc822d20b6e16 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 18 Oct 2023 22:08:00 +0800
Subject: sched/fair: use folio_xchg_last_cpupid() in
 should_numa_migrate_memory()

Convert to use folio_xchg_last_cpupid() in should_numa_migrate_memory().

Link: https://lkml.kernel.org/r/20231018140806.2783514-14-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3ee9d3564c20..d1a765cdf6e4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1823,7 +1823,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
 	}
 
 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
-	last_cpupid = page_cpupid_xchg_last(&folio->page, this_cpupid);
+	last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
 
 	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
 	    !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
-- 
cgit v1.2.3


From 47846d51348dd62e5231a83be040981b17c955fa Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 9 Oct 2023 13:18:49 -0400
Subject: audit: don't take task_lock() in audit_exe_compare() code path

The get_task_exe_file() function locks the given task with task_lock()
which when used inside audit_exe_compare() can cause deadlocks on
systems that generate audit records when the task_lock() is held. We
resolve this problem with two changes: ignoring those cases where the
task being audited is not the current task, and changing our approach
to obtaining the executable file struct to not require task_lock().

With the intent of the audit exe filter being to filter on audit events
generated by processes started by the specified executable, it makes
sense that we would only want to use the exe filter on audit records
associated with the currently executing process, e.g. @current.  If
we are asked to filter records using a non-@current task_struct we can
safely ignore the exe filter without negatively impacting the admin's
expectations for the exe filter.

Knowing that we only have to worry about filtering the currently
executing task in audit_exe_compare() we can do away with the
task_lock() and call get_mm_exe_file() with @current->mm directly.

Cc: <stable@vger.kernel.org>
Fixes: 5efc244346f9 ("audit: fix exe_file access in audit_exe_compare")
Reported-by: Andreas Steinmetz <anstein99@googlemail.com>
Reviewed-by: John Johansen <john.johanse@canonical.com>
Reviewed-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_watch.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 65075f1e4ac8..91e82e34b51e 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -527,11 +527,18 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
 	unsigned long ino;
 	dev_t dev;
 
-	exe_file = get_task_exe_file(tsk);
+	/* only do exe filtering if we are recording @current events/records */
+	if (tsk != current)
+		return 0;
+
+	if (WARN_ON_ONCE(!current->mm))
+		return 0;
+	exe_file = get_mm_exe_file(current->mm);
 	if (!exe_file)
 		return 0;
 	ino = file_inode(exe_file)->i_ino;
 	dev = file_inode(exe_file)->i_sb->s_dev;
 	fput(exe_file);
+
 	return audit_mark_compare(mark, ino, dev);
 }
-- 
cgit v1.2.3


From b56ebe7c896dc78b5865ec2c4b1dae3c93537517 Mon Sep 17 00:00:00 2001
From: Koichiro Den <den@valinux.co.jp>
Date: Thu, 26 Oct 2023 12:20:36 +0900
Subject: x86/apic/msi: Fix misconfigured non-maskable MSI quirk

commit ef8dd01538ea ("genirq/msi: Make interrupt allocation less
convoluted"), reworked the code so that the x86 specific quirk for affinity
setting of non-maskable PCI/MSI interrupts is not longer activated if
necessary.

This could be solved by restoring the original logic in the core MSI code,
but after a deeper analysis it turned out that the quirk flag is not
required at all.

The quirk is only required when the PCI/MSI device cannot mask the MSI
interrupts, which in turn also prevents reservation mode from being enabled
for the affected interrupt.

This allows ot remove the NOMASK quirk bit completely as msi_set_affinity()
can instead check whether reservation mode is enabled for the interrupt,
which gives exactly the same answer.

Even in the momentary non-existing case that the reservation mode would be
not set for a maskable MSI interrupt this would not cause any harm as it
just would cause msi_set_affinity() to go needlessly through the
functionaly equivalent slow path, which works perfectly fine with maskable
interrupts as well.

Rework msi_set_affinity() to query the reservation mode and remove all
NOMASK quirk logic from the core code.

[ tglx: Massaged changelog ]

Fixes: ef8dd01538ea ("genirq/msi: Make interrupt allocation less convoluted")
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Koichiro Den <den@valinux.co.jp>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20231026032036.2462428-1-den@valinux.co.jp
---
 arch/x86/kernel/apic/msi.c |  8 +++-----
 include/linux/irq.h        | 26 ++++----------------------
 include/linux/msi.h        |  6 ------
 kernel/irq/debugfs.c       |  1 -
 kernel/irq/msi.c           | 12 +-----------
 5 files changed, 8 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 6b6b711678fe..d9651f15ae4f 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -55,14 +55,14 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
 	 * caused by the non-atomic update of the address/data pair.
 	 *
 	 * Direct update is possible when:
-	 * - The MSI is maskable (remapped MSI does not use this code path)).
-	 *   The quirk bit is not set in this case.
+	 * - The MSI is maskable (remapped MSI does not use this code path).
+	 *   The reservation mode bit is set in this case.
 	 * - The new vector is the same as the old vector
 	 * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
 	 * - The interrupt is not yet started up
 	 * - The new destination CPU is the same as the old destination CPU
 	 */
-	if (!irqd_msi_nomask_quirk(irqd) ||
+	if (!irqd_can_reserve(irqd) ||
 	    cfg->vector == old_cfg.vector ||
 	    old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
 	    !irqd_is_started(irqd) ||
@@ -215,8 +215,6 @@ static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
 		if (WARN_ON_ONCE(domain != real_parent))
 			return false;
 		info->chip->irq_set_affinity = msi_set_affinity;
-		/* See msi_set_affinity() for the gory details */
-		info->flags |= MSI_FLAG_NOMASK_QUIRK;
 		break;
 	case DOMAIN_BUS_DMAR:
 	case DOMAIN_BUS_AMDVI:
diff --git a/include/linux/irq.h b/include/linux/irq.h
index d8a6fdce9373..90081afa10ce 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -215,8 +215,6 @@ struct irq_data {
  * IRQD_SINGLE_TARGET		- IRQ allows only a single affinity target
  * IRQD_DEFAULT_TRIGGER_SET	- Expected trigger already been set
  * IRQD_CAN_RESERVE		- Can use reservation mode
- * IRQD_MSI_NOMASK_QUIRK	- Non-maskable MSI quirk for affinity change
- *				  required
  * IRQD_HANDLE_ENFORCE_IRQCTX	- Enforce that handle_irq_*() is only invoked
  *				  from actual interrupt context.
  * IRQD_AFFINITY_ON_ACTIVATE	- Affinity is set on activation. Don't call
@@ -247,11 +245,10 @@ enum {
 	IRQD_SINGLE_TARGET		= BIT(24),
 	IRQD_DEFAULT_TRIGGER_SET	= BIT(25),
 	IRQD_CAN_RESERVE		= BIT(26),
-	IRQD_MSI_NOMASK_QUIRK		= BIT(27),
-	IRQD_HANDLE_ENFORCE_IRQCTX	= BIT(28),
-	IRQD_AFFINITY_ON_ACTIVATE	= BIT(29),
-	IRQD_IRQ_ENABLED_ON_SUSPEND	= BIT(30),
-	IRQD_RESEND_WHEN_IN_PROGRESS    = BIT(31),
+	IRQD_HANDLE_ENFORCE_IRQCTX	= BIT(27),
+	IRQD_AFFINITY_ON_ACTIVATE	= BIT(28),
+	IRQD_IRQ_ENABLED_ON_SUSPEND	= BIT(29),
+	IRQD_RESEND_WHEN_IN_PROGRESS    = BIT(30),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -426,21 +423,6 @@ static inline bool irqd_can_reserve(struct irq_data *d)
 	return __irqd_to_state(d) & IRQD_CAN_RESERVE;
 }
 
-static inline void irqd_set_msi_nomask_quirk(struct irq_data *d)
-{
-	__irqd_to_state(d) |= IRQD_MSI_NOMASK_QUIRK;
-}
-
-static inline void irqd_clr_msi_nomask_quirk(struct irq_data *d)
-{
-	__irqd_to_state(d) &= ~IRQD_MSI_NOMASK_QUIRK;
-}
-
-static inline bool irqd_msi_nomask_quirk(struct irq_data *d)
-{
-	return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK;
-}
-
 static inline void irqd_set_affinity_on_activate(struct irq_data *d)
 {
 	__irqd_to_state(d) |= IRQD_AFFINITY_ON_ACTIVATE;
diff --git a/include/linux/msi.h b/include/linux/msi.h
index a50ea79522f8..ddace8c34dcf 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -547,12 +547,6 @@ enum {
 	MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS	= (1 << 5),
 	/* Free MSI descriptors */
 	MSI_FLAG_FREE_MSI_DESCS		= (1 << 6),
-	/*
-	 * Quirk to handle MSI implementations which do not provide
-	 * masking. Currently known to affect x86, but has to be partially
-	 * handled in the core MSI code.
-	 */
-	MSI_FLAG_NOMASK_QUIRK		= (1 << 7),
 
 	/* Mask for the generic functionality */
 	MSI_GENERIC_FLAGS_MASK		= GENMASK(15, 0),
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 5971a66be034..aae0402507ed 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -121,7 +121,6 @@ static const struct irq_bit_descr irqdata_states[] = {
 	BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE),
 	BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
 	BIT_MASK_DESCR(IRQD_CAN_RESERVE),
-	BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
 
 	BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index b4c31a5c1147..79b4a58ba9c3 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1204,7 +1204,6 @@ static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
 
 #define VIRQ_CAN_RESERVE	0x01
 #define VIRQ_ACTIVATE		0x02
-#define VIRQ_NOMASK_QUIRK	0x04
 
 static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags)
 {
@@ -1213,8 +1212,6 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
 
 	if (!(vflags & VIRQ_CAN_RESERVE)) {
 		irqd_clr_can_reserve(irqd);
-		if (vflags & VIRQ_NOMASK_QUIRK)
-			irqd_set_msi_nomask_quirk(irqd);
 
 		/*
 		 * If the interrupt is managed but no CPU is available to
@@ -1275,15 +1272,8 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain
 	 * Interrupt can use a reserved vector and will not occupy
 	 * a real device vector until the interrupt is requested.
 	 */
-	if (msi_check_reservation_mode(domain, info, dev)) {
+	if (msi_check_reservation_mode(domain, info, dev))
 		vflags |= VIRQ_CAN_RESERVE;
-		/*
-		 * MSI affinity setting requires a special quirk (X86) when
-		 * reservation mode is active.
-		 */
-		if (info->flags & MSI_FLAG_NOMASK_QUIRK)
-			vflags |= VIRQ_NOMASK_QUIRK;
-	}
 
 	xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
 		if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED))
-- 
cgit v1.2.3


From c421c12586b3f00fb96b5c9af15c9a051a9090b1 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Sat, 21 Oct 2023 09:49:59 +0800
Subject: bpf: Add more WARN_ON_ONCE checks for mismatched alloc and free

There are two possible mismatched alloc and free cases in BPF memory
allocator:

1) allocate from cache X but free by cache Y with a different unit_size
2) allocate from per-cpu cache but free by kmalloc cache or vice versa

So add more WARN_ON_ONCE checks in free_bulk() and __free_by_rcu() to
spot these mismatched alloc and free early.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20231021014959.3563841-1-houtao@huaweicloud.com
---
 kernel/bpf/memalloc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 5308e386380a..63b909d277d4 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -340,6 +340,7 @@ static void free_bulk(struct bpf_mem_cache *c)
 	int cnt;
 
 	WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+	WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
 
 	do {
 		inc_active(c, &flags);
@@ -365,6 +366,9 @@ static void __free_by_rcu(struct rcu_head *head)
 	struct bpf_mem_cache *tgt = c->tgt;
 	struct llist_node *llnode;
 
+	WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+	WARN_ON_ONCE(tgt->percpu_size != c->percpu_size);
+
 	llnode = llist_del_all(&c->waiting_for_gp);
 	if (!llnode)
 		goto out;
-- 
cgit v1.2.3


From 5e7afb2eb7b2a7c81e9f608cbdf74a07606fd1b5 Mon Sep 17 00:00:00 2001
From: Herve Codina <herve.codina@bootlin.com>
Date: Tue, 24 Oct 2023 17:03:35 +0200
Subject: genirq/generic_chip: Make irq_remove_generic_chip() irqdomain aware

irq_remove_generic_chip() calculates the Linux interrupt number for removing the
handler and interrupt chip based on gc::irq_base as a linear function of
the bit positions of set bits in the @msk argument.

When the generic chip is present in an irq domain, i.e. created with a call
to irq_alloc_domain_generic_chips(), gc::irq_base contains not the base
Linux interrupt number.  It contains the base hardware interrupt for this
chip. It is set to 0 for the first chip in the domain, 0 + N for the next
chip, where $N is the number of hardware interrupts per chip.

That means the Linux interrupt number cannot be calculated based on
gc::irq_base for irqdomain based chips without a domain map lookup, which
is currently missing.

Rework the code to take the irqdomain case into account and calculate the
Linux interrupt number by a irqdomain lookup of the domain specific
hardware interrupt number.

[ tglx: Massage changelog. Reshuffle the logic and add a proper comment. ]

Fixes: cfefd21e693d ("genirq: Add chip suspend and resume callbacks")
Signed-off-by: Herve Codina <herve.codina@bootlin.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20231024150335.322282-1-herve.codina@bootlin.com
---
 kernel/irq/generic-chip.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 81ecca08caad..d39a40bc542b 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -548,21 +548,34 @@ EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
 void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
 			     unsigned int clr, unsigned int set)
 {
-	unsigned int i = gc->irq_base;
+	unsigned int i, virq;
 
 	raw_spin_lock(&gc_lock);
 	list_del(&gc->list);
 	raw_spin_unlock(&gc_lock);
 
-	for (; msk; msk >>= 1, i++) {
+	for (i = 0; msk; msk >>= 1, i++) {
 		if (!(msk & 0x01))
 			continue;
 
+		/*
+		 * Interrupt domain based chips store the base hardware
+		 * interrupt number in gc::irq_base. Otherwise gc::irq_base
+		 * contains the base Linux interrupt number.
+		 */
+		if (gc->domain) {
+			virq = irq_find_mapping(gc->domain, gc->irq_base + i);
+			if (!virq)
+				continue;
+		} else {
+			virq = gc->irq_base + i;
+		}
+
 		/* Remove handler first. That will mask the irq line */
-		irq_set_handler(i, NULL);
-		irq_set_chip(i, &no_irq_chip);
-		irq_set_chip_data(i, NULL);
-		irq_modify_status(i, clr, set);
+		irq_set_handler(virq, NULL);
+		irq_set_chip(virq, &no_irq_chip);
+		irq_set_chip_data(virq, NULL);
+		irq_modify_status(virq, clr, set);
 	}
 }
 EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
-- 
cgit v1.2.3


From c73801ae4f22b390228ebf471d55668e824198b6 Mon Sep 17 00:00:00 2001
From: Ben Wolsieffer <ben.wolsieffer@hefring.com>
Date: Thu, 19 Oct 2023 16:45:49 -0400
Subject: futex: Don't include process MM in futex key on no-MMU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On no-MMU, all futexes are treated as private because there is no need
to map a virtual address to physical to match the futex across
processes. This doesn't quite work though, because private futexes
include the current process's mm_struct as part of their key. This makes
it impossible for one process to wake up a shared futex being waited on
in another process.

Fix this bug by excluding the mm_struct from the key. With
a single address space, the futex address is already a unique key.

Fixes: 784bdf3bb694 ("futex: Assume all mappings are private on !MMU systems")
Signed-off-by: Ben Wolsieffer <ben.wolsieffer@hefring.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: André Almeida <andrealmeid@igalia.com>
Link: https://lore.kernel.org/r/20231019204548.1236437-2-ben.wolsieffer@hefring.com
---
 kernel/futex/core.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index ade7c731972d..52695c59d041 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -252,7 +252,17 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
 	 *        but access_ok() should be faster than find_vma()
 	 */
 	if (!fshared) {
-		key->private.mm = mm;
+		/*
+		 * On no-MMU, shared futexes are treated as private, therefore
+		 * we must not include the current process in the key. Since
+		 * there is only one address space, the address is a unique key
+		 * on its own.
+		 */
+		if (IS_ENABLED(CONFIG_MMU))
+			key->private.mm = mm;
+		else
+			key->private.mm = NULL;
+
 		key->private.address = address;
 		return 0;
 	}
-- 
cgit v1.2.3


From 7ddc21e317b360c3444de3023bcc83b85fabae2f Mon Sep 17 00:00:00 2001
From: WangJinchao <wangjinchao@xfusion.com>
Date: Mon, 16 Oct 2023 09:15:21 +0800
Subject: padata: Fix refcnt handling in padata_free_shell()

In a high-load arm64 environment, the pcrypt_aead01 test in LTP can lead
to system UAF (Use-After-Free) issues. Due to the lengthy analysis of
the pcrypt_aead01 function call, I'll describe the problem scenario
using a simplified model:

Suppose there's a user of padata named `user_function` that adheres to
the padata requirement of calling `padata_free_shell` after `serial()`
has been invoked, as demonstrated in the following code:

```c
struct request {
    struct padata_priv padata;
    struct completion *done;
};

void parallel(struct padata_priv *padata) {
    do_something();
}

void serial(struct padata_priv *padata) {
    struct request *request = container_of(padata,
    				struct request,
				padata);
    complete(request->done);
}

void user_function() {
    DECLARE_COMPLETION(done)
    padata->parallel = parallel;
    padata->serial = serial;
    padata_do_parallel();
    wait_for_completion(&done);
    padata_free_shell();
}
```

In the corresponding padata.c file, there's the following code:

```c
static void padata_serial_worker(struct work_struct *serial_work) {
    ...
    cnt = 0;

    while (!list_empty(&local_list)) {
        ...
        padata->serial(padata);
        cnt++;
    }

    local_bh_enable();

    if (refcount_sub_and_test(cnt, &pd->refcnt))
        padata_free_pd(pd);
}
```

Because of the high system load and the accumulation of unexecuted
softirq at this moment, `local_bh_enable()` in padata takes longer
to execute than usual. Subsequently, when accessing `pd->refcnt`,
`pd` has already been released by `padata_free_shell()`, resulting
in a UAF issue with `pd->refcnt`.

The fix is straightforward: add `refcount_dec_and_test` before calling
`padata_free_pd` in `padata_free_shell`.

Fixes: 07928d9bfc81 ("padata: Remove broken queue flushing")

Signed-off-by: WangJinchao <wangjinchao@xfusion.com>
Acked-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Acked-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 kernel/padata.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index 81c8183f3176..179fb1518070 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -1102,12 +1102,16 @@ EXPORT_SYMBOL(padata_alloc_shell);
  */
 void padata_free_shell(struct padata_shell *ps)
 {
+	struct parallel_data *pd;
+
 	if (!ps)
 		return;
 
 	mutex_lock(&ps->pinst->lock);
 	list_del(&ps->list);
-	padata_free_pd(rcu_dereference_protected(ps->pd, 1));
+	pd = rcu_dereference_protected(ps->pd, 1);
+	if (refcount_dec_and_test(&pd->refcnt))
+		padata_free_pd(pd);
 	mutex_unlock(&ps->pinst->lock);
 
 	kfree(ps);
-- 
cgit v1.2.3


From 446b1e0b7b39e2bf2187c58ba2a1cc60fb01de8b Mon Sep 17 00:00:00 2001
From: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Date: Sun, 22 Oct 2023 19:22:07 +0100
Subject: module: enable automatic module signing with FIPS 202 SHA-3

Add Kconfig options to use SHA-3 for kernel module signing. 256 size
for RSA only, and higher sizes for RSA and NIST P-384.

Signed-off-by: Dimitri John Ledkov <dimitri.ledkov@canonical.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 certs/Kconfig         |  2 +-
 kernel/module/Kconfig | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/certs/Kconfig b/certs/Kconfig
index 84582de66b7d..69d192a32dda 100644
--- a/certs/Kconfig
+++ b/certs/Kconfig
@@ -30,7 +30,7 @@ config MODULE_SIG_KEY_TYPE_RSA
 config MODULE_SIG_KEY_TYPE_ECDSA
 	bool "ECDSA"
 	select CRYPTO_ECDSA
-	depends on MODULE_SIG_SHA384 || MODULE_SIG_SHA512
+	depends on !(MODULE_SIG_SHA256 || MODULE_SIG_SHA3_256)
 	help
 	 Use an elliptic curve key (NIST P384) for module signing. Use
 	 a strong hash of same or higher bit length, i.e. sha384 or
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 9d7d45525fc4..0ea1b2970a23 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -248,6 +248,18 @@ config MODULE_SIG_SHA512
 	bool "Sign modules with SHA-512"
 	select CRYPTO_SHA512
 
+config MODULE_SIG_SHA3_256
+	bool "Sign modules with SHA3-256"
+	select CRYPTO_SHA3
+
+config MODULE_SIG_SHA3_384
+	bool "Sign modules with SHA3-384"
+	select CRYPTO_SHA3
+
+config MODULE_SIG_SHA3_512
+	bool "Sign modules with SHA3-512"
+	select CRYPTO_SHA3
+
 endchoice
 
 config MODULE_SIG_HASH
@@ -256,6 +268,9 @@ config MODULE_SIG_HASH
 	default "sha256" if MODULE_SIG_SHA256
 	default "sha384" if MODULE_SIG_SHA384
 	default "sha512" if MODULE_SIG_SHA512
+	default "sha3-256" if MODULE_SIG_SHA3_256
+	default "sha3-384" if MODULE_SIG_SHA3_384
+	default "sha3-512" if MODULE_SIG_SHA3_512
 
 choice
 	prompt "Module compression mode"
-- 
cgit v1.2.3


From 571d91dcadfa3cef499010b4eddb9b58b0da4d24 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 25 Oct 2023 13:16:19 -0700
Subject: perf: Add branch stack counters

Currently, the additional information of a branch entry is stored in a
u64 space. With more and more information added, the space is running
out. For example, the information of occurrences of events will be added
for each branch.

Two places were suggested to append the counters.
https://lore.kernel.org/lkml/20230802215814.GH231007@hirez.programming.kicks-ass.net/
One place is right after the flags of each branch entry. It changes the
existing struct perf_branch_entry. The later ARCH specific
implementation has to be really careful to consistently pick
the right struct.
The other place is right after the entire struct perf_branch_stack.
The disadvantage is that the pointer of the extra space has to be
recorded. The common interface perf_sample_save_brstack() has to be
updated.

The latter is much straightforward, and should be easily understood and
maintained. It is implemented in the patch.

Add a new branch sample type, PERF_SAMPLE_BRANCH_COUNTERS, to indicate
the event which is recorded in the branch info.

The "u64 counters" may store the occurrences of several events. The
information regarding the number of events/counters and the width of
each counter should be exposed via sysfs as a reference for the perf
tool. Define the branch_counter_nr and branch_counter_width ABI here.
The support will be implemented later in the Intel-specific patch.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231025201626.3000228-1-kan.liang@linux.intel.com
---
 .../ABI/testing/sysfs-bus-event_source-devices-caps     |  6 ++++++
 arch/powerpc/perf/core-book3s.c                         |  2 +-
 arch/x86/events/amd/core.c                              |  2 +-
 arch/x86/events/core.c                                  |  2 +-
 arch/x86/events/intel/core.c                            |  2 +-
 arch/x86/events/intel/ds.c                              |  4 ++--
 include/linux/perf_event.h                              | 17 ++++++++++++++++-
 include/uapi/linux/perf_event.h                         | 10 ++++++++++
 kernel/events/core.c                                    |  8 ++++++++
 9 files changed, 46 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps b/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps
index 8757dcf41c08..a5f506f7d481 100644
--- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-caps
@@ -16,3 +16,9 @@ Description:
 		Example output in powerpc:
 		grep . /sys/bus/event_source/devices/cpu/caps/*
 		/sys/bus/event_source/devices/cpu/caps/pmu_name:POWER9
+
+		The "branch_counter_nr" in the supported platform exposes the
+		maximum number of counters which can be shown in the u64 counters
+		of PERF_SAMPLE_BRANCH_COUNTERS, while the "branch_counter_width"
+		exposes the width of each counter. Both of them can be used by
+		the perf tool to parse the logged counters in each branch.
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 8c1f7def596e..3c14596bbfaf 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2313,7 +2313,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 			struct cpu_hw_events *cpuhw;
 			cpuhw = this_cpu_ptr(&cpu_hw_events);
 			power_pmu_bhrb_read(event, cpuhw);
-			perf_sample_save_brstack(&data, event, &cpuhw->bhrb_stack);
+			perf_sample_save_brstack(&data, event, &cpuhw->bhrb_stack, NULL);
 		}
 
 		if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC &&
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index e24976593a29..4ee6390b45c9 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -940,7 +940,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
 			continue;
 
 		if (has_branch_stack(event))
-			perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
+			perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
 
 		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 40ad1425ffa2..40c9af124128 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1702,7 +1702,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
 		perf_sample_data_init(&data, 0, event->hw.last_period);
 
 		if (has_branch_stack(event))
-			perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
+			perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
 
 		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a08f794a0e79..41a164764a84 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3047,7 +3047,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 		perf_sample_data_init(&data, 0, event->hw.last_period);
 
 		if (has_branch_stack(event))
-			perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
+			perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
 
 		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index bf97ab904d40..cb3f329f8fa4 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1755,7 +1755,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
 		setup_pebs_time(event, data, pebs->tsc);
 
 	if (has_branch_stack(event))
-		perf_sample_save_brstack(data, event, &cpuc->lbr_stack);
+		perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
 }
 
 static void adaptive_pebs_save_regs(struct pt_regs *regs,
@@ -1912,7 +1912,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 
 		if (has_branch_stack(event)) {
 			intel_pmu_store_pebs_lbrs(lbr);
-			perf_sample_save_brstack(data, event, &cpuc->lbr_stack);
+			perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
 		}
 	}
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0367d748fae0..7897ef066027 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1139,6 +1139,10 @@ static inline bool branch_sample_priv(const struct perf_event *event)
 	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE;
 }
 
+static inline bool branch_sample_counters(const struct perf_event *event)
+{
+	return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS;
+}
 
 struct perf_sample_data {
 	/*
@@ -1173,6 +1177,7 @@ struct perf_sample_data {
 	struct perf_callchain_entry	*callchain;
 	struct perf_raw_record		*raw;
 	struct perf_branch_stack	*br_stack;
+	u64				*br_stack_cntr;
 	union perf_sample_weight	weight;
 	union  perf_mem_data_src	data_src;
 	u64				txn;
@@ -1250,7 +1255,8 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data,
 
 static inline void perf_sample_save_brstack(struct perf_sample_data *data,
 					    struct perf_event *event,
-					    struct perf_branch_stack *brs)
+					    struct perf_branch_stack *brs,
+					    u64 *brs_cntr)
 {
 	int size = sizeof(u64); /* nr */
 
@@ -1258,7 +1264,16 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data,
 		size += sizeof(u64);
 	size += brs->nr * sizeof(struct perf_branch_entry);
 
+	/*
+	 * The extension space for counters is appended after the
+	 * struct perf_branch_stack. It is used to store the occurrences
+	 * of events of each branch.
+	 */
+	if (brs_cntr)
+		size += brs->nr * sizeof(u64);
+
 	data->br_stack = brs;
+	data->br_stack_cntr = brs_cntr;
 	data->dyn_size += size;
 	data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
 }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 39c6a250dd1b..4461f380425b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -204,6 +204,8 @@ enum perf_branch_sample_type_shift {
 
 	PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT	= 18, /* save privilege mode */
 
+	PERF_SAMPLE_BRANCH_COUNTERS_SHIFT	= 19, /* save occurrences of events on a branch */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -235,6 +237,8 @@ enum perf_branch_sample_type {
 
 	PERF_SAMPLE_BRANCH_PRIV_SAVE	= 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT,
 
+	PERF_SAMPLE_BRANCH_COUNTERS	= 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
@@ -982,6 +986,12 @@ enum perf_event_type {
 	 *	{ u64                   nr;
 	 *	  { u64	hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX
 	 *        { u64 from, to, flags } lbr[nr];
+	 *        #
+	 *        # The format of the counters is decided by the
+	 *        # "branch_counter_nr" and "branch_counter_width",
+	 *        # which are defined in the ABI.
+	 *        #
+	 *        { u64 counters; } cntr[nr] && PERF_SAMPLE_BRANCH_COUNTERS
 	 *      } && PERF_SAMPLE_BRANCH_STACK
 	 *
 	 * 	{ u64			abi; # enum perf_sample_regs_abi
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3eb26c2c6e65..d27ffd80ed67 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7341,6 +7341,14 @@ void perf_output_sample(struct perf_output_handle *handle,
 			if (branch_sample_hw_index(event))
 				perf_output_put(handle, data->br_stack->hw_idx);
 			perf_output_copy(handle, data->br_stack->entries, size);
+			/*
+			 * Add the extension space which is appended
+			 * right after the struct perf_branch_stack.
+			 */
+			if (data->br_stack_cntr) {
+				size = data->br_stack->nr * sizeof(u64);
+				perf_output_copy(handle, data->br_stack_cntr, size);
+			}
 		} else {
 			/*
 			 * we always store at least the value of nr
-- 
cgit v1.2.3


From e0f831836cead677fb07d54bd6bf499df35640c2 Mon Sep 17 00:00:00 2001
From: Yujie Liu <yujie.liu@intel.com>
Date: Fri, 27 Oct 2023 12:13:14 +0800
Subject: tracing/kprobes: Fix the description of variable length arguments

Fix the following kernel-doc warnings:

kernel/trace/trace_kprobe.c:1029: warning: Excess function parameter 'args' description in '__kprobe_event_gen_cmd_start'
kernel/trace/trace_kprobe.c:1097: warning: Excess function parameter 'args' description in '__kprobe_event_add_fields'

Refer to the usage of variable length arguments elsewhere in the kernel
code, "@..." is the proper way to express it in the description.

Link: https://lore.kernel.org/all/20231027041315.2613166-1-yujie.liu@intel.com/

Fixes: 2a588dd1d5d6 ("tracing: Add kprobe event command generation functions")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202310190437.paI6LYJF-lkp@intel.com/
Signed-off-by: Yujie Liu <yujie.liu@intel.com>
Reviewed-by: Mukesh Ojha <quic_mojha@quicinc.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace_kprobe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a8fef6ab0872..95c5b0668cb7 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1007,7 +1007,7 @@ EXPORT_SYMBOL_GPL(kprobe_event_cmd_init);
  * @name: The name of the kprobe event
  * @loc: The location of the kprobe event
  * @kretprobe: Is this a return probe?
- * @args: Variable number of arg (pairs), one pair for each field
+ * @...: Variable number of arg (pairs), one pair for each field
  *
  * NOTE: Users normally won't want to call this function directly, but
  * rather use the kprobe_event_gen_cmd_start() wrapper, which automatically
@@ -1080,7 +1080,7 @@ EXPORT_SYMBOL_GPL(__kprobe_event_gen_cmd_start);
 /**
  * __kprobe_event_add_fields - Add probe fields to a kprobe command from arg list
  * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @args: Variable number of arg (pairs), one pair for each field
+ * @...: Variable number of arg (pairs), one pair for each field
  *
  * NOTE: Users normally won't want to call this function directly, but
  * rather use the kprobe_event_add_fields() wrapper, which
-- 
cgit v1.2.3


From 926fe783c8a64b33997fec405cf1af3e61aed441 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 27 Oct 2023 16:31:26 -0700
Subject: tracing/kprobes: Fix symbol counting logic by looking at modules as
 well

Recent changes to count number of matching symbols when creating
a kprobe event failed to take into account kernel modules. As such, it
breaks kprobes on kernel module symbols, by assuming there is no match.

Fix this my calling module_kallsyms_on_each_symbol() in addition to
kallsyms_on_each_match_symbol() to perform a proper counting.

Link: https://lore.kernel.org/all/20231027233126.2073148-1-andrii@kernel.org/

Cc: Francis Laniel <flaniel@linux.microsoft.com>
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Fixes: b022f0c7e404 ("tracing/kprobes: Return EADDRNOTAVAIL when func matches several symbols")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace_kprobe.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 95c5b0668cb7..e834f149695b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -714,14 +714,30 @@ static int count_symbols(void *data, unsigned long unused)
 	return 0;
 }
 
+struct sym_count_ctx {
+	unsigned int count;
+	const char *name;
+};
+
+static int count_mod_symbols(void *data, const char *name, unsigned long unused)
+{
+	struct sym_count_ctx *ctx = data;
+
+	if (strcmp(name, ctx->name) == 0)
+		ctx->count++;
+
+	return 0;
+}
+
 static unsigned int number_of_same_symbols(char *func_name)
 {
-	unsigned int count;
+	struct sym_count_ctx ctx = { .count = 0, .name = func_name };
+
+	kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
 
-	count = 0;
-	kallsyms_on_each_match_symbol(count_symbols, func_name, &count);
+	module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx);
 
-	return count;
+	return ctx.count;
 }
 
 static int __trace_kprobe_create(int argc, const char *argv[])
-- 
cgit v1.2.3


From e017d304c74079c5169265c8ee9ac8abc8079145 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Sep 2023 11:34:22 +0200
Subject: PM: hibernate: Convert to bdev_open_by_dev()

Convert hibernation code to use bdev_open_by_dev().

CC: linux-pm@vger.kernel.org
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: "Rafael J. Wysocki" <rafael@kernel.org>
Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230927093442.25915-16-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/power/swap.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 74edbce2320b..095a263da7c4 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -222,7 +222,7 @@ int swsusp_swap_in_use(void)
  */
 
 static unsigned short root_swap = 0xffff;
-static struct block_device *hib_resume_bdev;
+static struct bdev_handle *hib_resume_bdev_handle;
 
 struct hib_bio_batch {
 	atomic_t		count;
@@ -276,7 +276,8 @@ static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr,
 	struct bio *bio;
 	int error = 0;
 
-	bio = bio_alloc(hib_resume_bdev, 1, opf, GFP_NOIO | __GFP_HIGH);
+	bio = bio_alloc(hib_resume_bdev_handle->bdev, 1, opf,
+			GFP_NOIO | __GFP_HIGH);
 	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
@@ -356,14 +357,14 @@ static int swsusp_swap_check(void)
 		return res;
 	root_swap = res;
 
-	hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
+	hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device,
 			BLK_OPEN_WRITE, NULL, NULL);
-	if (IS_ERR(hib_resume_bdev))
-		return PTR_ERR(hib_resume_bdev);
+	if (IS_ERR(hib_resume_bdev_handle))
+		return PTR_ERR(hib_resume_bdev_handle);
 
-	res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
+	res = set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE);
 	if (res < 0)
-		blkdev_put(hib_resume_bdev, NULL);
+		bdev_release(hib_resume_bdev_handle);
 
 	return res;
 }
@@ -1522,10 +1523,10 @@ int swsusp_check(bool exclusive)
 	void *holder = exclusive ? &swsusp_holder : NULL;
 	int error;
 
-	hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ,
-					    holder, NULL);
-	if (!IS_ERR(hib_resume_bdev)) {
-		set_blocksize(hib_resume_bdev, PAGE_SIZE);
+	hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device,
+				BLK_OPEN_READ, holder, NULL);
+	if (!IS_ERR(hib_resume_bdev_handle)) {
+		set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
 		error = hib_submit_io(REQ_OP_READ, swsusp_resume_block,
 					swsusp_header, NULL);
@@ -1550,11 +1551,11 @@ int swsusp_check(bool exclusive)
 
 put:
 		if (error)
-			blkdev_put(hib_resume_bdev, holder);
+			bdev_release(hib_resume_bdev_handle);
 		else
 			pr_debug("Image signature found, resuming\n");
 	} else {
-		error = PTR_ERR(hib_resume_bdev);
+		error = PTR_ERR(hib_resume_bdev_handle);
 	}
 
 	if (error)
@@ -1570,12 +1571,12 @@ put:
 
 void swsusp_close(bool exclusive)
 {
-	if (IS_ERR(hib_resume_bdev)) {
+	if (IS_ERR(hib_resume_bdev_handle)) {
 		pr_debug("Image device not initialised\n");
 		return;
 	}
 
-	blkdev_put(hib_resume_bdev, exclusive ? &swsusp_holder : NULL);
+	bdev_release(hib_resume_bdev_handle);
 }
 
 /**
-- 
cgit v1.2.3


From 93745df18e52157778a8a74cb888ac785844a7fe Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Sep 2023 11:34:23 +0200
Subject: PM: hibernate: Drop unused snapshot_test argument

snapshot_test argument is now unused in swsusp_close() and
load_image_and_restore(). Drop it

CC: linux-pm@vger.kernel.org
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: "Rafael J. Wysocki" <rafael@kernel.org>
Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20230927093442.25915-17-jack@suse.cz
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/power/hibernate.c | 14 +++++++-------
 kernel/power/power.h     |  2 +-
 kernel/power/swap.c      |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8d35b9f9aaa3..dee341ae4ace 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -684,7 +684,7 @@ static void power_down(void)
 		cpu_relax();
 }
 
-static int load_image_and_restore(bool snapshot_test)
+static int load_image_and_restore(void)
 {
 	int error;
 	unsigned int flags;
@@ -694,12 +694,12 @@ static int load_image_and_restore(bool snapshot_test)
 	lock_device_hotplug();
 	error = create_basic_memory_bitmaps();
 	if (error) {
-		swsusp_close(snapshot_test);
+		swsusp_close();
 		goto Unlock;
 	}
 
 	error = swsusp_read(&flags);
-	swsusp_close(snapshot_test);
+	swsusp_close();
 	if (!error)
 		error = hibernation_restore(flags & SF_PLATFORM_MODE);
 
@@ -788,7 +788,7 @@ int hibernate(void)
 		pm_pr_dbg("Checking hibernation image\n");
 		error = swsusp_check(false);
 		if (!error)
-			error = load_image_and_restore(false);
+			error = load_image_and_restore();
 	}
 	thaw_processes();
 
@@ -952,7 +952,7 @@ static int software_resume(void)
 	/* The snapshot device should not be opened while we're running */
 	if (!hibernate_acquire()) {
 		error = -EBUSY;
-		swsusp_close(true);
+		swsusp_close();
 		goto Unlock;
 	}
 
@@ -973,7 +973,7 @@ static int software_resume(void)
 		goto Close_Finish;
 	}
 
-	error = load_image_and_restore(true);
+	error = load_image_and_restore();
 	thaw_processes();
  Finish:
 	pm_notifier_call_chain(PM_POST_RESTORE);
@@ -987,7 +987,7 @@ static int software_resume(void)
 	pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
 	return error;
  Close_Finish:
-	swsusp_close(true);
+	swsusp_close();
 	goto Finish;
 }
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index a98f95e309a3..17fd9aaaf084 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -172,7 +172,7 @@ int swsusp_check(bool exclusive);
 extern void swsusp_free(void);
 extern int swsusp_read(unsigned int *flags_p);
 extern int swsusp_write(unsigned int flags);
-void swsusp_close(bool exclusive);
+void swsusp_close(void);
 #ifdef CONFIG_SUSPEND
 extern int swsusp_unmark(void);
 #endif
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 095a263da7c4..68a5c2f06957 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -444,7 +444,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
 err_rel:
 	release_swap_writer(handle);
 err_close:
-	swsusp_close(false);
+	swsusp_close();
 	return ret;
 }
 
@@ -509,7 +509,7 @@ static int swap_writer_finish(struct swap_map_handle *handle,
 	if (error)
 		free_all_swap_pages(root_swap);
 	release_swap_writer(handle);
-	swsusp_close(false);
+	swsusp_close();
 
 	return error;
 }
@@ -1569,7 +1569,7 @@ put:
  * @exclusive: Close the resume device which is exclusively opened.
  */
 
-void swsusp_close(bool exclusive)
+void swsusp_close(void)
 {
 	if (IS_ERR(hib_resume_bdev_handle)) {
 		pr_debug("Image device not initialised\n");
-- 
cgit v1.2.3


From dcc4e5728eeaeda84878ca0018758cff1abfca21 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 27 Oct 2023 08:56:38 -0700
Subject: seq_buf: Introduce DECLARE_SEQ_BUF and seq_buf_str()

Solve two ergonomic issues with struct seq_buf;

1) Too much boilerplate is required to initialize:

	struct seq_buf s;
	char buf[32];

	seq_buf_init(s, buf, sizeof(buf));

Instead, we can build this directly on the stack. Provide
DECLARE_SEQ_BUF() macro to do this:

	DECLARE_SEQ_BUF(s, 32);

2) %NUL termination is fragile and requires 2 steps to get a valid
   C String (and is a layering violation exposing the "internals" of
   seq_buf):

	seq_buf_terminate(s);
	do_something(s->buffer);

Instead, we can just return s->buffer directly after terminating it in
the refactored seq_buf_terminate(), now known as seq_buf_str():

	do_something(seq_buf_str(s));

Link: https://lore.kernel.org/linux-trace-kernel/20231027155634.make.260-kees@kernel.org
Link: https://lore.kernel.org/linux-trace-kernel/20231026194033.it.702-kees@kernel.org/

Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Yun Zhou <yun.zhou@windriver.com>
Cc: Jacob Keller <jacob.e.keller@intel.com>
Cc: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h | 21 +++++++++++++++++----
 kernel/trace/trace.c    | 11 +----------
 lib/seq_buf.c           |  4 +---
 3 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 8483e4b2d0d2..5fb1f12c33f9 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -21,9 +21,18 @@ struct seq_buf {
 	size_t			len;
 };
 
+#define DECLARE_SEQ_BUF(NAME, SIZE)			\
+	char __ ## NAME ## _buffer[SIZE] = "";		\
+	struct seq_buf NAME = {				\
+		.buffer = &__ ## NAME ## _buffer,	\
+		.size = SIZE,				\
+	}
+
 static inline void seq_buf_clear(struct seq_buf *s)
 {
 	s->len = 0;
+	if (s->size)
+		s->buffer[0] = '\0';
 }
 
 static inline void
@@ -69,8 +78,8 @@ static inline unsigned int seq_buf_used(struct seq_buf *s)
 }
 
 /**
- * seq_buf_terminate - Make sure buffer is nul terminated
- * @s: the seq_buf descriptor to terminate.
+ * seq_buf_str - get %NUL-terminated C string from seq_buf
+ * @s: the seq_buf handle
  *
  * This makes sure that the buffer in @s is nul terminated and
  * safe to read as a string.
@@ -81,16 +90,20 @@ static inline unsigned int seq_buf_used(struct seq_buf *s)
  *
  * After this function is called, s->buffer is safe to use
  * in string operations.
+ *
+ * Returns @s->buf after making sure it is terminated.
  */
-static inline void seq_buf_terminate(struct seq_buf *s)
+static inline const char *seq_buf_str(struct seq_buf *s)
 {
 	if (WARN_ON(s->size == 0))
-		return;
+		return "";
 
 	if (seq_buf_buffer_left(s))
 		s->buffer[s->len] = 0;
 	else
 		s->buffer[s->size - 1] = 0;
+
+	return s->buffer;
 }
 
 /**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d629065c2383..2539cfc20a97 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3828,15 +3828,6 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
 	return false;
 }
 
-static const char *show_buffer(struct trace_seq *s)
-{
-	struct seq_buf *seq = &s->seq;
-
-	seq_buf_terminate(seq);
-
-	return seq->buffer;
-}
-
 static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
 
 static int test_can_verify_check(const char *fmt, ...)
@@ -3976,7 +3967,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
 		 */
 		if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
 			      "fmt: '%s' current_buffer: '%s'",
-			      fmt, show_buffer(&iter->seq))) {
+			      fmt, seq_buf_str(&iter->seq.seq))) {
 			int ret;
 
 			/* Try to safely read the string */
diff --git a/lib/seq_buf.c b/lib/seq_buf.c
index b7477aefff53..23518f77ea9c 100644
--- a/lib/seq_buf.c
+++ b/lib/seq_buf.c
@@ -109,9 +109,7 @@ void seq_buf_do_printk(struct seq_buf *s, const char *lvl)
 	if (s->size == 0 || s->len == 0)
 		return;
 
-	seq_buf_terminate(s);
-
-	start = s->buffer;
+	start = seq_buf_str(s);
 	while ((lf = strchr(start, '\n'))) {
 		int len = lf - start + 1;
 
-- 
cgit v1.2.3


From 8b793bcda61f6c3ed4f5b2ded7530ef6749580cb Mon Sep 17 00:00:00 2001
From: Krister Johansen <kjlx@templeofstupid.com>
Date: Fri, 27 Oct 2023 14:46:53 -0700
Subject: watchdog: move softlockup_panic back to early_param

Setting softlockup_panic from do_sysctl_args() causes it to take effect
later in boot.  The lockup detector is enabled before SMP is brought
online, but do_sysctl_args runs afterwards.  If a user wants to set
softlockup_panic on boot and have it trigger should a softlockup occur
during onlining of the non-boot processors, they could do this prior to
commit f117955a2255 ("kernel/watchdog.c: convert {soft/hard}lockup boot
parameters to sysctl aliases").  However, after this commit the value
of softlockup_panic is set too late to be of help for this type of
problem.  Restore the prior behavior.

Signed-off-by: Krister Johansen <kjlx@templeofstupid.com>
Cc: stable@vger.kernel.org
Fixes: f117955a2255 ("kernel/watchdog.c: convert {soft/hard}lockup boot parameters to sysctl aliases")
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 fs/proc/proc_sysctl.c | 1 -
 kernel/watchdog.c     | 7 +++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 1c9635dddb70..de484195f49f 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1576,7 +1576,6 @@ static const struct sysctl_alias sysctl_aliases[] = {
 	{"hung_task_panic",			"kernel.hung_task_panic" },
 	{"numa_zonelist_order",			"vm.numa_zonelist_order" },
 	{"softlockup_all_cpu_backtrace",	"kernel.softlockup_all_cpu_backtrace" },
-	{"softlockup_panic",			"kernel.softlockup_panic" },
 	{ }
 };
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d145305d95fe..5cd6d4e26915 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -283,6 +283,13 @@ static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
 static DEFINE_PER_CPU(bool, softlockup_touch_sync);
 static unsigned long soft_lockup_nmi_warn;
 
+static int __init softlockup_panic_setup(char *str)
+{
+	softlockup_panic = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
+
 static int __init nowatchdog_setup(char *str)
 {
 	watchdog_user_enabled = 0;
-- 
cgit v1.2.3


From 3737df782c740b944912ed93420c57344b1cf864 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Wed, 30 Aug 2023 17:58:20 +0200
Subject: module/decompress: use vmalloc() for gzip decompression workspace

Use a similar approach as commit a419beac4a07 ("module/decompress: use
vmalloc() for zstd decompression workspace") and replace kmalloc() with
vmalloc() also for the gzip module decompression workspace.

In this case the workspace is represented by struct inflate_workspace
that can be fairly large for kmalloc() and it can potentially lead to
allocation errors on certain systems:

$ pahole inflate_workspace
struct inflate_workspace {
	struct inflate_state       inflate_state;        /*     0  9544 */
	/* --- cacheline 149 boundary (9536 bytes) was 8 bytes ago --- */
	unsigned char              working_window[32768]; /*  9544 32768 */

	/* size: 42312, cachelines: 662, members: 2 */
	/* last cacheline: 8 bytes */
};

Considering that there is no need to use continuous physical memory,
simply switch to vmalloc() to provide a more reliable in-kernel module
decompression.

Fixes: b1ae6dc41eaa ("module: add in-kernel support for decompressing")
Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/decompress.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
index 87440f714c0c..4156d59be440 100644
--- a/kernel/module/decompress.c
+++ b/kernel/module/decompress.c
@@ -100,7 +100,7 @@ static ssize_t module_gzip_decompress(struct load_info *info,
 	s.next_in = buf + gzip_hdr_len;
 	s.avail_in = size - gzip_hdr_len;
 
-	s.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
+	s.workspace = vmalloc(zlib_inflate_workspacesize());
 	if (!s.workspace)
 		return -ENOMEM;
 
@@ -138,7 +138,7 @@ static ssize_t module_gzip_decompress(struct load_info *info,
 out_inflate_end:
 	zlib_inflateEnd(&s);
 out:
-	kfree(s.workspace);
+	vfree(s.workspace);
 	return retval;
 }
 #elif defined(CONFIG_MODULE_COMPRESS_XZ)
-- 
cgit v1.2.3


From fd06da776130ec2611c30272a0868f6a54cdf9d2 Mon Sep 17 00:00:00 2001
From: Zhu Mao <zhumao001@208suo.com>
Date: Wed, 20 Sep 2023 17:13:09 -0700
Subject: module: Fix comment typo

Delete duplicated word in comment.

Signed-off-by: Zhu Mao <zhumao001@208suo.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/stats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module/stats.c b/kernel/module/stats.c
index 6ab2c94d6bc3..3ba0e98b3c91 100644
--- a/kernel/module/stats.c
+++ b/kernel/module/stats.c
@@ -126,7 +126,7 @@ static LIST_HEAD(dup_failed_modules);
  *     These typically should not happen unless your system is under memory
  *     pressure.
  *   * invalid_becoming_bytes: total number of bytes allocated and freed used
- *     used to read the kernel module userspace wants us to read before we
+ *     to read the kernel module userspace wants us to read before we
  *     promote it to be processed to be added to our @modules linked list. These
  *     failures can happen if we had a check in between a successful kernel_read_file_from_fd()
  *     call and right before we allocate the our private memory for the module
-- 
cgit v1.2.3


From ea0b0bcef4917a2640ecc100c768b8e785784834 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:52:53 -0700
Subject: module: Annotate struct module_notes_attrs with __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct module_notes_attrs.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: linux-modules@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
index c921bf044050..d964167c6658 100644
--- a/kernel/module/sysfs.c
+++ b/kernel/module/sysfs.c
@@ -143,7 +143,7 @@ static void remove_sect_attrs(struct module *mod)
 struct module_notes_attrs {
 	struct kobject *dir;
 	unsigned int notes;
-	struct bin_attribute attrs[];
+	struct bin_attribute attrs[] __counted_by(notes);
 };
 
 static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
-- 
cgit v1.2.3


From 85d68222ddc5f4522e456d97d201166acb50f716 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 31 Oct 2023 09:53:08 +0100
Subject: rcu: Break rcu_node_0 --> &rq->__lock order

Commit 851a723e45d1 ("sched: Always clear user_cpus_ptr in
do_set_cpus_allowed()") added a kfree() call to free any user
provided affinity mask, if present. It was changed later to use
kfree_rcu() in commit 9a5418bc48ba ("sched/core: Use kfree_rcu()
in do_set_cpus_allowed()") to avoid a circular locking dependency
problem.

It turns out that even kfree_rcu() isn't safe for avoiding
circular locking problem. As reported by kernel test robot,
the following circular locking dependency now exists:

  &rdp->nocb_lock --> rcu_node_0 --> &rq->__lock

Solve this by breaking the rcu_node_0 --> &rq->__lock chain by moving
the resched_cpu() out from under rcu_node lock.

[peterz: heavily borrowed from Waiman's Changelog]
[paulmck: applied Z qiang feedback]

Fixes: 851a723e45d1 ("sched: Always clear user_cpus_ptr in do_set_cpus_allowed()")
Reported-by: kernel test robot <oliver.sang@intel.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/oe-lkp/202310302207.a25f1a30-oliver.sang@intel.com
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tree.c | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 700524726079..77e7a0dc722a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -755,14 +755,19 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
 }
 
 /*
- * Return true if the specified CPU has passed through a quiescent
- * state by virtue of being in or having passed through an dynticks
- * idle state since the last call to dyntick_save_progress_counter()
- * for this same CPU, or by virtue of having been offline.
+ * Returns positive if the specified CPU has passed through a quiescent state
+ * by virtue of being in or having passed through an dynticks idle state since
+ * the last call to dyntick_save_progress_counter() for this same CPU, or by
+ * virtue of having been offline.
+ *
+ * Returns negative if the specified CPU needs a force resched.
+ *
+ * Returns zero otherwise.
  */
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
 	unsigned long jtsq;
+	int ret = 0;
 	struct rcu_node *rnp = rdp->mynode;
 
 	/*
@@ -848,8 +853,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	    (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
 	     rcu_state.cbovld)) {
 		WRITE_ONCE(rdp->rcu_urgent_qs, true);
-		resched_cpu(rdp->cpu);
 		WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+		ret = -1;
 	}
 
 	/*
@@ -862,8 +867,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	if (time_after(jiffies, rcu_state.jiffies_resched)) {
 		if (time_after(jiffies,
 			       READ_ONCE(rdp->last_fqs_resched) + jtsq)) {
-			resched_cpu(rdp->cpu);
 			WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+			ret = -1;
 		}
 		if (IS_ENABLED(CONFIG_IRQ_WORK) &&
 		    !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
@@ -892,7 +897,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 		}
 	}
 
-	return 0;
+	return ret;
 }
 
 /* Trace-event wrapper function for trace_rcu_future_grace_period.  */
@@ -2271,15 +2276,15 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
 {
 	int cpu;
 	unsigned long flags;
-	unsigned long mask;
-	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 
 	rcu_state.cbovld = rcu_state.cbovldnext;
 	rcu_state.cbovldnext = false;
 	rcu_for_each_leaf_node(rnp) {
+		unsigned long mask = 0;
+		unsigned long rsmask = 0;
+
 		cond_resched_tasks_rcu_qs();
-		mask = 0;
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		rcu_state.cbovldnext |= !!rnp->cbovldmask;
 		if (rnp->qsmask == 0) {
@@ -2297,11 +2302,17 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
 			continue;
 		}
 		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
+			struct rcu_data *rdp;
+			int ret;
+
 			rdp = per_cpu_ptr(&rcu_data, cpu);
-			if (f(rdp)) {
+			ret = f(rdp);
+			if (ret > 0) {
 				mask |= rdp->grpmask;
 				rcu_disable_urgency_upon_qs(rdp);
 			}
+			if (ret < 0)
+				rsmask |= rdp->grpmask;
 		}
 		if (mask != 0) {
 			/* Idle/offline CPUs, report (releases rnp->lock). */
@@ -2310,6 +2321,9 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
 			/* Nothing to do here, so just drop the lock. */
 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		}
+
+		for_each_leaf_node_cpu_mask(rnp, cpu, rsmask)
+			resched_cpu(cpu);
 	}
 }
 
-- 
cgit v1.2.3


From 2be4686d866ad5896f2bb94d82fe892197aea9c7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2023 16:40:47 +0200
Subject: rcu: Introduce rcu_cpu_online()

Export the RCU point of view as to when a CPU is considered offline
(ie: when does RCU consider that a CPU is sufficiently down in the
hotplug process to not feature any possible read side).

This will be used by RCU-tasks whose vision of an offline CPU should
reasonably match the one of RCU core.

Fixes: cff9b2332ab7 ("kernel/sched: Modify initial boot task idle setup")
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/rcu.h  | 2 ++
 kernel/rcu/tree.c | 7 +++++++
 2 files changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 0d866eaa4cc8..b531c33e9545 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -500,6 +500,7 @@ static inline void rcu_expedite_gp(void) { }
 static inline void rcu_unexpedite_gp(void) { }
 static inline void rcu_async_hurry(void) { }
 static inline void rcu_async_relax(void) { }
+static inline bool rcu_cpu_online(int cpu) { return true; }
 #else /* #ifdef CONFIG_TINY_RCU */
 bool rcu_gp_is_normal(void);     /* Internal RCU use. */
 bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
@@ -509,6 +510,7 @@ void rcu_unexpedite_gp(void);
 void rcu_async_hurry(void);
 void rcu_async_relax(void);
 void rcupdate_announce_bootup_oddness(void);
+bool rcu_cpu_online(int cpu);
 #ifdef CONFIG_TASKS_RCU_GENERIC
 void show_rcu_tasks_gp_kthreads(void);
 #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 77e7a0dc722a..c3359f4c8830 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4216,6 +4216,13 @@ static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
 	return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
 }
 
+bool rcu_cpu_online(int cpu)
+{
+	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+	return rcu_rdp_cpu_online(rdp);
+}
+
 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
 
 /*
-- 
cgit v1.2.3


From 9715ed501b585d47444865071674c961c0cc0020 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2023 16:40:48 +0200
Subject: rcu/tasks: Handle new PF_IDLE semantics

The commit:

	cff9b2332ab7 ("kernel/sched: Modify initial boot task idle setup")

has changed the semantics of what is to be considered an idle task in
such a way that CPU boot code preceding the actual idle loop is excluded
from it.

This has however introduced new potential RCU-tasks stalls when either:

1) Grace period is started before init/0 had a chance to set PF_IDLE,
   keeping it stuck in the holdout list until idle ever schedules.

2) Grace period is started when some possible CPUs have never been
   online, keeping their idle tasks stuck in the holdout list until the
   CPU ever boots up.

3) Similar to 1) but with secondary CPUs: Grace period is started
   concurrently with secondary CPU booting, putting its idle task in
   the holdout list because PF_IDLE isn't yet observed on it. It stays
   then stuck in the holdout list until that CPU ever schedules. The
   effect is mitigated here by the hotplug AP thread that must run to
   bring the CPU up.

Fix this with handling the new semantics of PF_IDLE, keeping in mind
that it may or may not be set on an idle task. Take advantage of that to
strengthen the coverage of an RCU-tasks quiescent state within an idle
task, excluding the CPU boot code from it. Only the code running within
the idle loop is now a quiescent state, along with offline CPUs.

Fixes: cff9b2332ab7 ("kernel/sched: Modify initial boot task idle setup")
Suggested-by: Joel Fernandes <joel@joelfernandes.org>
Suggested-by: Paul E . McKenney" <paulmck@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tasks.h | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 1fa631168594..1087f63b75b9 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -895,10 +895,36 @@ static void rcu_tasks_pregp_step(struct list_head *hop)
 	synchronize_rcu();
 }
 
+/* Check for quiescent states since the pregp's synchronize_rcu() */
+static bool rcu_tasks_is_holdout(struct task_struct *t)
+{
+	int cpu;
+
+	/* Has the task been seen voluntarily sleeping? */
+	if (!READ_ONCE(t->on_rq))
+		return false;
+
+	/*
+	 * Idle tasks (or idle injection) within the idle loop are RCU-tasks
+	 * quiescent states. But CPU boot code performed by the idle task
+	 * isn't a quiescent state.
+	 */
+	if (is_idle_task(t))
+		return false;
+
+	cpu = task_cpu(t);
+
+	/* Idle tasks on offline CPUs are RCU-tasks quiescent states. */
+	if (t == idle_task(cpu) && !rcu_cpu_online(cpu))
+		return false;
+
+	return true;
+}
+
 /* Per-task initial processing. */
 static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
 {
-	if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) {
+	if (t != current && rcu_tasks_is_holdout(t)) {
 		get_task_struct(t);
 		t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
 		WRITE_ONCE(t->rcu_tasks_holdout, true);
@@ -947,7 +973,7 @@ static void check_holdout_task(struct task_struct *t,
 
 	if (!READ_ONCE(t->rcu_tasks_holdout) ||
 	    t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
-	    !READ_ONCE(t->on_rq) ||
+	    !rcu_tasks_is_holdout(t) ||
 	    (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
 	     !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
 		WRITE_ONCE(t->rcu_tasks_holdout, false);
-- 
cgit v1.2.3


From a80712b9cc7e57830260ec5e1feb9cdb59e1da2f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2023 16:40:49 +0200
Subject: rcu/tasks-trace: Handle new PF_IDLE semantics

The commit:

	cff9b2332ab7 ("kernel/sched: Modify initial boot task idle setup")

has changed the semantics of what is to be considered an idle task in
such a way that the idle task of an offline CPU may not carry the
PF_IDLE flag anymore.

However RCU-tasks-trace tests the opposite assertion, still assuming
that idle tasks carry the PF_IDLE flag during their whole lifecycle.

Remove this assumption to avoid spurious warnings but keep the initial
test verifying that the idle task is the current task on any offline
CPU.

Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Fixes: cff9b2332ab7 ("kernel/sched: Modify initial boot task idle setup")
Suggested-by: Joel Fernandes <joel@joelfernandes.org>
Suggested-by: Paul E . McKenney" <paulmck@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 kernel/rcu/tasks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 1087f63b75b9..f54d5782eca0 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1551,7 +1551,7 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
 	} else {
 		// The task is not running, so C-language access is safe.
 		nesting = t->trc_reader_nesting;
-		WARN_ON_ONCE(ofl && task_curr(t) && !is_idle_task(t));
+		WARN_ON_ONCE(ofl && task_curr(t) && (t != idle_task(task_cpu(t))));
 		if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl)
 			n_heavy_reader_ofl_updates++;
 	}
-- 
cgit v1.2.3


From bb32500fb9b78215e4ef6ee8b4345c5f5d7eafb4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 31 Oct 2023 12:24:53 -0400
Subject: tracing: Have trace_event_file have ref counters

The following can crash the kernel:

 # cd /sys/kernel/tracing
 # echo 'p:sched schedule' > kprobe_events
 # exec 5>>events/kprobes/sched/enable
 # > kprobe_events
 # exec 5>&-

The above commands:

 1. Change directory to the tracefs directory
 2. Create a kprobe event (doesn't matter what one)
 3. Open bash file descriptor 5 on the enable file of the kprobe event
 4. Delete the kprobe event (removes the files too)
 5. Close the bash file descriptor 5

The above causes a crash!

 BUG: kernel NULL pointer dereference, address: 0000000000000028
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: 0000 [#1] PREEMPT SMP PTI
 CPU: 6 PID: 877 Comm: bash Not tainted 6.5.0-rc4-test-00008-g2c6b6b1029d4-dirty #186
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
 RIP: 0010:tracing_release_file_tr+0xc/0x50

What happens here is that the kprobe event creates a trace_event_file
"file" descriptor that represents the file in tracefs to the event. It
maintains state of the event (is it enabled for the given instance?).
Opening the "enable" file gets a reference to the event "file" descriptor
via the open file descriptor. When the kprobe event is deleted, the file is
also deleted from the tracefs system which also frees the event "file"
descriptor.

But as the tracefs file is still opened by user space, it will not be
totally removed until the final dput() is called on it. But this is not
true with the event "file" descriptor that is already freed. If the user
does a write to or simply closes the file descriptor it will reference the
event "file" descriptor that was just freed, causing a use-after-free bug.

To solve this, add a ref count to the event "file" descriptor as well as a
new flag called "FREED". The "file" will not be freed until the last
reference is released. But the FREE flag will be set when the event is
removed to prevent any more modifications to that event from happening,
even if there's still a reference to the event "file" descriptor.

Link: https://lore.kernel.org/linux-trace-kernel/20231031000031.1e705592@gandalf.local.home/
Link: https://lore.kernel.org/linux-trace-kernel/20231031122453.7a48b923@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Mark Rutland <mark.rutland@arm.com>
Fixes: f5ca233e2e66d ("tracing: Increase trace array ref count on enable and filter files")
Reported-by: Beau Belgrave <beaub@linux.microsoft.com>
Tested-by: Beau Belgrave <beaub@linux.microsoft.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h       |  4 ++++
 kernel/trace/trace.c               | 15 +++++++++++++++
 kernel/trace/trace.h               |  3 +++
 kernel/trace/trace_events.c        | 31 +++++++++++++++++++++++++++----
 kernel/trace/trace_events_filter.c |  3 +++
 5 files changed, 52 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 12207dc6722d..696f8dc4aa53 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -492,6 +492,7 @@ enum {
 	EVENT_FILE_FL_TRIGGER_COND_BIT,
 	EVENT_FILE_FL_PID_FILTER_BIT,
 	EVENT_FILE_FL_WAS_ENABLED_BIT,
+	EVENT_FILE_FL_FREED_BIT,
 };
 
 extern struct trace_event_file *trace_get_event_file(const char *instance,
@@ -630,6 +631,7 @@ extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...);
  *  TRIGGER_COND  - When set, one or more triggers has an associated filter
  *  PID_FILTER    - When set, the event is filtered based on pid
  *  WAS_ENABLED   - Set when enabled to know to clear trace on module removal
+ *  FREED         - File descriptor is freed, all fields should be considered invalid
  */
 enum {
 	EVENT_FILE_FL_ENABLED		= (1 << EVENT_FILE_FL_ENABLED_BIT),
@@ -643,6 +645,7 @@ enum {
 	EVENT_FILE_FL_TRIGGER_COND	= (1 << EVENT_FILE_FL_TRIGGER_COND_BIT),
 	EVENT_FILE_FL_PID_FILTER	= (1 << EVENT_FILE_FL_PID_FILTER_BIT),
 	EVENT_FILE_FL_WAS_ENABLED	= (1 << EVENT_FILE_FL_WAS_ENABLED_BIT),
+	EVENT_FILE_FL_FREED		= (1 << EVENT_FILE_FL_FREED_BIT),
 };
 
 struct trace_event_file {
@@ -671,6 +674,7 @@ struct trace_event_file {
 	 * caching and such. Which is mostly OK ;-)
 	 */
 	unsigned long		flags;
+	atomic_t		ref;	/* ref count for opened files */
 	atomic_t		sm_ref;	/* soft-mode reference counter */
 	atomic_t		tm_ref;	/* trigger-mode reference counter */
 };
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2539cfc20a97..9aebf904ff97 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4978,6 +4978,20 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp)
 	if (ret)
 		return ret;
 
+	mutex_lock(&event_mutex);
+
+	/* Fail if the file is marked for removal */
+	if (file->flags & EVENT_FILE_FL_FREED) {
+		trace_array_put(file->tr);
+		ret = -ENODEV;
+	} else {
+		event_file_get(file);
+	}
+
+	mutex_unlock(&event_mutex);
+	if (ret)
+		return ret;
+
 	filp->private_data = inode->i_private;
 
 	return 0;
@@ -4988,6 +5002,7 @@ int tracing_release_file_tr(struct inode *inode, struct file *filp)
 	struct trace_event_file *file = inode->i_private;
 
 	trace_array_put(file->tr);
+	event_file_put(file);
 
 	return 0;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0e1405abf4f7..b7f4ea25a194 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1669,6 +1669,9 @@ extern void event_trigger_unregister(struct event_command *cmd_ops,
 				     char *glob,
 				     struct event_trigger_data *trigger_data);
 
+extern void event_file_get(struct trace_event_file *file);
+extern void event_file_put(struct trace_event_file *file);
+
 /**
  * struct event_trigger_ops - callbacks for trace event triggers
  *
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f9e3e24d8796..f29e815ca5b2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -990,13 +990,35 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
 	}
 }
 
+void event_file_get(struct trace_event_file *file)
+{
+	atomic_inc(&file->ref);
+}
+
+void event_file_put(struct trace_event_file *file)
+{
+	if (WARN_ON_ONCE(!atomic_read(&file->ref))) {
+		if (file->flags & EVENT_FILE_FL_FREED)
+			kmem_cache_free(file_cachep, file);
+		return;
+	}
+
+	if (atomic_dec_and_test(&file->ref)) {
+		/* Count should only go to zero when it is freed */
+		if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED)))
+			return;
+		kmem_cache_free(file_cachep, file);
+	}
+}
+
 static void remove_event_file_dir(struct trace_event_file *file)
 {
 	eventfs_remove_dir(file->ei);
 	list_del(&file->list);
 	remove_subsystem(file->system);
 	free_event_filter(file->filter);
-	kmem_cache_free(file_cachep, file);
+	file->flags |= EVENT_FILE_FL_FREED;
+	event_file_put(file);
 }
 
 /*
@@ -1369,7 +1391,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		flags = file->flags;
 	mutex_unlock(&event_mutex);
 
-	if (!file)
+	if (!file || flags & EVENT_FILE_FL_FREED)
 		return -ENODEV;
 
 	if (flags & EVENT_FILE_FL_ENABLED &&
@@ -1403,7 +1425,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		ret = -ENODEV;
 		mutex_lock(&event_mutex);
 		file = event_file_data(filp);
-		if (likely(file)) {
+		if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) {
 			ret = tracing_update_buffers(file->tr);
 			if (ret < 0) {
 				mutex_unlock(&event_mutex);
@@ -1683,7 +1705,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	mutex_lock(&event_mutex);
 	file = event_file_data(filp);
-	if (file)
+	if (file && !(file->flags & EVENT_FILE_FL_FREED))
 		print_event_filter(file, s);
 	mutex_unlock(&event_mutex);
 
@@ -2902,6 +2924,7 @@ trace_create_new_event(struct trace_event_call *call,
 	atomic_set(&file->tm_ref, 0);
 	INIT_LIST_HEAD(&file->triggers);
 	list_add(&file->list, &tr->events);
+	event_file_get(file);
 
 	return file;
 }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 33264e510d16..0c611b281a5b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -2349,6 +2349,9 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
 	struct event_filter *filter = NULL;
 	int err;
 
+	if (file->flags & EVENT_FILE_FL_FREED)
+		return -ENODEV;
+
 	if (!strcmp(strstrip(filter_string), "0")) {
 		filter_disable(file);
 		filter = event_filter(file);
-- 
cgit v1.2.3


From 4f7969bcd6d33042d62e249b41b5578161e4c868 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 31 Oct 2023 15:10:33 -0400
Subject: tracing: Have the user copy of synthetic event address use correct
 context

A synthetic event is created by the synthetic event interface that can
read both user or kernel address memory. In reality, it reads any
arbitrary memory location from within the kernel. If the address space is
in USER (where CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE is set) then
it uses strncpy_from_user_nofault() to copy strings otherwise it uses
strncpy_from_kernel_nofault().

But since both functions use the same variable there's no annotation to
what that variable is (ie. __user). This makes sparse complain.

Quiet sparse by typecasting the strncpy_from_user_nofault() variable to
a __user pointer.

Link: https://lore.kernel.org/linux-trace-kernel/20231031151033.73c42e23@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Fixes: 0934ae9977c2 ("tracing: Fix reading strings from synthetic events");
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202311010013.fm8WTxa5-lkp@intel.com/
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_synth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 14cb275a0bab..846e02c0fb59 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -452,7 +452,7 @@ static unsigned int trace_string(struct synth_trace_event *entry,
 
 #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 		if ((unsigned long)str_val < TASK_SIZE)
-			ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX);
+			ret = strncpy_from_user_nofault(str_field, (const void __user *)str_val, STR_VAR_LEN_MAX);
 		else
 #endif
 			ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX);
-- 
cgit v1.2.3


From 05670f81d1287c40ec861186e4c4e3401013e7fb Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matttbe@kernel.org>
Date: Wed, 1 Nov 2023 19:16:01 +0100
Subject: bpf: fix compilation error without CGROUPS

Our MPTCP CI complained [1] -- and KBuild too -- that it was no longer
possible to build the kernel without CONFIG_CGROUPS:

  kernel/bpf/task_iter.c: In function 'bpf_iter_css_task_new':
  kernel/bpf/task_iter.c:919:14: error: 'CSS_TASK_ITER_PROCS' undeclared (first use in this function)
    919 |         case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED:
        |              ^~~~~~~~~~~~~~~~~~~
  kernel/bpf/task_iter.c:919:14: note: each undeclared identifier is reported only once for each function it appears in
  kernel/bpf/task_iter.c:919:36: error: 'CSS_TASK_ITER_THREADED' undeclared (first use in this function)
    919 |         case CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED:
        |                                    ^~~~~~~~~~~~~~~~~~~~~~
  kernel/bpf/task_iter.c:927:60: error: invalid application of 'sizeof' to incomplete type 'struct css_task_iter'
    927 |         kit->css_it = bpf_mem_alloc(&bpf_global_ma, sizeof(struct css_task_iter));
        |                                                            ^~~~~~
  kernel/bpf/task_iter.c:930:9: error: implicit declaration of function 'css_task_iter_start'; did you mean 'task_seq_start'? [-Werror=implicit-function-declaration]
    930 |         css_task_iter_start(css, flags, kit->css_it);
        |         ^~~~~~~~~~~~~~~~~~~
        |         task_seq_start
  kernel/bpf/task_iter.c: In function 'bpf_iter_css_task_next':
  kernel/bpf/task_iter.c:940:16: error: implicit declaration of function 'css_task_iter_next'; did you mean 'class_dev_iter_next'? [-Werror=implicit-function-declaration]
    940 |         return css_task_iter_next(kit->css_it);
        |                ^~~~~~~~~~~~~~~~~~
        |                class_dev_iter_next
  kernel/bpf/task_iter.c:940:16: error: returning 'int' from a function with return type 'struct task_struct *' makes pointer from integer without a cast [-Werror=int-conversion]
    940 |         return css_task_iter_next(kit->css_it);
        |                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  kernel/bpf/task_iter.c: In function 'bpf_iter_css_task_destroy':
  kernel/bpf/task_iter.c:949:9: error: implicit declaration of function 'css_task_iter_end' [-Werror=implicit-function-declaration]
    949 |         css_task_iter_end(kit->css_it);
        |         ^~~~~~~~~~~~~~~~~

This patch simply surrounds with a #ifdef the new code requiring CGroups
support. It seems enough for the compiler and this is similar to
bpf_iter_css_{new,next,destroy}() functions where no other #ifdef have
been added in kernel/bpf/helpers.c and in the selftests.

Fixes: 9c66dc94b62a ("bpf: Introduce css_task open-coded iterator kfuncs")
Link: https://github.com/multipath-tcp/mptcp_net-next/actions/runs/6665206927
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202310260528.aHWgVFqq-lkp@intel.com/
Signed-off-by: Matthieu Baerts <matttbe@kernel.org>
[ added missing ifdefs for BTF_ID cgroup definitions ]
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20231101181601.1493271-1-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c   | 8 +++++---
 kernel/bpf/task_iter.c | 4 ++++
 kernel/bpf/verifier.c  | 8 ++++++++
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e46ac288a108..95449ea7cc1b 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2564,15 +2564,17 @@ BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
 BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
+#ifdef CONFIG_CGROUPS
 BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
-BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
+#endif
+BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
 BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 654601dd6b49..8967d1ac7551 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -892,6 +892,8 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
 
 __diag_pop();
 
+#ifdef CONFIG_CGROUPS
+
 struct bpf_iter_css_task {
 	__u64 __opaque[1];
 } __attribute__((aligned(8)));
@@ -950,6 +952,8 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
 
 __diag_pop();
 
+#endif /* CONFIG_CGROUPS */
+
 struct bpf_iter_task {
 	__u64 __opaque[3];
 } __attribute__((aligned(8)));
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 857d76694517..8c91e0e54e27 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5388,7 +5388,9 @@ static bool in_rcu_cs(struct bpf_verifier_env *env)
 /* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
 BTF_SET_START(rcu_protected_types)
 BTF_ID(struct, prog_test_ref_kfunc)
+#ifdef CONFIG_CGROUPS
 BTF_ID(struct, cgroup)
+#endif
 BTF_ID(struct, bpf_cpumask)
 BTF_ID(struct, task_struct)
 BTF_SET_END(rcu_protected_types)
@@ -10835,7 +10837,9 @@ BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
 BTF_ID(func, bpf_throw)
+#ifdef CONFIG_CGROUPS
 BTF_ID(func, bpf_iter_css_task_new)
+#endif
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -10861,7 +10865,11 @@ BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
 BTF_ID(func, bpf_throw)
+#ifdef CONFIG_CGROUPS
 BTF_ID(func, bpf_iter_css_task_new)
+#else
+BTF_ID_UNUSED
+#endif
 
 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
 {
-- 
cgit v1.2.3


From 811c363645b33e6e22658634329e95f383dfc705 Mon Sep 17 00:00:00 2001
From: Hao Sun <sunhao.th@gmail.com>
Date: Wed, 1 Nov 2023 13:33:51 +0100
Subject: bpf: Fix check_stack_write_fixed_off() to correctly spill imm

In check_stack_write_fixed_off(), imm value is cast to u32 before being
spilled to the stack. Therefore, the sign information is lost, and the
range information is incorrect when load from the stack again.

For the following prog:
0: r2 = r10
1: *(u64*)(r2 -40) = -44
2: r0 = *(u64*)(r2 - 40)
3: if r0 s<= 0xa goto +2
4: r0 = 1
5: exit
6: r0  = 0
7: exit

The verifier gives:
func#0 @0
0: R1=ctx(off=0,imm=0) R10=fp0
0: (bf) r2 = r10                      ; R2_w=fp0 R10=fp0
1: (7a) *(u64 *)(r2 -40) = -44        ; R2_w=fp0 fp-40_w=4294967252
2: (79) r0 = *(u64 *)(r2 -40)         ; R0_w=4294967252 R2_w=fp0
fp-40_w=4294967252
3: (c5) if r0 s< 0xa goto pc+2
mark_precise: frame0: last_idx 3 first_idx 0 subseq_idx -1
mark_precise: frame0: regs=r0 stack= before 2: (79) r0 = *(u64 *)(r2 -40)
3: R0_w=4294967252
4: (b7) r0 = 1                        ; R0_w=1
5: (95) exit
verification time 7971 usec
stack depth 40
processed 6 insns (limit 1000000) max_states_per_insn 0 total_states 0
peak_states 0 mark_read 0

So remove the incorrect cast, since imm field is declared as s32, and
__mark_reg_known() takes u64, so imm would be correctly sign extended
by compiler.

Fixes: ecdf985d7615 ("bpf: track immediate values written to stack by BPF_ST instruction")
Cc: stable@vger.kernel.org
Signed-off-by: Hao Sun <sunhao.th@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231101-fix-check-stack-write-v3-1-f05c2b1473d5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8c91e0e54e27..014b4d1ef408 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4674,7 +4674,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 		   insn->imm != 0 && env->bpf_capable) {
 		struct bpf_reg_state fake_reg = {};
 
-		__mark_reg_known(&fake_reg, (u32)insn->imm);
+		__mark_reg_known(&fake_reg, insn->imm);
 		fake_reg.type = SCALAR_VALUE;
 		save_register_state(state, spi, &fake_reg, size);
 	} else if (reg && is_spillable_regtype(reg->type)) {
-- 
cgit v1.2.3


From 391145ba2accc48b596f3d438af1a6255b62a555 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 31 Oct 2023 14:56:24 -0700
Subject: bpf: Add __bpf_kfunc_{start,end}_defs macros

BPF kfuncs are meant to be called from BPF programs. Accordingly, most
kfuncs are not called from anywhere in the kernel, which the
-Wmissing-prototypes warning is unhappy about. We've peppered
__diag_ignore_all("-Wmissing-prototypes", ... everywhere kfuncs are
defined in the codebase to suppress this warning.

This patch adds two macros meant to bound one or many kfunc definitions.
All existing kfunc definitions which use these __diag calls to suppress
-Wmissing-prototypes are migrated to use the newly-introduced macros.
A new __diag_ignore_all - for "-Wmissing-declarations" - is added to the
__bpf_kfunc_start_defs macro based on feedback from Andrii on an earlier
version of this patch [0] and another recent mailing list thread [1].

In the future we might need to ignore different warnings or do other
kfunc-specific things. This change will make it easier to make such
modifications for all kfunc defs.

  [0]: https://lore.kernel.org/bpf/CAEf4BzaE5dRWtK6RPLnjTW-MW9sx9K3Fn6uwqCTChK2Dcb1Xig@mail.gmail.com/
  [1]: https://lore.kernel.org/bpf/ZT+2qCc%2FaXep0%2FLf@krava/

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Cc: Jiri Olsa <olsajiri@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20231031215625.2343848-1-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/kfuncs.rst     |  6 ++----
 include/linux/btf.h              |  9 +++++++++
 kernel/bpf/bpf_iter.c            |  6 ++----
 kernel/bpf/cgroup_iter.c         |  6 ++----
 kernel/bpf/cpumask.c             |  6 ++----
 kernel/bpf/helpers.c             |  6 ++----
 kernel/bpf/map_iter.c            |  6 ++----
 kernel/bpf/task_iter.c           | 18 ++++++------------
 kernel/trace/bpf_trace.c         |  6 ++----
 net/bpf/test_run.c               |  7 +++----
 net/core/filter.c                | 13 ++++---------
 net/core/xdp.c                   |  6 ++----
 net/ipv4/fou_bpf.c               |  6 ++----
 net/netfilter/nf_conntrack_bpf.c |  6 ++----
 net/netfilter/nf_nat_bpf.c       |  6 ++----
 net/xfrm/xfrm_interface_bpf.c    |  6 ++----
 16 files changed, 46 insertions(+), 73 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 0d2647fb358d..723408e399ab 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -37,16 +37,14 @@ prototype in a header for the wrapper kfunc.
 An example is given below::
 
         /* Disables missing prototype warnings */
-        __diag_push();
-        __diag_ignore_all("-Wmissing-prototypes",
-                          "Global kfuncs as their definitions will be in BTF");
+        __bpf_kfunc_start_defs();
 
         __bpf_kfunc struct task_struct *bpf_find_get_task_by_vpid(pid_t nr)
         {
                 return find_get_task_by_vpid(nr);
         }
 
-        __diag_pop();
+        __bpf_kfunc_end_defs();
 
 A wrapper kfunc is often needed when we need to annotate parameters of the
 kfunc. Otherwise one may directly make the kfunc visible to the BPF program by
diff --git a/include/linux/btf.h b/include/linux/btf.h
index c2231c64d60b..dc5ce962f600 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -84,6 +84,15 @@
  */
 #define __bpf_kfunc __used noinline
 
+#define __bpf_kfunc_start_defs()					       \
+	__diag_push();							       \
+	__diag_ignore_all("-Wmissing-declarations",			       \
+			  "Global kfuncs as their definitions will be in BTF");\
+	__diag_ignore_all("-Wmissing-prototypes",			       \
+			  "Global kfuncs as their definitions will be in BTF")
+
+#define __bpf_kfunc_end_defs() __diag_pop()
+
 /*
  * Return the name of the passed struct, if exists, or halt the build if for
  * example the structure gets renamed. In this way, developers have to revisit
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 833faa04461b..0fae79164187 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -782,9 +782,7 @@ struct bpf_iter_num_kern {
 	int end; /* final value, exclusive */
 } __aligned(8);
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 __bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
 {
@@ -843,4 +841,4 @@ __bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
 	s->cur = s->end = 0;
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index 209e5135f9fb..d1b5c5618dd7 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -305,9 +305,7 @@ struct bpf_iter_css_kern {
 	unsigned int flags;
 } __attribute__((aligned(8)));
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		"Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
 		struct cgroup_subsys_state *start, unsigned int flags)
@@ -358,4 +356,4 @@ __bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it)
 {
 }
 
-__diag_pop();
\ No newline at end of file
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 6983af8e093c..e01c741e54e7 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -34,9 +34,7 @@ static bool cpu_valid(u32 cpu)
 	return cpu < nr_cpu_ids;
 }
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global kfuncs as their definitions will be in BTF");
+__bpf_kfunc_start_defs();
 
 /**
  * bpf_cpumask_create() - Create a mutable BPF cpumask.
@@ -407,7 +405,7 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
 	return cpumask_any_and_distribute(src1, src2);
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(cpumask_kfunc_btf_ids)
 BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 95449ea7cc1b..abe82105e33e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1886,9 +1886,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 	}
 }
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 {
@@ -2505,7 +2503,7 @@ __bpf_kfunc void bpf_throw(u64 cookie)
 	WARN(1, "A call to BPF exception callback should never return\n");
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(generic_btf_ids)
 #ifdef CONFIG_KEXEC_CORE
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 6fc9dae9edc8..6abd7c5df4b3 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -193,9 +193,7 @@ static int __init bpf_map_iter_init(void)
 
 late_initcall(bpf_map_iter_init);
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
 {
@@ -213,7 +211,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
 	return ret;
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(bpf_map_iter_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8967d1ac7551..4e156dca48de 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -822,9 +822,7 @@ struct bpf_iter_task_vma_kern {
 	struct bpf_iter_task_vma_kern_data *data;
 } __attribute__((aligned(8)));
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
 				      struct task_struct *task, u64 addr)
@@ -890,7 +888,7 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
 	}
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 #ifdef CONFIG_CGROUPS
 
@@ -902,9 +900,7 @@ struct bpf_iter_css_task_kern {
 	struct css_task_iter *css_it;
 } __attribute__((aligned(8)));
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 __bpf_kfunc int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
 		struct cgroup_subsys_state *css, unsigned int flags)
@@ -950,7 +946,7 @@ __bpf_kfunc void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it)
 	bpf_mem_free(&bpf_global_ma, kit->css_it);
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 #endif /* CONFIG_CGROUPS */
 
@@ -973,9 +969,7 @@ enum {
 	BPF_TASK_ITER_PROC_THREADS
 };
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
 		struct task_struct *task__nullable, unsigned int flags)
@@ -1045,7 +1039,7 @@ __bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
 {
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index df697c74d519..84e8a0f6e4e0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1252,9 +1252,7 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
 };
 
 #ifdef CONFIG_KEYS
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "kfuncs which will be used in BPF programs");
+__bpf_kfunc_start_defs();
 
 /**
  * bpf_lookup_user_key - lookup a key by its serial
@@ -1404,7 +1402,7 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
 }
 #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(key_sig_kfunc_set)
 BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 0841f8d82419..c9fdcc5cdce1 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -503,9 +503,8 @@ out:
  * architecture dependent calling conventions. 7+ can be supported in the
  * future.
  */
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
+
 __bpf_kfunc int bpf_fentry_test1(int a)
 {
 	return a + 1;
@@ -605,7 +604,7 @@ __bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p)
 {
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(bpf_test_modify_return_ids)
 BTF_ID_FLAGS(func, bpf_modify_return_test)
diff --git a/net/core/filter.c b/net/core/filter.c
index 21d75108c2e9..383f96b0a1c7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -11767,9 +11767,7 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
 	return func;
 }
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 __bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags,
 				    struct bpf_dynptr_kern *ptr__uninit)
 {
@@ -11816,7 +11814,7 @@ __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
 
 	return 0;
 }
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
 			       struct bpf_dynptr_kern *ptr__uninit)
@@ -11879,10 +11877,7 @@ static int __init bpf_kfunc_init(void)
 }
 late_initcall(bpf_kfunc_init);
 
-/* Disables missing prototype warnings */
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 /* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
  *
@@ -11916,7 +11911,7 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
 	return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
 }
 
-__diag_pop()
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(bpf_sk_iter_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
diff --git a/net/core/xdp.c b/net/core/xdp.c
index df4789ab512d..b6f1d6dab3f2 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -696,9 +696,7 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
 	return nxdpf;
 }
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc_start_defs();
 
 /**
  * bpf_xdp_metadata_rx_timestamp - Read XDP frame RX timestamp.
@@ -738,7 +736,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
 	return -EOPNOTSUPP;
 }
 
-__diag_pop();
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(xdp_metadata_kfunc_ids)
 #define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c
index 3760a14b6b57..4da03bf45c9b 100644
--- a/net/ipv4/fou_bpf.c
+++ b/net/ipv4/fou_bpf.c
@@ -22,9 +22,7 @@ enum bpf_fou_encap_type {
 	FOU_BPF_ENCAP_GUE,
 };
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in BTF");
+__bpf_kfunc_start_defs();
 
 /* bpf_skb_set_fou_encap - Set FOU encap parameters
  *
@@ -100,7 +98,7 @@ __bpf_kfunc int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
 	return 0;
 }
 
-__diag_pop()
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(fou_kfunc_set)
 BTF_ID_FLAGS(func, bpf_skb_set_fou_encap)
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index b21799d468d2..475358ec8212 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -230,9 +230,7 @@ static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
 	return 0;
 }
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in nf_conntrack BTF");
+__bpf_kfunc_start_defs();
 
 /* bpf_xdp_ct_alloc - Allocate a new CT entry
  *
@@ -467,7 +465,7 @@ __bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
 	return nf_ct_change_status_common(nfct, status);
 }
 
-__diag_pop()
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(nf_ct_kfunc_set)
 BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c
index 141ee7783223..6e3b2f58855f 100644
--- a/net/netfilter/nf_nat_bpf.c
+++ b/net/netfilter/nf_nat_bpf.c
@@ -12,9 +12,7 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_nat.h>
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in nf_nat BTF");
+__bpf_kfunc_start_defs();
 
 /* bpf_ct_set_nat_info - Set source or destination nat address
  *
@@ -54,7 +52,7 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
 	return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
 }
 
-__diag_pop()
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(nf_nat_kfunc_set)
 BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS)
diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
index d74f3fd20f2b..7d5e920141e9 100644
--- a/net/xfrm/xfrm_interface_bpf.c
+++ b/net/xfrm/xfrm_interface_bpf.c
@@ -27,9 +27,7 @@ struct bpf_xfrm_info {
 	int link;
 };
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in xfrm_interface BTF");
+__bpf_kfunc_start_defs();
 
 /* bpf_skb_get_xfrm_info - Get XFRM metadata
  *
@@ -93,7 +91,7 @@ __bpf_kfunc int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx, const struct bp
 	return 0;
 }
 
-__diag_pop()
+__bpf_kfunc_end_defs();
 
 BTF_SET8_START(xfrm_ifc_kfunc_set)
 BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info)
-- 
cgit v1.2.3


From 15fb6f2b6c4c3c129adc2412ae12ec15e60a6adb Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 31 Oct 2023 14:56:25 -0700
Subject: bpf: Add __bpf_hook_{start,end} macros

Not all uses of __diag_ignore_all(...) in BPF-related code in order to
suppress warnings are wrapping kfunc definitions. Some "hook point"
definitions - small functions meant to be used as attach points for
fentry and similar BPF progs - need to suppress -Wmissing-declarations.

We could use __bpf_kfunc_{start,end}_defs added in the previous patch in
such cases, but this might be confusing to someone unfamiliar with BPF
internals. Instead, this patch adds __bpf_hook_{start,end} macros,
currently having the same effect as __bpf_kfunc_{start,end}_defs, then
uses them to suppress warnings for two hook points in the kernel itself
and some bpf_testmod hook points as well.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Cc: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20231031215625.2343848-2-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h                                   | 2 ++
 kernel/cgroup/rstat.c                                 | 9 +++------
 net/socket.c                                          | 8 ++------
 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 6 ++----
 4 files changed, 9 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index dc5ce962f600..59d404e22814 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -92,6 +92,8 @@
 			  "Global kfuncs as their definitions will be in BTF")
 
 #define __bpf_kfunc_end_defs() __diag_pop()
+#define __bpf_hook_start() __bpf_kfunc_start_defs()
+#define __bpf_hook_end() __bpf_kfunc_end_defs()
 
 /*
  * Return the name of the passed struct, if exists, or halt the build if for
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d80d7a608141..c0adb7254b45 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -156,19 +156,16 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
  * optimize away the callsite. Therefore, __weak is needed to ensure that the
  * call is still emitted, by telling the compiler that we don't know what the
  * function might eventually be.
- *
- * __diag_* below are needed to dismiss the missing prototype warning.
  */
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "kfuncs which will be used in BPF programs");
+
+__bpf_hook_start();
 
 __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
 				     struct cgroup *parent, int cpu)
 {
 }
 
-__diag_pop();
+__bpf_hook_end();
 
 /* see cgroup_rstat_flush() */
 static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
diff --git a/net/socket.c b/net/socket.c
index 0d1c4e78fc7f..3379c64217a4 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1685,20 +1685,16 @@ struct file *__sys_socket_file(int family, int type, int protocol)
  *	Therefore, __weak is needed to ensure that the call is still
  *	emitted, by telling the compiler that we don't know what the
  *	function might eventually be.
- *
- *	__diag_* below are needed to dismiss the missing prototype warning.
  */
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "A fmod_ret entry point for BPF programs");
+__bpf_hook_start();
 
 __weak noinline int update_socket_protocol(int family, int type, int protocol)
 {
 	return protocol;
 }
 
-__diag_pop();
+__bpf_hook_end();
 
 int __sys_socket(int family, int type, int protocol)
 {
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index a5e246f7b202..91907b321f91 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -39,9 +39,7 @@ struct bpf_testmod_struct_arg_4 {
 	int b;
 };
 
-__diag_push();
-__diag_ignore_all("-Wmissing-prototypes",
-		  "Global functions as their definitions will be in bpf_testmod.ko BTF");
+__bpf_hook_start();
 
 noinline int
 bpf_testmod_test_struct_arg_1(struct bpf_testmod_struct_arg_2 a, int b, int c) {
@@ -335,7 +333,7 @@ noinline int bpf_fentry_shadow_test(int a)
 }
 EXPORT_SYMBOL_GPL(bpf_fentry_shadow_test);
 
-__diag_pop();
+__bpf_hook_end();
 
 static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = {
 	.attr = { .name = "bpf_testmod", .mode = 0666, },
-- 
cgit v1.2.3


From fd381ce60a2d79cc967506208085336d3d268ae0 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 30 Oct 2023 14:36:16 +0800
Subject: bpf: Check map->usercnt after timer->timer is assigned

When there are concurrent uref release and bpf timer init operations,
the following sequence diagram is possible. It will break the guarantee
provided by bpf_timer: bpf_timer will still be alive after userspace
application releases or unpins the map. It also will lead to kmemleak
for old kernel version which doesn't release bpf_timer when map is
released.

bpf program X:

bpf_timer_init()
  lock timer->lock
    read timer->timer as NULL
    read map->usercnt != 0

                process Y:

                close(map_fd)
                  // put last uref
                  bpf_map_put_uref()
                    atomic_dec_and_test(map->usercnt)
                      array_map_free_timers()
                        bpf_timer_cancel_and_free()
                          // just return
                          read timer->timer is NULL

    t = bpf_map_kmalloc_node()
    timer->timer = t
  unlock timer->lock

Fix the problem by checking map->usercnt after timer->timer is assigned,
so when there are concurrent uref release and bpf timer init, either
bpf_timer_cancel_and_free() from uref release reads a no-NULL timer
or the newly-added atomic64_read() returns a zero usercnt.

Because atomic_dec_and_test(map->usercnt) and READ_ONCE(timer->timer)
in bpf_timer_cancel_and_free() are not protected by a lock, so add
a memory barrier to guarantee the order between map->usercnt and
timer->timer. Also use WRITE_ONCE(timer->timer, x) to match the lockless
read of timer->timer in bpf_timer_cancel_and_free().

Reported-by: Hsin-Wei Hung <hsinweih@uci.edu>
Closes: https://lore.kernel.org/bpf/CABcoxUaT2k9hWsS1tNgXyoU3E-=PuOgMn737qK984fbFmfYixQ@mail.gmail.com
Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231030063616.1653024-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index abe82105e33e..56b0c1f678ee 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1177,13 +1177,6 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
 		ret = -EBUSY;
 		goto out;
 	}
-	if (!atomic64_read(&map->usercnt)) {
-		/* maps with timers must be either held by user space
-		 * or pinned in bpffs.
-		 */
-		ret = -EPERM;
-		goto out;
-	}
 	/* allocate hrtimer via map_kmalloc to use memcg accounting */
 	t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
 	if (!t) {
@@ -1196,7 +1189,21 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
 	rcu_assign_pointer(t->callback_fn, NULL);
 	hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
 	t->timer.function = bpf_timer_cb;
-	timer->timer = t;
+	WRITE_ONCE(timer->timer, t);
+	/* Guarantee the order between timer->timer and map->usercnt. So
+	 * when there are concurrent uref release and bpf timer init, either
+	 * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
+	 * timer or atomic64_read() below returns a zero usercnt.
+	 */
+	smp_mb();
+	if (!atomic64_read(&map->usercnt)) {
+		/* maps with timers must be either held by user space
+		 * or pinned in bpffs.
+		 */
+		WRITE_ONCE(timer->timer, NULL);
+		kfree(t);
+		ret = -EPERM;
+	}
 out:
 	__bpf_spin_unlock_irqrestore(&timer->lock);
 	return ret;
@@ -1374,7 +1381,7 @@ void bpf_timer_cancel_and_free(void *val)
 	/* The subsequent bpf_timer_start/cancel() helpers won't be able to use
 	 * this timer, since it won't be initialized.
 	 */
-	timer->timer = NULL;
+	WRITE_ONCE(timer->timer, NULL);
 out:
 	__bpf_spin_unlock_irqrestore(&timer->lock);
 	if (!t)
-- 
cgit v1.2.3


From 3091b667498b0a212e760e1033e5f9b8c33a948f Mon Sep 17 00:00:00 2001
From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Tue, 31 Oct 2023 13:04:36 +0800
Subject: bpf: Relax allowlist for css_task iter

The newly added open-coded css_task iter would try to hold the global
css_set_lock in bpf_iter_css_task_new, so the bpf side has to be careful in
where it allows to use this iter. The mainly concern is dead locking on
css_set_lock. check_css_task_iter_allowlist() in verifier enforced css_task
can only be used in bpf_lsm hooks and sleepable bpf_iter.

This patch relax the allowlist for css_task iter. Any lsm and any iter
(even non-sleepable) and any sleepable are safe since they would not hold
the css_set_lock before entering BPF progs context.

This patch also fixes the misused BPF_TRACE_ITER in
check_css_task_iter_allowlist which compared bpf_prog_type with
bpf_attach_type.

Fixes: 9c66dc94b62ae ("bpf: Introduce css_task open-coded iterator kfuncs")
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231031050438.93297-2-zhouchuyi@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                                  | 16 ++++++++++++----
 tools/testing/selftests/bpf/progs/iters_task_failure.c |  4 ++--
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 014b4d1ef408..def99b1a2b17 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11402,6 +11402,12 @@ static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
 						  &meta->arg_rbtree_root.field);
 }
 
+/*
+ * css_task iter allowlist is needed to avoid dead locking on css_set_lock.
+ * LSM hooks and iters (both sleepable and non-sleepable) are safe.
+ * Any sleepable progs are also safe since bpf_check_attach_target() enforce
+ * them can only be attached to some specific hook points.
+ */
 static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env)
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
@@ -11409,10 +11415,12 @@ static bool check_css_task_iter_allowlist(struct bpf_verifier_env *env)
 	switch (prog_type) {
 	case BPF_PROG_TYPE_LSM:
 		return true;
-	case BPF_TRACE_ITER:
-		return env->prog->aux->sleepable;
+	case BPF_PROG_TYPE_TRACING:
+		if (env->prog->expected_attach_type == BPF_TRACE_ITER)
+			return true;
+		fallthrough;
 	default:
-		return false;
+		return env->prog->aux->sleepable;
 	}
 }
 
@@ -11671,7 +11679,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		case KF_ARG_PTR_TO_ITER:
 			if (meta->func_id == special_kfunc_list[KF_bpf_iter_css_task_new]) {
 				if (!check_css_task_iter_allowlist(env)) {
-					verbose(env, "css_task_iter is only allowed in bpf_lsm and bpf iter-s\n");
+					verbose(env, "css_task_iter is only allowed in bpf_lsm, bpf_iter and sleepable progs\n");
 					return -EINVAL;
 				}
 			}
diff --git a/tools/testing/selftests/bpf/progs/iters_task_failure.c b/tools/testing/selftests/bpf/progs/iters_task_failure.c
index c3bf96a67dba..6b1588d70652 100644
--- a/tools/testing/selftests/bpf/progs/iters_task_failure.c
+++ b/tools/testing/selftests/bpf/progs/iters_task_failure.c
@@ -84,8 +84,8 @@ int BPF_PROG(iter_css_lock_and_unlock)
 	return 0;
 }
 
-SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
-__failure __msg("css_task_iter is only allowed in bpf_lsm and bpf iter-s")
+SEC("?fentry/" SYS_PREFIX "sys_getpgid")
+__failure __msg("css_task_iter is only allowed in bpf_lsm, bpf_iter and sleepable progs")
 int BPF_PROG(iter_css_task_for_each)
 {
 	u64 cg_id = bpf_get_current_cgroup_id();
-- 
cgit v1.2.3


From 291d044fd51f8484066300ee42afecf8c8db7b3a Mon Sep 17 00:00:00 2001
From: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Date: Thu, 2 Nov 2023 13:39:03 +0800
Subject: bpf: Fix precision tracking for BPF_ALU | BPF_TO_BE | BPF_END
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BPF_END and BPF_NEG has a different specification for the source bit in
the opcode compared to other ALU/ALU64 instructions, and is either
reserved or use to specify the byte swap endianness. In both cases the
source bit does not encode source operand location, and src_reg is a
reserved field.

backtrack_insn() currently does not differentiate BPF_END and BPF_NEG
from other ALU/ALU64 instructions, which leads to r0 being incorrectly
marked as precise when processing BPF_ALU | BPF_TO_BE | BPF_END
instructions. This commit teaches backtrack_insn() to correctly mark
precision for such case.

While precise tracking of BPF_NEG and other BPF_END instructions are
correct and does not need fixing, this commit opt to process all BPF_NEG
and BPF_END instructions within the same if-clause to better align with
current convention used in the verifier (e.g. check_alu_op).

Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking")
Cc: stable@vger.kernel.org
Reported-by: Mohamed Mahmoud <mmahmoud@redhat.com>
Closes: https://lore.kernel.org/r/87jzrrwptf.fsf@toke.dk
Tested-by: Toke Høiland-Jørgensen <toke@redhat.com>
Tested-by: Tao Lyu <tao.lyu@epfl.ch>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231102053913.12004-2-shung-hsi.yu@suse.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index def99b1a2b17..bd1c42eb540f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3742,7 +3742,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 	if (class == BPF_ALU || class == BPF_ALU64) {
 		if (!bt_is_reg_set(bt, dreg))
 			return 0;
-		if (opcode == BPF_MOV) {
+		if (opcode == BPF_END || opcode == BPF_NEG) {
+			/* sreg is reserved and unused
+			 * dreg still need precision before this insn
+			 */
+			return 0;
+		} else if (opcode == BPF_MOV) {
 			if (BPF_SRC(insn->code) == BPF_X) {
 				/* dreg = sreg or dreg = (s8, s16, s32)sreg
 				 * dreg needs precision after this insn
-- 
cgit v1.2.3


From 17fc8084aa8f9d5235f252fc3978db657dd77e92 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Thu, 2 Nov 2023 09:19:14 +0100
Subject: module/decompress: use kvmalloc() consistently

We consistently switched from kmalloc() to vmalloc() in module
decompression to prevent potential memory allocation failures with large
modules, however vmalloc() is not as memory-efficient and fast as
kmalloc().

Since we don't know in general the size of the workspace required by the
decompression algorithm, it is more reasonable to use kvmalloc()
consistently, also considering that we don't have special memory
requirements here.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module/decompress.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
index 4156d59be440..474e68f0f063 100644
--- a/kernel/module/decompress.c
+++ b/kernel/module/decompress.c
@@ -100,7 +100,7 @@ static ssize_t module_gzip_decompress(struct load_info *info,
 	s.next_in = buf + gzip_hdr_len;
 	s.avail_in = size - gzip_hdr_len;
 
-	s.workspace = vmalloc(zlib_inflate_workspacesize());
+	s.workspace = kvmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
 	if (!s.workspace)
 		return -ENOMEM;
 
@@ -138,7 +138,7 @@ static ssize_t module_gzip_decompress(struct load_info *info,
 out_inflate_end:
 	zlib_inflateEnd(&s);
 out:
-	vfree(s.workspace);
+	kvfree(s.workspace);
 	return retval;
 }
 #elif defined(CONFIG_MODULE_COMPRESS_XZ)
@@ -241,7 +241,7 @@ static ssize_t module_zstd_decompress(struct load_info *info,
 	}
 
 	wksp_size = zstd_dstream_workspace_bound(header.windowSize);
-	wksp = vmalloc(wksp_size);
+	wksp = kvmalloc(wksp_size, GFP_KERNEL);
 	if (!wksp) {
 		retval = -ENOMEM;
 		goto out;
@@ -284,7 +284,7 @@ static ssize_t module_zstd_decompress(struct load_info *info,
 	retval = new_size;
 
  out:
-	vfree(wksp);
+	kvfree(wksp);
 	return retval;
 }
 #else
-- 
cgit v1.2.3


From a5e3b127455d073f146a2a4ea3e7117635d34c5c Mon Sep 17 00:00:00 2001
From: Petr Tesarik <petrtesarik@huaweicloud.com>
Date: Thu, 2 Nov 2023 10:36:49 +0100
Subject: swiotlb: do not free decrypted pages if dynamic

Fix these two error paths:

1. When set_memory_decrypted() fails, pages may be left fully or partially
   decrypted.

2. Decrypted pages may be freed if swiotlb_alloc_tlb() determines that the
   physical address is too high.

To fix the first issue, call set_memory_encrypted() on the allocated region
after a failed decryption attempt. If that also fails, leak the pages.

To fix the second issue, check that the TLB physical address is below the
requested limit before decrypting.

Let the caller differentiate between unsuitable physical address (=> retry
from a lower zone) and allocation failures (=> no point in retrying).

Cc: stable@vger.kernel.org
Fixes: 79636caad361 ("swiotlb: if swiotlb is full, fall back to a transient memory pool")
Signed-off-by: Petr Tesarik <petr.tesarik1@huawei-partners.com>
Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 26202274784f..71392c9fac10 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -558,29 +558,40 @@ void __init swiotlb_exit(void)
  * alloc_dma_pages() - allocate pages to be used for DMA
  * @gfp:	GFP flags for the allocation.
  * @bytes:	Size of the buffer.
+ * @phys_limit:	Maximum allowed physical address of the buffer.
  *
  * Allocate pages from the buddy allocator. If successful, make the allocated
  * pages decrypted that they can be used for DMA.
  *
- * Return: Decrypted pages, or %NULL on failure.
+ * Return: Decrypted pages, %NULL on allocation failure, or ERR_PTR(-EAGAIN)
+ * if the allocated physical address was above @phys_limit.
  */
-static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes)
+static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
 {
 	unsigned int order = get_order(bytes);
 	struct page *page;
+	phys_addr_t paddr;
 	void *vaddr;
 
 	page = alloc_pages(gfp, order);
 	if (!page)
 		return NULL;
 
-	vaddr = page_address(page);
+	paddr = page_to_phys(page);
+	if (paddr + bytes - 1 > phys_limit) {
+		__free_pages(page, order);
+		return ERR_PTR(-EAGAIN);
+	}
+
+	vaddr = phys_to_virt(paddr);
 	if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
 		goto error;
 	return page;
 
 error:
-	__free_pages(page, order);
+	/* Intentional leak if pages cannot be encrypted again. */
+	if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
+		__free_pages(page, order);
 	return NULL;
 }
 
@@ -618,11 +629,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
 	else if (phys_limit <= DMA_BIT_MASK(32))
 		gfp |= __GFP_DMA32;
 
-	while ((page = alloc_dma_pages(gfp, bytes)) &&
-	       page_to_phys(page) + bytes - 1 > phys_limit) {
-		/* allocated, but too high */
-		__free_pages(page, get_order(bytes));
-
+	while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit))) {
 		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
 		    phys_limit < DMA_BIT_MASK(64) &&
 		    !(gfp & (__GFP_DMA32 | __GFP_DMA)))
-- 
cgit v1.2.3


From 8ae0e970319ac0b516d285650a744bab4ed3dd37 Mon Sep 17 00:00:00 2001
From: Jia He <justin.he@arm.com>
Date: Sat, 28 Oct 2023 10:20:58 +0000
Subject: dma-mapping: move dma_addressing_limited() out of line

This patch moves dma_addressing_limited() out of line, serving as a
preliminary step to prevent the introduction of a new publicly accessible
low-level helper when validating whether all system RAM is mapped within
the DMA mapping range.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jia He <justin.he@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 19 +++++--------------
 kernel/dma/mapping.c        | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index f0ccca16a0ac..4a658de44ee9 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -144,6 +144,7 @@ bool dma_pci_p2pdma_supported(struct device *dev);
 int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
 u64 dma_get_required_mask(struct device *dev);
+bool dma_addressing_limited(struct device *dev);
 size_t dma_max_mapping_size(struct device *dev);
 size_t dma_opt_mapping_size(struct device *dev);
 bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
@@ -264,6 +265,10 @@ static inline u64 dma_get_required_mask(struct device *dev)
 {
 	return 0;
 }
+static inline bool dma_addressing_limited(struct device *dev)
+{
+	return false;
+}
 static inline size_t dma_max_mapping_size(struct device *dev)
 {
 	return 0;
@@ -465,20 +470,6 @@ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask)
 	return dma_set_mask_and_coherent(dev, mask);
 }
 
-/**
- * dma_addressing_limited - return if the device is addressing limited
- * @dev:	device to check
- *
- * Return %true if the devices DMA mask is too small to address all memory in
- * the system, else %false.  Lack of addressing bits is the prime reason for
- * bounce buffering, but might not be the only one.
- */
-static inline bool dma_addressing_limited(struct device *dev)
-{
-	return min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
-			    dma_get_required_mask(dev);
-}
-
 static inline unsigned int dma_get_max_seg_size(struct device *dev)
 {
 	if (dev->dma_parms && dev->dma_parms->max_segment_size)
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index e323ca48f7f2..7789c86f7ba3 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -793,6 +793,21 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_set_coherent_mask);
 
+/**
+ * dma_addressing_limited - return if the device is addressing limited
+ * @dev:	device to check
+ *
+ * Return %true if the devices DMA mask is too small to address all memory in
+ * the system, else %false.  Lack of addressing bits is the prime reason for
+ * bounce buffering, but might not be the only one.
+ */
+bool dma_addressing_limited(struct device *dev)
+{
+	return min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
+			    dma_get_required_mask(dev);
+}
+EXPORT_SYMBOL_GPL(dma_addressing_limited);
+
 size_t dma_max_mapping_size(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-- 
cgit v1.2.3


From a409d9600959f3c4b2a48946304c8e01b8d04072 Mon Sep 17 00:00:00 2001
From: Jia He <justin.he@arm.com>
Date: Sat, 28 Oct 2023 10:20:59 +0000
Subject: dma-mapping: fix dma_addressing_limited() if dma_range_map can't
 cover all system RAM

There is an unusual case that the range map covers right up to the top
of system RAM, but leaves a hole somewhere lower down. Then it prevents
the nvme device dma mapping in the checking path of phys_to_dma() and
causes the hangs at boot.

E.g. On an Armv8 Ampere server, the dsdt ACPI table is:
 Method (_DMA, 0, Serialized)  // _DMA: Direct Memory Access
            {
                Name (RBUF, ResourceTemplate ()
                {
                    QWordMemory (ResourceConsumer, PosDecode, MinFixed,
MaxFixed, Cacheable, ReadWrite,
                        0x0000000000000000, // Granularity
                        0x0000000000000000, // Range Minimum
                        0x00000000FFFFFFFF, // Range Maximum
                        0x0000000000000000, // Translation Offset
                        0x0000000100000000, // Length
                        ,, , AddressRangeMemory, TypeStatic)
                    QWordMemory (ResourceConsumer, PosDecode, MinFixed,
MaxFixed, Cacheable, ReadWrite,
                        0x0000000000000000, // Granularity
                        0x0000006010200000, // Range Minimum
                        0x000000602FFFFFFF, // Range Maximum
                        0x0000000000000000, // Translation Offset
                        0x000000001FE00000, // Length
                        ,, , AddressRangeMemory, TypeStatic)
                    QWordMemory (ResourceConsumer, PosDecode, MinFixed,
MaxFixed, Cacheable, ReadWrite,
                        0x0000000000000000, // Granularity
                        0x00000060F0000000, // Range Minimum
                        0x00000060FFFFFFFF, // Range Maximum
                        0x0000000000000000, // Translation Offset
                        0x0000000010000000, // Length
                        ,, , AddressRangeMemory, TypeStatic)
                    QWordMemory (ResourceConsumer, PosDecode, MinFixed,
MaxFixed, Cacheable, ReadWrite,
                        0x0000000000000000, // Granularity
                        0x0000007000000000, // Range Minimum
                        0x000003FFFFFFFFFF, // Range Maximum
                        0x0000000000000000, // Translation Offset
                        0x0000039000000000, // Length
                        ,, , AddressRangeMemory, TypeStatic)
                })

But the System RAM ranges are:
cat /proc/iomem |grep -i ram
90000000-91ffffff : System RAM
92900000-fffbffff : System RAM
880000000-fffffffff : System RAM
8800000000-bff5990fff : System RAM
bff59d0000-bff5a4ffff : System RAM
bff8000000-bfffffffff : System RAM
So some RAM ranges are out of dma_range_map.

Fix it by checking whether each of the system RAM resources can be
properly encompassed within the dma_range_map.

Signed-off-by: Jia He <justin.he@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/direct.c  | 40 ++++++++++++++++++++++++++++++++++++++++
 kernel/dma/direct.h  |  1 +
 kernel/dma/mapping.c | 11 +++++++++--
 3 files changed, 50 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index ed3056eb20b8..73c95815789a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -587,6 +587,46 @@ int dma_direct_supported(struct device *dev, u64 mask)
 	return mask >= phys_to_dma_unencrypted(dev, min_mask);
 }
 
+/*
+ * To check whether all ram resource ranges are covered by dma range map
+ * Returns 0 when further check is needed
+ * Returns 1 if there is some RAM range can't be covered by dma_range_map
+ */
+static int check_ram_in_range_map(unsigned long start_pfn,
+				  unsigned long nr_pages, void *data)
+{
+	unsigned long end_pfn = start_pfn + nr_pages;
+	const struct bus_dma_region *bdr = NULL;
+	const struct bus_dma_region *m;
+	struct device *dev = data;
+
+	while (start_pfn < end_pfn) {
+		for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) {
+			unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start);
+
+			if (start_pfn >= cpu_start_pfn &&
+			    start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) {
+				bdr = m;
+				break;
+			}
+		}
+		if (!bdr)
+			return 1;
+
+		start_pfn = PFN_DOWN(bdr->cpu_start) + PFN_DOWN(bdr->size);
+	}
+
+	return 0;
+}
+
+bool dma_direct_all_ram_mapped(struct device *dev)
+{
+	if (!dev->dma_range_map)
+		return true;
+	return !walk_system_ram_range(0, PFN_DOWN(ULONG_MAX) + 1, dev,
+				      check_ram_in_range_map);
+}
+
 size_t dma_direct_max_mapping_size(struct device *dev)
 {
 	/* If SWIOTLB is active, use its maximum mapping size */
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 97ec892ea0b5..18d346118fe8 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -20,6 +20,7 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
 bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr);
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 		enum dma_data_direction dir, unsigned long attrs);
+bool dma_direct_all_ram_mapped(struct device *dev);
 size_t dma_direct_max_mapping_size(struct device *dev);
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 7789c86f7ba3..58db8fd70471 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -803,8 +803,15 @@ EXPORT_SYMBOL(dma_set_coherent_mask);
  */
 bool dma_addressing_limited(struct device *dev)
 {
-	return min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
-			    dma_get_required_mask(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) <
+			 dma_get_required_mask(dev))
+		return true;
+
+	if (unlikely(ops))
+		return false;
+	return !dma_direct_all_ram_mapped(dev);
 }
 EXPORT_SYMBOL_GPL(dma_addressing_limited);
 
-- 
cgit v1.2.3


From 23816724fdbd47c28bc998866fd7bc5ad9f0e535 Mon Sep 17 00:00:00 2001
From: Yuran Pereira <yuran.pereira@hotmail.com>
Date: Sun, 5 Nov 2023 13:18:08 +0530
Subject: kdb: Corrects comment for kdballocenv

This patch corrects the comment for the kdballocenv function.
The previous comment incorrectly described the function's
parameters and return values.

Signed-off-by: Yuran Pereira <yuran.pereira@hotmail.com>
Link: https://lore.kernel.org/r/DB3PR10MB6835B383B596133EDECEA98AE8ABA@DB3PR10MB6835.EURPRD10.PROD.OUTLOOK.COM
[daniel.thompson@linaro.org: fixed whitespace alignment in new lines]
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/kdb/kdb_main.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 438b868cbfa9..6b213c8252d6 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -272,11 +272,10 @@ char *kdbgetenv(const char *match)
  * kdballocenv - This function is used to allocate bytes for
  *	environment entries.
  * Parameters:
- *	match	A character string representing a numeric value
- * Outputs:
- *	*value  the unsigned long representation of the env variable 'match'
+ *	bytes	The number of bytes to allocate in the static buffer.
  * Returns:
- *	Zero on success, a kdb diagnostic on failure.
+ *	A pointer to the allocated space in the buffer on success.
+ *	NULL if bytes > size available in the envbuffer.
  * Remarks:
  *	We use a static environment buffer (envbuffer) to hold the values
  *	of dynamically generated environment variables (see kdb_set).  Buffer
-- 
cgit v1.2.3


From 0de4f50de25af79c2a46db55d70cdbd8f985c6d1 Mon Sep 17 00:00:00 2001
From: Chuyi Zhou <zhouchuyi@bytedance.com>
Date: Tue, 7 Nov 2023 21:22:03 +0800
Subject: bpf: Let verifier consider {task,cgroup} is trusted in bpf_iter_reg

BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) in verifier.c wanted to
teach BPF verifier that bpf_iter__task -> task is a trusted ptr. But it
doesn't work well.

The reason is, bpf_iter__task -> task would go through btf_ctx_access()
which enforces the reg_type of 'task' is ctx_arg_info->reg_type, and in
task_iter.c, we actually explicitly declare that the
ctx_arg_info->reg_type is PTR_TO_BTF_ID_OR_NULL.

Actually we have a previous case like this[1] where PTR_TRUSTED is added to
the arg flag for map_iter.

This patch sets ctx_arg_info->reg_type is PTR_TO_BTF_ID_OR_NULL |
PTR_TRUSTED in task_reg_info.

Similarly, bpf_cgroup_reg_info -> cgroup is also PTR_TRUSTED since we are
under the protection of cgroup_mutex and we would check cgroup_is_dead()
in __cgroup_iter_seq_show().

This patch is to improve the user experience of the newly introduced
bpf_iter_css_task kfunc before hitting the mainline. The Fixes tag is
pointing to the commit introduced the bpf_iter_css_task kfunc.

Link[1]:https://lore.kernel.org/all/20230706133932.45883-3-aspsk@isovalent.com/

Fixes: 9c66dc94b62a ("bpf: Introduce css_task open-coded iterator kfuncs")
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231107132204.912120-2-zhouchuyi@bytedance.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/cgroup_iter.c | 2 +-
 kernel/bpf/task_iter.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index d1b5c5618dd7..f04a468cf6a7 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -282,7 +282,7 @@ static struct bpf_iter_reg bpf_cgroup_reg_info = {
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__cgroup, cgroup),
-		  PTR_TO_BTF_ID_OR_NULL },
+		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
 	},
 	.seq_info		= &cgroup_iter_seq_info,
 };
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 4e156dca48de..26082b97894d 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -704,7 +704,7 @@ static struct bpf_iter_reg task_reg_info = {
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__task, task),
-		  PTR_TO_BTF_ID_OR_NULL },
+		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
 	},
 	.seq_info		= &task_seq_info,
 	.fill_link_info		= bpf_iter_fill_link_info,
-- 
cgit v1.2.3


From 53c87e846e335e3c18044c397cc35178163d7827 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <petr.tesarik1@huawei-partners.com>
Date: Wed, 8 Nov 2023 12:12:49 +0100
Subject: swiotlb: fix out-of-bounds TLB allocations with
 CONFIG_SWIOTLB_DYNAMIC

Limit the free list length to the size of the IO TLB. Transient pool can be
smaller than IO_TLB_SEGSIZE, but the free list is initialized with the
assumption that the total number of slots is a multiple of IO_TLB_SEGSIZE.
As a result, swiotlb_area_find_slots() may allocate slots past the end of
a transient IO TLB buffer.

Reported-by: Niklas Schnelle <schnelle@linux.ibm.com>
Closes: https://lore.kernel.org/linux-iommu/104a8c8fedffd1ff8a2890983e2ec1c26bff6810.camel@linux.ibm.com/
Fixes: 79636caad361 ("swiotlb: if swiotlb is full, fall back to a transient memory pool")
Cc: stable@vger.kernel.org
Signed-off-by: Petr Tesarik <petr.tesarik1@huawei-partners.com>
Reviewed-by: Halil Pasic <pasic@linux.ibm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 71392c9fac10..33d942615be5 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -283,7 +283,8 @@ static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start,
 	}
 
 	for (i = 0; i < mem->nslabs; i++) {
-		mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
+		mem->slots[i].list = min(IO_TLB_SEGSIZE - io_tlb_offset(i),
+					 mem->nslabs - i);
 		mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
 		mem->slots[i].alloc_size = 0;
 	}
-- 
cgit v1.2.3


From 96a2b48e5e1df6698f504969f0f51dc34e52ff3d Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 29 Oct 2023 06:14:28 +0000
Subject: cgroup: Remove unnecessary list_empty()

The root hasn't been removed from the root_list, so the list can't be NULL.
However, if it had been removed, attempting to destroy it once more is not
possible. Let's replace this with WARN_ON_ONCE() for clarity.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1d5b9de3b1b9..3a436e4f0da1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1347,10 +1347,9 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 
 	spin_unlock_irq(&css_set_lock);
 
-	if (!list_empty(&root->root_list)) {
-		list_del(&root->root_list);
-		cgroup_root_count--;
-	}
+	WARN_ON_ONCE(list_empty(&root->root_list));
+	list_del(&root->root_list);
+	cgroup_root_count--;
 
 	if (!have_favordynmods)
 		cgroup_favor_dynmods(root, false);
-- 
cgit v1.2.3


From d23b5c577715892c87533b13923306acc6243f93 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 29 Oct 2023 06:14:29 +0000
Subject: cgroup: Make operations on the cgroup root_list RCU safe

At present, when we perform operations on the cgroup root_list, we must
hold the cgroup_mutex, which is a relatively heavyweight lock. In reality,
we can make operations on this list RCU-safe, eliminating the need to hold
the cgroup_mutex during traversal. Modifications to the list only occur in
the cgroup root setup and destroy paths, which should be infrequent in a
production environment. In contrast, traversal may occur frequently.
Therefore, making it RCU-safe would be beneficial.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h     |  1 +
 kernel/cgroup/cgroup-internal.h |  3 ++-
 kernel/cgroup/cgroup.c          | 23 ++++++++++++++++-------
 3 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 4a6b6b77ccb6..4caab0c6b361 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -563,6 +563,7 @@ struct cgroup_root {
 
 	/* A list running through the active hierarchies */
 	struct list_head root_list;
+	struct rcu_head rcu;
 
 	/* Hierarchy-specific flags */
 	unsigned int flags;
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index c56071f150f2..5e17f01ced9f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -170,7 +170,8 @@ extern struct list_head cgroup_roots;
 
 /* iterate across the hierarchies */
 #define for_each_root(root)						\
-	list_for_each_entry((root), &cgroup_roots, root_list)
+	list_for_each_entry_rcu((root), &cgroup_roots, root_list,	\
+				lockdep_is_held(&cgroup_mutex))
 
 /**
  * for_each_subsys - iterate all enabled cgroup subsystems
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 3a436e4f0da1..19784d44d615 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
 
 void cgroup_free_root(struct cgroup_root *root)
 {
-	kfree(root);
+	kfree_rcu(root, rcu);
 }
 
 static void cgroup_destroy_root(struct cgroup_root *root)
@@ -1348,7 +1348,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 	spin_unlock_irq(&css_set_lock);
 
 	WARN_ON_ONCE(list_empty(&root->root_list));
-	list_del(&root->root_list);
+	list_del_rcu(&root->root_list);
 	cgroup_root_count--;
 
 	if (!have_favordynmods)
@@ -1389,7 +1389,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
 		}
 	}
 
-	BUG_ON(!res_cgroup);
+	/*
+	 * If cgroup_mutex is not held, the cgrp_cset_link will be freed
+	 * before we remove the cgroup root from the root_list. Consequently,
+	 * when accessing a cgroup root, the cset_link may have already been
+	 * freed, resulting in a NULL res_cgroup. However, by holding the
+	 * cgroup_mutex, we ensure that res_cgroup can't be NULL.
+	 * If we don't hold cgroup_mutex in the caller, we must do the NULL
+	 * check.
+	 */
 	return res_cgroup;
 }
 
@@ -1448,7 +1456,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 					    struct cgroup_root *root)
 {
-	lockdep_assert_held(&cgroup_mutex);
 	lockdep_assert_held(&css_set_lock);
 
 	return __cset_cgroup_from_root(cset, root);
@@ -1456,7 +1463,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 
 /*
  * Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_lock held.
+ * called with css_set_lock held to prevent task's groups from being modified.
+ * Must be called with either cgroup_mutex or rcu read lock to prevent the
+ * cgroup root from being destroyed.
  */
 struct cgroup *task_cgroup_from_root(struct task_struct *task,
 				     struct cgroup_root *root)
@@ -2031,7 +2040,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
 	struct cgroup_root *root = ctx->root;
 	struct cgroup *cgrp = &root->cgrp;
 
-	INIT_LIST_HEAD(&root->root_list);
+	INIT_LIST_HEAD_RCU(&root->root_list);
 	atomic_set(&root->nr_cgrps, 1);
 	cgrp->root = root;
 	init_cgroup_housekeeping(cgrp);
@@ -2114,7 +2123,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 	 * care of subsystems' refcounts, which are explicitly dropped in
 	 * the failure exit path.
 	 */
-	list_add(&root->root_list, &cgroup_roots);
+	list_add_rcu(&root->root_list, &cgroup_roots);
 	cgroup_root_count++;
 
 	/*
-- 
cgit v1.2.3


From 9067d90006df089b9a1da0d74f0cad232a5d726a Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 29 Oct 2023 06:14:30 +0000
Subject: cgroup: Eliminate the need for cgroup_mutex in proc_cgroup_show()

The cgroup root_list is already RCU-safe. Therefore, we can replace the
cgroup_mutex with the RCU read lock in some particular paths. This change
will be particularly beneficial for frequent operations, such as
`cat /proc/self/cgroup`, in a cgroup1-based container environment.

I did stress tests with this change, as outlined below
(with CONFIG_PROVE_RCU_LIST enabled):

- Continuously mounting and unmounting named cgroups in some tasks,
  for example:

  cgrp_name=$1
  while true
  do
      mount -t cgroup -o none,name=$cgrp_name none /$cgrp_name
      umount /$cgrp_name
  done

- Continuously triggering proc_cgroup_show() in some tasks concurrently,
  for example:
  while true; do cat /proc/self/cgroup > /dev/null; done

They can ran successfully after implementing this change, with no RCU
warnings in dmesg.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 19784d44d615..9bb255e41cf2 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6285,7 +6285,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 	if (!buf)
 		goto out;
 
-	cgroup_lock();
+	rcu_read_lock();
 	spin_lock_irq(&css_set_lock);
 
 	for_each_root(root) {
@@ -6296,6 +6296,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 		if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
 			continue;
 
+		cgrp = task_cgroup_from_root(tsk, root);
+		/* The root has already been unmounted. */
+		if (!cgrp)
+			continue;
+
 		seq_printf(m, "%d:", root->hierarchy_id);
 		if (root != &cgrp_dfl_root)
 			for_each_subsys(ss, ssid)
@@ -6306,9 +6311,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 			seq_printf(m, "%sname=%s", count ? "," : "",
 				   root->name);
 		seq_putc(m, ':');
-
-		cgrp = task_cgroup_from_root(tsk, root);
-
 		/*
 		 * On traditional hierarchies, all zombie tasks show up as
 		 * belonging to the root cgroup.  On the default hierarchy,
@@ -6340,7 +6342,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 	retval = 0;
 out_unlock:
 	spin_unlock_irq(&css_set_lock);
-	cgroup_unlock();
+	rcu_read_unlock();
 	kfree(buf);
 out:
 	return retval;
-- 
cgit v1.2.3


From 0008454e8fd30ed0017a9a35b8dd708f168931b8 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 29 Oct 2023 06:14:31 +0000
Subject: cgroup: Add annotation for holding namespace_sem in
 current_cgns_cgroup_from_root()

When I initially examined the function current_cgns_cgroup_from_root(), I
was perplexed by its lack of holding cgroup_mutex. However, after Michal
explained the reason[0] to me, I realized that it already holds the
namespace_sem. I believe this intricacy could also confuse others, so it
would be advisable to include an annotation for clarification.

After we replace the cgroup_mutex with RCU read lock, if current doesn't
hold the namespace_sem, the root cgroup will be NULL. So let's add a
WARN_ON_ONCE() for it.

[0]. https://lore.kernel.org/bpf/afdnpo3jz2ic2ampud7swd6so5carkilts2mkygcaw67vbw6yh@5b5mncf7qyet

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Michal Koutny <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 9bb255e41cf2..4e610863cc37 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1420,6 +1420,11 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
 
 	rcu_read_unlock();
 
+	/*
+	 * The namespace_sem is held by current, so the root cgroup can't
+	 * be umounted. Therefore, we can ensure that the res is non-NULL.
+	 */
+	WARN_ON_ONCE(!res);
 	return res;
 }
 
-- 
cgit v1.2.3


From aecd408b7e50742868b3305c24325a89024e2a30 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 29 Oct 2023 06:14:32 +0000
Subject: cgroup: Add a new helper for cgroup1 hierarchy

A new helper is added for cgroup1 hierarchy:

- task_get_cgroup1
  Acquires the associated cgroup of a task within a specific cgroup1
  hierarchy. The cgroup1 hierarchy is identified by its hierarchy ID.

This helper function is added to facilitate the tracing of tasks within
a particular container or cgroup dir in BPF programs. It's important to
note that this helper is designed specifically for cgroup1 only.

tj: Use irsqsave/restore as suggested by Hou Tao <houtao@huaweicloud.com>.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Hou Tao <houtao@huaweicloud.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h          |  4 +++-
 kernel/cgroup/cgroup-internal.h |  1 -
 kernel/cgroup/cgroup-v1.c       | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0ef0af66080e..34aaf0e87def 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -69,6 +69,7 @@ struct css_task_iter {
 extern struct file_system_type cgroup_fs_type;
 extern struct cgroup_root cgrp_dfl_root;
 extern struct css_set init_css_set;
+extern spinlock_t css_set_lock;
 
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
 #include <linux/cgroup_subsys.h>
@@ -386,7 +387,6 @@ static inline void cgroup_unlock(void)
  * as locks used during the cgroup_subsys::attach() methods.
  */
 #ifdef CONFIG_PROVE_RCU
-extern spinlock_t css_set_lock;
 #define task_css_set_check(task, __c)					\
 	rcu_dereference_check((task)->cgroups,				\
 		rcu_read_lock_sched_held() ||				\
@@ -853,4 +853,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 
 #endif /* CONFIG_CGROUP_BPF */
 
+struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);
+
 #endif /* _LINUX_CGROUP_H */
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 5e17f01ced9f..520b90dd97ec 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -164,7 +164,6 @@ struct cgroup_mgctx {
 #define DEFINE_CGROUP_MGCTX(name)						\
 	struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
 
-extern spinlock_t css_set_lock;
 extern struct cgroup_subsys *cgroup_subsys[];
 extern struct list_head cgroup_roots;
 
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 76db6c67e39a..04d11a7dd95f 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1262,6 +1262,40 @@ int cgroup1_get_tree(struct fs_context *fc)
 	return ret;
 }
 
+/**
+ * task_get_cgroup1 - Acquires the associated cgroup of a task within a
+ * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
+ * hierarchy ID.
+ * @tsk: The target task
+ * @hierarchy_id: The ID of a cgroup1 hierarchy
+ *
+ * On success, the cgroup is returned. On failure, ERR_PTR is returned.
+ * We limit it to cgroup1 only.
+ */
+struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id)
+{
+	struct cgroup *cgrp = ERR_PTR(-ENOENT);
+	struct cgroup_root *root;
+	unsigned long flags;
+
+	rcu_read_lock();
+	for_each_root(root) {
+		/* cgroup1 only*/
+		if (root == &cgrp_dfl_root)
+			continue;
+		if (root->hierarchy_id != hierarchy_id)
+			continue;
+		spin_lock_irqsave(&css_set_lock, flags);
+		cgrp = task_cgroup_from_root(tsk, root);
+		if (!cgrp || !cgroup_tryget(cgrp))
+			cgrp = ERR_PTR(-ENOENT);
+		spin_unlock_irqrestore(&css_set_lock, flags);
+		break;
+	}
+	rcu_read_unlock();
+	return cgrp;
+}
+
 static int __init cgroup1_wq_init(void)
 {
 	/*
-- 
cgit v1.2.3


From 93f7378734b595fb61e89b802002fb7e3a1267d2 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:45 -0700
Subject: bpf: derive smin/smax from umin/max bounds

Add smin/smax derivation from appropriate umin/umax values. Previously the
logic was surprisingly asymmetric, trying to derive umin/umax from smin/smax
(if possible), but not trying to do the same in the other direction. A simple
addition to __reg64_deduce_bounds() fixes this.

Added also generic comment about u64/s64 ranges and their relationship.
Hopefully that helps readers to understand all the bounds deductions
a bit better.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bd1c42eb540f..1a5c389b951a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2358,6 +2358,77 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
 
 static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
 {
+	/* If u64 range forms a valid s64 range (due to matching sign bit),
+	 * try to learn from that. Let's do a bit of ASCII art to see when
+	 * this is happening. Let's take u64 range first:
+	 *
+	 * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
+	 * |-------------------------------|--------------------------------|
+	 *
+	 * Valid u64 range is formed when umin and umax are anywhere in the
+	 * range [0, U64_MAX], and umin <= umax. u64 case is simple and
+	 * straightforward. Let's see how s64 range maps onto the same range
+	 * of values, annotated below the line for comparison:
+	 *
+	 * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
+	 * |-------------------------------|--------------------------------|
+	 * 0                        S64_MAX S64_MIN                        -1
+	 *
+	 * So s64 values basically start in the middle and they are logically
+	 * contiguous to the right of it, wrapping around from -1 to 0, and
+	 * then finishing as S64_MAX (0x7fffffffffffffff) right before
+	 * S64_MIN. We can try drawing the continuity of u64 vs s64 values
+	 * more visually as mapped to sign-agnostic range of hex values.
+	 *
+	 *  u64 start                                               u64 end
+	 *  _______________________________________________________________
+	 * /                                                               \
+	 * 0             0x7fffffffffffffff 0x8000000000000000        U64_MAX
+	 * |-------------------------------|--------------------------------|
+	 * 0                        S64_MAX S64_MIN                        -1
+	 *                                / \
+	 * >------------------------------   ------------------------------->
+	 * s64 continues...        s64 end   s64 start          s64 "midpoint"
+	 *
+	 * What this means is that, in general, we can't always derive
+	 * something new about u64 from any random s64 range, and vice versa.
+	 *
+	 * But we can do that in two particular cases. One is when entire
+	 * u64/s64 range is *entirely* contained within left half of the above
+	 * diagram or when it is *entirely* contained in the right half. I.e.:
+	 *
+	 * |-------------------------------|--------------------------------|
+	 *     ^                   ^            ^                 ^
+	 *     A                   B            C                 D
+	 *
+	 * [A, B] and [C, D] are contained entirely in their respective halves
+	 * and form valid contiguous ranges as both u64 and s64 values. [A, B]
+	 * will be non-negative both as u64 and s64 (and in fact it will be
+	 * identical ranges no matter the signedness). [C, D] treated as s64
+	 * will be a range of negative values, while in u64 it will be
+	 * non-negative range of values larger than 0x8000000000000000.
+	 *
+	 * Now, any other range here can't be represented in both u64 and s64
+	 * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid
+	 * contiguous u64 ranges, but they are discontinuous in s64. [B, C]
+	 * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX],
+	 * for example. Similarly, valid s64 range [D, A] (going from negative
+	 * to positive values), would be two separate [D, U64_MAX] and [0, A]
+	 * ranges as u64. Currently reg_state can't represent two segments per
+	 * numeric domain, so in such situations we can only derive maximal
+	 * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64).
+	 *
+	 * So we use these facts to derive umin/umax from smin/smax and vice
+	 * versa only if they stay within the same "half". This is equivalent
+	 * to checking sign bit: lower half will have sign bit as zero, upper
+	 * half have sign bit 1. Below in code we simplify this by just
+	 * casting umin/umax as smin/smax and checking if they form valid
+	 * range, and vice versa. Those are equivalent checks.
+	 */
+	if ((s64)reg->umin_value <= (s64)reg->umax_value) {
+		reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value);
+		reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value);
+	}
 	/* Learn sign from signed bounds.
 	 * If we cannot cross the sign boundary, then signed and unsigned bounds
 	 * are the same, so combine.  This works even in the negative case, e.g.
-- 
cgit v1.2.3


From d540517990a9d105bf0312760665964916ac044f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:46 -0700
Subject: bpf: derive smin32/smax32 from umin32/umax32 bounds

All the logic that applies to u64 vs s64, equally applies for u32 vs s32
relationships (just taken in a smaller 32-bit numeric space). So do the
same deduction of smin32/smax32 from umin32/umax32, if we can.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1a5c389b951a..b53ee72a7d72 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2324,6 +2324,13 @@ static void __update_reg_bounds(struct bpf_reg_state *reg)
 /* Uses signed min/max values to inform unsigned, and vice-versa */
 static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
 {
+	/* if u32 range forms a valid s32 range (due to matching sign bit),
+	 * try to learn from that
+	 */
+	if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) {
+		reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value);
+		reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value);
+	}
 	/* Learn sign from signed bounds.
 	 * If we cannot cross the sign boundary, then signed and unsigned bounds
 	 * are the same, so combine.  This works even in the negative case, e.g.
-- 
cgit v1.2.3


From c1efab6468fd5ef541d47d81dbb62cca27f8db3b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:47 -0700
Subject: bpf: derive subreg bounds from full bounds when upper 32 bits are
 constant

Comments in code try to explain the idea behind why this is correct.
Please check the code and comments.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-6-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b53ee72a7d72..9e39f12538f7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2324,6 +2324,51 @@ static void __update_reg_bounds(struct bpf_reg_state *reg)
 /* Uses signed min/max values to inform unsigned, and vice-versa */
 static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
 {
+	/* If upper 32 bits of u64/s64 range don't change, we can use lower 32
+	 * bits to improve our u32/s32 boundaries.
+	 *
+	 * E.g., the case where we have upper 32 bits as zero ([10, 20] in
+	 * u64) is pretty trivial, it's obvious that in u32 we'll also have
+	 * [10, 20] range. But this property holds for any 64-bit range as
+	 * long as upper 32 bits in that entire range of values stay the same.
+	 *
+	 * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311]
+	 * in decimal) has the same upper 32 bits throughout all the values in
+	 * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15])
+	 * range.
+	 *
+	 * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32,
+	 * following the rules outlined below about u64/s64 correspondence
+	 * (which equally applies to u32 vs s32 correspondence). In general it
+	 * depends on actual hexadecimal values of 32-bit range. They can form
+	 * only valid u32, or only valid s32 ranges in some cases.
+	 *
+	 * So we use all these insights to derive bounds for subregisters here.
+	 */
+	if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) {
+		/* u64 to u32 casting preserves validity of low 32 bits as
+		 * a range, if upper 32 bits are the same
+		 */
+		reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value);
+		reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value);
+
+		if ((s32)reg->umin_value <= (s32)reg->umax_value) {
+			reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
+			reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
+		}
+	}
+	if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) {
+		/* low 32 bits should form a proper u32 range */
+		if ((u32)reg->smin_value <= (u32)reg->smax_value) {
+			reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value);
+			reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value);
+		}
+		/* low 32 bits should form a proper s32 range */
+		if ((s32)reg->smin_value <= (s32)reg->smax_value) {
+			reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
+			reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
+		}
+	}
 	/* if u32 range forms a valid s32 range (due to matching sign bit),
 	 * try to learn from that
 	 */
-- 
cgit v1.2.3


From 6593f2e6741f03b49bffc9d55ddd4c1c47853c39 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:48 -0700
Subject: bpf: add special smin32/smax32 derivation from 64-bit bounds

Add a special case where we can derive valid s32 bounds from umin/umax
or smin/smax by stitching together negative s32 subrange and
non-negative s32 subrange. That requires upper 32 bits to form a [N, N+1]
range in u32 domain (taking into account wrap around, so 0xffffffff
to 0x00000000 is a valid [N, N+1] range in this sense). See code comment
for concrete examples.

Eduard Zingerman also provided an alternative explanation ([0]) for more
mathematically inclined readers:

Suppose:
. there are numbers a, b, c
. 2**31 <= b < 2**32
. 0 <= c < 2**31
. umin = 2**32 * a + b
. umax = 2**32 * (a + 1) + c

The number of values in the range represented by [umin; umax] is:
. N = umax - umin + 1 = 2**32 + c - b + 1
. min(N) = 2**32 + 0 - (2**32-1) + 1 = 2, with b = 2**32-1, c = 0
. max(N) = 2**32 + (2**31 - 1) - 2**31 + 1 = 2**32, with b = 2**31, c = 2**31-1

Hence [(s32)b; (s32)c] forms a valid range.

  [0] https://lore.kernel.org/bpf/d7af631802f0cfae20df77fe70068702d24bbd31.camel@gmail.com/

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-7-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9e39f12538f7..0fffbf01328e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2369,6 +2369,29 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
 			reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
 		}
 	}
+	/* Special case where upper bits form a small sequence of two
+	 * sequential numbers (in 32-bit unsigned space, so 0xffffffff to
+	 * 0x00000000 is also valid), while lower bits form a proper s32 range
+	 * going from negative numbers to positive numbers. E.g., let's say we
+	 * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]).
+	 * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff,
+	 * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits,
+	 * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]).
+	 * Note that it doesn't have to be 0xffffffff going to 0x00000000 in
+	 * upper 32 bits. As a random example, s64 range
+	 * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range
+	 * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister.
+	 */
+	if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) &&
+	    (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) {
+		reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value);
+		reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value);
+	}
+	if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) &&
+	    (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) {
+		reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value);
+		reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value);
+	}
 	/* if u32 range forms a valid s32 range (due to matching sign bit),
 	 * try to learn from that
 	 */
-- 
cgit v1.2.3


From c51d5ad6543cc36334ef1fcd762d0df767a0bf7e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:49 -0700
Subject: bpf: improve deduction of 64-bit bounds from 32-bit bounds

Add a few interesting cases in which we can tighten 64-bit bounds based
on newly learnt information about 32-bit bounds. E.g., when full u64/s64
registers are used in BPF program, and then eventually compared as
u32/s32. The latter comparison doesn't change the value of full
register, but it does impose new restrictions on possible lower 32 bits
of such full registers. And we can use that to derive additional full
register bounds information.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231102033759.2541186-8-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0fffbf01328e..969f1ecfe310 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2536,10 +2536,54 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
 	}
 }
 
+static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
+{
+	/* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit
+	 * values on both sides of 64-bit range in hope to have tigher range.
+	 * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from
+	 * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff].
+	 * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound
+	 * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of
+	 * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a
+	 * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff].
+	 * We just need to make sure that derived bounds we are intersecting
+	 * with are well-formed ranges in respecitve s64 or u64 domain, just
+	 * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments.
+	 */
+	__u64 new_umin, new_umax;
+	__s64 new_smin, new_smax;
+
+	/* u32 -> u64 tightening, it's always well-formed */
+	new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value;
+	new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value;
+	reg->umin_value = max_t(u64, reg->umin_value, new_umin);
+	reg->umax_value = min_t(u64, reg->umax_value, new_umax);
+	/* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */
+	new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value;
+	new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value;
+	reg->smin_value = max_t(s64, reg->smin_value, new_smin);
+	reg->smax_value = min_t(s64, reg->smax_value, new_smax);
+
+	/* if s32 can be treated as valid u32 range, we can use it as well */
+	if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
+		/* s32 -> u64 tightening */
+		new_umin = (reg->umin_value & ~0xffffffffULL) | (u32)reg->s32_min_value;
+		new_umax = (reg->umax_value & ~0xffffffffULL) | (u32)reg->s32_max_value;
+		reg->umin_value = max_t(u64, reg->umin_value, new_umin);
+		reg->umax_value = min_t(u64, reg->umax_value, new_umax);
+		/* s32 -> s64 tightening */
+		new_smin = (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value;
+		new_smax = (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value;
+		reg->smin_value = max_t(s64, reg->smin_value, new_smin);
+		reg->smax_value = min_t(s64, reg->smax_value, new_smax);
+	}
+}
+
 static void __reg_deduce_bounds(struct bpf_reg_state *reg)
 {
 	__reg32_deduce_bounds(reg);
 	__reg64_deduce_bounds(reg);
+	__reg_deduce_mixed_bounds(reg);
 }
 
 /* Attempts to improve var_off based on unsigned min/max information */
-- 
cgit v1.2.3


From d7f00873817129e62f8c70891cb13c8eafe9feef Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:50 -0700
Subject: bpf: try harder to deduce register bounds from different numeric
 domains

There are cases (caught by subsequent reg_bounds tests in selftests/bpf)
where performing one round of __reg_deduce_bounds() doesn't propagate
all the information from, say, s32 to u32 bounds and than from newly
learned u32 bounds back to u64 and s64. So perform __reg_deduce_bounds()
twice to make sure such derivations are propagated fully after
reg_bounds_sync().

One such example is test `(s64)[0xffffffff00000001; 0] (u64)<
0xffffffff00000000` from selftest patch from this patch set. It demonstrates an
intricate dance of u64 -> s64 -> u64 -> u32 bounds adjustments, which requires
two rounds of __reg_deduce_bounds(). Here are corresponding refinement log from
selftest, showing evolution of knowledge.

REFINING (FALSE R1) (u64)SRC=[0xffffffff00000000; U64_MAX] (u64)DST_OLD=[0; U64_MAX] (u64)DST_NEW=[0xffffffff00000000; U64_MAX]
REFINING (FALSE R1) (u64)SRC=[0xffffffff00000000; U64_MAX] (s64)DST_OLD=[0xffffffff00000001; 0] (s64)DST_NEW=[0xffffffff00000001; -1]
REFINING (FALSE R1) (s64)SRC=[0xffffffff00000001; -1] (u64)DST_OLD=[0xffffffff00000000; U64_MAX] (u64)DST_NEW=[0xffffffff00000001; U64_MAX]
REFINING (FALSE R1) (u64)SRC=[0xffffffff00000001; U64_MAX] (u32)DST_OLD=[0; U32_MAX] (u32)DST_NEW=[1; U32_MAX]

R1 initially has smin/smax set to [0xffffffff00000001; -1], while umin/umax is
unknown. After (u64)< comparison, in FALSE branch we gain knowledge that
umin/umax is [0xffffffff00000000; U64_MAX]. That causes smin/smax to learn that
zero can't happen and upper bound is -1. Then smin/smax is adjusted from
umin/umax improving lower bound from 0xffffffff00000000 to 0xffffffff00000001.
And then eventually umin32/umax32 bounds are drived from umin/umax and become
[1; U32_MAX].

Selftest in the last patch is actually implementing a multi-round fixed-point
convergence logic, but so far all the tests are handled by two rounds of
reg_bounds_sync() on the verifier state, so we keep it simple for now.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-9-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 969f1ecfe310..61f17b63ba00 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2605,6 +2605,7 @@ static void reg_bounds_sync(struct bpf_reg_state *reg)
 	__update_reg_bounds(reg);
 	/* We might have learned something about the sign bit. */
 	__reg_deduce_bounds(reg);
+	__reg_deduce_bounds(reg);
 	/* We might have learned some bits from the bounds. */
 	__reg_bound_offset(reg);
 	/* Intersecting with the old var_off might have improved our bounds
-- 
cgit v1.2.3


From 9e314f5d8682e1fe6ac214fb34580a238b6fd3c4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:51 -0700
Subject: bpf: drop knowledge-losing __reg_combine_{32,64}_into_{64,32} logic

When performing 32-bit conditional operation operating on lower 32 bits
of a full 64-bit register, register full value isn't changed. We just
potentially gain new knowledge about that register's lower 32 bits.

Unfortunately, __reg_combine_{32,64}_into_{64,32} logic that
reg_set_min_max() performs as a last step, can lose information in some
cases due to __mark_reg64_unbounded() and __reg_assign_32_into_64().
That's bad and completely unnecessary. Especially __reg_assign_32_into_64()
looks completely out of place here, because we are not performing
zero-extending subregister assignment during conditional jump.

So this patch replaced __reg_combine_* with just a normal
reg_bounds_sync() which will do a proper job of deriving u64/s64 bounds
from u32/s32, and vice versa (among all other combinations).

__reg_combine_64_into_32() is also used in one more place,
coerce_reg_to_size(), while handling 1- and 2-byte register loads.
Looking into this, it seems like besides marking subregister as
unbounded before performing reg_bounds_sync(), we were also performing
deduction of smin32/smax32 and umin32/umax32 bounds from respective
smin/smax and umin/umax bounds. It's now redundant as reg_bounds_sync()
performs all the same logic more generically (e.g., without unnecessary
assumption that upper 32 bits of full register should be zero).

Long story short, we remove __reg_combine_64_into_32() completely, and
coerce_reg_to_size() now only does resetting subreg to unbounded and then
performing reg_bounds_sync() to recover as much information as possible
from 64-bit umin/umax and smin/smax bounds, set explicitly in
coerce_reg_to_size() earlier.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231102033759.2541186-10-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 60 +++++++--------------------------------------------
 1 file changed, 8 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 61f17b63ba00..b4d6b5a032ce 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2639,51 +2639,6 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
 	}
 }
 
-static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
-{
-	/* special case when 64-bit register has upper 32-bit register
-	 * zeroed. Typically happens after zext or <<32, >>32 sequence
-	 * allowing us to use 32-bit bounds directly,
-	 */
-	if (tnum_equals_const(tnum_clear_subreg(reg->var_off), 0)) {
-		__reg_assign_32_into_64(reg);
-	} else {
-		/* Otherwise the best we can do is push lower 32bit known and
-		 * unknown bits into register (var_off set from jmp logic)
-		 * then learn as much as possible from the 64-bit tnum
-		 * known and unknown bits. The previous smin/smax bounds are
-		 * invalid here because of jmp32 compare so mark them unknown
-		 * so they do not impact tnum bounds calculation.
-		 */
-		__mark_reg64_unbounded(reg);
-	}
-	reg_bounds_sync(reg);
-}
-
-static bool __reg64_bound_s32(s64 a)
-{
-	return a >= S32_MIN && a <= S32_MAX;
-}
-
-static bool __reg64_bound_u32(u64 a)
-{
-	return a >= U32_MIN && a <= U32_MAX;
-}
-
-static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
-{
-	__mark_reg32_unbounded(reg);
-	if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) {
-		reg->s32_min_value = (s32)reg->smin_value;
-		reg->s32_max_value = (s32)reg->smax_value;
-	}
-	if (__reg64_bound_u32(reg->umin_value) && __reg64_bound_u32(reg->umax_value)) {
-		reg->u32_min_value = (u32)reg->umin_value;
-		reg->u32_max_value = (u32)reg->umax_value;
-	}
-	reg_bounds_sync(reg);
-}
-
 /* Mark a register as having a completely unknown (scalar) value. */
 static void __mark_reg_unknown(const struct bpf_verifier_env *env,
 			       struct bpf_reg_state *reg)
@@ -6387,9 +6342,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
 	 * values are also truncated so we push 64-bit bounds into
 	 * 32-bit bounds. Above were truncated < 32-bits already.
 	 */
-	if (size >= 4)
-		return;
-	__reg_combine_64_into_32(reg);
+	if (size < 4) {
+		__mark_reg32_unbounded(reg);
+		reg_bounds_sync(reg);
+	}
 }
 
 static void set_sext64_default_val(struct bpf_reg_state *reg, int size)
@@ -14642,13 +14598,13 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 					     tnum_subreg(false_32off));
 		true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off),
 					    tnum_subreg(true_32off));
-		__reg_combine_32_into_64(false_reg);
-		__reg_combine_32_into_64(true_reg);
+		reg_bounds_sync(false_reg);
+		reg_bounds_sync(true_reg);
 	} else {
 		false_reg->var_off = false_64off;
 		true_reg->var_off = true_64off;
-		__reg_combine_64_into_32(false_reg);
-		__reg_combine_64_into_32(true_reg);
+		reg_bounds_sync(false_reg);
+		reg_bounds_sync(true_reg);
 	}
 }
 
-- 
cgit v1.2.3


From c2a3ab094683ddc154879a1364fc7cb0228f96a6 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:53 -0700
Subject: bpf: rename is_branch_taken reg arguments to prepare for the second
 one

Just taking mundane refactoring bits out into a separate patch. No
functional changes.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231102033759.2541186-12-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 108 +++++++++++++++++++++++++-------------------------
 1 file changed, 54 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b4d6b5a032ce..770d5bd54ff3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14188,26 +14188,26 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 	}));
 }
 
-static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
+static int is_branch32_taken(struct bpf_reg_state *reg1, u32 val, u8 opcode)
 {
-	struct tnum subreg = tnum_subreg(reg->var_off);
+	struct tnum subreg = tnum_subreg(reg1->var_off);
 	s32 sval = (s32)val;
 
 	switch (opcode) {
 	case BPF_JEQ:
 		if (tnum_is_const(subreg))
 			return !!tnum_equals_const(subreg, val);
-		else if (val < reg->u32_min_value || val > reg->u32_max_value)
+		else if (val < reg1->u32_min_value || val > reg1->u32_max_value)
 			return 0;
-		else if (sval < reg->s32_min_value || sval > reg->s32_max_value)
+		else if (sval < reg1->s32_min_value || sval > reg1->s32_max_value)
 			return 0;
 		break;
 	case BPF_JNE:
 		if (tnum_is_const(subreg))
 			return !tnum_equals_const(subreg, val);
-		else if (val < reg->u32_min_value || val > reg->u32_max_value)
+		else if (val < reg1->u32_min_value || val > reg1->u32_max_value)
 			return 1;
-		else if (sval < reg->s32_min_value || sval > reg->s32_max_value)
+		else if (sval < reg1->s32_min_value || sval > reg1->s32_max_value)
 			return 1;
 		break;
 	case BPF_JSET:
@@ -14217,51 +14217,51 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
 			return 0;
 		break;
 	case BPF_JGT:
-		if (reg->u32_min_value > val)
+		if (reg1->u32_min_value > val)
 			return 1;
-		else if (reg->u32_max_value <= val)
+		else if (reg1->u32_max_value <= val)
 			return 0;
 		break;
 	case BPF_JSGT:
-		if (reg->s32_min_value > sval)
+		if (reg1->s32_min_value > sval)
 			return 1;
-		else if (reg->s32_max_value <= sval)
+		else if (reg1->s32_max_value <= sval)
 			return 0;
 		break;
 	case BPF_JLT:
-		if (reg->u32_max_value < val)
+		if (reg1->u32_max_value < val)
 			return 1;
-		else if (reg->u32_min_value >= val)
+		else if (reg1->u32_min_value >= val)
 			return 0;
 		break;
 	case BPF_JSLT:
-		if (reg->s32_max_value < sval)
+		if (reg1->s32_max_value < sval)
 			return 1;
-		else if (reg->s32_min_value >= sval)
+		else if (reg1->s32_min_value >= sval)
 			return 0;
 		break;
 	case BPF_JGE:
-		if (reg->u32_min_value >= val)
+		if (reg1->u32_min_value >= val)
 			return 1;
-		else if (reg->u32_max_value < val)
+		else if (reg1->u32_max_value < val)
 			return 0;
 		break;
 	case BPF_JSGE:
-		if (reg->s32_min_value >= sval)
+		if (reg1->s32_min_value >= sval)
 			return 1;
-		else if (reg->s32_max_value < sval)
+		else if (reg1->s32_max_value < sval)
 			return 0;
 		break;
 	case BPF_JLE:
-		if (reg->u32_max_value <= val)
+		if (reg1->u32_max_value <= val)
 			return 1;
-		else if (reg->u32_min_value > val)
+		else if (reg1->u32_min_value > val)
 			return 0;
 		break;
 	case BPF_JSLE:
-		if (reg->s32_max_value <= sval)
+		if (reg1->s32_max_value <= sval)
 			return 1;
-		else if (reg->s32_min_value > sval)
+		else if (reg1->s32_min_value > sval)
 			return 0;
 		break;
 	}
@@ -14270,79 +14270,79 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
 }
 
 
-static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
+static int is_branch64_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode)
 {
 	s64 sval = (s64)val;
 
 	switch (opcode) {
 	case BPF_JEQ:
-		if (tnum_is_const(reg->var_off))
-			return !!tnum_equals_const(reg->var_off, val);
-		else if (val < reg->umin_value || val > reg->umax_value)
+		if (tnum_is_const(reg1->var_off))
+			return !!tnum_equals_const(reg1->var_off, val);
+		else if (val < reg1->umin_value || val > reg1->umax_value)
 			return 0;
-		else if (sval < reg->smin_value || sval > reg->smax_value)
+		else if (sval < reg1->smin_value || sval > reg1->smax_value)
 			return 0;
 		break;
 	case BPF_JNE:
-		if (tnum_is_const(reg->var_off))
-			return !tnum_equals_const(reg->var_off, val);
-		else if (val < reg->umin_value || val > reg->umax_value)
+		if (tnum_is_const(reg1->var_off))
+			return !tnum_equals_const(reg1->var_off, val);
+		else if (val < reg1->umin_value || val > reg1->umax_value)
 			return 1;
-		else if (sval < reg->smin_value || sval > reg->smax_value)
+		else if (sval < reg1->smin_value || sval > reg1->smax_value)
 			return 1;
 		break;
 	case BPF_JSET:
-		if ((~reg->var_off.mask & reg->var_off.value) & val)
+		if ((~reg1->var_off.mask & reg1->var_off.value) & val)
 			return 1;
-		if (!((reg->var_off.mask | reg->var_off.value) & val))
+		if (!((reg1->var_off.mask | reg1->var_off.value) & val))
 			return 0;
 		break;
 	case BPF_JGT:
-		if (reg->umin_value > val)
+		if (reg1->umin_value > val)
 			return 1;
-		else if (reg->umax_value <= val)
+		else if (reg1->umax_value <= val)
 			return 0;
 		break;
 	case BPF_JSGT:
-		if (reg->smin_value > sval)
+		if (reg1->smin_value > sval)
 			return 1;
-		else if (reg->smax_value <= sval)
+		else if (reg1->smax_value <= sval)
 			return 0;
 		break;
 	case BPF_JLT:
-		if (reg->umax_value < val)
+		if (reg1->umax_value < val)
 			return 1;
-		else if (reg->umin_value >= val)
+		else if (reg1->umin_value >= val)
 			return 0;
 		break;
 	case BPF_JSLT:
-		if (reg->smax_value < sval)
+		if (reg1->smax_value < sval)
 			return 1;
-		else if (reg->smin_value >= sval)
+		else if (reg1->smin_value >= sval)
 			return 0;
 		break;
 	case BPF_JGE:
-		if (reg->umin_value >= val)
+		if (reg1->umin_value >= val)
 			return 1;
-		else if (reg->umax_value < val)
+		else if (reg1->umax_value < val)
 			return 0;
 		break;
 	case BPF_JSGE:
-		if (reg->smin_value >= sval)
+		if (reg1->smin_value >= sval)
 			return 1;
-		else if (reg->smax_value < sval)
+		else if (reg1->smax_value < sval)
 			return 0;
 		break;
 	case BPF_JLE:
-		if (reg->umax_value <= val)
+		if (reg1->umax_value <= val)
 			return 1;
-		else if (reg->umin_value > val)
+		else if (reg1->umin_value > val)
 			return 0;
 		break;
 	case BPF_JSLE:
-		if (reg->smax_value <= sval)
+		if (reg1->smax_value <= sval)
 			return 1;
-		else if (reg->smin_value > sval)
+		else if (reg1->smin_value > sval)
 			return 0;
 		break;
 	}
@@ -14357,11 +14357,11 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
  * -1 - unknown. Example: "if (reg < 5)" is unknown when register value
  *      range [0,10]
  */
-static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
+static int is_branch_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode,
 			   bool is_jmp32)
 {
-	if (__is_pointer_value(false, reg)) {
-		if (!reg_not_null(reg))
+	if (__is_pointer_value(false, reg1)) {
+		if (!reg_not_null(reg1))
 			return -1;
 
 		/* If pointer is valid tests against zero will fail so we can
@@ -14381,8 +14381,8 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
 	}
 
 	if (is_jmp32)
-		return is_branch32_taken(reg, val, opcode);
-	return is_branch64_taken(reg, val, opcode);
+		return is_branch32_taken(reg1, val, opcode);
+	return is_branch64_taken(reg1, val, opcode);
 }
 
 static int flip_opcode(u32 opcode)
-- 
cgit v1.2.3


From c31534267c180f7ed00288d239a501b554885300 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:54 -0700
Subject: bpf: generalize is_branch_taken() to work with two registers

While still assuming that second register is a constant, generalize
is_branch_taken-related code to accept two registers instead of register
plus explicit constant value. This also, as a side effect, allows to
simplify check_cond_jmp_op() by unifying BPF_K case with BPF_X case, for
which we use a fake register to represent BPF_K's imm constant as
a register.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231102033759.2541186-13-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 57 +++++++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 770d5bd54ff3..79d01445093b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14188,9 +14188,13 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 	}));
 }
 
-static int is_branch32_taken(struct bpf_reg_state *reg1, u32 val, u8 opcode)
+/*
+ * <reg1> <op> <reg2>, currently assuming reg2 is a constant
+ */
+static int is_branch32_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, u8 opcode)
 {
 	struct tnum subreg = tnum_subreg(reg1->var_off);
+	u32 val = (u32)tnum_subreg(reg2->var_off).value;
 	s32 sval = (s32)val;
 
 	switch (opcode) {
@@ -14270,8 +14274,12 @@ static int is_branch32_taken(struct bpf_reg_state *reg1, u32 val, u8 opcode)
 }
 
 
-static int is_branch64_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode)
+/*
+ * <reg1> <op> <reg2>, currently assuming reg2 is a constant
+ */
+static int is_branch64_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, u8 opcode)
 {
+	u64 val = reg2->var_off.value;
 	s64 sval = (s64)val;
 
 	switch (opcode) {
@@ -14350,16 +14358,23 @@ static int is_branch64_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode)
 	return -1;
 }
 
-/* compute branch direction of the expression "if (reg opcode val) goto target;"
+/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;"
  * and return:
  *  1 - branch will be taken and "goto target" will be executed
  *  0 - branch will not be taken and fall-through to next insn
- * -1 - unknown. Example: "if (reg < 5)" is unknown when register value
+ * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
  *      range [0,10]
  */
-static int is_branch_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode,
-			   bool is_jmp32)
+static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+			   u8 opcode, bool is_jmp32)
 {
+	struct tnum reg2_tnum = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
+	u64 val;
+
+	if (!tnum_is_const(reg2_tnum))
+		return -1;
+	val = reg2_tnum.value;
+
 	if (__is_pointer_value(false, reg1)) {
 		if (!reg_not_null(reg1))
 			return -1;
@@ -14381,8 +14396,8 @@ static int is_branch_taken(struct bpf_reg_state *reg1, u64 val, u8 opcode,
 	}
 
 	if (is_jmp32)
-		return is_branch32_taken(reg1, val, opcode);
-	return is_branch64_taken(reg1, val, opcode);
+		return is_branch32_taken(reg1, reg2, opcode);
+	return is_branch64_taken(reg1, reg2, opcode);
 }
 
 static int flip_opcode(u32 opcode)
@@ -14853,6 +14868,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
 	struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
 	struct bpf_reg_state *eq_branch_regs;
+	struct bpf_reg_state fake_reg = {};
 	u8 opcode = BPF_OP(insn->code);
 	bool is_jmp32;
 	int pred = -1;
@@ -14893,36 +14909,27 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
 			return -EINVAL;
 		}
+		src_reg = &fake_reg;
+		src_reg->type = SCALAR_VALUE;
+		__mark_reg_known(src_reg, insn->imm);
 	}
 
 	is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
 
 	if (BPF_SRC(insn->code) == BPF_K) {
-		pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32);
+		pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
 	} else if (src_reg->type == SCALAR_VALUE &&
 		   is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) {
-		pred = is_branch_taken(dst_reg,
-				       tnum_subreg(src_reg->var_off).value,
-				       opcode,
-				       is_jmp32);
+		pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
 	} else if (src_reg->type == SCALAR_VALUE &&
 		   !is_jmp32 && tnum_is_const(src_reg->var_off)) {
-		pred = is_branch_taken(dst_reg,
-				       src_reg->var_off.value,
-				       opcode,
-				       is_jmp32);
+		pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
 	} else if (dst_reg->type == SCALAR_VALUE &&
 		   is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) {
-		pred = is_branch_taken(src_reg,
-				       tnum_subreg(dst_reg->var_off).value,
-				       flip_opcode(opcode),
-				       is_jmp32);
+		pred = is_branch_taken(src_reg, dst_reg, flip_opcode(opcode), is_jmp32);
 	} else if (dst_reg->type == SCALAR_VALUE &&
 		   !is_jmp32 && tnum_is_const(dst_reg->var_off)) {
-		pred = is_branch_taken(src_reg,
-				       dst_reg->var_off.value,
-				       flip_opcode(opcode),
-				       is_jmp32);
+		pred = is_branch_taken(src_reg, dst_reg, flip_opcode(opcode), is_jmp32);
 	} else if (reg_is_pkt_pointer_any(dst_reg) &&
 		   reg_is_pkt_pointer_any(src_reg) &&
 		   !is_jmp32) {
-- 
cgit v1.2.3


From c697289efe4ef38bc5c62f119cb74433f784b826 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:55 -0700
Subject: bpf: move is_branch_taken() down

Move is_branch_taken() slightly down. In subsequent patched we'll need
both flip_opcode() and is_pkt_ptr_branch_taken() for is_branch_taken(),
but instead of sprinkling forward declarations around, it makes more
sense to move is_branch_taken() lower below is_pkt_ptr_branch_taken(),
and also keep it closer to very tightly related reg_set_min_max(), as
they are two critical parts of the same SCALAR range tracking logic.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-14-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 84 +++++++++++++++++++++++++--------------------------
 1 file changed, 42 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 79d01445093b..414a7c58b4a4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14358,48 +14358,6 @@ static int is_branch64_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *r
 	return -1;
 }
 
-/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;"
- * and return:
- *  1 - branch will be taken and "goto target" will be executed
- *  0 - branch will not be taken and fall-through to next insn
- * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
- *      range [0,10]
- */
-static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
-			   u8 opcode, bool is_jmp32)
-{
-	struct tnum reg2_tnum = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
-	u64 val;
-
-	if (!tnum_is_const(reg2_tnum))
-		return -1;
-	val = reg2_tnum.value;
-
-	if (__is_pointer_value(false, reg1)) {
-		if (!reg_not_null(reg1))
-			return -1;
-
-		/* If pointer is valid tests against zero will fail so we can
-		 * use this to direct branch taken.
-		 */
-		if (val != 0)
-			return -1;
-
-		switch (opcode) {
-		case BPF_JEQ:
-			return 0;
-		case BPF_JNE:
-			return 1;
-		default:
-			return -1;
-		}
-	}
-
-	if (is_jmp32)
-		return is_branch32_taken(reg1, reg2, opcode);
-	return is_branch64_taken(reg1, reg2, opcode);
-}
-
 static int flip_opcode(u32 opcode)
 {
 	/* How can we transform "a <op> b" into "b <op> a"? */
@@ -14461,6 +14419,48 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
 	return -1;
 }
 
+/* compute branch direction of the expression "if (<reg1> opcode <reg2>) goto target;"
+ * and return:
+ *  1 - branch will be taken and "goto target" will be executed
+ *  0 - branch will not be taken and fall-through to next insn
+ * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value
+ *      range [0,10]
+ */
+static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+			   u8 opcode, bool is_jmp32)
+{
+	struct tnum reg2_tnum = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
+	u64 val;
+
+	if (!tnum_is_const(reg2_tnum))
+		return -1;
+	val = reg2_tnum.value;
+
+	if (__is_pointer_value(false, reg1)) {
+		if (!reg_not_null(reg1))
+			return -1;
+
+		/* If pointer is valid tests against zero will fail so we can
+		 * use this to direct branch taken.
+		 */
+		if (val != 0)
+			return -1;
+
+		switch (opcode) {
+		case BPF_JEQ:
+			return 0;
+		case BPF_JNE:
+			return 1;
+		default:
+			return -1;
+		}
+	}
+
+	if (is_jmp32)
+		return is_branch32_taken(reg1, reg2, opcode);
+	return is_branch64_taken(reg1, reg2, opcode);
+}
+
 /* Adjusts the register min/max values in the case that the dst_reg is the
  * variable register that we are working on, and src_reg is a constant or we're
  * simply doing a BPF_K check.
-- 
cgit v1.2.3


From b74c2a842bba941945279027083fcee1e9aaa73f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:56 -0700
Subject: bpf: generalize is_branch_taken to handle all conditional jumps in
 one place

Make is_branch_taken() a single entry point for branch pruning decision
making, handling both pointer vs pointer, pointer vs scalar, and scalar
vs scalar cases in one place. This also nicely cleans up check_cond_jmp_op().

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-15-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 49 +++++++++++++++++++++++++------------------------
 1 file changed, 25 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 414a7c58b4a4..17bbff33e0e8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14188,6 +14188,19 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 	}));
 }
 
+/* check if register is a constant scalar value */
+static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32)
+{
+	return reg->type == SCALAR_VALUE &&
+	       tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off);
+}
+
+/* assuming is_reg_const() is true, return constant value of a register */
+static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
+{
+	return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value;
+}
+
 /*
  * <reg1> <op> <reg2>, currently assuming reg2 is a constant
  */
@@ -14429,12 +14442,20 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
 static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
 			   u8 opcode, bool is_jmp32)
 {
-	struct tnum reg2_tnum = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
 	u64 val;
 
-	if (!tnum_is_const(reg2_tnum))
+	if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32)
+		return is_pkt_ptr_branch_taken(reg1, reg2, opcode);
+
+	/* try to make sure reg2 is a constant SCALAR_VALUE */
+	if (!is_reg_const(reg2, is_jmp32)) {
+		opcode = flip_opcode(opcode);
+		swap(reg1, reg2);
+	}
+	/* for now we expect reg2 to be a constant to make any useful decisions */
+	if (!is_reg_const(reg2, is_jmp32))
 		return -1;
-	val = reg2_tnum.value;
+	val = reg_const_value(reg2, is_jmp32);
 
 	if (__is_pointer_value(false, reg1)) {
 		if (!reg_not_null(reg1))
@@ -14915,27 +14936,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	}
 
 	is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
-
-	if (BPF_SRC(insn->code) == BPF_K) {
-		pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
-	} else if (src_reg->type == SCALAR_VALUE &&
-		   is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) {
-		pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
-	} else if (src_reg->type == SCALAR_VALUE &&
-		   !is_jmp32 && tnum_is_const(src_reg->var_off)) {
-		pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
-	} else if (dst_reg->type == SCALAR_VALUE &&
-		   is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) {
-		pred = is_branch_taken(src_reg, dst_reg, flip_opcode(opcode), is_jmp32);
-	} else if (dst_reg->type == SCALAR_VALUE &&
-		   !is_jmp32 && tnum_is_const(dst_reg->var_off)) {
-		pred = is_branch_taken(src_reg, dst_reg, flip_opcode(opcode), is_jmp32);
-	} else if (reg_is_pkt_pointer_any(dst_reg) &&
-		   reg_is_pkt_pointer_any(src_reg) &&
-		   !is_jmp32) {
-		pred = is_pkt_ptr_branch_taken(dst_reg, src_reg, opcode);
-	}
-
+	pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32);
 	if (pred >= 0) {
 		/* If we get here with a dst_reg pointer type it is because
 		 * above is_branch_taken() special cased the 0 comparison.
-- 
cgit v1.2.3


From 4d345887d2e5a1915600cb5d37b16c4088c6ee1c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:57 -0700
Subject: bpf: unify 32-bit and 64-bit is_branch_taken logic

Combine 32-bit and 64-bit is_branch_taken logic for SCALAR_VALUE
registers. It makes it easier to see parallels between two domains
(32-bit and 64-bit), and makes subsequent refactoring more
straightforward.

No functional changes.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-16-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 200 +++++++++++++++-----------------------------------
 1 file changed, 59 insertions(+), 141 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 17bbff33e0e8..c77cca5c4461 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14204,166 +14204,86 @@ static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
 /*
  * <reg1> <op> <reg2>, currently assuming reg2 is a constant
  */
-static int is_branch32_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, u8 opcode)
+static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+				  u8 opcode, bool is_jmp32)
 {
-	struct tnum subreg = tnum_subreg(reg1->var_off);
-	u32 val = (u32)tnum_subreg(reg2->var_off).value;
-	s32 sval = (s32)val;
+	struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off;
+	u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value;
+	u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value;
+	s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value;
+	s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value;
+	u64 uval = is_jmp32 ? (u32)tnum_subreg(reg2->var_off).value : reg2->var_off.value;
+	s64 sval = is_jmp32 ? (s32)uval : (s64)uval;
 
 	switch (opcode) {
 	case BPF_JEQ:
-		if (tnum_is_const(subreg))
-			return !!tnum_equals_const(subreg, val);
-		else if (val < reg1->u32_min_value || val > reg1->u32_max_value)
+		if (tnum_is_const(t1))
+			return !!tnum_equals_const(t1, uval);
+		else if (uval < umin1 || uval > umax1)
 			return 0;
-		else if (sval < reg1->s32_min_value || sval > reg1->s32_max_value)
+		else if (sval < smin1 || sval > smax1)
 			return 0;
 		break;
 	case BPF_JNE:
-		if (tnum_is_const(subreg))
-			return !tnum_equals_const(subreg, val);
-		else if (val < reg1->u32_min_value || val > reg1->u32_max_value)
+		if (tnum_is_const(t1))
+			return !tnum_equals_const(t1, uval);
+		else if (uval < umin1 || uval > umax1)
 			return 1;
-		else if (sval < reg1->s32_min_value || sval > reg1->s32_max_value)
+		else if (sval < smin1 || sval > smax1)
 			return 1;
 		break;
 	case BPF_JSET:
-		if ((~subreg.mask & subreg.value) & val)
+		if ((~t1.mask & t1.value) & uval)
 			return 1;
-		if (!((subreg.mask | subreg.value) & val))
+		if (!((t1.mask | t1.value) & uval))
 			return 0;
 		break;
 	case BPF_JGT:
-		if (reg1->u32_min_value > val)
+		if (umin1 > uval )
 			return 1;
-		else if (reg1->u32_max_value <= val)
+		else if (umax1 <= uval)
 			return 0;
 		break;
 	case BPF_JSGT:
-		if (reg1->s32_min_value > sval)
+		if (smin1 > sval)
 			return 1;
-		else if (reg1->s32_max_value <= sval)
+		else if (smax1 <= sval)
 			return 0;
 		break;
 	case BPF_JLT:
-		if (reg1->u32_max_value < val)
+		if (umax1 < uval)
 			return 1;
-		else if (reg1->u32_min_value >= val)
+		else if (umin1 >= uval)
 			return 0;
 		break;
 	case BPF_JSLT:
-		if (reg1->s32_max_value < sval)
+		if (smax1 < sval)
 			return 1;
-		else if (reg1->s32_min_value >= sval)
+		else if (smin1 >= sval)
 			return 0;
 		break;
 	case BPF_JGE:
-		if (reg1->u32_min_value >= val)
+		if (umin1 >= uval)
 			return 1;
-		else if (reg1->u32_max_value < val)
+		else if (umax1 < uval)
 			return 0;
 		break;
 	case BPF_JSGE:
-		if (reg1->s32_min_value >= sval)
+		if (smin1 >= sval)
 			return 1;
-		else if (reg1->s32_max_value < sval)
+		else if (smax1 < sval)
 			return 0;
 		break;
 	case BPF_JLE:
-		if (reg1->u32_max_value <= val)
+		if (umax1 <= uval)
 			return 1;
-		else if (reg1->u32_min_value > val)
+		else if (umin1 > uval)
 			return 0;
 		break;
 	case BPF_JSLE:
-		if (reg1->s32_max_value <= sval)
+		if (smax1 <= sval)
 			return 1;
-		else if (reg1->s32_min_value > sval)
-			return 0;
-		break;
-	}
-
-	return -1;
-}
-
-
-/*
- * <reg1> <op> <reg2>, currently assuming reg2 is a constant
- */
-static int is_branch64_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, u8 opcode)
-{
-	u64 val = reg2->var_off.value;
-	s64 sval = (s64)val;
-
-	switch (opcode) {
-	case BPF_JEQ:
-		if (tnum_is_const(reg1->var_off))
-			return !!tnum_equals_const(reg1->var_off, val);
-		else if (val < reg1->umin_value || val > reg1->umax_value)
-			return 0;
-		else if (sval < reg1->smin_value || sval > reg1->smax_value)
-			return 0;
-		break;
-	case BPF_JNE:
-		if (tnum_is_const(reg1->var_off))
-			return !tnum_equals_const(reg1->var_off, val);
-		else if (val < reg1->umin_value || val > reg1->umax_value)
-			return 1;
-		else if (sval < reg1->smin_value || sval > reg1->smax_value)
-			return 1;
-		break;
-	case BPF_JSET:
-		if ((~reg1->var_off.mask & reg1->var_off.value) & val)
-			return 1;
-		if (!((reg1->var_off.mask | reg1->var_off.value) & val))
-			return 0;
-		break;
-	case BPF_JGT:
-		if (reg1->umin_value > val)
-			return 1;
-		else if (reg1->umax_value <= val)
-			return 0;
-		break;
-	case BPF_JSGT:
-		if (reg1->smin_value > sval)
-			return 1;
-		else if (reg1->smax_value <= sval)
-			return 0;
-		break;
-	case BPF_JLT:
-		if (reg1->umax_value < val)
-			return 1;
-		else if (reg1->umin_value >= val)
-			return 0;
-		break;
-	case BPF_JSLT:
-		if (reg1->smax_value < sval)
-			return 1;
-		else if (reg1->smin_value >= sval)
-			return 0;
-		break;
-	case BPF_JGE:
-		if (reg1->umin_value >= val)
-			return 1;
-		else if (reg1->umax_value < val)
-			return 0;
-		break;
-	case BPF_JSGE:
-		if (reg1->smin_value >= sval)
-			return 1;
-		else if (reg1->smax_value < sval)
-			return 0;
-		break;
-	case BPF_JLE:
-		if (reg1->umax_value <= val)
-			return 1;
-		else if (reg1->umin_value > val)
-			return 0;
-		break;
-	case BPF_JSLE:
-		if (reg1->smax_value <= sval)
-			return 1;
-		else if (reg1->smin_value > sval)
+		else if (smin1 > sval)
 			return 0;
 		break;
 	}
@@ -14477,9 +14397,7 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg
 		}
 	}
 
-	if (is_jmp32)
-		return is_branch32_taken(reg1, reg2, opcode);
-	return is_branch64_taken(reg1, reg2, opcode);
+	return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
 }
 
 /* Adjusts the register min/max values in the case that the dst_reg is the
@@ -14489,15 +14407,15 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg
  */
 static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			    struct bpf_reg_state *false_reg,
-			    u64 val, u32 val32,
+			    u64 uval, u32 uval32,
 			    u8 opcode, bool is_jmp32)
 {
 	struct tnum false_32off = tnum_subreg(false_reg->var_off);
 	struct tnum false_64off = false_reg->var_off;
 	struct tnum true_32off = tnum_subreg(true_reg->var_off);
 	struct tnum true_64off = true_reg->var_off;
-	s64 sval = (s64)val;
-	s32 sval32 = (s32)val32;
+	s64 sval = (s64)uval;
+	s32 sval32 = (s32)uval32;
 
 	/* If the dst_reg is a pointer, we can't learn anything about its
 	 * variable offset from the compare (unless src_reg were a pointer into
@@ -14520,49 +14438,49 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 	 */
 	case BPF_JEQ:
 		if (is_jmp32) {
-			__mark_reg32_known(true_reg, val32);
+			__mark_reg32_known(true_reg, uval32);
 			true_32off = tnum_subreg(true_reg->var_off);
 		} else {
-			___mark_reg_known(true_reg, val);
+			___mark_reg_known(true_reg, uval);
 			true_64off = true_reg->var_off;
 		}
 		break;
 	case BPF_JNE:
 		if (is_jmp32) {
-			__mark_reg32_known(false_reg, val32);
+			__mark_reg32_known(false_reg, uval32);
 			false_32off = tnum_subreg(false_reg->var_off);
 		} else {
-			___mark_reg_known(false_reg, val);
+			___mark_reg_known(false_reg, uval);
 			false_64off = false_reg->var_off;
 		}
 		break;
 	case BPF_JSET:
 		if (is_jmp32) {
-			false_32off = tnum_and(false_32off, tnum_const(~val32));
-			if (is_power_of_2(val32))
+			false_32off = tnum_and(false_32off, tnum_const(~uval32));
+			if (is_power_of_2(uval32))
 				true_32off = tnum_or(true_32off,
-						     tnum_const(val32));
+						     tnum_const(uval32));
 		} else {
-			false_64off = tnum_and(false_64off, tnum_const(~val));
-			if (is_power_of_2(val))
+			false_64off = tnum_and(false_64off, tnum_const(~uval));
+			if (is_power_of_2(uval))
 				true_64off = tnum_or(true_64off,
-						     tnum_const(val));
+						     tnum_const(uval));
 		}
 		break;
 	case BPF_JGE:
 	case BPF_JGT:
 	{
 		if (is_jmp32) {
-			u32 false_umax = opcode == BPF_JGT ? val32  : val32 - 1;
-			u32 true_umin = opcode == BPF_JGT ? val32 + 1 : val32;
+			u32 false_umax = opcode == BPF_JGT ? uval32  : uval32 - 1;
+			u32 true_umin = opcode == BPF_JGT ? uval32 + 1 : uval32;
 
 			false_reg->u32_max_value = min(false_reg->u32_max_value,
 						       false_umax);
 			true_reg->u32_min_value = max(true_reg->u32_min_value,
 						      true_umin);
 		} else {
-			u64 false_umax = opcode == BPF_JGT ? val    : val - 1;
-			u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
+			u64 false_umax = opcode == BPF_JGT ? uval    : uval - 1;
+			u64 true_umin = opcode == BPF_JGT ? uval + 1 : uval;
 
 			false_reg->umax_value = min(false_reg->umax_value, false_umax);
 			true_reg->umin_value = max(true_reg->umin_value, true_umin);
@@ -14591,16 +14509,16 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 	case BPF_JLT:
 	{
 		if (is_jmp32) {
-			u32 false_umin = opcode == BPF_JLT ? val32  : val32 + 1;
-			u32 true_umax = opcode == BPF_JLT ? val32 - 1 : val32;
+			u32 false_umin = opcode == BPF_JLT ? uval32  : uval32 + 1;
+			u32 true_umax = opcode == BPF_JLT ? uval32 - 1 : uval32;
 
 			false_reg->u32_min_value = max(false_reg->u32_min_value,
 						       false_umin);
 			true_reg->u32_max_value = min(true_reg->u32_max_value,
 						      true_umax);
 		} else {
-			u64 false_umin = opcode == BPF_JLT ? val    : val + 1;
-			u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
+			u64 false_umin = opcode == BPF_JLT ? uval    : uval + 1;
+			u64 true_umax = opcode == BPF_JLT ? uval - 1 : uval;
 
 			false_reg->umin_value = max(false_reg->umin_value, false_umin);
 			true_reg->umax_value = min(true_reg->umax_value, true_umax);
@@ -14649,7 +14567,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
  */
 static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 				struct bpf_reg_state *false_reg,
-				u64 val, u32 val32,
+				u64 uval, u32 uval32,
 				u8 opcode, bool is_jmp32)
 {
 	opcode = flip_opcode(opcode);
@@ -14657,7 +14575,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 	 * BPF_JA, can't get here.
 	 */
 	if (opcode)
-		reg_set_min_max(true_reg, false_reg, val, val32, opcode, is_jmp32);
+		reg_set_min_max(true_reg, false_reg, uval, uval32, opcode, is_jmp32);
 }
 
 /* Regs are known to be equal, so intersect their min/max/var_off */
-- 
cgit v1.2.3


From 811476e9cc578cb6c776627ac069dc45a8431791 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:58 -0700
Subject: bpf: prepare reg_set_min_max for second set of registers

Similarly to is_branch_taken()-related refactorings, start preparing
reg_set_min_max() to handle more generic case of two non-const
registers. Start with renaming arguments to accommodate later addition
of second register as an input argument.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-17-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 80 +++++++++++++++++++++++++--------------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c77cca5c4461..40ed261d3489 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14405,25 +14405,25 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg
  * simply doing a BPF_K check.
  * In JEQ/JNE cases we also adjust the var_off values.
  */
-static void reg_set_min_max(struct bpf_reg_state *true_reg,
-			    struct bpf_reg_state *false_reg,
+static void reg_set_min_max(struct bpf_reg_state *true_reg1,
+			    struct bpf_reg_state *false_reg1,
 			    u64 uval, u32 uval32,
 			    u8 opcode, bool is_jmp32)
 {
-	struct tnum false_32off = tnum_subreg(false_reg->var_off);
-	struct tnum false_64off = false_reg->var_off;
-	struct tnum true_32off = tnum_subreg(true_reg->var_off);
-	struct tnum true_64off = true_reg->var_off;
+	struct tnum false_32off = tnum_subreg(false_reg1->var_off);
+	struct tnum false_64off = false_reg1->var_off;
+	struct tnum true_32off = tnum_subreg(true_reg1->var_off);
+	struct tnum true_64off = true_reg1->var_off;
 	s64 sval = (s64)uval;
 	s32 sval32 = (s32)uval32;
 
 	/* If the dst_reg is a pointer, we can't learn anything about its
 	 * variable offset from the compare (unless src_reg were a pointer into
 	 * the same object, but we don't bother with that.
-	 * Since false_reg and true_reg have the same type by construction, we
+	 * Since false_reg1 and true_reg1 have the same type by construction, we
 	 * only need to check one of them for pointerness.
 	 */
-	if (__is_pointer_value(false, false_reg))
+	if (__is_pointer_value(false, false_reg1))
 		return;
 
 	switch (opcode) {
@@ -14438,20 +14438,20 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 	 */
 	case BPF_JEQ:
 		if (is_jmp32) {
-			__mark_reg32_known(true_reg, uval32);
-			true_32off = tnum_subreg(true_reg->var_off);
+			__mark_reg32_known(true_reg1, uval32);
+			true_32off = tnum_subreg(true_reg1->var_off);
 		} else {
-			___mark_reg_known(true_reg, uval);
-			true_64off = true_reg->var_off;
+			___mark_reg_known(true_reg1, uval);
+			true_64off = true_reg1->var_off;
 		}
 		break;
 	case BPF_JNE:
 		if (is_jmp32) {
-			__mark_reg32_known(false_reg, uval32);
-			false_32off = tnum_subreg(false_reg->var_off);
+			__mark_reg32_known(false_reg1, uval32);
+			false_32off = tnum_subreg(false_reg1->var_off);
 		} else {
-			___mark_reg_known(false_reg, uval);
-			false_64off = false_reg->var_off;
+			___mark_reg_known(false_reg1, uval);
+			false_64off = false_reg1->var_off;
 		}
 		break;
 	case BPF_JSET:
@@ -14474,16 +14474,16 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			u32 false_umax = opcode == BPF_JGT ? uval32  : uval32 - 1;
 			u32 true_umin = opcode == BPF_JGT ? uval32 + 1 : uval32;
 
-			false_reg->u32_max_value = min(false_reg->u32_max_value,
+			false_reg1->u32_max_value = min(false_reg1->u32_max_value,
 						       false_umax);
-			true_reg->u32_min_value = max(true_reg->u32_min_value,
+			true_reg1->u32_min_value = max(true_reg1->u32_min_value,
 						      true_umin);
 		} else {
 			u64 false_umax = opcode == BPF_JGT ? uval    : uval - 1;
 			u64 true_umin = opcode == BPF_JGT ? uval + 1 : uval;
 
-			false_reg->umax_value = min(false_reg->umax_value, false_umax);
-			true_reg->umin_value = max(true_reg->umin_value, true_umin);
+			false_reg1->umax_value = min(false_reg1->umax_value, false_umax);
+			true_reg1->umin_value = max(true_reg1->umin_value, true_umin);
 		}
 		break;
 	}
@@ -14494,14 +14494,14 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			s32 false_smax = opcode == BPF_JSGT ? sval32    : sval32 - 1;
 			s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32;
 
-			false_reg->s32_max_value = min(false_reg->s32_max_value, false_smax);
-			true_reg->s32_min_value = max(true_reg->s32_min_value, true_smin);
+			false_reg1->s32_max_value = min(false_reg1->s32_max_value, false_smax);
+			true_reg1->s32_min_value = max(true_reg1->s32_min_value, true_smin);
 		} else {
 			s64 false_smax = opcode == BPF_JSGT ? sval    : sval - 1;
 			s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
 
-			false_reg->smax_value = min(false_reg->smax_value, false_smax);
-			true_reg->smin_value = max(true_reg->smin_value, true_smin);
+			false_reg1->smax_value = min(false_reg1->smax_value, false_smax);
+			true_reg1->smin_value = max(true_reg1->smin_value, true_smin);
 		}
 		break;
 	}
@@ -14512,16 +14512,16 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			u32 false_umin = opcode == BPF_JLT ? uval32  : uval32 + 1;
 			u32 true_umax = opcode == BPF_JLT ? uval32 - 1 : uval32;
 
-			false_reg->u32_min_value = max(false_reg->u32_min_value,
+			false_reg1->u32_min_value = max(false_reg1->u32_min_value,
 						       false_umin);
-			true_reg->u32_max_value = min(true_reg->u32_max_value,
+			true_reg1->u32_max_value = min(true_reg1->u32_max_value,
 						      true_umax);
 		} else {
 			u64 false_umin = opcode == BPF_JLT ? uval    : uval + 1;
 			u64 true_umax = opcode == BPF_JLT ? uval - 1 : uval;
 
-			false_reg->umin_value = max(false_reg->umin_value, false_umin);
-			true_reg->umax_value = min(true_reg->umax_value, true_umax);
+			false_reg1->umin_value = max(false_reg1->umin_value, false_umin);
+			true_reg1->umax_value = min(true_reg1->umax_value, true_umax);
 		}
 		break;
 	}
@@ -14532,14 +14532,14 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			s32 false_smin = opcode == BPF_JSLT ? sval32    : sval32 + 1;
 			s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32;
 
-			false_reg->s32_min_value = max(false_reg->s32_min_value, false_smin);
-			true_reg->s32_max_value = min(true_reg->s32_max_value, true_smax);
+			false_reg1->s32_min_value = max(false_reg1->s32_min_value, false_smin);
+			true_reg1->s32_max_value = min(true_reg1->s32_max_value, true_smax);
 		} else {
 			s64 false_smin = opcode == BPF_JSLT ? sval    : sval + 1;
 			s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
 
-			false_reg->smin_value = max(false_reg->smin_value, false_smin);
-			true_reg->smax_value = min(true_reg->smax_value, true_smax);
+			false_reg1->smin_value = max(false_reg1->smin_value, false_smin);
+			true_reg1->smax_value = min(true_reg1->smax_value, true_smax);
 		}
 		break;
 	}
@@ -14548,17 +14548,17 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 	}
 
 	if (is_jmp32) {
-		false_reg->var_off = tnum_or(tnum_clear_subreg(false_64off),
+		false_reg1->var_off = tnum_or(tnum_clear_subreg(false_64off),
 					     tnum_subreg(false_32off));
-		true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off),
+		true_reg1->var_off = tnum_or(tnum_clear_subreg(true_64off),
 					    tnum_subreg(true_32off));
-		reg_bounds_sync(false_reg);
-		reg_bounds_sync(true_reg);
+		reg_bounds_sync(false_reg1);
+		reg_bounds_sync(true_reg1);
 	} else {
-		false_reg->var_off = false_64off;
-		true_reg->var_off = true_64off;
-		reg_bounds_sync(false_reg);
-		reg_bounds_sync(true_reg);
+		false_reg1->var_off = false_64off;
+		true_reg1->var_off = true_64off;
+		reg_bounds_sync(false_reg1);
+		reg_bounds_sync(true_reg1);
 	}
 }
 
-- 
cgit v1.2.3


From 4621202adc5bc0d1006af37fe8b9aca131387d3c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 1 Nov 2023 20:37:59 -0700
Subject: bpf: generalize reg_set_min_max() to handle two sets of two registers

Change reg_set_min_max() to take FALSE/TRUE sets of two registers each,
instead of assuming that we are always comparing to a constant. For now
we still assume that right-hand side registers are constants (and make
sure that's the case by swapping src/dst regs, if necessary), but
subsequent patches will remove this limitation.

reg_set_min_max() is now called unconditionally for any register
comparison, so that might include pointer vs pointer. This makes it
consistent with is_branch_taken() generality. But we currently only
support adjustments based on SCALAR vs SCALAR comparisons, so
reg_set_min_max() has to guard itself againts pointers.

Taking two by two registers allows to further unify and simplify
check_cond_jmp_op() logic. We utilize fake register for BPF_K
conditional jump case, just like with is_branch_taken() part.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231102033759.2541186-18-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 131 +++++++++++++++++++++-----------------------------
 1 file changed, 56 insertions(+), 75 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 40ed261d3489..e801c50d3857 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14400,32 +14400,50 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg
 	return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
 }
 
-/* Adjusts the register min/max values in the case that the dst_reg is the
- * variable register that we are working on, and src_reg is a constant or we're
- * simply doing a BPF_K check.
- * In JEQ/JNE cases we also adjust the var_off values.
+/* Adjusts the register min/max values in the case that the dst_reg and
+ * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
+ * check, in which case we havea fake SCALAR_VALUE representing insn->imm).
+ * Technically we can do similar adjustments for pointers to the same object,
+ * but we don't support that right now.
  */
 static void reg_set_min_max(struct bpf_reg_state *true_reg1,
+			    struct bpf_reg_state *true_reg2,
 			    struct bpf_reg_state *false_reg1,
-			    u64 uval, u32 uval32,
+			    struct bpf_reg_state *false_reg2,
 			    u8 opcode, bool is_jmp32)
 {
-	struct tnum false_32off = tnum_subreg(false_reg1->var_off);
-	struct tnum false_64off = false_reg1->var_off;
-	struct tnum true_32off = tnum_subreg(true_reg1->var_off);
-	struct tnum true_64off = true_reg1->var_off;
-	s64 sval = (s64)uval;
-	s32 sval32 = (s32)uval32;
-
-	/* If the dst_reg is a pointer, we can't learn anything about its
-	 * variable offset from the compare (unless src_reg were a pointer into
-	 * the same object, but we don't bother with that.
-	 * Since false_reg1 and true_reg1 have the same type by construction, we
-	 * only need to check one of them for pointerness.
+	struct tnum false_32off, false_64off;
+	struct tnum true_32off, true_64off;
+	u64 uval;
+	u32 uval32;
+	s64 sval;
+	s32 sval32;
+
+	/* If either register is a pointer, we can't learn anything about its
+	 * variable offset from the compare (unless they were a pointer into
+	 * the same object, but we don't bother with that).
 	 */
-	if (__is_pointer_value(false, false_reg1))
+	if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
+		return;
+
+	/* we expect right-hand registers (src ones) to be constants, for now */
+	if (!is_reg_const(false_reg2, is_jmp32)) {
+		opcode = flip_opcode(opcode);
+		swap(true_reg1, true_reg2);
+		swap(false_reg1, false_reg2);
+	}
+	if (!is_reg_const(false_reg2, is_jmp32))
 		return;
 
+	false_32off = tnum_subreg(false_reg1->var_off);
+	false_64off = false_reg1->var_off;
+	true_32off = tnum_subreg(true_reg1->var_off);
+	true_64off = true_reg1->var_off;
+	uval = false_reg2->var_off.value;
+	uval32 = (u32)tnum_subreg(false_reg2->var_off).value;
+	sval = (s64)uval;
+	sval32 = (s32)uval32;
+
 	switch (opcode) {
 	/* JEQ/JNE comparison doesn't change the register equivalence.
 	 *
@@ -14562,22 +14580,6 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg1,
 	}
 }
 
-/* Same as above, but for the case that dst_reg holds a constant and src_reg is
- * the variable reg.
- */
-static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
-				struct bpf_reg_state *false_reg,
-				u64 uval, u32 uval32,
-				u8 opcode, bool is_jmp32)
-{
-	opcode = flip_opcode(opcode);
-	/* This uses zero as "not present in table"; luckily the zero opcode,
-	 * BPF_JA, can't get here.
-	 */
-	if (opcode)
-		reg_set_min_max(true_reg, false_reg, uval, uval32, opcode, is_jmp32);
-}
-
 /* Regs are known to be equal, so intersect their min/max/var_off */
 static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
 				  struct bpf_reg_state *dst_reg)
@@ -14902,53 +14904,32 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		return -EFAULT;
 	other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
 
-	/* detect if we are comparing against a constant value so we can adjust
-	 * our min/max values for our dst register.
-	 * this is only legit if both are scalars (or pointers to the same
-	 * object, I suppose, see the PTR_MAYBE_NULL related if block below),
-	 * because otherwise the different base pointers mean the offsets aren't
-	 * comparable.
-	 */
 	if (BPF_SRC(insn->code) == BPF_X) {
-		struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+		reg_set_min_max(&other_branch_regs[insn->dst_reg],
+				&other_branch_regs[insn->src_reg],
+				dst_reg, src_reg, opcode, is_jmp32);
 
 		if (dst_reg->type == SCALAR_VALUE &&
-		    src_reg->type == SCALAR_VALUE) {
-			if (tnum_is_const(src_reg->var_off) ||
-			    (is_jmp32 &&
-			     tnum_is_const(tnum_subreg(src_reg->var_off))))
-				reg_set_min_max(&other_branch_regs[insn->dst_reg],
-						dst_reg,
-						src_reg->var_off.value,
-						tnum_subreg(src_reg->var_off).value,
-						opcode, is_jmp32);
-			else if (tnum_is_const(dst_reg->var_off) ||
-				 (is_jmp32 &&
-				  tnum_is_const(tnum_subreg(dst_reg->var_off))))
-				reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
-						    src_reg,
-						    dst_reg->var_off.value,
-						    tnum_subreg(dst_reg->var_off).value,
-						    opcode, is_jmp32);
-			else if (!is_jmp32 &&
-				 (opcode == BPF_JEQ || opcode == BPF_JNE))
-				/* Comparing for equality, we can combine knowledge */
-				reg_combine_min_max(&other_branch_regs[insn->src_reg],
-						    &other_branch_regs[insn->dst_reg],
-						    src_reg, dst_reg, opcode);
-			if (src_reg->id &&
-			    !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
-				find_equal_scalars(this_branch, src_reg);
-				find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
-			}
-
-		}
-	} else if (dst_reg->type == SCALAR_VALUE) {
+		    src_reg->type == SCALAR_VALUE &&
+		    !is_jmp32 && (opcode == BPF_JEQ || opcode == BPF_JNE)) {
+			/* Comparing for equality, we can combine knowledge */
+			reg_combine_min_max(&other_branch_regs[insn->src_reg],
+					    &other_branch_regs[insn->dst_reg],
+					    src_reg, dst_reg, opcode);
+		}
+	} else /* BPF_SRC(insn->code) == BPF_K */ {
 		reg_set_min_max(&other_branch_regs[insn->dst_reg],
-					dst_reg, insn->imm, (u32)insn->imm,
-					opcode, is_jmp32);
+				src_reg /* fake one */,
+				dst_reg, src_reg /* same fake one */,
+				opcode, is_jmp32);
 	}
 
+	if (BPF_SRC(insn->code) == BPF_X &&
+	    src_reg->type == SCALAR_VALUE && src_reg->id &&
+	    !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
+		find_equal_scalars(this_branch, src_reg);
+		find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
+	}
 	if (dst_reg->type == SCALAR_VALUE && dst_reg->id &&
 	    !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) {
 		find_equal_scalars(this_branch, dst_reg);
-- 
cgit v1.2.3


From 9b75dbeb36fcd9fc7ed51d370310d0518a387769 Mon Sep 17 00:00:00 2001
From: Florian Lehner <dev@der-flo.net>
Date: Sun, 5 Nov 2023 09:58:01 +0100
Subject: bpf, lpm: Fix check prefixlen before walking trie

When looking up an element in LPM trie, the condition 'matchlen ==
trie->max_prefixlen' will never return true, if key->prefixlen is larger
than trie->max_prefixlen. Consequently all elements in the LPM trie will
be visited and no element is returned in the end.

To resolve this, check key->prefixlen first before walking the LPM trie.

Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation")
Signed-off-by: Florian Lehner <dev@der-flo.net>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231105085801.3742-1-dev@der-flo.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/lpm_trie.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 17c7e7782a1f..b32be680da6c 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -231,6 +231,9 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
 	struct lpm_trie_node *node, *found = NULL;
 	struct bpf_lpm_trie_key *key = _key;
 
+	if (key->prefixlen > trie->max_prefixlen)
+		return NULL;
+
 	/* Start walking the trie from the root node ... */
 
 	for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held());
-- 
cgit v1.2.3


From 74523c06ae20b83c5508a98af62393ac34913362 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 6 Nov 2023 20:57:23 -0800
Subject: bpf: Add __bpf_dynptr_data* for in kernel use

Different types of bpf dynptr have different internal data storage.
Specifically, SKB and XDP type of dynptr may have non-continuous data.
Therefore, it is not always safe to directly access dynptr->data.

Add __bpf_dynptr_data and __bpf_dynptr_data_rw to replace direct access to
dynptr->data.

Update bpf_verify_pkcs7_signature to use __bpf_dynptr_data instead of
dynptr->data.

Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://lore.kernel.org/bpf/20231107045725.2278852-2-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  2 ++
 kernel/bpf/helpers.c     | 19 +++++++++++++++++++
 kernel/trace/bpf_trace.c | 12 ++++++++----
 3 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4825d3cdb29..eb84caf133df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1222,6 +1222,8 @@ enum bpf_dynptr_type {
 
 int bpf_dynptr_check_size(u32 size);
 u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr);
+const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len);
+void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len);
 
 #ifdef CONFIG_BPF_JIT
 int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 56b0c1f678ee..174f02a9e703 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2618,3 +2618,22 @@ static int __init kfunc_init(void)
 }
 
 late_initcall(kfunc_init);
+
+/* Get a pointer to dynptr data up to len bytes for read only access. If
+ * the dynptr doesn't have continuous data up to len bytes, return NULL.
+ */
+const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
+{
+	return bpf_dynptr_slice(ptr, 0, NULL, len);
+}
+
+/* Get a pointer to dynptr data up to len bytes for read write access. If
+ * the dynptr doesn't have continuous data up to len bytes, or the dynptr
+ * is read only, return NULL.
+ */
+void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len)
+{
+	if (__bpf_dynptr_is_rdonly(ptr))
+		return NULL;
+	return (void *)__bpf_dynptr_data(ptr, len);
+}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 84e8a0f6e4e0..f0b8b7c29126 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1376,6 +1376,8 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
 			       struct bpf_dynptr_kern *sig_ptr,
 			       struct bpf_key *trusted_keyring)
 {
+	const void *data, *sig;
+	u32 data_len, sig_len;
 	int ret;
 
 	if (trusted_keyring->has_ref) {
@@ -1392,10 +1394,12 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
 			return ret;
 	}
 
-	return verify_pkcs7_signature(data_ptr->data,
-				      __bpf_dynptr_size(data_ptr),
-				      sig_ptr->data,
-				      __bpf_dynptr_size(sig_ptr),
+	data_len = __bpf_dynptr_size(data_ptr);
+	data = __bpf_dynptr_data(data_ptr, data_len);
+	sig_len = __bpf_dynptr_size(sig_ptr);
+	sig = __bpf_dynptr_data(sig_ptr, sig_len);
+
+	return verify_pkcs7_signature(data, data_len, sig, sig_len,
 				      trusted_keyring->key,
 				      VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
 				      NULL);
-- 
cgit v1.2.3


From 0b51940729150e807fc4b7767164e6bb6cf4f7dd Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 6 Nov 2023 20:57:24 -0800
Subject: bpf: Factor out helper check_reg_const_str()

ARG_PTR_TO_CONST_STR is used to specify constant string args for BPF
helpers. The logic that verifies a reg is ARG_PTR_TO_CONST_STR is
implemented in check_func_arg().

As we introduce kfuncs with constant string args, it is necessary to
do the same check for kfuncs (in check_kfunc_args). Factor out the logic
for ARG_PTR_TO_CONST_STR to a new check_reg_const_str() so that it can be
reused.

check_func_arg() ensures check_reg_const_str() is only called with reg of
type PTR_TO_MAP_VALUE. Add a redundent type check in check_reg_const_str()
to avoid misuse in the future. Other than this redundent check, there is
no change in behavior.

Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://lore.kernel.org/bpf/20231107045725.2278852-3-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 85 +++++++++++++++++++++++++++++----------------------
 1 file changed, 49 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e801c50d3857..637f34231633 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8725,6 +8725,54 @@ static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
 	return state->stack[spi].spilled_ptr.dynptr.type;
 }
 
+static int check_reg_const_str(struct bpf_verifier_env *env,
+			       struct bpf_reg_state *reg, u32 regno)
+{
+	struct bpf_map *map = reg->map_ptr;
+	int err;
+	int map_off;
+	u64 map_addr;
+	char *str_ptr;
+
+	if (reg->type != PTR_TO_MAP_VALUE)
+		return -EINVAL;
+
+	if (!bpf_map_is_rdonly(map)) {
+		verbose(env, "R%d does not point to a readonly map'\n", regno);
+		return -EACCES;
+	}
+
+	if (!tnum_is_const(reg->var_off)) {
+		verbose(env, "R%d is not a constant address'\n", regno);
+		return -EACCES;
+	}
+
+	if (!map->ops->map_direct_value_addr) {
+		verbose(env, "no direct value access support for this map type\n");
+		return -EACCES;
+	}
+
+	err = check_map_access(env, regno, reg->off,
+			       map->value_size - reg->off, false,
+			       ACCESS_HELPER);
+	if (err)
+		return err;
+
+	map_off = reg->off + reg->var_off.value;
+	err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
+	if (err) {
+		verbose(env, "direct value access on string failed\n");
+		return err;
+	}
+
+	str_ptr = (char *)(long)(map_addr);
+	if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) {
+		verbose(env, "string is not zero-terminated\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
 static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 			  struct bpf_call_arg_meta *meta,
 			  const struct bpf_func_proto *fn,
@@ -8969,44 +9017,9 @@ skip_type_check:
 	}
 	case ARG_PTR_TO_CONST_STR:
 	{
-		struct bpf_map *map = reg->map_ptr;
-		int map_off;
-		u64 map_addr;
-		char *str_ptr;
-
-		if (!bpf_map_is_rdonly(map)) {
-			verbose(env, "R%d does not point to a readonly map'\n", regno);
-			return -EACCES;
-		}
-
-		if (!tnum_is_const(reg->var_off)) {
-			verbose(env, "R%d is not a constant address'\n", regno);
-			return -EACCES;
-		}
-
-		if (!map->ops->map_direct_value_addr) {
-			verbose(env, "no direct value access support for this map type\n");
-			return -EACCES;
-		}
-
-		err = check_map_access(env, regno, reg->off,
-				       map->value_size - reg->off, false,
-				       ACCESS_HELPER);
+		err = check_reg_const_str(env, reg, regno);
 		if (err)
 			return err;
-
-		map_off = reg->off + reg->var_off.value;
-		err = map->ops->map_direct_value_addr(map, &map_addr, map_off);
-		if (err) {
-			verbose(env, "direct value access on string failed\n");
-			return err;
-		}
-
-		str_ptr = (char *)(long)(map_addr);
-		if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) {
-			verbose(env, "string is not zero-terminated\n");
-			return -EINVAL;
-		}
 		break;
 	}
 	case ARG_PTR_TO_KPTR:
-- 
cgit v1.2.3


From 045edee19d591e59ed53772bf6dfc9b1ed9577eb Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 6 Nov 2023 20:57:25 -0800
Subject: bpf: Introduce KF_ARG_PTR_TO_CONST_STR

Similar to ARG_PTR_TO_CONST_STR for BPF helpers, KF_ARG_PTR_TO_CONST_STR
specifies kfunc args that point to const strings. Annotation "__str" is
used to specify kfunc arg of type KF_ARG_PTR_TO_CONST_STR. Also, add
documentation for the "__str" annotation.

bpf_get_file_xattr() will be the first kfunc that uses this type.

Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://lore.kernel.org/bpf/20231107045725.2278852-4-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/kfuncs.rst | 24 ++++++++++++++++++++++++
 kernel/bpf/verifier.c        | 19 +++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 723408e399ab..7985c6615f3c 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -135,6 +135,30 @@ Either way, the returned buffer is either NULL, or of size buffer_szk. Without t
 annotation, the verifier will reject the program if a null pointer is passed in with
 a nonzero size.
 
+2.2.5 __str Annotation
+----------------------------
+This annotation is used to indicate that the argument is a constant string.
+
+An example is given below::
+
+        __bpf_kfunc bpf_get_file_xattr(..., const char *name__str, ...)
+        {
+        ...
+        }
+
+In this case, ``bpf_get_file_xattr()`` can be called as::
+
+        bpf_get_file_xattr(..., "xattr_name", ...);
+
+Or::
+
+        const char name[] = "xattr_name";  /* This need to be global */
+        int BPF_PROG(...)
+        {
+                ...
+                bpf_get_file_xattr(..., name, ...);
+                ...
+        }
 
 .. _BPF_kfunc_nodef:
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 637f34231633..9276e0abcb4b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -10810,6 +10810,11 @@ static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param
 	return __kfunc_param_match_suffix(btf, arg, "__nullable");
 }
 
+static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg)
+{
+	return __kfunc_param_match_suffix(btf, arg, "__str");
+}
+
 static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
 					  const struct btf_param *arg,
 					  const char *name)
@@ -10953,6 +10958,7 @@ enum kfunc_ptr_arg_type {
 	KF_ARG_PTR_TO_RB_ROOT,
 	KF_ARG_PTR_TO_RB_NODE,
 	KF_ARG_PTR_TO_NULL,
+	KF_ARG_PTR_TO_CONST_STR,
 };
 
 enum special_kfunc_type {
@@ -11103,6 +11109,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_RB_NODE;
 
+	if (is_kfunc_arg_const_str(meta->btf, &args[argno]))
+		return KF_ARG_PTR_TO_CONST_STR;
+
 	if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
 		if (!btf_type_is_struct(ref_t)) {
 			verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
@@ -11734,6 +11743,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 		case KF_ARG_PTR_TO_MEM_SIZE:
 		case KF_ARG_PTR_TO_CALLBACK:
 		case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
+		case KF_ARG_PTR_TO_CONST_STR:
 			/* Trusted by default */
 			break;
 		default:
@@ -12005,6 +12015,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			meta->arg_btf = reg->btf;
 			meta->arg_btf_id = reg->btf_id;
 			break;
+		case KF_ARG_PTR_TO_CONST_STR:
+			if (reg->type != PTR_TO_MAP_VALUE) {
+				verbose(env, "arg#%d doesn't point to a const string\n", i);
+				return -EINVAL;
+			}
+			ret = check_reg_const_str(env, reg, regno);
+			if (ret)
+				return ret;
+			break;
 		}
 	}
 
-- 
cgit v1.2.3


From 82ce364c6087e31ff9837380a4641a856284064c Mon Sep 17 00:00:00 2001
From: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Date: Wed, 8 Nov 2023 22:00:41 +0800
Subject: bpf: replace register_is_const() with is_reg_const()

The addition of is_reg_const() in commit 171de12646d2 ("bpf: generalize
is_branch_taken to handle all conditional jumps in one place") has made the
register_is_const() redundant. Give the former has more feature, plus the
fact the latter is only used in one place, replace register_is_const() with
is_reg_const(), and remove the definition of register_is_const.

This requires moving the definition of is_reg_const() further up. And since
the comment of reg_const_value() reference is_reg_const(), move it up as
well.

Signed-off-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231108140043.12282-1-shung-hsi.yu@suse.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9276e0abcb4b..993e4677bbe9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4690,9 +4690,17 @@ static bool register_is_null(struct bpf_reg_state *reg)
 	return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
 }
 
-static bool register_is_const(struct bpf_reg_state *reg)
+/* check if register is a constant scalar value */
+static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32)
+{
+	return reg->type == SCALAR_VALUE &&
+	       tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off);
+}
+
+/* assuming is_reg_const() is true, return constant value of a register */
+static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
 {
-	return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
+	return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value;
 }
 
 static bool __is_scalar_unbounded(struct bpf_reg_state *reg)
@@ -10050,7 +10058,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	val = reg->var_off.value;
 	max = map->max_entries;
 
-	if (!(register_is_const(reg) && val < max)) {
+	if (!(is_reg_const(reg, false) && val < max)) {
 		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
 		return 0;
 	}
@@ -14220,19 +14228,6 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 	}));
 }
 
-/* check if register is a constant scalar value */
-static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32)
-{
-	return reg->type == SCALAR_VALUE &&
-	       tnum_is_const(subreg32 ? tnum_subreg(reg->var_off) : reg->var_off);
-}
-
-/* assuming is_reg_const() is true, return constant value of a register */
-static u64 reg_const_value(struct bpf_reg_state *reg, bool subreg32)
-{
-	return subreg32 ? tnum_subreg(reg->var_off).value : reg->var_off.value;
-}
-
 /*
  * <reg1> <op> <reg2>, currently assuming reg2 is a constant
  */
-- 
cgit v1.2.3


From 1500a5d9f49cb66906d3ea1c9158df25cc41dd40 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 7 Nov 2023 00:56:34 -0800
Subject: bpf: Add KF_RCU flag to bpf_refcount_acquire_impl

Refcounted local kptrs are kptrs to user-defined types with a
bpf_refcount field. Recent commits ([0], [1]) modified the lifetime of
refcounted local kptrs such that the underlying memory is not reused
until RCU grace period has elapsed.

Separately, verification of bpf_refcount_acquire calls currently
succeeds for MAYBE_NULL non-owning reference input, which is a problem
as bpf_refcount_acquire_impl has no handling for this case.

This patch takes advantage of aforementioned lifetime changes to tag
bpf_refcount_acquire_impl kfunc KF_RCU, thereby preventing MAYBE_NULL
input to the kfunc. The KF_RCU flag applies to all kfunc params; it's
fine for it to apply to the void *meta__ign param as that's populated by
the verifier and is tagged __ign regardless.

  [0]: commit 7e26cd12ad1c ("bpf: Use bpf_mem_free_rcu when
       bpf_obj_dropping refcounted nodes") is the actual change to
       allocation behaivor
  [1]: commit 0816b8c6bf7f ("bpf: Consider non-owning refs to refcounted
       nodes RCU protected") modified verifier understanding of
       refcounted local kptrs to match [0]'s changes

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Fixes: 7c50b1cb76ac ("bpf: Add bpf_refcount_acquire kfunc")
Link: https://lore.kernel.org/r/20231107085639.3016113-2-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 174f02a9e703..b84d8cb8d239 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2520,7 +2520,7 @@ BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
 BTF_ID_FLAGS(func, bpf_list_push_front_impl)
 BTF_ID_FLAGS(func, bpf_list_push_back_impl)
 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
-- 
cgit v1.2.3


From 649924b76ab151a96bdd22a97a993fb0421f134c Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 7 Nov 2023 00:56:36 -0800
Subject: bpf: Use bpf_mem_free_rcu when bpf_obj_dropping non-refcounted nodes

The use of bpf_mem_free_rcu to free refcounted local kptrs was added
in commit 7e26cd12ad1c ("bpf: Use bpf_mem_free_rcu when
bpf_obj_dropping refcounted nodes"). In the cover letter for the
series containing that patch [0] I commented:

    Perhaps it makes sense to move to mem_free_rcu for _all_
    non-owning refs in the future, not just refcounted. This might
    allow custom non-owning ref lifetime + invalidation logic to be
    entirely subsumed by MEM_RCU handling. IMO this needs a bit more
    thought and should be tackled outside of a fix series, so it's not
    attempted here.

It's time to start moving in the "non-owning refs have MEM_RCU
lifetime" direction. As mentioned in that comment, using
bpf_mem_free_rcu for all local kptrs - not just refcounted - is
necessarily the first step towards that goal. This patch does so.

After this patch the memory pointed to by all local kptrs will not be
reused until RCU grace period elapses. The verifier's understanding of
non-owning ref validity and the clobbering logic it uses to enforce
that understanding are not changed here, that'll happen gradually in
future work, including further patches in the series.

  [0]: https://lore.kernel.org/all/20230821193311.3290257-1-davemarchevsky@fb.com/

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20231107085639.3016113-4-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b84d8cb8d239..03517db5cfb3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1937,10 +1937,7 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu)
 		ma = &bpf_global_percpu_ma;
 	else
 		ma = &bpf_global_ma;
-	if (rec && rec->refcount_off >= 0)
-		bpf_mem_free_rcu(ma, p);
-	else
-		bpf_mem_free(ma, p);
+	bpf_mem_free_rcu(ma, p);
 }
 
 __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
-- 
cgit v1.2.3


From 790ce3cfefb1b768dccd4eee324ddef0f0ce3db4 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 7 Nov 2023 00:56:37 -0800
Subject: bpf: Move GRAPH_{ROOT,NODE}_MASK macros into btf_field_type enum

This refactoring patch removes the unused BPF_GRAPH_NODE_OR_ROOT
btf_field_type and moves BPF_GRAPH_{NODE,ROOT} macros into the
btf_field_type enum. Further patches in the series will use
BPF_GRAPH_NODE, so let's move this useful definition out of btf.c.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20231107085639.3016113-5-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h |  4 ++--
 kernel/bpf/btf.c    | 11 ++++-------
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index eb84caf133df..4001d11be151 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -186,8 +186,8 @@ enum btf_field_type {
 	BPF_LIST_NODE  = (1 << 6),
 	BPF_RB_ROOT    = (1 << 7),
 	BPF_RB_NODE    = (1 << 8),
-	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
-				 BPF_RB_NODE | BPF_RB_ROOT,
+	BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE,
+	BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD,
 	BPF_REFCOUNT   = (1 << 9),
 };
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 15d71d2986d3..63cf4128fc05 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3840,9 +3840,6 @@ end:
 	return ERR_PTR(ret);
 }
 
-#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT)
-#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE)
-
 int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
 {
 	int i;
@@ -3855,13 +3852,13 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
 	 * Hence we only need to ensure that bpf_{list_head,rb_root} ownership
 	 * does not form cycles.
 	 */
-	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK))
+	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT))
 		return 0;
 	for (i = 0; i < rec->cnt; i++) {
 		struct btf_struct_meta *meta;
 		u32 btf_id;
 
-		if (!(rec->fields[i].type & GRAPH_ROOT_MASK))
+		if (!(rec->fields[i].type & BPF_GRAPH_ROOT))
 			continue;
 		btf_id = rec->fields[i].graph_root.value_btf_id;
 		meta = btf_find_struct_meta(btf, btf_id);
@@ -3873,7 +3870,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
 		 * to check ownership cycle for a type unless it's also a
 		 * node type.
 		 */
-		if (!(rec->field_mask & GRAPH_NODE_MASK))
+		if (!(rec->field_mask & BPF_GRAPH_NODE))
 			continue;
 
 		/* We need to ensure ownership acyclicity among all types. The
@@ -3909,7 +3906,7 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
 		 * - A is both an root and node.
 		 * - B is only an node.
 		 */
-		if (meta->record->field_mask & GRAPH_ROOT_MASK)
+		if (meta->record->field_mask & BPF_GRAPH_ROOT)
 			return -ELOOP;
 	}
 	return 0;
-- 
cgit v1.2.3


From 1b12171533a9bb23cf6fba7262b479028b65e1e8 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Tue, 7 Nov 2023 00:56:38 -0800
Subject: bpf: Mark direct ld of stashed bpf_{rb,list}_node as non-owning ref

This patch enables the following pattern:

  /* mapval contains a __kptr pointing to refcounted local kptr */
  mapval = bpf_map_lookup_elem(&map, &idx);
  if (!mapval || !mapval->some_kptr) { /* omitted */ }

  p = bpf_refcount_acquire(&mapval->some_kptr);

Currently this doesn't work because bpf_refcount_acquire expects an
owning or non-owning ref. The verifier defines non-owning ref as a type:

  PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF

while mapval->some_kptr is PTR_TO_BTF_ID | PTR_UNTRUSTED. It's possible
to do the refcount_acquire by first bpf_kptr_xchg'ing mapval->some_kptr
into a temp kptr, refcount_acquiring that, and xchg'ing back into
mapval, but this is unwieldy and shouldn't be necessary.

This patch modifies btf_ld_kptr_type such that user-allocated types are
marked MEM_ALLOC and if those types have a bpf_{rb,list}_node they're
marked NON_OWN_REF as well. Additionally, due to changes to
bpf_obj_drop_impl earlier in this series, rcu_protected_object now
returns true for all user-allocated types, resulting in
mapval->some_kptr being marked MEM_RCU.

After this patch's changes, mapval->some_kptr is now:

  PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU

which results in it passing the non-owning ref test, and the motivating
example passing verification.

Future work will likely get rid of special non-owning ref lifetime logic
in the verifier, at which point we'll be able to delete the NON_OWN_REF
flag entirely.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20231107085639.3016113-6-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 993e4677bbe9..9ae6eae13471 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5557,10 +5557,23 @@ BTF_SET_END(rcu_protected_types)
 static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
 {
 	if (!btf_is_kernel(btf))
-		return false;
+		return true;
 	return btf_id_set_contains(&rcu_protected_types, btf_id);
 }
 
+static struct btf_record *kptr_pointee_btf_record(struct btf_field *kptr_field)
+{
+	struct btf_struct_meta *meta;
+
+	if (btf_is_kernel(kptr_field->kptr.btf))
+		return NULL;
+
+	meta = btf_find_struct_meta(kptr_field->kptr.btf,
+				    kptr_field->kptr.btf_id);
+
+	return meta ? meta->record : NULL;
+}
+
 static bool rcu_safe_kptr(const struct btf_field *field)
 {
 	const struct btf_field_kptr *kptr = &field->kptr;
@@ -5571,12 +5584,25 @@ static bool rcu_safe_kptr(const struct btf_field *field)
 
 static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field)
 {
+	struct btf_record *rec;
+	u32 ret;
+
+	ret = PTR_MAYBE_NULL;
 	if (rcu_safe_kptr(kptr_field) && in_rcu_cs(env)) {
-		if (kptr_field->type != BPF_KPTR_PERCPU)
-			return PTR_MAYBE_NULL | MEM_RCU;
-		return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU;
+		ret |= MEM_RCU;
+		if (kptr_field->type == BPF_KPTR_PERCPU)
+			ret |= MEM_PERCPU;
+		else if (!btf_is_kernel(kptr_field->kptr.btf))
+			ret |= MEM_ALLOC;
+
+		rec = kptr_pointee_btf_record(kptr_field);
+		if (rec && btf_record_has_field(rec, BPF_GRAPH_NODE))
+			ret |= NON_OWN_REF;
+	} else {
+		ret |= PTR_UNTRUSTED;
 	}
-	return PTR_MAYBE_NULL | PTR_UNTRUSTED;
+
+	return ret;
 }
 
 static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
-- 
cgit v1.2.3


From 3feb263bb516ee7e1da0acd22b15afbb9a7daa19 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Nov 2023 16:26:36 -0800
Subject: bpf: handle ldimm64 properly in check_cfg()

ldimm64 instructions are 16-byte long, and so have to be handled
appropriately in check_cfg(), just like the rest of BPF verifier does.

This has implications in three places:
  - when determining next instruction for non-jump instructions;
  - when determining next instruction for callback address ldimm64
    instructions (in visit_func_call_insn());
  - when checking for unreachable instructions, where second half of
    ldimm64 is expected to be unreachable;

We take this also as an opportunity to report jump into the middle of
ldimm64. And adjust few test_verifier tests accordingly.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reported-by: Hao Sun <sunhao.th@gmail.com>
Fixes: 475fb78fbf48 ("bpf: verifier (add branch/goto checks)")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231110002638.4168352-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                             |  8 ++++++--
 kernel/bpf/verifier.c                           | 27 ++++++++++++++++++-------
 tools/testing/selftests/bpf/verifier/ld_imm64.c |  8 ++++----
 3 files changed, 30 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4825d3cdb29..35bff17396c0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -909,10 +909,14 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
 	aux->ctx_field_size = size;
 }
 
+static bool bpf_is_ldimm64(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
+}
+
 static inline bool bpf_pseudo_func(const struct bpf_insn *insn)
 {
-	return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
-	       insn->src_reg == BPF_PSEUDO_FUNC;
+	return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
 }
 
 struct bpf_prog_ops {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bd1c42eb540f..b87715b364fd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15439,15 +15439,16 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
 				struct bpf_verifier_env *env,
 				bool visit_callee)
 {
-	int ret;
+	int ret, insn_sz;
 
-	ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
+	insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
+	ret = push_insn(t, t + insn_sz, FALLTHROUGH, env, false);
 	if (ret)
 		return ret;
 
-	mark_prune_point(env, t + 1);
+	mark_prune_point(env, t + insn_sz);
 	/* when we exit from subprog, we need to record non-linear history */
-	mark_jmp_point(env, t + 1);
+	mark_jmp_point(env, t + insn_sz);
 
 	if (visit_callee) {
 		mark_prune_point(env, t);
@@ -15469,15 +15470,17 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
 static int visit_insn(int t, struct bpf_verifier_env *env)
 {
 	struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
-	int ret, off;
+	int ret, off, insn_sz;
 
 	if (bpf_pseudo_func(insn))
 		return visit_func_call_insn(t, insns, env, true);
 
 	/* All non-branch instructions have a single fall-through edge. */
 	if (BPF_CLASS(insn->code) != BPF_JMP &&
-	    BPF_CLASS(insn->code) != BPF_JMP32)
-		return push_insn(t, t + 1, FALLTHROUGH, env, false);
+	    BPF_CLASS(insn->code) != BPF_JMP32) {
+		insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+		return push_insn(t, t + insn_sz, FALLTHROUGH, env, false);
+	}
 
 	switch (BPF_OP(insn->code)) {
 	case BPF_EXIT:
@@ -15607,11 +15610,21 @@ walk_cfg:
 	}
 
 	for (i = 0; i < insn_cnt; i++) {
+		struct bpf_insn *insn = &env->prog->insnsi[i];
+
 		if (insn_state[i] != EXPLORED) {
 			verbose(env, "unreachable insn %d\n", i);
 			ret = -EINVAL;
 			goto err_free;
 		}
+		if (bpf_is_ldimm64(insn)) {
+			if (insn_state[i + 1] != 0) {
+				verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
+				ret = -EINVAL;
+				goto err_free;
+			}
+			i++; /* skip second half of ldimm64 */
+		}
 	}
 	ret = 0; /* cfg looks good */
 
diff --git a/tools/testing/selftests/bpf/verifier/ld_imm64.c b/tools/testing/selftests/bpf/verifier/ld_imm64.c
index f9297900cea6..78f19c255f20 100644
--- a/tools/testing/selftests/bpf/verifier/ld_imm64.c
+++ b/tools/testing/selftests/bpf/verifier/ld_imm64.c
@@ -9,8 +9,8 @@
 	BPF_MOV64_IMM(BPF_REG_0, 2),
 	BPF_EXIT_INSN(),
 	},
-	.errstr = "invalid BPF_LD_IMM insn",
-	.errstr_unpriv = "R1 pointer comparison",
+	.errstr = "jump into the middle of ldimm64 insn 1",
+	.errstr_unpriv = "jump into the middle of ldimm64 insn 1",
 	.result = REJECT,
 },
 {
@@ -23,8 +23,8 @@
 	BPF_LD_IMM64(BPF_REG_0, 1),
 	BPF_EXIT_INSN(),
 	},
-	.errstr = "invalid BPF_LD_IMM insn",
-	.errstr_unpriv = "R1 pointer comparison",
+	.errstr = "jump into the middle of ldimm64 insn 1",
+	.errstr_unpriv = "jump into the middle of ldimm64 insn 1",
 	.result = REJECT,
 },
 {
-- 
cgit v1.2.3


From 4bb7ea946a370707315ab774432963ce47291946 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Nov 2023 16:26:37 -0800
Subject: bpf: fix precision backtracking instruction iteration

Fix an edge case in __mark_chain_precision() which prematurely stops
backtracking instructions in a state if it happens that state's first
and last instruction indexes are the same. This situations doesn't
necessarily mean that there were no instructions simulated in a state,
but rather that we starting from the instruction, jumped around a bit,
and then ended up at the same instruction before checkpointing or
marking precision.

To distinguish between these two possible situations, we need to consult
jump history. If it's empty or contain a single record "bridging" parent
state and first instruction of processed state, then we indeed
backtracked all instructions in this state. But if history is not empty,
we are definitely not done yet.

Move this logic inside get_prev_insn_idx() to contain it more nicely.
Use -ENOENT return code to denote "we are out of instructions"
situation.

This bug was exposed by verifier_loop1.c's bounded_recursion subtest, once
the next fix in this patch set is applied.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231110002638.4168352-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b87715b364fd..484c742f733e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3516,12 +3516,29 @@ static int push_jmp_history(struct bpf_verifier_env *env,
 
 /* Backtrack one insn at a time. If idx is not at the top of recorded
  * history then previous instruction came from straight line execution.
+ * Return -ENOENT if we exhausted all instructions within given state.
+ *
+ * It's legal to have a bit of a looping with the same starting and ending
+ * insn index within the same state, e.g.: 3->4->5->3, so just because current
+ * instruction index is the same as state's first_idx doesn't mean we are
+ * done. If there is still some jump history left, we should keep going. We
+ * need to take into account that we might have a jump history between given
+ * state's parent and itself, due to checkpointing. In this case, we'll have
+ * history entry recording a jump from last instruction of parent state and
+ * first instruction of given state.
  */
 static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
 			     u32 *history)
 {
 	u32 cnt = *history;
 
+	if (i == st->first_insn_idx) {
+		if (cnt == 0)
+			return -ENOENT;
+		if (cnt == 1 && st->jmp_history[0].idx == i)
+			return -ENOENT;
+	}
+
 	if (cnt && st->jmp_history[cnt - 1].idx == i) {
 		i = st->jmp_history[cnt - 1].prev_idx;
 		(*history)--;
@@ -4401,10 +4418,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 				 * Nothing to be tracked further in the parent state.
 				 */
 				return 0;
-			if (i == first_idx)
-				break;
 			subseq_idx = i;
 			i = get_prev_insn_idx(st, i, &history);
+			if (i == -ENOENT)
+				break;
 			if (i >= env->prog->len) {
 				/* This can happen if backtracking reached insn 0
 				 * and there are still reg_mask or stack_mask
-- 
cgit v1.2.3


From 10e14e9652bf9e8104151bfd9200433083deae3d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Nov 2023 22:14:10 -0800
Subject: bpf: fix control-flow graph checking in privileged mode

When BPF program is verified in privileged mode, BPF verifier allows
bounded loops. This means that from CFG point of view there are
definitely some back-edges. Original commit adjusted check_cfg() logic
to not detect back-edges in control flow graph if they are resulting
from conditional jumps, which the idea that subsequent full BPF
verification process will determine whether such loops are bounded or
not, and either accept or reject the BPF program. At least that's my
reading of the intent.

Unfortunately, the implementation of this idea doesn't work correctly in
all possible situations. Conditional jump might not result in immediate
back-edge, but just a few unconditional instructions later we can arrive
at back-edge. In such situations check_cfg() would reject BPF program
even in privileged mode, despite it might be bounded loop. Next patch
adds one simple program demonstrating such scenario.

To keep things simple, instead of trying to detect back edges in
privileged mode, just assume every back edge is valid and let subsequent
BPF verification prove or reject bounded loops.

Note a few test changes. For unknown reason, we have a few tests that
are specified to detect a back-edge in a privileged mode, but looking at
their code it seems like the right outcome is passing check_cfg() and
letting subsequent verification to make a decision about bounded or not
bounded looping.

Bounded recursion case is also interesting. The example should pass, as
recursion is limited to just a few levels and so we never reach maximum
number of nested frames and never exhaust maximum stack depth. But the
way that max stack depth logic works today it falsely detects this as
exceeding max nested frame count. This patch series doesn't attempt to
fix this orthogonal problem, so we just adjust expected verifier failure.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Fixes: 2589726d12a1 ("bpf: introduce bounded loops")
Reported-by: Hao Sun <sunhao.th@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231110061412.2995786-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                              | 23 ++++++++--------------
 .../testing/selftests/bpf/progs/verifier_loops1.c  |  9 ++++++---
 tools/testing/selftests/bpf/verifier/calls.c       |  6 +++---
 3 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 484c742f733e..a2267d5ed14e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15403,8 +15403,7 @@ enum {
  * w - next instruction
  * e - edge
  */
-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
-		     bool loop_ok)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 {
 	int *insn_stack = env->cfg.insn_stack;
 	int *insn_state = env->cfg.insn_state;
@@ -15436,7 +15435,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
 		insn_stack[env->cfg.cur_stack++] = w;
 		return KEEP_EXPLORING;
 	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
-		if (loop_ok && env->bpf_capable)
+		if (env->bpf_capable)
 			return DONE_EXPLORING;
 		verbose_linfo(env, t, "%d: ", t);
 		verbose_linfo(env, w, "%d: ", w);
@@ -15459,7 +15458,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
 	int ret, insn_sz;
 
 	insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
-	ret = push_insn(t, t + insn_sz, FALLTHROUGH, env, false);
+	ret = push_insn(t, t + insn_sz, FALLTHROUGH, env);
 	if (ret)
 		return ret;
 
@@ -15469,12 +15468,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
 
 	if (visit_callee) {
 		mark_prune_point(env, t);
-		ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
-				/* It's ok to allow recursion from CFG point of
-				 * view. __check_func_call() will do the actual
-				 * check.
-				 */
-				bpf_pseudo_func(insns + t));
+		ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
 	}
 	return ret;
 }
@@ -15496,7 +15490,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 	if (BPF_CLASS(insn->code) != BPF_JMP &&
 	    BPF_CLASS(insn->code) != BPF_JMP32) {
 		insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
-		return push_insn(t, t + insn_sz, FALLTHROUGH, env, false);
+		return push_insn(t, t + insn_sz, FALLTHROUGH, env);
 	}
 
 	switch (BPF_OP(insn->code)) {
@@ -15543,8 +15537,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 			off = insn->imm;
 
 		/* unconditional jump with single edge */
-		ret = push_insn(t, t + off + 1, FALLTHROUGH, env,
-				true);
+		ret = push_insn(t, t + off + 1, FALLTHROUGH, env);
 		if (ret)
 			return ret;
 
@@ -15557,11 +15550,11 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 		/* conditional jump with two edges */
 		mark_prune_point(env, t);
 
-		ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
+		ret = push_insn(t, t + 1, FALLTHROUGH, env);
 		if (ret)
 			return ret;
 
-		return push_insn(t, t + insn->off + 1, BRANCH, env, true);
+		return push_insn(t, t + insn->off + 1, BRANCH, env);
 	}
 }
 
diff --git a/tools/testing/selftests/bpf/progs/verifier_loops1.c b/tools/testing/selftests/bpf/progs/verifier_loops1.c
index 5bc86af80a9a..71735dbf33d4 100644
--- a/tools/testing/selftests/bpf/progs/verifier_loops1.c
+++ b/tools/testing/selftests/bpf/progs/verifier_loops1.c
@@ -75,9 +75,10 @@ l0_%=:	r0 += 1;					\
 "	::: __clobber_all);
 }
 
-SEC("tracepoint")
+SEC("socket")
 __description("bounded loop, start in the middle")
-__failure __msg("back-edge")
+__success
+__failure_unpriv __msg_unpriv("back-edge")
 __naked void loop_start_in_the_middle(void)
 {
 	asm volatile ("					\
@@ -136,7 +137,9 @@ l0_%=:	exit;						\
 
 SEC("tracepoint")
 __description("bounded recursion")
-__failure __msg("back-edge")
+__failure
+/* verifier limitation in detecting max stack depth */
+__msg("the call stack of 8 frames is too deep !")
 __naked void bounded_recursion(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index 1bdf2b43e49e..3d5cd51071f0 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -442,7 +442,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-	.errstr = "back-edge from insn 0 to 0",
+	.errstr = "the call stack of 9 frames is too deep",
 	.result = REJECT,
 },
 {
@@ -799,7 +799,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-	.errstr = "back-edge",
+	.errstr = "the call stack of 9 frames is too deep",
 	.result = REJECT,
 },
 {
@@ -811,7 +811,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_TRACEPOINT,
-	.errstr = "back-edge",
+	.errstr = "the call stack of 9 frames is too deep",
 	.result = REJECT,
 },
 {
-- 
cgit v1.2.3


From ce51e6153f7781bcde0f8bb4c81d6fd85ee422e6 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Wed, 8 Nov 2023 21:12:39 +0900
Subject: tracing: fprobe-event: Fix to check tracepoint event and return

Fix to check the tracepoint event is not valid with $retval.
The commit 08c9306fc2e3 ("tracing/fprobe-event: Assume fprobe is
a return event by $retval") introduced automatic return probe
conversion with $retval. But since tracepoint event does not
support return probe, $retval is not acceptable.

Without this fix, ftracetest, tprobe_syntax_errors.tc fails;

[22] Tracepoint probe event parser error log check      [FAIL]
 ----
 # tail 22-tprobe_syntax_errors.tc-log.mRKroL
 + ftrace_errlog_check trace_fprobe t kfree ^$retval dynamic_events
 + printf %s t kfree
 + wc -c
 + pos=8
 + printf %s t kfree ^$retval
 + tr -d ^
 + command=t kfree $retval
 + echo Test command: t kfree $retval
 Test command: t kfree $retval
 + echo
 ----

So 't kfree $retval' should fail (tracepoint doesn't support
return probe) but passed it.

Link: https://lore.kernel.org/all/169944555933.45057.12831706585287704173.stgit@devnote2/

Fixes: 08c9306fc2e3 ("tracing/fprobe-event: Assume fprobe is a return event by $retval")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace_fprobe.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 8bfe23af9c73..7d2ddbcfa377 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -927,11 +927,12 @@ static int parse_symbol_and_return(int argc, const char *argv[],
 	for (i = 2; i < argc; i++) {
 		tmp = strstr(argv[i], "$retval");
 		if (tmp && !isalnum(tmp[7]) && tmp[7] != '_') {
+			if (is_tracepoint) {
+				trace_probe_log_set_index(i);
+				trace_probe_log_err(tmp - argv[i], RETVAL_ON_PROBE);
+				return -EINVAL;
+			}
 			*is_return = true;
-			/*
-			 * NOTE: Don't check is_tracepoint here, because it will
-			 * be checked when the argument is parsed.
-			 */
 			break;
 		}
 	}
-- 
cgit v1.2.3


From b8e3a87a627b575896e448021e5c2f8a3bc19931 Mon Sep 17 00:00:00 2001
From: Jordan Rome <jordalgo@meta.com>
Date: Wed, 8 Nov 2023 03:23:34 -0800
Subject: bpf: Add crosstask check to __bpf_get_stack

Currently get_perf_callchain only supports user stack walking for
the current task. Passing the correct *crosstask* param will return
0 frames if the task passed to __bpf_get_stack isn't the current
one instead of a single incorrect frame/address. This change
passes the correct *crosstask* param but also does a preemptive
check in __bpf_get_stack if the task is current and returns
-EOPNOTSUPP if it is not.

This issue was found using bpf_get_task_stack inside a BPF
iterator ("iter/task"), which iterates over all tasks.
bpf_get_task_stack works fine for fetching kernel stacks
but because get_perf_callchain relies on the caller to know
if the requested *task* is the current one (via *crosstask*)
it was failing in a confusing way.

It might be possible to get user stacks for all tasks utilizing
something like access_process_vm but that requires the bpf
program calling bpf_get_task_stack to be sleepable and would
therefore be a breaking change.

Fixes: fa28dcb82a38 ("bpf: Introduce helper bpf_get_task_stack()")
Signed-off-by: Jordan Rome <jordalgo@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231108112334.3433136-1-jordalgo@meta.com
---
 include/uapi/linux/bpf.h       |  3 +++
 kernel/bpf/stackmap.c          | 11 ++++++++++-
 tools/include/uapi/linux/bpf.h |  3 +++
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 095ca7238ac2..7cf8bcf9f6a2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4517,6 +4517,8 @@ union bpf_attr {
  * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
  *	Description
  *		Return a user or a kernel stack in bpf program provided buffer.
+ *		Note: the user stack will only be populated if the *task* is
+ *		the current task; all other tasks will return -EOPNOTSUPP.
  *		To achieve this, the helper needs *task*, which is a valid
  *		pointer to **struct task_struct**. To store the stacktrace, the
  *		bpf program provides *buf* with a nonnegative *size*.
@@ -4528,6 +4530,7 @@ union bpf_attr {
  *
  *		**BPF_F_USER_STACK**
  *			Collect a user space stack instead of a kernel stack.
+ *			The *task* must be the current task.
  *		**BPF_F_USER_BUILD_ID**
  *			Collect buildid+offset instead of ips for user stack,
  *			only valid if **BPF_F_USER_STACK** is also specified.
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index d6b277482085..dff7ba539701 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -388,6 +388,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 {
 	u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
 	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+	bool crosstask = task && task != current;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	bool user = flags & BPF_F_USER_STACK;
 	struct perf_callchain_entry *trace;
@@ -410,6 +411,14 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 	if (task && user && !user_mode(regs))
 		goto err_fault;
 
+	/* get_perf_callchain does not support crosstask user stack walking
+	 * but returns an empty stack instead of NULL.
+	 */
+	if (crosstask && user) {
+		err = -EOPNOTSUPP;
+		goto clear;
+	}
+
 	num_elem = size / elem_size;
 	max_depth = num_elem + skip;
 	if (sysctl_perf_event_max_stack < max_depth)
@@ -421,7 +430,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 		trace = get_callchain_entry_for_task(task, max_depth);
 	else
 		trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
-					   false, false);
+					   crosstask, false);
 	if (unlikely(!trace))
 		goto err_fault;
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 095ca7238ac2..7cf8bcf9f6a2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4517,6 +4517,8 @@ union bpf_attr {
  * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
  *	Description
  *		Return a user or a kernel stack in bpf program provided buffer.
+ *		Note: the user stack will only be populated if the *task* is
+ *		the current task; all other tasks will return -EOPNOTSUPP.
  *		To achieve this, the helper needs *task*, which is a valid
  *		pointer to **struct task_struct**. To store the stacktrace, the
  *		bpf program provides *buf* with a nonnegative *size*.
@@ -4528,6 +4530,7 @@ union bpf_attr {
  *
  *		**BPF_F_USER_STACK**
  *			Collect a user space stack instead of a kernel stack.
+ *			The *task* must be the current task.
  *		**BPF_F_USER_BUILD_ID**
  *			Collect buildid+offset instead of ips for user stack,
  *			only valid if **BPF_F_USER_STACK** is also specified.
-- 
cgit v1.2.3


From f032c53bea6d2057c14553832d846be2f151cfb2 Mon Sep 17 00:00:00 2001
From: Yujie Liu <yujie.liu@intel.com>
Date: Tue, 31 Oct 2023 12:13:05 +0800
Subject: tracing/kprobes: Fix the order of argument descriptions

The order of descriptions should be consistent with the argument list of
the function, so "kretprobe" should be the second one.

int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe,
                                 const char *name, const char *loc, ...)

Link: https://lore.kernel.org/all/20231031041305.3363712-1-yujie.liu@intel.com/

Fixes: 2a588dd1d5d6 ("tracing: Add kprobe event command generation functions")
Suggested-by: Mukesh Ojha <quic_mojha@quicinc.com>
Signed-off-by: Yujie Liu <yujie.liu@intel.com>
Reviewed-by: Mukesh Ojha <quic_mojha@quicinc.com>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace_kprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a3442db35670..52f8b537dd0a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1020,9 +1020,9 @@ EXPORT_SYMBOL_GPL(kprobe_event_cmd_init);
 /**
  * __kprobe_event_gen_cmd_start - Generate a kprobe event command from arg list
  * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @kretprobe: Is this a return probe?
  * @name: The name of the kprobe event
  * @loc: The location of the kprobe event
- * @kretprobe: Is this a return probe?
  * @...: Variable number of arg (pairs), one pair for each field
  *
  * NOTE: Users normally won't want to call this function directly, but
-- 
cgit v1.2.3


From 5c0930ccaad5a74d74e8b18b648c5eb21ed2fe94 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Nov 2023 15:57:13 +0100
Subject: hrtimers: Push pending hrtimers away from outgoing CPU earlier

2b8272ff4a70 ("cpu/hotplug: Prevent self deadlock on CPU hot-unplug")
solved the straight forward CPU hotplug deadlock vs. the scheduler
bandwidth timer. Yu discovered a more involved variant where a task which
has a bandwidth timer started on the outgoing CPU holds a lock and then
gets throttled. If the lock required by one of the CPU hotplug callbacks
the hotplug operation deadlocks because the unthrottling timer event is not
handled on the dying CPU and can only be recovered once the control CPU
reaches the hotplug state which pulls the pending hrtimers from the dead
CPU.

Solve this by pushing the hrtimers away from the dying CPU in the dying
callbacks. Nothing can queue a hrtimer on the dying CPU at that point because
all other CPUs spin in stop_machine() with interrupts disabled and once the
operation is finished the CPU is marked offline.

Reported-by: Yu Liao <liaoyu15@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Liu Tie <liutie4@huawei.com>
Link: https://lore.kernel.org/r/87a5rphara.ffs@tglx
---
 include/linux/cpuhotplug.h |  1 +
 include/linux/hrtimer.h    |  4 ++--
 kernel/cpu.c               |  8 +++++++-
 kernel/time/hrtimer.c      | 33 ++++++++++++---------------------
 4 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 068f7738be22..448f5f995adc 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -193,6 +193,7 @@ enum cpuhp_state {
 	CPUHP_AP_ARM_CORESIGHT_CTI_STARTING,
 	CPUHP_AP_ARM64_ISNDEP_STARTING,
 	CPUHP_AP_SMPCFD_DYING,
+	CPUHP_AP_HRTIMERS_DYING,
 	CPUHP_AP_X86_TBOOT_DYING,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DYING,
 	CPUHP_AP_ONLINE,
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0ee140176f10..f2044d5a652b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -531,9 +531,9 @@ extern void sysrq_timer_list_show(void);
 
 int hrtimers_prepare_cpu(unsigned int cpu);
 #ifdef CONFIG_HOTPLUG_CPU
-int hrtimers_dead_cpu(unsigned int cpu);
+int hrtimers_cpu_dying(unsigned int cpu);
 #else
-#define hrtimers_dead_cpu	NULL
+#define hrtimers_cpu_dying	NULL
 #endif
 
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6de7c6bb74ee..2e69a1deaa31 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2098,7 +2098,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 	[CPUHP_HRTIMERS_PREPARE] = {
 		.name			= "hrtimers:prepare",
 		.startup.single		= hrtimers_prepare_cpu,
-		.teardown.single	= hrtimers_dead_cpu,
+		.teardown.single	= NULL,
 	},
 	[CPUHP_SMPCFD_PREPARE] = {
 		.name			= "smpcfd:prepare",
@@ -2190,6 +2190,12 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 		.startup.single		= NULL,
 		.teardown.single	= smpcfd_dying_cpu,
 	},
+	[CPUHP_AP_HRTIMERS_DYING] = {
+		.name			= "hrtimers:dying",
+		.startup.single		= NULL,
+		.teardown.single	= hrtimers_cpu_dying,
+	},
+
 	/* Entry state on starting. Interrupts enabled from here on. Transient
 	 * state for synchronsization */
 	[CPUHP_AP_ONLINE] = {
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 238262e4aba7..760793998cdd 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2219,29 +2219,22 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 	}
 }
 
-int hrtimers_dead_cpu(unsigned int scpu)
+int hrtimers_cpu_dying(unsigned int dying_cpu)
 {
 	struct hrtimer_cpu_base *old_base, *new_base;
-	int i;
+	int i, ncpu = cpumask_first(cpu_active_mask);
 
-	BUG_ON(cpu_online(scpu));
-	tick_cancel_sched_timer(scpu);
+	tick_cancel_sched_timer(dying_cpu);
+
+	old_base = this_cpu_ptr(&hrtimer_bases);
+	new_base = &per_cpu(hrtimer_bases, ncpu);
 
-	/*
-	 * this BH disable ensures that raise_softirq_irqoff() does
-	 * not wakeup ksoftirqd (and acquire the pi-lock) while
-	 * holding the cpu_base lock
-	 */
-	local_bh_disable();
-	local_irq_disable();
-	old_base = &per_cpu(hrtimer_bases, scpu);
-	new_base = this_cpu_ptr(&hrtimer_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
 	 */
-	raw_spin_lock(&new_base->lock);
-	raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+	raw_spin_lock(&old_base->lock);
+	raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		migrate_hrtimer_list(&old_base->clock_base[i],
@@ -2252,15 +2245,13 @@ int hrtimers_dead_cpu(unsigned int scpu)
 	 * The migration might have changed the first expiring softirq
 	 * timer on this CPU. Update it.
 	 */
-	hrtimer_update_softirq_timer(new_base, false);
+	__hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
+	/* Tell the other CPU to retrigger the next event */
+	smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
 
-	raw_spin_unlock(&old_base->lock);
 	raw_spin_unlock(&new_base->lock);
+	raw_spin_unlock(&old_base->lock);
 
-	/* Check, if we got expired work to do */
-	__hrtimer_peek_ahead_timers();
-	local_irq_enable();
-	local_bh_enable();
 	return 0;
 }
 
-- 
cgit v1.2.3


From fe28f631fa941fba583d1c4f25895284b90af671 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Wed, 25 Oct 2023 14:25:52 -0400
Subject: workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs
 from wq_unbound_cpumask

When the "isolcpus" boot command line option is used to add a set
of isolated CPUs, those CPUs will be excluded automatically from
wq_unbound_cpumask to avoid running work functions from unbound
workqueues.

Recently cpuset has been extended to allow the creation of partitions
of isolated CPUs dynamically. To make it closer to the "isolcpus"
in functionality, the CPUs in those isolated cpuset partitions should be
excluded from wq_unbound_cpumask as well. This can be done currently by
explicitly writing to the workqueue's cpumask sysfs file after creating
the isolated partitions. However, this process can be error prone.

Ideally, the cpuset code should be allowed to request the workqueue code
to exclude those isolated CPUs from wq_unbound_cpumask so that this
operation can be done automatically and the isolated CPUs will be returned
back to wq_unbound_cpumask after the destructions of the isolated
cpuset partitions.

This patch adds a new workqueue_unbound_exclude_cpumask() function to
enable that. This new function will exclude the specified isolated
CPUs from wq_unbound_cpumask. To be able to restore those isolated
CPUs back after the destruction of isolated cpuset partitions, a new
wq_requested_unbound_cpumask is added to store the user provided unbound
cpumask either from the boot command line options or from writing to
the cpumask sysfs file. This new cpumask provides the basis for CPU
exclusion.

To enable users to understand how the wq_unbound_cpumask is being
modified internally, this patch also exposes the newly introduced
wq_requested_unbound_cpumask as well as a wq_isolated_cpumask to
store the cpumask to be excluded from wq_unbound_cpumask as read-only
sysfs files.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |  2 +-
 kernel/workqueue.c        | 91 ++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 84 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 24b1e5070f4d..b0b9604b76b8 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -491,7 +491,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void);
 void free_workqueue_attrs(struct workqueue_attrs *attrs);
 int apply_workqueue_attrs(struct workqueue_struct *wq,
 			  const struct workqueue_attrs *attrs);
-int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);
+extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask);
 
 extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6e578f576a6f..bd9d34eacd78 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -381,6 +381,12 @@ static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 /* PL&A: allowable cpus for unbound wqs and work items */
 static cpumask_var_t wq_unbound_cpumask;
 
+/* PL: user requested unbound cpumask via sysfs */
+static cpumask_var_t wq_requested_unbound_cpumask;
+
+/* PL: isolated cpumask to be excluded from unbound cpumask */
+static cpumask_var_t wq_isolated_cpumask;
+
 /* for further constrain wq_unbound_cpumask by cmdline parameter*/
 static struct cpumask wq_cmdline_cpumask __initdata;
 
@@ -5839,7 +5845,7 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
  *  		-EINVAL	- Invalid @cpumask
  *  		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
  */
-int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
 {
 	int ret = -EINVAL;
 
@@ -5850,6 +5856,7 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
 	cpumask_and(cpumask, cpumask, cpu_possible_mask);
 	if (!cpumask_empty(cpumask)) {
 		apply_wqattrs_lock();
+		cpumask_copy(wq_requested_unbound_cpumask, cpumask);
 		if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
 			ret = 0;
 			goto out_unlock;
@@ -5864,6 +5871,44 @@ out_unlock:
 	return ret;
 }
 
+/**
+ * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
+ * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
+ *
+ * This function can be called from cpuset code to provide a set of isolated
+ * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold
+ * either cpus_read_lock or cpus_write_lock.
+ */
+int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
+{
+	cpumask_var_t cpumask;
+	int ret = 0;
+
+	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	lockdep_assert_cpus_held();
+	mutex_lock(&wq_pool_mutex);
+
+	/* Save the current isolated cpumask & export it via sysfs */
+	cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
+
+	/*
+	 * If the operation fails, it will fall back to
+	 * wq_requested_unbound_cpumask which is initially set to
+	 * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
+	 * by any subsequent write to workqueue/cpumask sysfs file.
+	 */
+	if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
+		cpumask_copy(cpumask, wq_requested_unbound_cpumask);
+	if (!cpumask_equal(cpumask, wq_unbound_cpumask))
+		ret = workqueue_apply_unbound_cpumask(cpumask);
+
+	mutex_unlock(&wq_pool_mutex);
+	free_cpumask_var(cpumask);
+	return ret;
+}
+
 static int parse_affn_scope(const char *val)
 {
 	int i;
@@ -6158,19 +6203,36 @@ static struct bus_type wq_subsys = {
 	.dev_groups			= wq_sysfs_groups,
 };
 
-static ssize_t wq_unbound_cpumask_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
+static ssize_t __wq_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf, cpumask_var_t mask)
 {
 	int written;
 
 	mutex_lock(&wq_pool_mutex);
-	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
-			    cpumask_pr_args(wq_unbound_cpumask));
+	written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
 	mutex_unlock(&wq_pool_mutex);
 
 	return written;
 }
 
+static ssize_t wq_unbound_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);
+}
+
+static ssize_t wq_requested_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);
+}
+
+static ssize_t wq_isolated_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);
+}
+
 static ssize_t wq_unbound_cpumask_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
@@ -6188,9 +6250,13 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev,
 	return ret ? ret : count;
 }
 
-static struct device_attribute wq_sysfs_cpumask_attr =
+static struct device_attribute wq_sysfs_cpumask_attrs[] = {
 	__ATTR(cpumask, 0644, wq_unbound_cpumask_show,
-	       wq_unbound_cpumask_store);
+	       wq_unbound_cpumask_store),
+	__ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL),
+	__ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL),
+	__ATTR_NULL,
+};
 
 static int __init wq_sysfs_init(void)
 {
@@ -6203,7 +6269,13 @@ static int __init wq_sysfs_init(void)
 
 	dev_root = bus_get_dev_root(&wq_subsys);
 	if (dev_root) {
-		err = device_create_file(dev_root, &wq_sysfs_cpumask_attr);
+		struct device_attribute *attr;
+
+		for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) {
+			err = device_create_file(dev_root, attr);
+			if (err)
+				break;
+		}
 		put_device(dev_root);
 	}
 	return err;
@@ -6534,11 +6606,14 @@ void __init workqueue_init_early(void)
 	BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
+	BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
+	BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
 	cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
 	cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
 
 	if (!cpumask_empty(&wq_cmdline_cpumask))
 		cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask);
+	cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-- 
cgit v1.2.3


From 11e5f407b64a8fa09d1a4b336d15bd285a434c1f Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Wed, 25 Oct 2023 14:25:54 -0400
Subject: cgroup/cpuset: Keep track of CPUs in isolated partitions

Add a new internal isolated_cpus mask to keep track of the CPUs that are in
isolated partitions. Expose that new cpumask as a new root-only control file
".cpuset.cpus.isolated".

tj: Updated patch description to reflect dropping __DEBUG__ prefix.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset.c | 190 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 127 insertions(+), 63 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 615daaf87f1f..19c8779798fd 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -204,6 +204,11 @@ struct cpuset {
  */
 static cpumask_var_t	subpartitions_cpus;
 
+/*
+ * Exclusive CPUs in isolated partitions
+ */
+static cpumask_var_t	isolated_cpus;
+
 /* List of remote partition root children */
 static struct list_head remote_children;
 
@@ -1317,6 +1322,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus,
  */
 enum partition_cmd {
 	partcmd_enable,		/* Enable partition root	  */
+	partcmd_enablei,	/* Enable isolated partition root */
 	partcmd_disable,	/* Disable partition root	  */
 	partcmd_update,		/* Update parent's effective_cpus */
 	partcmd_invalidate,	/* Make partition invalid	  */
@@ -1418,6 +1424,74 @@ static void reset_partition_data(struct cpuset *cs)
 	}
 }
 
+/*
+ * partition_xcpus_newstate - Exclusive CPUs state change
+ * @old_prs: old partition_root_state
+ * @new_prs: new partition_root_state
+ * @xcpus: exclusive CPUs with state change
+ */
+static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
+{
+	WARN_ON_ONCE(old_prs == new_prs);
+	if (new_prs == PRS_ISOLATED)
+		cpumask_or(isolated_cpus, isolated_cpus, xcpus);
+	else
+		cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
+}
+
+/*
+ * partition_xcpus_add - Add new exclusive CPUs to partition
+ * @new_prs: new partition_root_state
+ * @parent: parent cpuset
+ * @xcpus: exclusive CPUs to be added
+ *
+ * Remote partition if parent == NULL
+ */
+static void partition_xcpus_add(int new_prs, struct cpuset *parent,
+				struct cpumask *xcpus)
+{
+	WARN_ON_ONCE(new_prs < 0);
+	lockdep_assert_held(&callback_lock);
+	if (!parent)
+		parent = &top_cpuset;
+
+	if (parent == &top_cpuset)
+		cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
+
+	if (new_prs != parent->partition_root_state)
+		partition_xcpus_newstate(parent->partition_root_state, new_prs,
+					 xcpus);
+
+	cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
+}
+
+/*
+ * partition_xcpus_del - Remove exclusive CPUs from partition
+ * @old_prs: old partition_root_state
+ * @parent: parent cpuset
+ * @xcpus: exclusive CPUs to be removed
+ *
+ * Remote partition if parent == NULL
+ */
+static void partition_xcpus_del(int old_prs, struct cpuset *parent,
+				struct cpumask *xcpus)
+{
+	WARN_ON_ONCE(old_prs < 0);
+	lockdep_assert_held(&callback_lock);
+	if (!parent)
+		parent = &top_cpuset;
+
+	if (parent == &top_cpuset)
+		cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
+
+	if (old_prs != parent->partition_root_state)
+		partition_xcpus_newstate(old_prs, parent->partition_root_state,
+					 xcpus);
+
+	cpumask_and(xcpus, xcpus, cpu_active_mask);
+	cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
+}
+
 /*
  * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
  * @cs: cpuset
@@ -1456,13 +1530,15 @@ static inline bool is_local_partition(struct cpuset *cs)
 /*
  * remote_partition_enable - Enable current cpuset as a remote partition root
  * @cs: the cpuset to update
+ * @new_prs: new partition_root_state
  * @tmp: temparary masks
  * Return: 1 if successful, 0 if error
  *
  * Enable the current cpuset to become a remote partition root taking CPUs
  * directly from the top cpuset. cpuset_mutex must be held by the caller.
  */
-static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp)
+static int remote_partition_enable(struct cpuset *cs, int new_prs,
+				   struct tmpmasks *tmp)
 {
 	/*
 	 * The user must have sysadmin privilege.
@@ -1485,18 +1561,14 @@ static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp)
 		return 0;
 
 	spin_lock_irq(&callback_lock);
-	cpumask_andnot(top_cpuset.effective_cpus,
-		       top_cpuset.effective_cpus, tmp->new_cpus);
-	cpumask_or(subpartitions_cpus,
-		   subpartitions_cpus, tmp->new_cpus);
-
+	partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
+	list_add(&cs->remote_sibling, &remote_children);
 	if (cs->use_parent_ecpus) {
 		struct cpuset *parent = parent_cs(cs);
 
 		cs->use_parent_ecpus = false;
 		parent->child_ecpus_count--;
 	}
-	list_add(&cs->remote_sibling, &remote_children);
 	spin_unlock_irq(&callback_lock);
 
 	/*
@@ -1524,13 +1596,8 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
 	WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
 
 	spin_lock_irq(&callback_lock);
-	cpumask_andnot(subpartitions_cpus,
-		       subpartitions_cpus, tmp->new_cpus);
-	cpumask_and(tmp->new_cpus,
-		    tmp->new_cpus, cpu_active_mask);
-	cpumask_or(top_cpuset.effective_cpus,
-		   top_cpuset.effective_cpus, tmp->new_cpus);
 	list_del_init(&cs->remote_sibling);
+	partition_xcpus_del(cs->partition_root_state, NULL, tmp->new_cpus);
 	cs->partition_root_state = -cs->partition_root_state;
 	if (!cs->prs_err)
 		cs->prs_err = PERR_INVCPUS;
@@ -1557,6 +1624,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
 			       struct tmpmasks *tmp)
 {
 	bool adding, deleting;
+	int prs = cs->partition_root_state;
 
 	if (WARN_ON_ONCE(!is_remote_partition(cs)))
 		return;
@@ -1580,20 +1648,10 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
 		goto invalidate;
 
 	spin_lock_irq(&callback_lock);
-	if (adding) {
-		cpumask_or(subpartitions_cpus,
-			   subpartitions_cpus, tmp->addmask);
-		cpumask_andnot(top_cpuset.effective_cpus,
-			       top_cpuset.effective_cpus, tmp->addmask);
-	}
-	if (deleting) {
-		cpumask_andnot(subpartitions_cpus,
-			       subpartitions_cpus, tmp->delmask);
-		cpumask_and(tmp->delmask,
-			    tmp->delmask, cpu_active_mask);
-		cpumask_or(top_cpuset.effective_cpus,
-			   top_cpuset.effective_cpus, tmp->delmask);
-	}
+	if (adding)
+		partition_xcpus_add(prs, NULL, tmp->addmask);
+	if (deleting)
+		partition_xcpus_del(prs, NULL, tmp->delmask);
 	spin_unlock_irq(&callback_lock);
 
 	/*
@@ -1676,11 +1734,11 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
  * @tmp:     Temporary addmask and delmask
  * Return:   0 or a partition root state error code
  *
- * For partcmd_enable, the cpuset is being transformed from a non-partition
- * root to a partition root. The effective_xcpus (cpus_allowed if effective_xcpus
- * not set) mask of the given cpuset will be taken away from parent's
- * effective_cpus. The function will return 0 if all the CPUs listed in
- * effective_xcpus can be granted or an error code will be returned.
+ * For partcmd_enable*, the cpuset is being transformed from a non-partition
+ * root to a partition root. The effective_xcpus (cpus_allowed if
+ * effective_xcpus not set) mask of the given cpuset will be taken away from
+ * parent's effective_cpus. The function will return 0 if all the CPUs listed
+ * in effective_xcpus can be granted or an error code will be returned.
  *
  * For partcmd_disable, the cpuset is being transformed from a partition
  * root back to a non-partition root. Any CPUs in effective_xcpus will be
@@ -1695,7 +1753,7 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
  *
  * For partcmd_invalidate, the current partition will be made invalid.
  *
- * The partcmd_enable and partcmd_disable commands are used by
+ * The partcmd_enable* and partcmd_disable commands are used by
  * update_prstate(). An error code may be returned and the caller will check
  * for error.
  *
@@ -1760,7 +1818,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 
 	nocpu = tasks_nocpu_error(parent, cs, xcpus);
 
-	if (cmd == partcmd_enable) {
+	if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
 		/*
 		 * Enabling partition root is not allowed if its
 		 * effective_xcpus is empty or doesn't overlap with
@@ -1783,6 +1841,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 		cpumask_copy(tmp->delmask, xcpus);
 		deleting = true;
 		subparts_delta++;
+		new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
 	} else if (cmd == partcmd_disable) {
 		/*
 		 * May need to add cpus to parent's effective_cpus for
@@ -1792,6 +1851,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 			  cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
 		if (adding)
 			subparts_delta--;
+		new_prs = PRS_MEMBER;
 	} else if (newmask) {
 		/*
 		 * Empty cpumask is not allowed
@@ -1940,37 +2000,24 @@ write_error:
 	 * newly deleted ones will be added back to effective_cpus.
 	 */
 	spin_lock_irq(&callback_lock);
-	if (adding) {
-		if (parent == &top_cpuset)
-			cpumask_andnot(subpartitions_cpus,
-				       subpartitions_cpus, tmp->addmask);
-		/*
-		 * Some of the CPUs in effective_xcpus might have been offlined.
-		 */
-		cpumask_or(parent->effective_cpus,
-			   parent->effective_cpus, tmp->addmask);
-		cpumask_and(parent->effective_cpus,
-			    parent->effective_cpus, cpu_active_mask);
-	}
-	if (deleting) {
-		if (parent == &top_cpuset)
-			cpumask_or(subpartitions_cpus,
-				   subpartitions_cpus, tmp->delmask);
-		cpumask_andnot(parent->effective_cpus,
-			       parent->effective_cpus, tmp->delmask);
-	}
-
-	if (is_partition_valid(parent)) {
-		parent->nr_subparts += subparts_delta;
-		WARN_ON_ONCE(parent->nr_subparts < 0);
-	}
-
 	if (old_prs != new_prs) {
 		cs->partition_root_state = new_prs;
 		if (new_prs <= 0)
 			cs->nr_subparts = 0;
 	}
+	/*
+	 * Adding to parent's effective_cpus means deletion CPUs from cs
+	 * and vice versa.
+	 */
+	if (adding)
+		partition_xcpus_del(old_prs, parent, tmp->addmask);
+	if (deleting)
+		partition_xcpus_add(new_prs, parent, tmp->delmask);
 
+	if (is_partition_valid(parent)) {
+		parent->nr_subparts += subparts_delta;
+		WARN_ON_ONCE(parent->nr_subparts < 0);
+	}
 	spin_unlock_irq(&callback_lock);
 
 	if ((old_prs != new_prs) && (cmd == partcmd_update))
@@ -2948,6 +2995,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 	int err = PERR_NONE, old_prs = cs->partition_root_state;
 	struct cpuset *parent = parent_cs(cs);
 	struct tmpmasks tmpmask;
+	bool new_xcpus_state = false;
 
 	if (old_prs == new_prs)
 		return 0;
@@ -2977,6 +3025,9 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 		goto out;
 
 	if (!old_prs) {
+		enum partition_cmd cmd = (new_prs == PRS_ROOT)
+				       ? partcmd_enable : partcmd_enablei;
+
 		/*
 		 * cpus_allowed cannot be empty.
 		 */
@@ -2985,19 +3036,18 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 			goto out;
 		}
 
-		err = update_parent_effective_cpumask(cs, partcmd_enable,
-						      NULL, &tmpmask);
+		err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
 		/*
 		 * If an attempt to become local partition root fails,
 		 * try to become a remote partition root instead.
 		 */
-		if (err && remote_partition_enable(cs, &tmpmask))
+		if (err && remote_partition_enable(cs, new_prs, &tmpmask))
 			err = 0;
 	} else if (old_prs && new_prs) {
 		/*
 		 * A change in load balance state only, no change in cpumasks.
 		 */
-		;
+		new_xcpus_state = true;
 	} else {
 		/*
 		 * Switching back to member is always allowed even if it
@@ -3029,6 +3079,8 @@ out:
 	WRITE_ONCE(cs->prs_err, err);
 	if (!is_partition_valid(cs))
 		reset_partition_data(cs);
+	else if (new_xcpus_state)
+		partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
 	spin_unlock_irq(&callback_lock);
 
 	/* Force update if switching back to member */
@@ -3386,6 +3438,7 @@ typedef enum {
 	FILE_SUBPARTS_CPULIST,
 	FILE_EXCLUSIVE_CPULIST,
 	FILE_EFFECTIVE_XCPULIST,
+	FILE_ISOLATED_CPULIST,
 	FILE_CPU_EXCLUSIVE,
 	FILE_MEM_EXCLUSIVE,
 	FILE_MEM_HARDWALL,
@@ -3582,6 +3635,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 	case FILE_SUBPARTS_CPULIST:
 		seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
 		break;
+	case FILE_ISOLATED_CPULIST:
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -3875,6 +3931,13 @@ static struct cftype dfl_files[] = {
 		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
 	},
 
+	{
+		.name = "cpus.isolated",
+		.seq_show = cpuset_common_seq_show,
+		.private = FILE_ISOLATED_CPULIST,
+		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
+	},
+
 	{ }	/* terminate */
 };
 
@@ -4194,6 +4257,7 @@ int __init cpuset_init(void)
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
 	BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
+	BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
 
 	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
-- 
cgit v1.2.3


From 72c6303acfa1008c542e093bc9f9916fb99e0323 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Wed, 25 Oct 2023 14:25:55 -0400
Subject: cgroup/cpuset: Take isolated CPUs out of workqueue unbound cpumask

To make CPUs in isolated cpuset partition closer in isolation to
the boot time isolated CPUs specified in the "isolcpus" boot command
line option, we need to take those CPUs out of the workqueue unbound
cpumask so that work functions from the unbound workqueues won't run
on those CPUs.  Otherwise, they will interfere the user tasks running
on those isolated CPUs.

With the introduction of the workqueue_unbound_exclude_cpumask() helper
function in an earlier commit, those isolated CPUs can now be taken
out from the workqueue unbound cpumask.

This patch also updates cgroup-v2.rst to mention that isolated
CPUs will be excluded from unbound workqueue cpumask as well as
updating test_cpuset_prs.sh to verify the correctness of the new
*cpuset.cpus.isolated file, if available via cgroup_debug option.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/admin-guide/cgroup-v2.rst           |  10 +-
 kernel/cgroup/cpuset.c                            | 116 ++++++++++++++++++----
 tools/testing/selftests/cgroup/test_cpuset_prs.sh |  74 ++++++++++++--
 3 files changed, 166 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 3f85254f3cef..cf5651a11df8 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2358,11 +2358,11 @@ Cpuset Interface Files
 	partition or scheduling domain.  The set of exclusive CPUs is
 	determined by the value of its "cpuset.cpus.exclusive.effective".
 
-	When set to "isolated", the CPUs in that partition will
-	be in an isolated state without any load balancing from the
-	scheduler.  Tasks placed in such a partition with multiple
-	CPUs should be carefully distributed and bound to each of the
-	individual CPUs for optimal performance.
+	When set to "isolated", the CPUs in that partition will be in
+	an isolated state without any load balancing from the scheduler
+	and excluded from the unbound workqueues.  Tasks placed in such
+	a partition with multiple CPUs should be carefully distributed
+	and bound to each of the individual CPUs for optimal performance.
 
 	A partition root ("root" or "isolated") can be in one of the
 	two possible states - valid or invalid.  An invalid partition
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 19c8779798fd..1bad4007ff4b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -25,6 +25,7 @@
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
 #include <linux/cpuset.h>
+#include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
@@ -43,6 +44,7 @@
 #include <linux/sched/isolation.h>
 #include <linux/cgroup.h>
 #include <linux/wait.h>
+#include <linux/workqueue.h>
 
 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@@ -1444,25 +1446,31 @@ static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *x
  * @new_prs: new partition_root_state
  * @parent: parent cpuset
  * @xcpus: exclusive CPUs to be added
+ * Return: true if isolated_cpus modified, false otherwise
  *
  * Remote partition if parent == NULL
  */
-static void partition_xcpus_add(int new_prs, struct cpuset *parent,
+static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
 				struct cpumask *xcpus)
 {
+	bool isolcpus_updated;
+
 	WARN_ON_ONCE(new_prs < 0);
 	lockdep_assert_held(&callback_lock);
 	if (!parent)
 		parent = &top_cpuset;
 
+
 	if (parent == &top_cpuset)
 		cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
 
-	if (new_prs != parent->partition_root_state)
+	isolcpus_updated = (new_prs != parent->partition_root_state);
+	if (isolcpus_updated)
 		partition_xcpus_newstate(parent->partition_root_state, new_prs,
 					 xcpus);
 
 	cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
+	return isolcpus_updated;
 }
 
 /*
@@ -1470,12 +1478,15 @@ static void partition_xcpus_add(int new_prs, struct cpuset *parent,
  * @old_prs: old partition_root_state
  * @parent: parent cpuset
  * @xcpus: exclusive CPUs to be removed
+ * Return: true if isolated_cpus modified, false otherwise
  *
  * Remote partition if parent == NULL
  */
-static void partition_xcpus_del(int old_prs, struct cpuset *parent,
+static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
 				struct cpumask *xcpus)
 {
+	bool isolcpus_updated;
+
 	WARN_ON_ONCE(old_prs < 0);
 	lockdep_assert_held(&callback_lock);
 	if (!parent)
@@ -1484,12 +1495,27 @@ static void partition_xcpus_del(int old_prs, struct cpuset *parent,
 	if (parent == &top_cpuset)
 		cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
 
-	if (old_prs != parent->partition_root_state)
+	isolcpus_updated = (old_prs != parent->partition_root_state);
+	if (isolcpus_updated)
 		partition_xcpus_newstate(old_prs, parent->partition_root_state,
 					 xcpus);
 
 	cpumask_and(xcpus, xcpus, cpu_active_mask);
 	cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
+	return isolcpus_updated;
+}
+
+static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
+{
+	int ret;
+
+	lockdep_assert_cpus_held();
+
+	if (!isolcpus_updated)
+		return;
+
+	ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
+	WARN_ON_ONCE(ret < 0);
 }
 
 /*
@@ -1540,6 +1566,8 @@ static inline bool is_local_partition(struct cpuset *cs)
 static int remote_partition_enable(struct cpuset *cs, int new_prs,
 				   struct tmpmasks *tmp)
 {
+	bool isolcpus_updated;
+
 	/*
 	 * The user must have sysadmin privilege.
 	 */
@@ -1561,7 +1589,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
 		return 0;
 
 	spin_lock_irq(&callback_lock);
-	partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
+	isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
 	list_add(&cs->remote_sibling, &remote_children);
 	if (cs->use_parent_ecpus) {
 		struct cpuset *parent = parent_cs(cs);
@@ -1570,13 +1598,13 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
 		parent->child_ecpus_count--;
 	}
 	spin_unlock_irq(&callback_lock);
+	update_unbound_workqueue_cpumask(isolcpus_updated);
 
 	/*
 	 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
 	 */
 	update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
 	update_sibling_cpumasks(&top_cpuset, NULL, tmp);
-
 	return 1;
 }
 
@@ -1591,18 +1619,22 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
  */
 static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
 {
+	bool isolcpus_updated;
+
 	compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
 	WARN_ON_ONCE(!is_remote_partition(cs));
 	WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
 
 	spin_lock_irq(&callback_lock);
 	list_del_init(&cs->remote_sibling);
-	partition_xcpus_del(cs->partition_root_state, NULL, tmp->new_cpus);
+	isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
+					       NULL, tmp->new_cpus);
 	cs->partition_root_state = -cs->partition_root_state;
 	if (!cs->prs_err)
 		cs->prs_err = PERR_INVCPUS;
 	reset_partition_data(cs);
 	spin_unlock_irq(&callback_lock);
+	update_unbound_workqueue_cpumask(isolcpus_updated);
 
 	/*
 	 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1625,6 +1657,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
 {
 	bool adding, deleting;
 	int prs = cs->partition_root_state;
+	int isolcpus_updated = 0;
 
 	if (WARN_ON_ONCE(!is_remote_partition(cs)))
 		return;
@@ -1649,10 +1682,11 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
 
 	spin_lock_irq(&callback_lock);
 	if (adding)
-		partition_xcpus_add(prs, NULL, tmp->addmask);
+		isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
 	if (deleting)
-		partition_xcpus_del(prs, NULL, tmp->delmask);
+		isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
 	spin_unlock_irq(&callback_lock);
+	update_unbound_workqueue_cpumask(isolcpus_updated);
 
 	/*
 	 * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1774,6 +1808,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
 	int part_error = PERR_NONE;	/* Partition error? */
 	int subparts_delta = 0;
 	struct cpumask *xcpus;		/* cs effective_xcpus */
+	int isolcpus_updated = 0;
 	bool nocpu;
 
 	lockdep_assert_held(&cpuset_mutex);
@@ -2010,15 +2045,18 @@ write_error:
 	 * and vice versa.
 	 */
 	if (adding)
-		partition_xcpus_del(old_prs, parent, tmp->addmask);
+		isolcpus_updated += partition_xcpus_del(old_prs, parent,
+							tmp->addmask);
 	if (deleting)
-		partition_xcpus_add(new_prs, parent, tmp->delmask);
+		isolcpus_updated += partition_xcpus_add(new_prs, parent,
+							tmp->delmask);
 
 	if (is_partition_valid(parent)) {
 		parent->nr_subparts += subparts_delta;
 		WARN_ON_ONCE(parent->nr_subparts < 0);
 	}
 	spin_unlock_irq(&callback_lock);
+	update_unbound_workqueue_cpumask(isolcpus_updated);
 
 	if ((old_prs != new_prs) && (cmd == partcmd_update))
 		update_partition_exclusive(cs, new_prs);
@@ -3082,6 +3120,7 @@ out:
 	else if (new_xcpus_state)
 		partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
 	spin_unlock_irq(&callback_lock);
+	update_unbound_workqueue_cpumask(new_xcpus_state);
 
 	/* Force update if switching back to member */
 	update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
@@ -4370,6 +4409,30 @@ void cpuset_force_rebuild(void)
 	force_rebuild = true;
 }
 
+/*
+ * Attempt to acquire a cpus_read_lock while a hotplug operation may be in
+ * progress.
+ * Return: true if successful, false otherwise
+ *
+ * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
+ * cpus_read_trylock() is used here to acquire the lock.
+ */
+static bool cpuset_hotplug_cpus_read_trylock(void)
+{
+	int retries = 0;
+
+	while (!cpus_read_trylock()) {
+		/*
+		 * CPU hotplug still in progress. Retry 5 times
+		 * with a 10ms wait before bailing out.
+		 */
+		if (++retries > 5)
+			return false;
+		msleep(10);
+	}
+	return true;
+}
+
 /**
  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
  * @cs: cpuset in interest
@@ -4386,6 +4449,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
 	bool cpus_updated;
 	bool mems_updated;
 	bool remote;
+	int partcmd = -1;
 	struct cpuset *parent;
 retry:
 	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -4417,11 +4481,13 @@ retry:
 		compute_partition_effective_cpumask(cs, &new_cpus);
 
 	if (remote && cpumask_empty(&new_cpus) &&
-	    partition_is_populated(cs, NULL)) {
+	    partition_is_populated(cs, NULL) &&
+	    cpuset_hotplug_cpus_read_trylock()) {
 		remote_partition_disable(cs, tmp);
 		compute_effective_cpumask(&new_cpus, cs, parent);
 		remote = false;
 		cpuset_force_rebuild();
+		cpus_read_unlock();
 	}
 
 	/*
@@ -4432,18 +4498,28 @@ retry:
 	 *    partitions.
 	 */
 	if (is_local_partition(cs) && (!is_partition_valid(parent) ||
-				tasks_nocpu_error(parent, cs, &new_cpus))) {
-		update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp);
-		compute_effective_cpumask(&new_cpus, cs, parent);
-		cpuset_force_rebuild();
-	}
+				tasks_nocpu_error(parent, cs, &new_cpus)))
+		partcmd = partcmd_invalidate;
 	/*
 	 * On the other hand, an invalid partition root may be transitioned
 	 * back to a regular one.
 	 */
-	else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
-		update_parent_effective_cpumask(cs, partcmd_update, NULL, tmp);
-		if (is_partition_valid(cs)) {
+	else if (is_partition_valid(parent) && is_partition_invalid(cs))
+		partcmd = partcmd_update;
+
+	/*
+	 * cpus_read_lock needs to be held before calling
+	 * update_parent_effective_cpumask(). To avoid circular lock
+	 * dependency between cpuset_mutex and cpus_read_lock,
+	 * cpus_read_trylock() is used here to acquire the lock.
+	 */
+	if (partcmd >= 0) {
+		if (!cpuset_hotplug_cpus_read_trylock())
+			goto update_tasks;
+
+		update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
+		cpus_read_unlock();
+		if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
 			compute_partition_effective_cpumask(cs, &new_cpus);
 			cpuset_force_rebuild();
 		}
diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
index 2b825019f806..7b7c4c2b6d85 100755
--- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -232,11 +232,11 @@ TEST_MATRIX=(
 	" C0-3:S+ C1-3:S+ C2-3   C4-5   X2-3  X2-3:P1   P2     P1    0 A1:0-1,A2:,A3:2-3,B1:4-5 \
 								       A1:P0,A2:P1,A3:P2,B1:P1 2-3"
 	" C0-3:S+ C1-3:S+ C2-3    C4    X2-3  X2-3:P1   P2     P1    0 A1:0-1,A2:,A3:2-3,B1:4 \
-								       A1:P0,A2:P1,A3:P2,B1:P1 2-4"
+								       A1:P0,A2:P1,A3:P2,B1:P1 2-4,2-3"
 	" C0-3:S+ C1-3:S+  C3     C4    X2-3  X2-3:P1   P2     P1    0 A1:0-1,A2:2,A3:3,B1:4 \
-								       A1:P0,A2:P1,A3:P2,B1:P1 2-4"
+								       A1:P0,A2:P1,A3:P2,B1:P1 2-4,3"
 	" C0-4:S+ C1-4:S+ C2-4     .    X2-4  X2-4:P2  X4:P1    .    0 A1:0-1,A2:2-3,A3:4 \
-								       A1:P0,A2:P2,A3:P1 2-4"
+								       A1:P0,A2:P2,A3:P1 2-4,2-3"
 	" C0-4:X2-4:S+ C1-4:X2-4:S+:P2 C2-4:X4:P1 \
 				   .      .      X5      .      .    0 A1:0-4,A2:1-4,A3:2-4 \
 								       A1:P0,A2:P-2,A3:P-1"
@@ -248,7 +248,7 @@ TEST_MATRIX=(
 	" C0-3:S+ C1-3:S+ C2-3     .    X2-3   X2-3 X2-3:P2:O2=0 .   0 A1:0-1,A2:1,A3:3 A1:P0,A3:P2 2-3"
 	" C0-3:S+ C1-3:S+ C2-3     .    X2-3   X2-3 X2-3:P2:O2=0 O2=1 0 A1:0-1,A2:1,A3:2-3 A1:P0,A3:P2 2-3"
 	" C0-3:S+ C1-3:S+  C3      .    X2-3   X2-3    P2:O3=0   .   0 A1:0-2,A2:1-2,A3: A1:P0,A3:P2 3"
-	" C0-3:S+ C1-3:S+  C3      .    X2-3   X2-3   T:P2:O3=0  .   0 A1:0-2,A2:1-2,A3:1-2 A1:P0,A3:P-2 3"
+	" C0-3:S+ C1-3:S+  C3      .    X2-3   X2-3   T:P2:O3=0  .   0 A1:0-2,A2:1-2,A3:1-2 A1:P0,A3:P-2 3,"
 
 	# An invalidated remote partition cannot self-recover from hotplug
 	" C0-3:S+ C1-3:S+  C2      .    X2-3   X2-3   T:P2:O2=0 O2=1 0 A1:0-3,A2:1-3,A3:2 A1:P0,A3:P-2"
@@ -376,7 +376,7 @@ write_cpu_online()
 		}
 	fi
 	echo $VAL > $CPUFILE
-	pause 0.01
+	pause 0.05
 }
 
 #
@@ -508,12 +508,14 @@ dump_states()
 		XECPUS=$DIR/cpuset.cpus.exclusive.effective
 		PRS=$DIR/cpuset.cpus.partition
 		PCPUS=$DIR/.__DEBUG__.cpuset.cpus.subpartitions
+		ISCPUS=$DIR/.__DEBUG__.cpuset.cpus.isolated
 		[[ -e $CPUS   ]] && echo "$CPUS: $(cat $CPUS)"
 		[[ -e $XCPUS  ]] && echo "$XCPUS: $(cat $XCPUS)"
 		[[ -e $ECPUS  ]] && echo "$ECPUS: $(cat $ECPUS)"
 		[[ -e $XECPUS ]] && echo "$XECPUS: $(cat $XECPUS)"
 		[[ -e $PRS    ]] && echo "$PRS: $(cat $PRS)"
 		[[ -e $PCPUS  ]] && echo "$PCPUS: $(cat $PCPUS)"
+		[[ -e $ISCPUS ]] && echo "$ISCPUS: $(cat $ISCPUS)"
 	done
 }
 
@@ -591,11 +593,17 @@ check_cgroup_states()
 
 #
 # Get isolated (including offline) CPUs by looking at
-# /sys/kernel/debug/sched/domains and compare that with the expected value.
+# /sys/kernel/debug/sched/domains and *cpuset.cpus.isolated control file,
+# if available, and compare that with the expected value.
 #
-# Note that a sched domain of just 1 CPU will be considered isolated.
+# Note that isolated CPUs from the sched/domains context include offline
+# CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may
+# not be included in the *cpuset.cpus.isolated control file which contains
+# only CPUs in isolated partitions.
 #
-# $1 - expected isolated cpu list
+# $1 - expected isolated cpu list(s) <isolcpus1>{,<isolcpus2>}
+# <isolcpus1> - expected sched/domains value
+# <isolcpus2> - *cpuset.cpus.isolated value = <isolcpus1> if not defined
 #
 check_isolcpus()
 {
@@ -603,8 +611,38 @@ check_isolcpus()
 	ISOLCPUS=
 	LASTISOLCPU=
 	SCHED_DOMAINS=/sys/kernel/debug/sched/domains
+	ISCPUS=${CGROUP2}/.__DEBUG__.cpuset.cpus.isolated
+	if [[ $EXPECT_VAL = . ]]
+	then
+		EXPECT_VAL=
+		EXPECT_VAL2=
+	elif [[ $(expr $EXPECT_VAL : ".*,.*") > 0 ]]
+	then
+		set -- $(echo $EXPECT_VAL | sed -e "s/,/ /g")
+		EXPECT_VAL=$1
+		EXPECT_VAL2=$2
+	else
+		EXPECT_VAL2=$EXPECT_VAL
+	fi
+
+	#
+	# Check the debug isolated cpumask, if present
+	#
+	[[ -f $ISCPUS ]] && {
+		ISOLCPUS=$(cat $ISCPUS)
+		[[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && {
+			# Take a 50ms pause and try again
+			pause 0.05
+			ISOLCPUS=$(cat $ISCPUS)
+		}
+		[[ "$EXPECT_VAL2" != "$ISOLCPUS" ]] && return 1
+		ISOLCPUS=
+	}
+
+	#
+	# Use the sched domain in debugfs to check isolated CPUs, if available
+	#
 	[[ -d $SCHED_DOMAINS ]] || return 0
-	[[ $EXPECT_VAL = . ]] && EXPECT_VAL=
 
 	for ((CPU=0; CPU < $NR_CPUS; CPU++))
 	do
@@ -648,6 +686,22 @@ test_fail()
 	exit 1
 }
 
+#
+# Check to see if there are unexpected isolated CPUs left
+#
+null_isolcpus_check()
+{
+	[[ $VERBOSE -gt 0 ]] || return 0
+	pause 0.02
+	check_isolcpus "."
+	if [[ $? -ne 0 ]]
+	then
+		echo "Unexpected isolated CPUs: $ISOLCPUS"
+		dump_states
+		exit 1
+	fi
+}
+
 #
 # Run cpuset state transition test
 #  $1 - test matrix name
@@ -733,6 +787,7 @@ run_state_test()
 			echo "Effective cpus changed to $NEWLIST after test $I!"
 			exit 1
 		}
+		null_isolcpus_check
 		[[ $VERBOSE -gt 0 ]] && echo "Test $I done."
 		((I++))
 	done
@@ -802,6 +857,7 @@ test_isolated()
 	console_msg "Cleaning up"
 	echo $$ > $CGROUP2/cgroup.procs
 	[[ -d A1 ]] && rmdir A1
+	null_isolcpus_check
 }
 
 #
-- 
cgit v1.2.3


From e76d28bdf9ba5388b8c4835a5199dc427b603188 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Fri, 3 Nov 2023 23:13:01 -0400
Subject: cgroup/rstat: Reduce cpu_lock hold time in
 cgroup_rstat_flush_locked()

When cgroup_rstat_updated() isn't being called concurrently with
cgroup_rstat_flush_locked(), its run time is pretty short. When
both are called concurrently, the cgroup_rstat_updated() run time
can spike to a pretty high value due to high cpu_lock hold time in
cgroup_rstat_flush_locked(). This can be problematic if the task calling
cgroup_rstat_updated() is a realtime task running on an isolated CPU
with a strict latency requirement. The cgroup_rstat_updated() call can
happen when there is a page fault even though the task is running in
user space most of the time.

The percpu cpu_lock is used to protect the update tree -
updated_next and updated_children. This protection is only needed when
cgroup_rstat_cpu_pop_updated() is being called. The subsequent flushing
operation which can take a much longer time does not need that protection
as it is already protected by cgroup_rstat_lock.

To reduce the cpu_lock hold time, we need to perform all the
cgroup_rstat_cpu_pop_updated() calls up front with the lock
released afterward before doing any flushing. This patch adds a new
cgroup_rstat_updated_list() function to return a singly linked list of
cgroups to be flushed.

Some instrumentation code are added to measure the cpu_lock hold time
right after lock acquisition to after releasing the lock. Parallel
kernel build on a 2-socket x86-64 server is used as the benchmarking
tool for measuring the lock hold time.

The maximum cpu_lock hold time before and after the patch are 100us and
29us respectively. So the worst case time is reduced to about 30% of
the original. However, there may be some OS or hardware noises like NMI
or SMI in the test system that can worsen the worst case value. Those
noises are usually tuned out in a real production environment to get
a better result.

OTOH, the lock hold time frequency distribution should give a better
idea of the performance benefit of the patch.  Below were the frequency
distribution before and after the patch:

     Hold time        Before patch       After patch
     ---------        ------------       -----------
       0-01 us           804,139         13,738,708
      01-05 us         9,772,767          1,177,194
      05-10 us         4,595,028              4,984
      10-15 us           303,481              3,562
      15-20 us            78,971              1,314
      20-25 us            24,583                 18
      25-30 us             6,908                 12
      30-40 us             8,015
      40-50 us             2,192
      50-60 us               316
      60-70 us                43
      70-80 us                 7
      80-90 us                 2
        >90 us                 3

Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |  7 +++++++
 kernel/cgroup/rstat.c       | 43 ++++++++++++++++++++++++++++---------------
 2 files changed, 35 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 4caab0c6b361..37518436cfe7 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -496,6 +496,13 @@ struct cgroup {
 	struct cgroup_rstat_cpu __percpu *rstat_cpu;
 	struct list_head rstat_css_list;
 
+	/*
+	 * A singly-linked list of cgroup structures to be rstat flushed.
+	 * This is a scratch field to be used exclusively by
+	 * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock.
+	 */
+	struct cgroup	*rstat_flush_next;
+
 	/* cgroup basic resource statistics */
 	struct cgroup_base_stat last_bstat;
 	struct cgroup_base_stat bstat;
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d80d7a608141..1f300bf4dc40 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -145,6 +145,32 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
 	return pos;
 }
 
+/* Return a list of updated cgroups to be flushed */
+static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
+{
+	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+	struct cgroup *head, *tail, *next;
+	unsigned long flags;
+
+	/*
+	 * The _irqsave() is needed because cgroup_rstat_lock is
+	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+	 * this lock with the _irq() suffix only disables interrupts on
+	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+	 * interrupts on both configurations. The _irqsave() ensures
+	 * that interrupts are always disabled and later restored.
+	 */
+	raw_spin_lock_irqsave(cpu_lock, flags);
+	head = tail = cgroup_rstat_cpu_pop_updated(NULL, root, cpu);
+	while (tail) {
+		next = cgroup_rstat_cpu_pop_updated(tail, root, cpu);
+		tail->rstat_flush_next = next;
+		tail = next;
+	}
+	raw_spin_unlock_irqrestore(cpu_lock, flags);
+	return head;
+}
+
 /*
  * A hook for bpf stat collectors to attach to and flush their stats.
  * Together with providing bpf kfuncs for cgroup_rstat_updated() and
@@ -179,21 +205,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
 	lockdep_assert_held(&cgroup_rstat_lock);
 
 	for_each_possible_cpu(cpu) {
-		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
-						       cpu);
-		struct cgroup *pos = NULL;
-		unsigned long flags;
+		struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
 
-		/*
-		 * The _irqsave() is needed because cgroup_rstat_lock is
-		 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
-		 * this lock with the _irq() suffix only disables interrupts on
-		 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
-		 * interrupts on both configurations. The _irqsave() ensures
-		 * that interrupts are always disabled and later restored.
-		 */
-		raw_spin_lock_irqsave(cpu_lock, flags);
-		while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
+		for (; pos; pos = pos->rstat_flush_next) {
 			struct cgroup_subsys_state *css;
 
 			cgroup_base_stat_flush(pos, cpu);
@@ -205,7 +219,6 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
 				css->ss->css_rstat_flush(css, cpu);
 			rcu_read_unlock();
 		}
-		raw_spin_unlock_irqrestore(cpu_lock, flags);
 
 		/* play nice and yield if necessary */
 		if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
-- 
cgit v1.2.3


From 022732e3d846e197539712e51ecada90ded0572a Mon Sep 17 00:00:00 2001
From: Chris Riches <chris.riches@nutanix.com>
Date: Wed, 18 Oct 2023 09:23:51 +0000
Subject: audit: Send netlink ACK before setting connection in auditd_set

When auditd_set sets the auditd_conn pointer, audit messages can
immediately be put on the socket by other kernel threads. If the backlog
is large or the rate is high, this can immediately fill the socket
buffer. If the audit daemon requested an ACK for this operation, a full
socket buffer causes the ACK to get dropped, also setting ENOBUFS on the
socket.

To avoid this race and ensure ACKs get through, fast-track the ACK in
this specific case to ensure it is sent before auditd_conn is set.

Signed-off-by: Chris Riches <chris.riches@nutanix.com>
[PM: fix some tab vs space damage]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 16205dd29843..9c8e5f732c4c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -487,15 +487,19 @@ static void auditd_conn_free(struct rcu_head *rcu)
  * @pid: auditd PID
  * @portid: auditd netlink portid
  * @net: auditd network namespace pointer
+ * @skb: the netlink command from the audit daemon
+ * @ack: netlink ack flag, cleared if ack'd here
  *
  * Description:
  * This function will obtain and drop network namespace references as
  * necessary.  Returns zero on success, negative values on failure.
  */
-static int auditd_set(struct pid *pid, u32 portid, struct net *net)
+static int auditd_set(struct pid *pid, u32 portid, struct net *net,
+		      struct sk_buff *skb, bool *ack)
 {
 	unsigned long flags;
 	struct auditd_connection *ac_old, *ac_new;
+	struct nlmsghdr *nlh;
 
 	if (!pid || !net)
 		return -EINVAL;
@@ -507,6 +511,13 @@ static int auditd_set(struct pid *pid, u32 portid, struct net *net)
 	ac_new->portid = portid;
 	ac_new->net = get_net(net);
 
+	/* send the ack now to avoid a race with the queue backlog */
+	if (*ack) {
+		nlh = nlmsg_hdr(skb);
+		netlink_ack(skb, nlh, 0, NULL);
+		*ack = false;
+	}
+
 	spin_lock_irqsave(&auditd_conn_lock, flags);
 	ac_old = rcu_dereference_protected(auditd_conn,
 					   lockdep_is_held(&auditd_conn_lock));
@@ -1200,7 +1211,8 @@ static int audit_replace(struct pid *pid)
 	return auditd_send_unicast_skb(skb);
 }
 
-static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+			     bool *ack)
 {
 	u32			seq;
 	void			*data;
@@ -1293,7 +1305,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 				/* register a new auditd connection */
 				err = auditd_set(req_pid,
 						 NETLINK_CB(skb).portid,
-						 sock_net(NETLINK_CB(skb).sk));
+						 sock_net(NETLINK_CB(skb).sk),
+						 skb, ack);
 				if (audit_enabled != AUDIT_OFF)
 					audit_log_config_change("audit_pid",
 								new_pid,
@@ -1538,9 +1551,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
  * Parse the provided skb and deal with any messages that may be present,
  * malformed skbs are discarded.
  */
-static void audit_receive(struct sk_buff  *skb)
+static void audit_receive(struct sk_buff *skb)
 {
 	struct nlmsghdr *nlh;
+	bool ack;
 	/*
 	 * len MUST be signed for nlmsg_next to be able to dec it below 0
 	 * if the nlmsg_len was not aligned
@@ -1553,9 +1567,12 @@ static void audit_receive(struct sk_buff  *skb)
 
 	audit_ctl_lock();
 	while (nlmsg_ok(nlh, len)) {
-		err = audit_receive_msg(skb, nlh);
-		/* if err or if this message says it wants a response */
-		if (err || (nlh->nlmsg_flags & NLM_F_ACK))
+		ack = nlh->nlmsg_flags & NLM_F_ACK;
+		err = audit_receive_msg(skb, nlh, &ack);
+
+		/* send an ack if the user asked for one and audit_receive_msg
+		 * didn't already do it, or if there was an error. */
+		if (ack || err)
 			netlink_ack(skb, nlh, err, NULL);
 
 		nlh = nlmsg_next(nlh, &len);
-- 
cgit v1.2.3


From a04a1198088a1378d0389c250cc684f649bcc91e Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 12 Sep 2023 13:56:49 -0700
Subject: LSM: syscalls for current process attributes

Create a system call lsm_get_self_attr() to provide the security
module maintained attributes of the current process.
Create a system call lsm_set_self_attr() to set a security
module maintained attribute of the current process.
Historically these attributes have been exposed to user space via
entries in procfs under /proc/self/attr.

The attribute value is provided in a lsm_ctx structure. The structure
identifies the size of the attribute, and the attribute value. The format
of the attribute value is defined by the security module. A flags field
is included for LSM specific information. It is currently unused and must
be 0. The total size of the data, including the lsm_ctx structure and any
padding, is maintained as well.

struct lsm_ctx {
        __u64 id;
        __u64 flags;
        __u64 len;
        __u64 ctx_len;
        __u8 ctx[];
};

Two new LSM hooks are used to interface with the LSMs.
security_getselfattr() collects the lsm_ctx values from the
LSMs that support the hook, accounting for space requirements.
security_setselfattr() identifies which LSM the attribute is
intended for and passes it along.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 Documentation/userspace-api/lsm.rst |  70 +++++++++++++++++
 include/linux/lsm_hook_defs.h       |   4 +
 include/linux/lsm_hooks.h           |   1 +
 include/linux/security.h            |  19 +++++
 include/linux/syscalls.h            |   5 ++
 include/uapi/linux/lsm.h            |  36 +++++++++
 kernel/sys_ni.c                     |   2 +
 security/Makefile                   |   1 +
 security/lsm_syscalls.c             |  57 ++++++++++++++
 security/security.c                 | 152 ++++++++++++++++++++++++++++++++++++
 10 files changed, 347 insertions(+)
 create mode 100644 Documentation/userspace-api/lsm.rst
 create mode 100644 security/lsm_syscalls.c

(limited to 'kernel')

diff --git a/Documentation/userspace-api/lsm.rst b/Documentation/userspace-api/lsm.rst
new file mode 100644
index 000000000000..f8499f3e2826
--- /dev/null
+++ b/Documentation/userspace-api/lsm.rst
@@ -0,0 +1,70 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. Copyright (C) 2022 Casey Schaufler <casey@schaufler-ca.com>
+.. Copyright (C) 2022 Intel Corporation
+
+=====================================
+Linux Security Modules
+=====================================
+
+:Author: Casey Schaufler
+:Date: July 2023
+
+Linux security modules (LSM) provide a mechanism to implement
+additional access controls to the Linux security policies.
+
+The various security modules may support any of these attributes:
+
+``LSM_ATTR_CURRENT`` is the current, active security context of the
+process.
+The proc filesystem provides this value in ``/proc/self/attr/current``.
+This is supported by the SELinux, Smack and AppArmor security modules.
+Smack also provides this value in ``/proc/self/attr/smack/current``.
+AppArmor also provides this value in ``/proc/self/attr/apparmor/current``.
+
+``LSM_ATTR_EXEC`` is the security context of the process at the time the
+current image was executed.
+The proc filesystem provides this value in ``/proc/self/attr/exec``.
+This is supported by the SELinux and AppArmor security modules.
+AppArmor also provides this value in ``/proc/self/attr/apparmor/exec``.
+
+``LSM_ATTR_FSCREATE`` is the security context of the process used when
+creating file system objects.
+The proc filesystem provides this value in ``/proc/self/attr/fscreate``.
+This is supported by the SELinux security module.
+
+``LSM_ATTR_KEYCREATE`` is the security context of the process used when
+creating key objects.
+The proc filesystem provides this value in ``/proc/self/attr/keycreate``.
+This is supported by the SELinux security module.
+
+``LSM_ATTR_PREV`` is the security context of the process at the time the
+current security context was set.
+The proc filesystem provides this value in ``/proc/self/attr/prev``.
+This is supported by the SELinux and AppArmor security modules.
+AppArmor also provides this value in ``/proc/self/attr/apparmor/prev``.
+
+``LSM_ATTR_SOCKCREATE`` is the security context of the process used when
+creating socket objects.
+The proc filesystem provides this value in ``/proc/self/attr/sockcreate``.
+This is supported by the SELinux security module.
+
+Kernel interface
+================
+
+Set a security attribute of the current process
+-----------------------------------------------
+
+.. kernel-doc:: security/lsm_syscalls.c
+    :identifiers: sys_lsm_set_self_attr
+
+Get the specified security attributes of the current process
+------------------------------------------------------------
+
+.. kernel-doc:: security/lsm_syscalls.c
+    :identifiers: sys_lsm_get_self_attr
+
+Additional documentation
+========================
+
+* Documentation/security/lsm.rst
+* Documentation/security/lsm-development.rst
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index ff217a5ce552..c925a0d26edf 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -262,6 +262,10 @@ LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops,
 LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb)
 LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry,
 	 struct inode *inode)
+LSM_HOOK(int, -EOPNOTSUPP, getselfattr, unsigned int attr,
+	 struct lsm_ctx __user *ctx, size_t *size, u32 flags)
+LSM_HOOK(int, -EOPNOTSUPP, setselfattr, unsigned int attr,
+	 struct lsm_ctx *ctx, size_t size, u32 flags)
 LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name,
 	 char **value)
 LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 7f0adb33caaa..a2ade0ffe9e7 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -25,6 +25,7 @@
 #ifndef __LINUX_LSM_HOOKS_H
 #define __LINUX_LSM_HOOKS_H
 
+#include <uapi/linux/lsm.h>
 #include <linux/security.h>
 #include <linux/init.h>
 #include <linux/rculist.h>
diff --git a/include/linux/security.h b/include/linux/security.h
index c81bca77f4f2..dd1fe487385d 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -60,6 +60,7 @@ struct fs_parameter;
 enum fs_value_type;
 struct watch;
 struct watch_notification;
+struct lsm_ctx;
 
 /* Default (no) options for the capable function */
 #define CAP_OPT_NONE 0x0
@@ -472,6 +473,10 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd);
 int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
 			unsigned nsops, int alter);
 void security_d_instantiate(struct dentry *dentry, struct inode *inode);
+int security_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
+			 size_t __user *size, u32 flags);
+int security_setselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
+			 size_t size, u32 flags);
 int security_getprocattr(struct task_struct *p, int lsmid, const char *name,
 			 char **value);
 int security_setprocattr(int lsmid, const char *name, void *value, size_t size);
@@ -1338,6 +1343,20 @@ static inline void security_d_instantiate(struct dentry *dentry,
 					  struct inode *inode)
 { }
 
+static inline int security_getselfattr(unsigned int attr,
+				       struct lsm_ctx __user *ctx,
+				       size_t __user *size, u32 flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int security_setselfattr(unsigned int attr,
+				       struct lsm_ctx __user *ctx,
+				       size_t size, u32 flags)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int security_getprocattr(struct task_struct *p, int lsmid,
 				       const char *name, char **value)
 {
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index fd9d12de7e92..4e1e56a24f1e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -71,6 +71,7 @@ struct clone_args;
 struct open_how;
 struct mount_attr;
 struct landlock_ruleset_attr;
+struct lsm_ctx;
 enum landlock_rule_type;
 struct cachestat_range;
 struct cachestat;
@@ -949,6 +950,10 @@ asmlinkage long sys_cachestat(unsigned int fd,
 		struct cachestat_range __user *cstat_range,
 		struct cachestat __user *cstat, unsigned int flags);
 asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags);
+asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx,
+				      size_t *size, __u32 flags);
+asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx,
+				      size_t size, __u32 flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/lsm.h b/include/uapi/linux/lsm.h
index f27c9a9cc376..eeda59a77c02 100644
--- a/include/uapi/linux/lsm.h
+++ b/include/uapi/linux/lsm.h
@@ -9,6 +9,36 @@
 #ifndef _UAPI_LINUX_LSM_H
 #define _UAPI_LINUX_LSM_H
 
+#include <linux/types.h>
+#include <linux/unistd.h>
+
+/**
+ * struct lsm_ctx - LSM context information
+ * @id: the LSM id number, see LSM_ID_XXX
+ * @flags: LSM specific flags
+ * @len: length of the lsm_ctx struct, @ctx and any other data or padding
+ * @ctx_len: the size of @ctx
+ * @ctx: the LSM context value
+ *
+ * The @len field MUST be equal to the size of the lsm_ctx struct
+ * plus any additional padding and/or data placed after @ctx.
+ *
+ * In all cases @ctx_len MUST be equal to the length of @ctx.
+ * If @ctx is a string value it should be nul terminated with
+ * @ctx_len equal to `strlen(@ctx) + 1`.  Binary values are
+ * supported.
+ *
+ * The @flags and @ctx fields SHOULD only be interpreted by the
+ * LSM specified by @id; they MUST be set to zero/0 when not used.
+ */
+struct lsm_ctx {
+	__u64 id;
+	__u64 flags;
+	__u64 len;
+	__u64 ctx_len;
+	__u8 ctx[];
+};
+
 /*
  * ID tokens to identify Linux Security Modules (LSMs)
  *
@@ -51,4 +81,10 @@
 #define LSM_ATTR_PREV		104
 #define LSM_ATTR_SOCKCREATE	105
 
+/*
+ * LSM_FLAG_XXX definitions identify special handling instructions
+ * for the API.
+ */
+#define LSM_FLAG_SINGLE	0x0001
+
 #endif /* _UAPI_LINUX_LSM_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e1a6e3c675c0..1f61b8452a6e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -171,6 +171,8 @@ COND_SYSCALL(landlock_add_rule);
 COND_SYSCALL(landlock_restrict_self);
 COND_SYSCALL(fadvise64_64);
 COND_SYSCALL_COMPAT(fadvise64_64);
+COND_SYSCALL(lsm_get_self_attr);
+COND_SYSCALL(lsm_set_self_attr);
 
 /* CONFIG_MMU only */
 COND_SYSCALL(swapon);
diff --git a/security/Makefile b/security/Makefile
index 18121f8f85cd..59f238490665 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_KEYS)			+= keys/
 
 # always enable default capabilities
 obj-y					+= commoncap.o
+obj-$(CONFIG_SECURITY) 			+= lsm_syscalls.o
 obj-$(CONFIG_MMU)			+= min_addr.o
 
 # Object file lists
diff --git a/security/lsm_syscalls.c b/security/lsm_syscalls.c
new file mode 100644
index 000000000000..226ae80d9683
--- /dev/null
+++ b/security/lsm_syscalls.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * System calls implementing the Linux Security Module API.
+ *
+ *  Copyright (C) 2022 Casey Schaufler <casey@schaufler-ca.com>
+ *  Copyright (C) 2022 Intel Corporation
+ */
+
+#include <asm/current.h>
+#include <linux/compiler_types.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/security.h>
+#include <linux/stddef.h>
+#include <linux/syscalls.h>
+#include <linux/types.h>
+#include <linux/lsm_hooks.h>
+#include <uapi/linux/lsm.h>
+
+/**
+ * sys_lsm_set_self_attr - Set current task's security module attribute
+ * @attr: which attribute to set
+ * @ctx: the LSM contexts
+ * @size: size of @ctx
+ * @flags: reserved for future use
+ *
+ * Sets the calling task's LSM context. On success this function
+ * returns 0. If the attribute specified cannot be set a negative
+ * value indicating the reason for the error is returned.
+ */
+SYSCALL_DEFINE4(lsm_set_self_attr, unsigned int, attr, struct lsm_ctx __user *,
+		ctx, size_t, size, u32, flags)
+{
+	return security_setselfattr(attr, ctx, size, flags);
+}
+
+/**
+ * sys_lsm_get_self_attr - Return current task's security module attributes
+ * @attr: which attribute to return
+ * @ctx: the user-space destination for the information, or NULL
+ * @size: pointer to the size of space available to receive the data
+ * @flags: special handling options. LSM_FLAG_SINGLE indicates that only
+ * attributes associated with the LSM identified in the passed @ctx be
+ * reported.
+ *
+ * Returns the calling task's LSM contexts. On success this
+ * function returns the number of @ctx array elements. This value
+ * may be zero if there are no LSM contexts assigned. If @size is
+ * insufficient to contain the return data -E2BIG is returned and
+ * @size is set to the minimum required size. In all other cases
+ * a negative value indicating the error is returned.
+ */
+SYSCALL_DEFINE4(lsm_get_self_attr, unsigned int, attr, struct lsm_ctx __user *,
+		ctx, size_t __user *, size, u32, flags)
+{
+	return security_getselfattr(attr, ctx, size, flags);
+}
diff --git a/security/security.c b/security/security.c
index c66f9faefa40..9757d009113f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -3837,6 +3837,158 @@ void security_d_instantiate(struct dentry *dentry, struct inode *inode)
 }
 EXPORT_SYMBOL(security_d_instantiate);
 
+/*
+ * Please keep this in sync with it's counterpart in security/lsm_syscalls.c
+ */
+
+/**
+ * security_getselfattr - Read an LSM attribute of the current process.
+ * @attr: which attribute to return
+ * @uctx: the user-space destination for the information, or NULL
+ * @size: pointer to the size of space available to receive the data
+ * @flags: special handling options. LSM_FLAG_SINGLE indicates that only
+ * attributes associated with the LSM identified in the passed @ctx be
+ * reported.
+ *
+ * A NULL value for @uctx can be used to get both the number of attributes
+ * and the size of the data.
+ *
+ * Returns the number of attributes found on success, negative value
+ * on error. @size is reset to the total size of the data.
+ * If @size is insufficient to contain the data -E2BIG is returned.
+ */
+int security_getselfattr(unsigned int attr, struct lsm_ctx __user *uctx,
+			 size_t __user *size, u32 flags)
+{
+	struct security_hook_list *hp;
+	struct lsm_ctx lctx = { .id = LSM_ID_UNDEF, };
+	u8 __user *base = (u8 __user *)uctx;
+	size_t total = 0;
+	size_t entrysize;
+	size_t left;
+	bool toobig = false;
+	bool single = false;
+	int count = 0;
+	int rc;
+
+	if (attr == LSM_ATTR_UNDEF)
+		return -EINVAL;
+	if (size == NULL)
+		return -EINVAL;
+	if (get_user(left, size))
+		return -EFAULT;
+
+	if (flags) {
+		/*
+		 * Only flag supported is LSM_FLAG_SINGLE
+		 */
+		if (flags != LSM_FLAG_SINGLE)
+			return -EINVAL;
+		if (uctx && copy_from_user(&lctx, uctx, sizeof(lctx)))
+			return -EFAULT;
+		/*
+		 * If the LSM ID isn't specified it is an error.
+		 */
+		if (lctx.id == LSM_ID_UNDEF)
+			return -EINVAL;
+		single = true;
+	}
+
+	/*
+	 * In the usual case gather all the data from the LSMs.
+	 * In the single case only get the data from the LSM specified.
+	 */
+	hlist_for_each_entry(hp, &security_hook_heads.getselfattr, list) {
+		if (single && lctx.id != hp->lsmid->id)
+			continue;
+		entrysize = left;
+		if (base)
+			uctx = (struct lsm_ctx __user *)(base + total);
+		rc = hp->hook.getselfattr(attr, uctx, &entrysize, flags);
+		if (rc == -EOPNOTSUPP) {
+			rc = 0;
+			continue;
+		}
+		if (rc == -E2BIG) {
+			toobig = true;
+			left = 0;
+		} else if (rc < 0)
+			return rc;
+		else
+			left -= entrysize;
+
+		total += entrysize;
+		count += rc;
+		if (single)
+			break;
+	}
+	if (put_user(total, size))
+		return -EFAULT;
+	if (toobig)
+		return -E2BIG;
+	if (count == 0)
+		return LSM_RET_DEFAULT(getselfattr);
+	return count;
+}
+
+/*
+ * Please keep this in sync with it's counterpart in security/lsm_syscalls.c
+ */
+
+/**
+ * security_setselfattr - Set an LSM attribute on the current process.
+ * @attr: which attribute to set
+ * @uctx: the user-space source for the information
+ * @size: the size of the data
+ * @flags: reserved for future use, must be 0
+ *
+ * Set an LSM attribute for the current process. The LSM, attribute
+ * and new value are included in @uctx.
+ *
+ * Returns 0 on success, -EINVAL if the input is inconsistent, -EFAULT
+ * if the user buffer is inaccessible, E2BIG if size is too big, or an
+ * LSM specific failure.
+ */
+int security_setselfattr(unsigned int attr, struct lsm_ctx __user *uctx,
+			 size_t size, u32 flags)
+{
+	struct security_hook_list *hp;
+	struct lsm_ctx *lctx;
+	int rc = LSM_RET_DEFAULT(setselfattr);
+
+	if (flags)
+		return -EINVAL;
+	if (size < sizeof(*lctx))
+		return -EINVAL;
+	if (size > PAGE_SIZE)
+		return -E2BIG;
+
+	lctx = kmalloc(size, GFP_KERNEL);
+	if (lctx == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(lctx, uctx, size)) {
+		rc = -EFAULT;
+		goto free_out;
+	}
+
+	if (size < lctx->len || size < lctx->ctx_len + sizeof(*lctx) ||
+	    lctx->len < lctx->ctx_len + sizeof(*lctx)) {
+		rc = -EINVAL;
+		goto free_out;
+	}
+
+	hlist_for_each_entry(hp, &security_hook_heads.setselfattr, list)
+		if ((hp->lsmid->id) == lctx->id) {
+			rc = hp->hook.setselfattr(attr, lctx, size, flags);
+			break;
+		}
+
+free_out:
+	kfree(lctx);
+	return rc;
+}
+
 /**
  * security_getprocattr() - Read an attribute for a task
  * @p: the task
-- 
cgit v1.2.3


From ad4aff9ec25f400608283c10d634cc4eeda83a02 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 12 Sep 2023 13:56:50 -0700
Subject: LSM: Create lsm_list_modules system call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create a system call to report the list of Linux Security Modules
that are active on the system. The list is provided as an array
of LSM ID numbers.

The calling application can use this list determine what LSM
specific actions it might take. That might include choosing an
output format, determining required privilege or bypassing
security module specific behavior.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Reviewed-by: Mickaël Salaün <mic@digikod.net>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 Documentation/userspace-api/lsm.rst |  3 +++
 include/linux/syscalls.h            |  1 +
 kernel/sys_ni.c                     |  1 +
 security/lsm_syscalls.c             | 39 +++++++++++++++++++++++++++++++++++++
 4 files changed, 44 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/userspace-api/lsm.rst b/Documentation/userspace-api/lsm.rst
index f8499f3e2826..a76da373841b 100644
--- a/Documentation/userspace-api/lsm.rst
+++ b/Documentation/userspace-api/lsm.rst
@@ -63,6 +63,9 @@ Get the specified security attributes of the current process
 .. kernel-doc:: security/lsm_syscalls.c
     :identifiers: sys_lsm_get_self_attr
 
+.. kernel-doc:: security/lsm_syscalls.c
+    :identifiers: sys_lsm_list_modules
+
 Additional documentation
 ========================
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 4e1e56a24f1e..feec5719750b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -954,6 +954,7 @@ asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx,
 				      size_t *size, __u32 flags);
 asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx,
 				      size_t size, __u32 flags);
+asmlinkage long sys_lsm_list_modules(u64 *ids, size_t *size, u32 flags);
 
 /*
  * Architecture-specific system calls
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1f61b8452a6e..9fa5989bf2ce 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -173,6 +173,7 @@ COND_SYSCALL(fadvise64_64);
 COND_SYSCALL_COMPAT(fadvise64_64);
 COND_SYSCALL(lsm_get_self_attr);
 COND_SYSCALL(lsm_set_self_attr);
+COND_SYSCALL(lsm_list_modules);
 
 /* CONFIG_MMU only */
 COND_SYSCALL(swapon);
diff --git a/security/lsm_syscalls.c b/security/lsm_syscalls.c
index 226ae80d9683..329aaca5efc0 100644
--- a/security/lsm_syscalls.c
+++ b/security/lsm_syscalls.c
@@ -55,3 +55,42 @@ SYSCALL_DEFINE4(lsm_get_self_attr, unsigned int, attr, struct lsm_ctx __user *,
 {
 	return security_getselfattr(attr, ctx, size, flags);
 }
+
+/**
+ * sys_lsm_list_modules - Return a list of the active security modules
+ * @ids: the LSM module ids
+ * @size: pointer to size of @ids, updated on return
+ * @flags: reserved for future use, must be zero
+ *
+ * Returns a list of the active LSM ids. On success this function
+ * returns the number of @ids array elements. This value may be zero
+ * if there are no LSMs active. If @size is insufficient to contain
+ * the return data -E2BIG is returned and @size is set to the minimum
+ * required size. In all other cases a negative value indicating the
+ * error is returned.
+ */
+SYSCALL_DEFINE3(lsm_list_modules, u64 __user *, ids, size_t __user *, size,
+		u32, flags)
+{
+	size_t total_size = lsm_active_cnt * sizeof(*ids);
+	size_t usize;
+	int i;
+
+	if (flags)
+		return -EINVAL;
+
+	if (get_user(usize, size))
+		return -EFAULT;
+
+	if (put_user(total_size, size) != 0)
+		return -EFAULT;
+
+	if (usize < total_size)
+		return -E2BIG;
+
+	for (i = 0; i < lsm_active_cnt; i++)
+		if (put_user(lsm_idlist[i]->id, ids++))
+			return -EFAULT;
+
+	return lsm_active_cnt;
+}
-- 
cgit v1.2.3


From fe977716b40cb98cf9c91a66454adf3dc2f8c59a Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sat, 11 Nov 2023 09:00:29 +0000
Subject: bpf: Add a new kfunc for cgroup1 hierarchy

A new kfunc is added to acquire cgroup1 of a task:

- bpf_task_get_cgroup1
  Acquires the associated cgroup of a task whithin a specific cgroup1
  hierarchy. The cgroup1 hierarchy is identified by its hierarchy ID.

This new kfunc enables the tracing of tasks within a designated
container or cgroup directory in BPF programs.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20231111090034.4248-2-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 03517db5cfb3..b45a8381f9bd 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2228,6 +2228,25 @@ __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task,
 	rcu_read_unlock();
 	return ret;
 }
+
+/**
+ * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a
+ * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
+ * hierarchy ID.
+ * @task: The target task
+ * @hierarchy_id: The ID of a cgroup1 hierarchy
+ *
+ * On success, the cgroup is returen. On failure, NULL is returned.
+ */
+__bpf_kfunc struct cgroup *
+bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id)
+{
+	struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id);
+
+	if (IS_ERR(cgrp))
+		return NULL;
+	return cgrp;
+}
 #endif /* CONFIG_CGROUPS */
 
 /**
@@ -2534,6 +2553,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
+BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 #endif
 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_throw)
-- 
cgit v1.2.3


From eab03c23c2a162085b13200d7942fc5a00b5ccc8 Mon Sep 17 00:00:00 2001
From: Abel Wu <wuyun.abel@bytedance.com>
Date: Tue, 7 Nov 2023 17:05:07 +0800
Subject: sched/eevdf: Fix vruntime adjustment on reweight

vruntime of the (on_rq && !0-lag) entity needs to be adjusted when
it gets re-weighted, and the calculations can be simplified based
on the fact that re-weight won't change the w-average of all the
entities. Please check the proofs in comments.

But adjusting vruntime can also cause position change in RB-tree
hence require re-queue to fix up which might be costly. This might
be avoided by deferring adjustment to the time the entity actually
leaves tree (dequeue/pick), but that will negatively affect task
selection and probably not good enough either.

Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231107090510.71322-2-wuyun.abel@bytedance.com
---
 kernel/sched/fair.c | 151 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 128 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2048138ce54b..025d90925bf6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3666,41 +3666,140 @@ static inline void
 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 #endif
 
+static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
+			   unsigned long weight)
+{
+	unsigned long old_weight = se->load.weight;
+	u64 avruntime = avg_vruntime(cfs_rq);
+	s64 vlag, vslice;
+
+	/*
+	 * VRUNTIME
+	 * ========
+	 *
+	 * COROLLARY #1: The virtual runtime of the entity needs to be
+	 * adjusted if re-weight at !0-lag point.
+	 *
+	 * Proof: For contradiction assume this is not true, so we can
+	 * re-weight without changing vruntime at !0-lag point.
+	 *
+	 *             Weight	VRuntime   Avg-VRuntime
+	 *     before    w          v            V
+	 *      after    w'         v'           V'
+	 *
+	 * Since lag needs to be preserved through re-weight:
+	 *
+	 *	lag = (V - v)*w = (V'- v')*w', where v = v'
+	 *	==>	V' = (V - v)*w/w' + v		(1)
+	 *
+	 * Let W be the total weight of the entities before reweight,
+	 * since V' is the new weighted average of entities:
+	 *
+	 *	V' = (WV + w'v - wv) / (W + w' - w)	(2)
+	 *
+	 * by using (1) & (2) we obtain:
+	 *
+	 *	(WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+	 *	==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+	 *	==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+	 *	==>	(V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+	 *
+	 * Since we are doing at !0-lag point which means V != v, we
+	 * can simplify (3):
+	 *
+	 *	==>	W / (W + w' - w) = w / w'
+	 *	==>	Ww' = Ww + ww' - ww
+	 *	==>	W * (w' - w) = w * (w' - w)
+	 *	==>	W = w	(re-weight indicates w' != w)
+	 *
+	 * So the cfs_rq contains only one entity, hence vruntime of
+	 * the entity @v should always equal to the cfs_rq's weighted
+	 * average vruntime @V, which means we will always re-weight
+	 * at 0-lag point, thus breach assumption. Proof completed.
+	 *
+	 *
+	 * COROLLARY #2: Re-weight does NOT affect weighted average
+	 * vruntime of all the entities.
+	 *
+	 * Proof: According to corollary #1, Eq. (1) should be:
+	 *
+	 *	(V - v)*w = (V' - v')*w'
+	 *	==>    v' = V' - (V - v)*w/w'		(4)
+	 *
+	 * According to the weighted average formula, we have:
+	 *
+	 *	V' = (WV - wv + w'v') / (W - w + w')
+	 *	   = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+	 *	   = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+	 *	   = (WV + w'V' - Vw) / (W - w + w')
+	 *
+	 *	==>  V'*(W - w + w') = WV + w'V' - Vw
+	 *	==>	V' * (W - w) = (W - w) * V	(5)
+	 *
+	 * If the entity is the only one in the cfs_rq, then reweight
+	 * always occurs at 0-lag point, so V won't change. Or else
+	 * there are other entities, hence W != w, then Eq. (5) turns
+	 * into V' = V. So V won't change in either case, proof done.
+	 *
+	 *
+	 * So according to corollary #1 & #2, the effect of re-weight
+	 * on vruntime should be:
+	 *
+	 *	v' = V' - (V - v) * w / w'		(4)
+	 *	   = V  - (V - v) * w / w'
+	 *	   = V  - vl * w / w'
+	 *	   = V  - vl'
+	 */
+	if (avruntime != se->vruntime) {
+		vlag = (s64)(avruntime - se->vruntime);
+		vlag = div_s64(vlag * old_weight, weight);
+		se->vruntime = avruntime - vlag;
+	}
+
+	/*
+	 * DEADLINE
+	 * ========
+	 *
+	 * When the weight changes, the virtual time slope changes and
+	 * we should adjust the relative virtual deadline accordingly.
+	 *
+	 *	d' = v' + (d - v)*w/w'
+	 *	   = V' - (V - v)*w/w' + (d - v)*w/w'
+	 *	   = V  - (V - v)*w/w' + (d - v)*w/w'
+	 *	   = V  + (d - V)*w/w'
+	 */
+	vslice = (s64)(se->deadline - avruntime);
+	vslice = div_s64(vslice * old_weight, weight);
+	se->deadline = avruntime + vslice;
+}
+
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
-	unsigned long old_weight = se->load.weight;
+	bool curr = cfs_rq->curr == se;
 
 	if (se->on_rq) {
 		/* commit outstanding execution time */
-		if (cfs_rq->curr == se)
+		if (curr)
 			update_curr(cfs_rq);
 		else
-			avg_vruntime_sub(cfs_rq, se);
+			__dequeue_entity(cfs_rq, se);
 		update_load_sub(&cfs_rq->load, se->load.weight);
 	}
 	dequeue_load_avg(cfs_rq, se);
 
-	update_load_set(&se->load, weight);
-
 	if (!se->on_rq) {
 		/*
 		 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
 		 * we need to scale se->vlag when w_i changes.
 		 */
-		se->vlag = div_s64(se->vlag * old_weight, weight);
+		se->vlag = div_s64(se->vlag * se->load.weight, weight);
 	} else {
-		s64 deadline = se->deadline - se->vruntime;
-		/*
-		 * When the weight changes, the virtual time slope changes and
-		 * we should adjust the relative virtual deadline accordingly.
-		 */
-		deadline = div_s64(deadline * old_weight, weight);
-		se->deadline = se->vruntime + deadline;
-		if (se != cfs_rq->curr)
-			min_deadline_cb_propagate(&se->run_node, NULL);
+		reweight_eevdf(cfs_rq, se, weight);
 	}
 
+	update_load_set(&se->load, weight);
+
 #ifdef CONFIG_SMP
 	do {
 		u32 divider = get_pelt_divider(&se->avg);
@@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	enqueue_load_avg(cfs_rq, se);
 	if (se->on_rq) {
 		update_load_add(&cfs_rq->load, se->load.weight);
-		if (cfs_rq->curr != se)
-			avg_vruntime_add(cfs_rq, se);
+		if (!curr) {
+			/*
+			 * The entity's vruntime has been adjusted, so let's check
+			 * whether the rq-wide min_vruntime needs updated too. Since
+			 * the calculations above require stable min_vruntime rather
+			 * than up-to-date one, we do the update at the end of the
+			 * reweight process.
+			 */
+			__enqueue_entity(cfs_rq, se);
+			update_min_vruntime(cfs_rq);
+		}
 	}
 }
 
@@ -3857,14 +3965,11 @@ static void update_cfs_group(struct sched_entity *se)
 
 #ifndef CONFIG_SMP
 	shares = READ_ONCE(gcfs_rq->tg->shares);
-
-	if (likely(se->load.weight == shares))
-		return;
 #else
-	shares   = calc_group_shares(gcfs_rq);
+	shares = calc_group_shares(gcfs_rq);
 #endif
-
-	reweight_entity(cfs_rq_of(se), se, shares);
+	if (unlikely(se->load.weight != shares))
+		reweight_entity(cfs_rq_of(se), se, shares);
 }
 
 #else /* CONFIG_FAIR_GROUP_SCHED */
-- 
cgit v1.2.3


From 8b39d20eceeda6c4eb23df1497f9ed2fffdc8f69 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 26 Oct 2023 12:41:14 -0400
Subject: sched: psi: fix unprivileged polling against cgroups

519fabc7aaba ("psi: remove 500ms min window size limitation for
triggers") breaks unprivileged psi polling on cgroups.

Historically, we had a privilege check for polling in the open() of a
pressure file in /proc, but were erroneously missing it for the open()
of cgroup pressure files.

When unprivileged polling was introduced in d82caa273565 ("sched/psi:
Allow unprivileged polling of N*2s period"), it needed to filter
privileges depending on the exact polling parameters, and as such
moved the CAP_SYS_RESOURCE check from the proc open() callback to
psi_trigger_create(). Both the proc files as well as cgroup files go
through this during write(). This implicitly added the missing check
for privileges required for HT polling for cgroups.

When 519fabc7aaba ("psi: remove 500ms min window size limitation for
triggers") followed right after to remove further restrictions on the
RT polling window, it incorrectly assumed the cgroup privilege check
was still missing and added it to the cgroup open(), mirroring what we
used to do for proc files in the past.

As a result, unprivileged poll requests that would be supported now
get rejected when opening the cgroup pressure file for writing.

Remove the cgroup open() check. psi_trigger_create() handles it.

Fixes: 519fabc7aaba ("psi: remove 500ms min window size limitation for triggers")
Reported-by: Luca Boccassi <bluca@debian.org>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Luca Boccassi <bluca@debian.org>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Cc: stable@vger.kernel.org # 6.5+
Link: https://lore.kernel.org/r/20231026164114.2488682-1-hannes@cmpxchg.org
---
 kernel/cgroup/cgroup.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1d5b9de3b1b9..4b9ff41ca603 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3885,14 +3885,6 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
 	return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
 }
 
-static int cgroup_pressure_open(struct kernfs_open_file *of)
-{
-	if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
-		return -EPERM;
-
-	return 0;
-}
-
 static void cgroup_pressure_release(struct kernfs_open_file *of)
 {
 	struct cgroup_file_ctx *ctx = of->priv;
@@ -5299,7 +5291,6 @@ static struct cftype cgroup_psi_files[] = {
 	{
 		.name = "io.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
-		.open = cgroup_pressure_open,
 		.seq_show = cgroup_io_pressure_show,
 		.write = cgroup_io_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -5308,7 +5299,6 @@ static struct cftype cgroup_psi_files[] = {
 	{
 		.name = "memory.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
-		.open = cgroup_pressure_open,
 		.seq_show = cgroup_memory_pressure_show,
 		.write = cgroup_memory_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -5317,7 +5307,6 @@ static struct cftype cgroup_psi_files[] = {
 	{
 		.name = "cpu.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
-		.open = cgroup_pressure_open,
 		.seq_show = cgroup_cpu_pressure_show,
 		.write = cgroup_cpu_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -5327,7 +5316,6 @@ static struct cftype cgroup_psi_files[] = {
 	{
 		.name = "irq.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
-		.open = cgroup_pressure_open,
 		.seq_show = cgroup_irq_pressure_show,
 		.write = cgroup_irq_pressure_write,
 		.poll = cgroup_pressure_poll,
-- 
cgit v1.2.3


From 6d7e4782bcf549221b4ccfffec2cf4d1a473f1a3 Mon Sep 17 00:00:00 2001
From: Keisuke Nishimura <keisuke.nishimura@inria.fr>
Date: Tue, 31 Oct 2023 14:38:22 +0100
Subject: sched/fair: Fix the decision for load balance

should_we_balance is called for the decision to do load-balancing.
When sched ticks invoke this function, only one CPU should return
true. However, in the current code, two CPUs can return true. The
following situation, where b means busy and i means idle, is an
example, because CPU 0 and CPU 2 return true.

        [0, 1] [2, 3]
         b  b   i  b

This fix checks if there exists an idle CPU with busy sibling(s)
after looking for a CPU on an idle core. If some idle CPUs with busy
siblings are found, just the first one should do load-balancing.

Fixes: b1bfeab9b002 ("sched/fair: Consider the idle state of the whole core for load balance")
Signed-off-by: Keisuke Nishimura <keisuke.nishimura@inria.fr>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
Reviewed-by: Shrikanth Hegde <sshegde@linux.vnet.ibm.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/20231031133821.1570861-1-keisuke.nishimura@inria.fr
---
 kernel/sched/fair.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 025d90925bf6..d7a3c63a2171 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11184,12 +11184,16 @@ static int should_we_balance(struct lb_env *env)
 			continue;
 		}
 
-		/* Are we the first idle CPU? */
+		/*
+		 * Are we the first idle core in a non-SMT domain or higher,
+		 * or the first idle CPU in a SMT domain?
+		 */
 		return cpu == env->dst_cpu;
 	}
 
-	if (idle_smt == env->dst_cpu)
-		return true;
+	/* Are we the first idle CPU with busy siblings? */
+	if (idle_smt != -1)
+		return idle_smt == env->dst_cpu;
 
 	/* Are we the first CPU of this group ? */
 	return group_balance_cpu(sg) == env->dst_cpu;
-- 
cgit v1.2.3


From 969d90ec212bae4b45bf9d21d7daa30aa6cf055e Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 14 Nov 2023 17:25:48 -0500
Subject: audit: don't WARN_ON_ONCE(!current->mm) in audit_exe_compare()

eBPF can end up calling into the audit code from some odd places, and
some of these places don't have @current set properly so we end up
tripping the `WARN_ON_ONCE(!current->mm)` near the top of
`audit_exe_compare()`.  While the basic `!current->mm` check is good,
the `WARN_ON_ONCE()` results in some scary console messages so let's
drop that and just do the regular `!current->mm` check to avoid
problems.

Cc: <stable@vger.kernel.org>
Fixes: 47846d51348d ("audit: don't take task_lock() in audit_exe_compare() code path")
Reported-by: Artem Savkov <asavkov@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_watch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 91e82e34b51e..7a98cd176a12 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -531,7 +531,7 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
 	if (tsk != current)
 		return 0;
 
-	if (WARN_ON_ONCE(!current->mm))
+	if (!current->mm)
 		return 0;
 	exe_file = get_mm_exe_file(current->mm);
 	if (!exe_file)
-- 
cgit v1.2.3


From c9bd1568d5462f4108417518ce1af7b924acfb6f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 14 Nov 2023 21:36:13 +0100
Subject: futex: Fix hardcoded flags

Xi reported that commit 5694289ce183 ("futex: Flag conversion") broke
glibc's robust futex tests.

This was narrowed down to the change of FLAGS_SHARED from 0x01 to
0x10, at which point Florian noted that handle_futex_death() has a
hardcoded flags argument of 1.

Change this to: FLAGS_SIZE_32 | FLAGS_SHARED, matching how
futex_to_flags() unconditionally sets FLAGS_SIZE_32 for all legacy
futex ops.

Reported-by: Xi Ruoyao <xry111@xry111.site>
Reported-by: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20231114201402.GA25315@noisy.programming.kicks-ass.net
Fixes: 5694289ce183 ("futex: Flag conversion")
Cc: <stable@vger.kernel.org>
---
 kernel/futex/core.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 52695c59d041..dad981a865b8 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -700,7 +700,8 @@ retry:
 	owner = uval & FUTEX_TID_MASK;
 
 	if (pending_op && !pi && !owner) {
-		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+		futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+			   FUTEX_BITSET_MATCH_ANY);
 		return 0;
 	}
 
@@ -752,8 +753,10 @@ retry:
 	 * Wake robust non-PI futexes here. The wakeup of
 	 * PI futexes happens in exit_pi_state():
 	 */
-	if (!pi && (uval & FUTEX_WAITERS))
-		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+	if (!pi && (uval & FUTEX_WAITERS)) {
+		futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+			   FUTEX_BITSET_MATCH_ANY);
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 889c58b3155ff4c8e8671c95daef63d6fabbb6b1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2023 12:34:46 +0200
Subject: perf/core: Fix cpuctx refcounting

Audit of the refcounting turned up that perf_pmu_migrate_context()
fails to migrate the ctx refcount.

Fixes: bd2756811766 ("perf: Rewrite core context handling")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20230612093539.085862001@infradead.org
Cc: <stable@vger.kernel.org>
---
 include/linux/perf_event.h | 13 ++++++++-----
 kernel/events/core.c       | 17 +++++++++++++++++
 2 files changed, 25 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index afb028c54f33..5547ba68e6e4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -843,11 +843,11 @@ struct perf_event {
 };
 
 /*
- *           ,-----------------------[1:n]----------------------.
- *           V                                                  V
- * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event
- *           ^                      ^     |                     |
- *           `--------[1:n]---------'     `-[n:1]-> pmu <-[1:n]-'
+ *           ,-----------------------[1:n]------------------------.
+ *           V                                                    V
+ * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event
+ *                                        |                       |
+ *                                        `--[n:1]-> pmu <-[1:n]--'
  *
  *
  * struct perf_event_pmu_context  lifetime is refcount based and RCU freed
@@ -865,6 +865,9 @@ struct perf_event {
  * ctx->mutex pinning the configuration. Since we hold a reference on
  * group_leader (through the filedesc) it can't go away, therefore it's
  * associated pmu_ctx must exist and cannot change due to ctx->mutex.
+ *
+ * perf_event holds a refcount on perf_event_context
+ * perf_event holds a refcount on perf_event_pmu_context
  */
 struct perf_event_pmu_context {
 	struct pmu			*pmu;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 683dc086ef10..b704d83a28b2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4828,6 +4828,11 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
 	void *task_ctx_data = NULL;
 
 	if (!ctx->task) {
+		/*
+		 * perf_pmu_migrate_context() / __perf_pmu_install_event()
+		 * relies on the fact that find_get_pmu_context() cannot fail
+		 * for CPU contexts.
+		 */
 		struct perf_cpu_pmu_context *cpc;
 
 		cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
@@ -12889,6 +12894,9 @@ static void __perf_pmu_install_event(struct pmu *pmu,
 				     int cpu, struct perf_event *event)
 {
 	struct perf_event_pmu_context *epc;
+	struct perf_event_context *old_ctx = event->ctx;
+
+	get_ctx(ctx); /* normally find_get_context() */
 
 	event->cpu = cpu;
 	epc = find_get_pmu_context(pmu, ctx, event);
@@ -12897,6 +12905,11 @@ static void __perf_pmu_install_event(struct pmu *pmu,
 	if (event->state >= PERF_EVENT_STATE_OFF)
 		event->state = PERF_EVENT_STATE_INACTIVE;
 	perf_install_in_context(ctx, event, cpu);
+
+	/*
+	 * Now that event->ctx is updated and visible, put the old ctx.
+	 */
+	put_ctx(old_ctx);
 }
 
 static void __perf_pmu_install(struct perf_event_context *ctx,
@@ -12935,6 +12948,10 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
 	struct perf_event_context *src_ctx, *dst_ctx;
 	LIST_HEAD(events);
 
+	/*
+	 * Since per-cpu context is persistent, no need to grab an extra
+	 * reference.
+	 */
 	src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
 	dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
 
-- 
cgit v1.2.3


From d6111cf45c5787282b2e20d77bdb6b28881d516a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 31 Oct 2023 11:12:01 -0700
Subject: sched: Use WRITE_ONCE() for p->on_rq

Since RCU-tasks uses READ_ONCE(p->on_rq), ensure the write-side
matches with WRITE_ONCE().

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/e4896e0b-eacc-45a2-a7a8-de2280a51ecc@paulmck-laptop
---
 kernel/sched/core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a708d225c28e..9d5099d02dbc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2124,12 +2124,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
 
 	enqueue_task(rq, p, flags);
 
-	p->on_rq = TASK_ON_RQ_QUEUED;
+	WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED);
+	ASSERT_EXCLUSIVE_WRITER(p->on_rq);
 }
 
 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
-	p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
+	WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
+	ASSERT_EXCLUSIVE_WRITER(p->on_rq);
 
 	dequeue_task(rq, p, flags);
 }
-- 
cgit v1.2.3


From 84db47ca7146d7bd00eb5cf2b93989a971c84650 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@amd.com>
Date: Fri, 20 Oct 2023 21:27:46 +0530
Subject: sched/numa: Fix mm numa_scan_seq based unconditional scan

Since commit fc137c0ddab2 ("sched/numa: enhance vma scanning logic")

NUMA Balancing allows updating PTEs to trap NUMA hinting faults if the
task had previously accessed VMA. However unconditional scan of VMAs are
allowed during initial phase of VMA creation until process's
mm numa_scan_seq reaches 2 even though current task had not accessed VMA.

Rationale:
 - Without initial scan subsequent PTE update may never happen.
 - Give fair opportunity to all the VMAs to be scanned and subsequently
understand the access pattern of all the VMAs.

But it has a corner case where, if a VMA is created after some time,
process's mm numa_scan_seq could be already greater than 2.

For e.g., values of mm numa_scan_seq when VMAs are created by running
mmtest autonuma benchmark briefly looks like:
start_seq=0 : 459
start_seq=2 : 138
start_seq=3 : 144
start_seq=4 : 8
start_seq=8 : 1
start_seq=9 : 1
This results in no unconditional PTE updates for those VMAs created after
some time.

Fix:
 - Note down the initial value of mm numa_scan_seq in per VMA start_seq.
 - Allow unconditional scan till start_seq + 2.

Result:
SUT: AMD EPYC Milan with 2 NUMA nodes 256 cpus.
base kernel: upstream 6.6-rc6 with Mels patches [1] applied.

kernbench
==========		base                  patched %gain
Amean    elsp-128      165.09 ( 0.00%)      164.78 *   0.19%*

Duration User       41404.28    41375.08
Duration System      9862.22     9768.48
Duration Elapsed      519.87      518.72

Ops NUMA PTE updates           1041416.00      831536.00
Ops NUMA hint faults            263296.00      220966.00
Ops NUMA pages migrated         258021.00      212769.00
Ops AutoNUMA cost                 1328.67        1114.69

autonumabench

NUMA01_THREADLOCAL
==================
Amean  elsp-NUMA01_THREADLOCAL   81.79 (0.00%)  67.74 *  17.18%*

Duration User       54832.73    47379.67
Duration System        75.00      185.75
Duration Elapsed      576.72      476.09

Ops NUMA PTE updates                  394429.00    11121044.00
Ops NUMA hint faults                    1001.00     8906404.00
Ops NUMA pages migrated                  288.00     2998694.00
Ops AutoNUMA cost                          7.77       44666.84

Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Link: https://lore.kernel.org/r/2ea7cbce80ac7c62e90cbfb9653a7972f902439f.1697816692.git.raghavendra.kt@amd.com
---
 include/linux/mm_types.h | 3 +++
 kernel/sched/fair.c      | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 957ce38768b2..950df415d7de 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -600,6 +600,9 @@ struct vma_numab_state {
 	 */
 	unsigned long pids_active[2];
 
+	/* MM scan sequence ID when scan first started after VMA creation */
+	int start_scan_seq;
+
 	/*
 	 * MM scan sequence ID when the VMA was last completely scanned.
 	 * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d7a3c63a2171..44b5262b6657 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3164,7 +3164,7 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
 	 * This is also done to avoid any side effect of task scanning
 	 * amplifying the unfairness of disjoint set of VMAs' access.
 	 */
-	if (READ_ONCE(current->mm->numa_scan_seq) < 2)
+	if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
 		return true;
 
 	pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
@@ -3307,6 +3307,8 @@ retry_pids:
 			if (!vma->numab_state)
 				continue;
 
+			vma->numab_state->start_scan_seq = mm->numa_scan_seq;
+
 			vma->numab_state->next_scan = now +
 				msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 
-- 
cgit v1.2.3


From 2227a957e1d5b1941be4e4207879ec74f4bb37f8 Mon Sep 17 00:00:00 2001
From: Abel Wu <wuyun.abel@bytedance.com>
Date: Wed, 15 Nov 2023 11:36:45 +0800
Subject: sched/eevdf: Sort the rbtree by virtual deadline

Sort the task timeline by virtual deadline and keep the min_vruntime
in the augmented tree, so we can avoid doubling the worst case cost
and make full use of the cached leftmost node to enable O(1) fastpath
picking in next patch.

Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231115033647.80785-3-wuyun.abel@bytedance.com
---
 include/linux/sched.h |   2 +-
 kernel/sched/debug.c  |  11 +++-
 kernel/sched/fair.c   | 168 ++++++++++++++++++++------------------------------
 kernel/sched/sched.h  |   1 +
 4 files changed, 77 insertions(+), 105 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292c31697248..cd56d4018527 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -553,7 +553,7 @@ struct sched_entity {
 	struct load_weight		load;
 	struct rb_node			run_node;
 	u64				deadline;
-	u64				min_deadline;
+	u64				min_vruntime;
 
 	struct list_head		group_node;
 	unsigned int			on_rq;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4580a450700e..168eecc209b4 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -628,8 +628,8 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
-	s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
-	struct sched_entity *last, *first;
+	s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+	struct sched_entity *last, *first, *root;
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
@@ -644,15 +644,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			SPLIT_NS(cfs_rq->exec_clock));
 
 	raw_spin_rq_lock_irqsave(rq, flags);
+	root = __pick_root_entity(cfs_rq);
+	if (root)
+		left_vruntime = root->min_vruntime;
 	first = __pick_first_entity(cfs_rq);
 	if (first)
-		left_vruntime = first->vruntime;
+		left_deadline = first->deadline;
 	last = __pick_last_entity(cfs_rq);
 	if (last)
 		right_vruntime = last->vruntime;
 	min_vruntime = cfs_rq->min_vruntime;
 	raw_spin_rq_unlock_irqrestore(rq, flags);
 
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "left_deadline",
+			SPLIT_NS(left_deadline));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "left_vruntime",
 			SPLIT_NS(left_vruntime));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 44b5262b6657..31bca05c3612 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -551,7 +551,11 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 static inline bool entity_before(const struct sched_entity *a,
 				 const struct sched_entity *b)
 {
-	return (s64)(a->vruntime - b->vruntime) < 0;
+	/*
+	 * Tiebreak on vruntime seems unnecessary since it can
+	 * hardly happen.
+	 */
+	return (s64)(a->deadline - b->deadline) < 0;
 }
 
 static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -720,7 +724,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
  *       to the loss in precision caused by the division.
  */
-int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
 {
 	struct sched_entity *curr = cfs_rq->curr;
 	s64 avg = cfs_rq->avg_vruntime;
@@ -733,7 +737,12 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		load += weight;
 	}
 
-	return avg >= entity_key(cfs_rq, se) * load;
+	return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
+}
+
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	return vruntime_eligible(cfs_rq, se->vruntime);
 }
 
 static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
@@ -752,9 +761,8 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
 
 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *se = __pick_first_entity(cfs_rq);
+	struct sched_entity *se = __pick_root_entity(cfs_rq);
 	struct sched_entity *curr = cfs_rq->curr;
-
 	u64 vruntime = cfs_rq->min_vruntime;
 
 	if (curr) {
@@ -766,9 +774,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
 
 	if (se) {
 		if (!curr)
-			vruntime = se->vruntime;
+			vruntime = se->min_vruntime;
 		else
-			vruntime = min_vruntime(vruntime, se->vruntime);
+			vruntime = min_vruntime(vruntime, se->min_vruntime);
 	}
 
 	/* ensure we never gain time by being placed backwards. */
@@ -781,34 +789,34 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
 	return entity_before(__node_2_se(a), __node_2_se(b));
 }
 
-#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
 
-static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
+static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
 {
 	if (node) {
 		struct sched_entity *rse = __node_2_se(node);
-		if (deadline_gt(min_deadline, se, rse))
-			se->min_deadline = rse->min_deadline;
+		if (vruntime_gt(min_vruntime, se, rse))
+			se->min_vruntime = rse->min_vruntime;
 	}
 }
 
 /*
- * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
+ * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
  */
-static inline bool min_deadline_update(struct sched_entity *se, bool exit)
+static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
 {
-	u64 old_min_deadline = se->min_deadline;
+	u64 old_min_vruntime = se->min_vruntime;
 	struct rb_node *node = &se->run_node;
 
-	se->min_deadline = se->deadline;
-	__update_min_deadline(se, node->rb_right);
-	__update_min_deadline(se, node->rb_left);
+	se->min_vruntime = se->vruntime;
+	__min_vruntime_update(se, node->rb_right);
+	__min_vruntime_update(se, node->rb_left);
 
-	return se->min_deadline == old_min_deadline;
+	return se->min_vruntime == old_min_vruntime;
 }
 
-RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
-		     run_node, min_deadline, min_deadline_update);
+RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
+		     run_node, min_vruntime, min_vruntime_update);
 
 /*
  * Enqueue an entity into the rb-tree:
@@ -816,18 +824,28 @@ RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	avg_vruntime_add(cfs_rq, se);
-	se->min_deadline = se->deadline;
+	se->min_vruntime = se->vruntime;
 	rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
-				__entity_less, &min_deadline_cb);
+				__entity_less, &min_vruntime_cb);
 }
 
 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
-				  &min_deadline_cb);
+				  &min_vruntime_cb);
 	avg_vruntime_sub(cfs_rq, se);
 }
 
+struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
+{
+	struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;
+
+	if (!root)
+		return NULL;
+
+	return __node_2_se(root);
+}
+
 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
@@ -850,23 +868,28 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
  *     with the earliest virtual deadline.
  *
  * We can do this in O(log n) time due to an augmented RB-tree. The
- * tree keeps the entries sorted on service, but also functions as a
- * heap based on the deadline by keeping:
+ * tree keeps the entries sorted on deadline, but also functions as a
+ * heap based on the vruntime by keeping:
  *
- *  se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
+ *  se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
  *
- * Which allows an EDF like search on (sub)trees.
+ * Which allows tree pruning through eligibility.
  */
-static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
 	struct sched_entity *curr = cfs_rq->curr;
 	struct sched_entity *best = NULL;
-	struct sched_entity *best_left = NULL;
+
+	/*
+	 * We can safely skip eligibility check if there is only one entity
+	 * in this cfs_rq, saving some cycles.
+	 */
+	if (cfs_rq->nr_running == 1)
+		return curr && curr->on_rq ? curr : __node_2_se(node);
 
 	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
 		curr = NULL;
-	best = curr;
 
 	/*
 	 * Once selected, run a task until it either becomes non-eligible or
@@ -875,95 +898,38 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
 	if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
 		return curr;
 
+	/* Heap search for the EEVD entity */
 	while (node) {
 		struct sched_entity *se = __node_2_se(node);
+		struct rb_node *left = node->rb_left;
 
 		/*
-		 * If this entity is not eligible, try the left subtree.
+		 * Eligible entities in left subtree are always better
+		 * choices, since they have earlier deadlines.
 		 */
-		if (!entity_eligible(cfs_rq, se)) {
-			node = node->rb_left;
+		if (left && vruntime_eligible(cfs_rq,
+					__node_2_se(left)->min_vruntime)) {
+			node = left;
 			continue;
 		}
 
 		/*
-		 * Now we heap search eligible trees for the best (min_)deadline
+		 * The left subtree either is empty or has no eligible
+		 * entity, so check the current node since it is the one
+		 * with earliest deadline that might be eligible.
 		 */
-		if (!best || deadline_gt(deadline, best, se))
+		if (entity_eligible(cfs_rq, se)) {
 			best = se;
-
-		/*
-		 * Every se in a left branch is eligible, keep track of the
-		 * branch with the best min_deadline
-		 */
-		if (node->rb_left) {
-			struct sched_entity *left = __node_2_se(node->rb_left);
-
-			if (!best_left || deadline_gt(min_deadline, best_left, left))
-				best_left = left;
-
-			/*
-			 * min_deadline is in the left branch. rb_left and all
-			 * descendants are eligible, so immediately switch to the second
-			 * loop.
-			 */
-			if (left->min_deadline == se->min_deadline)
-				break;
-		}
-
-		/* min_deadline is at this node, no need to look right */
-		if (se->deadline == se->min_deadline)
 			break;
-
-		/* else min_deadline is in the right branch. */
-		node = node->rb_right;
-	}
-
-	/*
-	 * We ran into an eligible node which is itself the best.
-	 * (Or nr_running == 0 and both are NULL)
-	 */
-	if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0)
-		return best;
-
-	/*
-	 * Now best_left and all of its children are eligible, and we are just
-	 * looking for deadline == min_deadline
-	 */
-	node = &best_left->run_node;
-	while (node) {
-		struct sched_entity *se = __node_2_se(node);
-
-		/* min_deadline is the current node */
-		if (se->deadline == se->min_deadline)
-			return se;
-
-		/* min_deadline is in the left branch */
-		if (node->rb_left &&
-		    __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
-			node = node->rb_left;
-			continue;
 		}
 
-		/* else min_deadline is in the right branch */
 		node = node->rb_right;
 	}
-	return NULL;
-}
 
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
-{
-	struct sched_entity *se = __pick_eevdf(cfs_rq);
-
-	if (!se) {
-		struct sched_entity *left = __pick_first_entity(cfs_rq);
-		if (left) {
-			pr_err("EEVDF scheduling fail, picking leftmost\n");
-			return left;
-		}
-	}
+	if (!best || (curr && entity_before(curr, best)))
+		best = curr;
 
-	return se;
+	return best;
 }
 
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2e5a95486a42..539c7e763f15 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2822,6 +2822,7 @@ DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
 		    double_rq_lock(_T->lock, _T->lock2),
 		    double_rq_unlock(_T->lock, _T->lock2))
 
+extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
 
-- 
cgit v1.2.3


From ee4373dc902c0a403dd084b254ce70a78f95466f Mon Sep 17 00:00:00 2001
From: Abel Wu <wuyun.abel@bytedance.com>
Date: Wed, 15 Nov 2023 11:36:46 +0800
Subject: sched/eevdf: O(1) fastpath for task selection

Since the RB-tree is now sorted by deadline, let's first try the
leftmost entity which has the earliest virtual deadline. I've done
some benchmarks to see its effectiveness.

All the benchmarks are done inside a normal cpu cgroup in a clean
environment with cpu turbo disabled, on a dual-CPU Intel Xeon(R)
Platinum 8260 with 2 NUMA nodes each of which has 24C/48T.

  hackbench: process/thread + pipe/socket + 1/2/4/8 groups
  netperf:   TCP/UDP + STREAM/RR + 24/48/72/96/192 threads
  tbench:    loopback 24/48/72/96/192 threads
  schbench:  1/2/4/8 mthreads

  direct:    cfs_rq has only one entity
  parity:    RUN_TO_PARITY
  fast:      O(1) fastpath
  slow:	     heap search

    (%)		direct	parity	fast	slow
  hackbench	92.95	2.02	4.91	0.12
  netperf	68.08	6.60	24.18	1.14
  tbench	67.55	11.22	20.61	0.62
  schbench	69.91	2.65	25.73	1.71

The above results indicate that this fastpath really makes task
selection more efficient.

Signed-off-by: Abel Wu <wuyun.abel@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231115033647.80785-4-wuyun.abel@bytedance.com
---
 kernel/sched/fair.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 31bca05c3612..d3e045d80cab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -878,6 +878,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+	struct sched_entity *se = __pick_first_entity(cfs_rq);
 	struct sched_entity *curr = cfs_rq->curr;
 	struct sched_entity *best = NULL;
 
@@ -886,7 +887,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
 	 * in this cfs_rq, saving some cycles.
 	 */
 	if (cfs_rq->nr_running == 1)
-		return curr && curr->on_rq ? curr : __node_2_se(node);
+		return curr && curr->on_rq ? curr : se;
 
 	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
 		curr = NULL;
@@ -898,9 +899,14 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
 	if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
 		return curr;
 
+	/* Pick the leftmost entity if it's eligible */
+	if (se && entity_eligible(cfs_rq, se)) {
+		best = se;
+		goto found;
+	}
+
 	/* Heap search for the EEVD entity */
 	while (node) {
-		struct sched_entity *se = __node_2_se(node);
 		struct rb_node *left = node->rb_left;
 
 		/*
@@ -913,6 +919,8 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
 			continue;
 		}
 
+		se = __node_2_se(node);
+
 		/*
 		 * The left subtree either is empty or has no eligible
 		 * entity, so check the current node since it is the one
@@ -925,7 +933,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
 
 		node = node->rb_right;
 	}
-
+found:
 	if (!best || (curr && entity_before(curr, best)))
 		best = curr;
 
-- 
cgit v1.2.3


From 5d69eca542ee17c618f9a55da52191d5e28b435f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 4 Nov 2023 11:59:18 +0100
Subject: sched: Unify runtime accounting across classes

All classes use sched_entity::exec_start to track runtime and have
copies of the exact same code around to compute runtime.

Collapse all that.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/54d148a144f26d9559698c4dd82d8859038a7380.1699095159.git.bristot@kernel.org
---
 include/linux/sched.h    |  2 +-
 kernel/sched/deadline.c  | 15 +++----------
 kernel/sched/fair.c      | 57 ++++++++++++++++++++++++++++++++++++------------
 kernel/sched/rt.c        | 15 +++----------
 kernel/sched/sched.h     | 12 ++--------
 kernel/sched/stop_task.c | 13 +----------
 6 files changed, 53 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index cd56d4018527..44b46d9743bf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -523,7 +523,7 @@ struct sched_statistics {
 	u64				block_max;
 	s64				sum_block_runtime;
 
-	u64				exec_max;
+	s64				exec_max;
 	u64				slice_max;
 
 	u64				nr_migrations_cold;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b28114478b82..de79719c63c0 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1275,9 +1275,8 @@ static void update_curr_dl(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_dl_entity *dl_se = &curr->dl;
-	u64 delta_exec, scaled_delta_exec;
+	s64 delta_exec, scaled_delta_exec;
 	int cpu = cpu_of(rq);
-	u64 now;
 
 	if (!dl_task(curr) || !on_dl_rq(dl_se))
 		return;
@@ -1290,21 +1289,13 @@ static void update_curr_dl(struct rq *rq)
 	 * natural solution, but the full ramifications of this
 	 * approach need further study.
 	 */
-	now = rq_clock_task(rq);
-	delta_exec = now - curr->se.exec_start;
-	if (unlikely((s64)delta_exec <= 0)) {
+	delta_exec = update_curr_common(rq);
+	if (unlikely(delta_exec <= 0)) {
 		if (unlikely(dl_se->dl_yielded))
 			goto throttle;
 		return;
 	}
 
-	schedstat_set(curr->stats.exec_max,
-		      max(curr->stats.exec_max, delta_exec));
-
-	trace_sched_stat_runtime(curr, delta_exec, 0);
-
-	update_current_exec_runtime(curr, now, delta_exec);
-
 	if (dl_entity_is_special(dl_se))
 		return;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d3e045d80cab..11073cf00134 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1103,23 +1103,17 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_SMP */
 
-/*
- * Update the current task's runtime statistics.
- */
-static void update_curr(struct cfs_rq *cfs_rq)
+static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
 {
-	struct sched_entity *curr = cfs_rq->curr;
-	u64 now = rq_clock_task(rq_of(cfs_rq));
-	u64 delta_exec;
-
-	if (unlikely(!curr))
-		return;
+	u64 now = rq_clock_task(rq);
+	s64 delta_exec;
 
 	delta_exec = now - curr->exec_start;
-	if (unlikely((s64)delta_exec <= 0))
-		return;
+	if (unlikely(delta_exec <= 0))
+		return delta_exec;
 
 	curr->exec_start = now;
+	curr->sum_exec_runtime += delta_exec;
 
 	if (schedstat_enabled()) {
 		struct sched_statistics *stats;
@@ -1129,8 +1123,43 @@ static void update_curr(struct cfs_rq *cfs_rq)
 				max(delta_exec, stats->exec_max));
 	}
 
-	curr->sum_exec_runtime += delta_exec;
-	schedstat_add(cfs_rq->exec_clock, delta_exec);
+	return delta_exec;
+}
+
+/*
+ * Used by other classes to account runtime.
+ */
+s64 update_curr_common(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	s64 delta_exec;
+
+	delta_exec = update_curr_se(rq, &curr->se);
+	if (unlikely(delta_exec <= 0))
+		return delta_exec;
+
+	trace_sched_stat_runtime(curr, delta_exec, 0);
+
+	account_group_exec_runtime(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
+
+	return delta_exec;
+}
+
+/*
+ * Update the current task's runtime statistics.
+ */
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr = cfs_rq->curr;
+	s64 delta_exec;
+
+	if (unlikely(!curr))
+		return;
+
+	delta_exec = update_curr_se(rq_of(cfs_rq), curr);
+	if (unlikely(delta_exec <= 0))
+		return;
 
 	curr->vruntime += calc_delta_fair(delta_exec, curr);
 	update_deadline(cfs_rq, curr);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 6aaf0a3d6081..3261b067b67e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1002,24 +1002,15 @@ static void update_curr_rt(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_rt_entity *rt_se = &curr->rt;
-	u64 delta_exec;
-	u64 now;
+	s64 delta_exec;
 
 	if (curr->sched_class != &rt_sched_class)
 		return;
 
-	now = rq_clock_task(rq);
-	delta_exec = now - curr->se.exec_start;
-	if (unlikely((s64)delta_exec <= 0))
+	delta_exec = update_curr_common(rq);
+	if (unlikely(delta_exec <= 0))
 		return;
 
-	schedstat_set(curr->stats.exec_max,
-		      max(curr->stats.exec_max, delta_exec));
-
-	trace_sched_stat_runtime(curr, delta_exec, 0);
-
-	update_current_exec_runtime(curr, now, delta_exec);
-
 	if (!rt_bandwidth_enabled())
 		return;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 539c7e763f15..6703e9e81b1d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2212,6 +2212,8 @@ struct affinity_context {
 	unsigned int flags;
 };
 
+extern s64 update_curr_common(struct rq *rq);
+
 struct sched_class {
 
 #ifdef CONFIG_UCLAMP_TASK
@@ -3262,16 +3264,6 @@ extern int sched_dynamic_mode(const char *str);
 extern void sched_dynamic_update(int mode);
 #endif
 
-static inline void update_current_exec_runtime(struct task_struct *curr,
-						u64 now, u64 delta_exec)
-{
-	curr->se.sum_exec_runtime += delta_exec;
-	account_group_exec_runtime(curr, delta_exec);
-
-	curr->se.exec_start = now;
-	cgroup_account_cputime(curr, delta_exec);
-}
-
 #ifdef CONFIG_SCHED_MM_CID
 
 #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 6cf7304e6449..b1b8fe61c532 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq)
 
 static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
 {
-	struct task_struct *curr = rq->curr;
-	u64 now, delta_exec;
-
-	now = rq_clock_task(rq);
-	delta_exec = now - curr->se.exec_start;
-	if (unlikely((s64)delta_exec < 0))
-		delta_exec = 0;
-
-	schedstat_set(curr->stats.exec_max,
-		      max(curr->stats.exec_max, delta_exec));
-
-	update_current_exec_runtime(curr, now, delta_exec);
+	update_curr_common(rq);
 }
 
 /*
-- 
cgit v1.2.3


From 5fe6ec8f6ab549b6422e41551abb51802bd48bc7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 6 Nov 2023 13:41:43 +0100
Subject: sched: Remove vruntime from trace_sched_stat_runtime()

Tracing the runtime delta makes sense, observer can sum over time.
Tracing the absolute vruntime makes less sense, inconsistent:
absolute-vs-delta, but also vruntime delta can be computed from
runtime delta.

Removing the vruntime thing also makes the two tracepoint sites
identical, allowing to unify the code in a later patch.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/trace/events/sched.h | 15 ++++++---------
 kernel/sched/fair.c          |  5 ++---
 2 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 6188ad0d9e0d..dbb01b4b7451 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -493,33 +493,30 @@ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_blocked,
  */
 DECLARE_EVENT_CLASS(sched_stat_runtime,
 
-	TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),
+	TP_PROTO(struct task_struct *tsk, u64 runtime),
 
-	TP_ARGS(tsk, __perf_count(runtime), vruntime),
+	TP_ARGS(tsk, __perf_count(runtime)),
 
 	TP_STRUCT__entry(
 		__array( char,	comm,	TASK_COMM_LEN	)
 		__field( pid_t,	pid			)
 		__field( u64,	runtime			)
-		__field( u64,	vruntime			)
 	),
 
 	TP_fast_assign(
 		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
 		__entry->pid		= tsk->pid;
 		__entry->runtime	= runtime;
-		__entry->vruntime	= vruntime;
 	),
 
-	TP_printk("comm=%s pid=%d runtime=%Lu [ns] vruntime=%Lu [ns]",
+	TP_printk("comm=%s pid=%d runtime=%Lu [ns]",
 			__entry->comm, __entry->pid,
-			(unsigned long long)__entry->runtime,
-			(unsigned long long)__entry->vruntime)
+			(unsigned long long)__entry->runtime)
 );
 
 DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime,
-	     TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),
-	     TP_ARGS(tsk, runtime, vruntime));
+	     TP_PROTO(struct task_struct *tsk, u64 runtime),
+	     TP_ARGS(tsk, runtime));
 
 /*
  * Tracepoint for showing priority inheritance modifying a tasks
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11073cf00134..33db70c6b582 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1138,8 +1138,7 @@ s64 update_curr_common(struct rq *rq)
 	if (unlikely(delta_exec <= 0))
 		return delta_exec;
 
-	trace_sched_stat_runtime(curr, delta_exec, 0);
-
+	trace_sched_stat_runtime(curr, delta_exec);
 	account_group_exec_runtime(curr, delta_exec);
 	cgroup_account_cputime(curr, delta_exec);
 
@@ -1168,7 +1167,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	if (entity_is_task(curr)) {
 		struct task_struct *curtask = task_of(curr);
 
-		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
+		trace_sched_stat_runtime(curtask, delta_exec);
 		cgroup_account_cputime(curtask, delta_exec);
 		account_group_exec_runtime(curtask, delta_exec);
 	}
-- 
cgit v1.2.3


From c708a4dc5ab547edc3d6537233ca9e79ea30ce47 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 6 Nov 2023 14:04:01 +0100
Subject: sched: Unify more update_curr*()

Now that trace_sched_stat_runtime() no longer takes a vruntime
argument, the task specific bits are identical between
update_curr_common() and update_curr().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/fair.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 33db70c6b582..1cd92b11b289 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1126,6 +1126,13 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
 	return delta_exec;
 }
 
+static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
+{
+	trace_sched_stat_runtime(p, delta_exec);
+	account_group_exec_runtime(p, delta_exec);
+	cgroup_account_cputime(p, delta_exec);
+}
+
 /*
  * Used by other classes to account runtime.
  */
@@ -1135,12 +1142,8 @@ s64 update_curr_common(struct rq *rq)
 	s64 delta_exec;
 
 	delta_exec = update_curr_se(rq, &curr->se);
-	if (unlikely(delta_exec <= 0))
-		return delta_exec;
-
-	trace_sched_stat_runtime(curr, delta_exec);
-	account_group_exec_runtime(curr, delta_exec);
-	cgroup_account_cputime(curr, delta_exec);
+	if (likely(delta_exec > 0))
+		update_curr_task(curr, delta_exec);
 
 	return delta_exec;
 }
@@ -1164,13 +1167,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	update_deadline(cfs_rq, curr);
 	update_min_vruntime(cfs_rq);
 
-	if (entity_is_task(curr)) {
-		struct task_struct *curtask = task_of(curr);
-
-		trace_sched_stat_runtime(curtask, delta_exec);
-		cgroup_account_cputime(curtask, delta_exec);
-		account_group_exec_runtime(curtask, delta_exec);
-	}
+	if (entity_is_task(curr))
+		update_curr_task(task_of(curr), delta_exec);
 
 	account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
-- 
cgit v1.2.3


From 9e07d45c5210f5dd6701c00d55791983db7320fa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 4 Nov 2023 11:59:19 +0100
Subject: sched/deadline: Collect sched_dl_entity initialization

Create a single function that initializes a sched_dl_entity.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/51acc695eecf0a1a2f78f9a044e11ffd9b316bcf.1699095159.git.bristot@kernel.org
---
 kernel/sched/core.c     |  5 +----
 kernel/sched/deadline.c | 22 +++++++++++++++-------
 kernel/sched/sched.h    |  5 +----
 3 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9d5099d02dbc..966631f05d71 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4511,10 +4511,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	memset(&p->stats, 0, sizeof(p->stats));
 #endif
 
-	RB_CLEAR_NODE(&p->dl.rb_node);
-	init_dl_task_timer(&p->dl);
-	init_dl_inactive_task_timer(&p->dl);
-	__dl_clear_params(p);
+	init_dl_entity(&p->dl);
 
 	INIT_LIST_HEAD(&p->rt.run_list);
 	p->rt.timeout		= 0;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index de79719c63c0..e80bb884262d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -335,6 +335,8 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
 	__add_rq_bw(new_bw, &rq->dl);
 }
 
+static void __dl_clear_params(struct sched_dl_entity *dl_se);
+
 /*
  * The utilization of a task cannot be immediately removed from
  * the rq active utilization (running_bw) when the task blocks.
@@ -434,7 +436,7 @@ static void task_non_contending(struct task_struct *p)
 			raw_spin_lock(&dl_b->lock);
 			__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
 			raw_spin_unlock(&dl_b->lock);
-			__dl_clear_params(p);
+			__dl_clear_params(dl_se);
 		}
 
 		return;
@@ -1183,7 +1185,7 @@ unlock:
 	return HRTIMER_NORESTART;
 }
 
-void init_dl_task_timer(struct sched_dl_entity *dl_se)
+static void init_dl_task_timer(struct sched_dl_entity *dl_se)
 {
 	struct hrtimer *timer = &dl_se->dl_timer;
 
@@ -1389,7 +1391,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 		raw_spin_lock(&dl_b->lock);
 		__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
 		raw_spin_unlock(&dl_b->lock);
-		__dl_clear_params(p);
+		__dl_clear_params(dl_se);
 
 		goto unlock;
 	}
@@ -1405,7 +1407,7 @@ unlock:
 	return HRTIMER_NORESTART;
 }
 
-void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
+static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
 {
 	struct hrtimer *timer = &dl_se->inactive_timer;
 
@@ -2957,10 +2959,8 @@ bool __checkparam_dl(const struct sched_attr *attr)
 /*
  * This function clears the sched_dl_entity static params.
  */
-void __dl_clear_params(struct task_struct *p)
+static void __dl_clear_params(struct sched_dl_entity *dl_se)
 {
-	struct sched_dl_entity *dl_se = &p->dl;
-
 	dl_se->dl_runtime		= 0;
 	dl_se->dl_deadline		= 0;
 	dl_se->dl_period		= 0;
@@ -2978,6 +2978,14 @@ void __dl_clear_params(struct task_struct *p)
 #endif
 }
 
+void init_dl_entity(struct sched_dl_entity *dl_se)
+{
+	RB_CLEAR_NODE(&dl_se->rb_node);
+	init_dl_task_timer(dl_se);
+	init_dl_inactive_task_timer(dl_se);
+	__dl_clear_params(dl_se);
+}
+
 bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
 {
 	struct sched_dl_entity *dl_se = &p->dl;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6703e9e81b1d..3c62df1511e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -273,8 +273,6 @@ struct rt_bandwidth {
 	unsigned int		rt_period_active;
 };
 
-void __dl_clear_params(struct task_struct *p);
-
 static inline int dl_bandwidth_enabled(void)
 {
 	return sysctl_sched_rt_runtime >= 0;
@@ -2427,8 +2425,7 @@ extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
 extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
 
-extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
-extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
+extern void init_dl_entity(struct sched_dl_entity *dl_se);
 
 #define BW_SHIFT		20
 #define BW_UNIT			(1 << BW_SHIFT)
-- 
cgit v1.2.3


From 2f7a0f58948d8231236e2facecc500f1930fb996 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 4 Nov 2023 11:59:20 +0100
Subject: sched/deadline: Move bandwidth accounting into {en,de}queue_dl_entity

In preparation of introducing !task sched_dl_entity; move the
bandwidth accounting into {en.de}queue_dl_entity().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Link: https://lkml.kernel.org/r/a86dccbbe44e021b8771627e1dae01a69b73466d.1699095159.git.bristot@kernel.org
---
 kernel/sched/deadline.c | 130 +++++++++++++++++++++++++++---------------------
 kernel/sched/sched.h    |   6 +++
 2 files changed, 78 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e80bb884262d..81810f67df7a 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -391,12 +391,12 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se);
  * up, and checks if the task is still in the "ACTIVE non contending"
  * state or not (in the second case, it updates running_bw).
  */
-static void task_non_contending(struct task_struct *p)
+static void task_non_contending(struct sched_dl_entity *dl_se)
 {
-	struct sched_dl_entity *dl_se = &p->dl;
 	struct hrtimer *timer = &dl_se->inactive_timer;
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
+	struct task_struct *p = dl_task_of(dl_se);
 	s64 zerolag_time;
 
 	/*
@@ -428,13 +428,14 @@ static void task_non_contending(struct task_struct *p)
 	if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
 		if (dl_task(p))
 			sub_running_bw(dl_se, dl_rq);
+
 		if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
 			struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
 			if (READ_ONCE(p->__state) == TASK_DEAD)
-				sub_rq_bw(&p->dl, &rq->dl);
+				sub_rq_bw(dl_se, &rq->dl);
 			raw_spin_lock(&dl_b->lock);
-			__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+			__dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p)));
 			raw_spin_unlock(&dl_b->lock);
 			__dl_clear_params(dl_se);
 		}
@@ -1601,6 +1602,41 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 
 	update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);
 
+	/*
+	 * Check if a constrained deadline task was activated
+	 * after the deadline but before the next period.
+	 * If that is the case, the task will be throttled and
+	 * the replenishment timer will be set to the next period.
+	 */
+	if (!dl_se->dl_throttled && !dl_is_implicit(dl_se))
+		dl_check_constrained_dl(dl_se);
+
+	if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING)) {
+		struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+		add_rq_bw(dl_se, dl_rq);
+		add_running_bw(dl_se, dl_rq);
+	}
+
+	/*
+	 * If p is throttled, we do not enqueue it. In fact, if it exhausted
+	 * its budget it needs a replenishment and, since it now is on
+	 * its rq, the bandwidth timer callback (which clearly has not
+	 * run yet) will take care of this.
+	 * However, the active utilization does not depend on the fact
+	 * that the task is on the runqueue or not (but depends on the
+	 * task's state - in GRUB parlance, "inactive" vs "active contending").
+	 * In other words, even if a task is throttled its utilization must
+	 * be counted in the active utilization; hence, we need to call
+	 * add_running_bw().
+	 */
+	if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+		if (flags & ENQUEUE_WAKEUP)
+			task_contending(dl_se, flags);
+
+		return;
+	}
+
 	/*
 	 * If this is a wakeup or a new instance, the scheduling
 	 * parameters of the task might need updating. Otherwise,
@@ -1620,9 +1656,28 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 	__enqueue_dl_entity(dl_se);
 }
 
-static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 {
 	__dequeue_dl_entity(dl_se);
+
+	if (flags & (DEQUEUE_SAVE|DEQUEUE_MIGRATING)) {
+		struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+		sub_running_bw(dl_se, dl_rq);
+		sub_rq_bw(dl_se, dl_rq);
+	}
+
+	/*
+	 * This check allows to start the inactive timer (or to immediately
+	 * decrease the active utilization, if needed) in two cases:
+	 * when the task blocks and when it is terminating
+	 * (p->state == TASK_DEAD). We can handle the two cases in the same
+	 * way, because from GRUB's point of view the same thing is happening
+	 * (the task moves from "active contending" to "active non contending"
+	 * or "inactive")
+	 */
+	if (flags & DEQUEUE_SLEEP)
+		task_non_contending(dl_se);
 }
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -1667,76 +1722,35 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 		return;
 	}
 
-	/*
-	 * Check if a constrained deadline task was activated
-	 * after the deadline but before the next period.
-	 * If that is the case, the task will be throttled and
-	 * the replenishment timer will be set to the next period.
-	 */
-	if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
-		dl_check_constrained_dl(&p->dl);
-
-	if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
-		add_rq_bw(&p->dl, &rq->dl);
-		add_running_bw(&p->dl, &rq->dl);
-	}
-
-	/*
-	 * If p is throttled, we do not enqueue it. In fact, if it exhausted
-	 * its budget it needs a replenishment and, since it now is on
-	 * its rq, the bandwidth timer callback (which clearly has not
-	 * run yet) will take care of this.
-	 * However, the active utilization does not depend on the fact
-	 * that the task is on the runqueue or not (but depends on the
-	 * task's state - in GRUB parlance, "inactive" vs "active contending").
-	 * In other words, even if a task is throttled its utilization must
-	 * be counted in the active utilization; hence, we need to call
-	 * add_running_bw().
-	 */
-	if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
-		if (flags & ENQUEUE_WAKEUP)
-			task_contending(&p->dl, flags);
-
-		return;
-	}
-
 	check_schedstat_required();
 	update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl);
 
+	if (p->on_rq == TASK_ON_RQ_MIGRATING)
+		flags |= ENQUEUE_MIGRATING;
+
 	enqueue_dl_entity(&p->dl, flags);
 
-	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+	if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1)
 		enqueue_pushable_dl_task(rq, p);
 }
 
 static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_stats_dequeue_dl(&rq->dl, &p->dl, flags);
-	dequeue_dl_entity(&p->dl);
-	dequeue_pushable_dl_task(rq, p);
+	dequeue_dl_entity(&p->dl, flags);
+
+	if (!p->dl.dl_throttled)
+		dequeue_pushable_dl_task(rq, p);
 }
 
 static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_curr_dl(rq);
-	__dequeue_task_dl(rq, p, flags);
 
-	if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
-		sub_running_bw(&p->dl, &rq->dl);
-		sub_rq_bw(&p->dl, &rq->dl);
-	}
+	if (p->on_rq == TASK_ON_RQ_MIGRATING)
+		flags |= DEQUEUE_MIGRATING;
 
-	/*
-	 * This check allows to start the inactive timer (or to immediately
-	 * decrease the active utilization, if needed) in two cases:
-	 * when the task blocks and when it is terminating
-	 * (p->state == TASK_DEAD). We can handle the two cases in the same
-	 * way, because from GRUB's point of view the same thing is happening
-	 * (the task moves from "active contending" to "active non contending"
-	 * or "inactive")
-	 */
-	if (flags & DEQUEUE_SLEEP)
-		task_non_contending(p);
+	__dequeue_task_dl(rq, p, flags);
 }
 
 /*
@@ -2551,7 +2565,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	 * will reset the task parameters.
 	 */
 	if (task_on_rq_queued(p) && p->dl.dl_runtime)
-		task_non_contending(p);
+		task_non_contending(&p->dl);
 
 	/*
 	 * In case a task is setscheduled out from SCHED_DEADLINE we need to
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3c62df1511e7..1cda787172f0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2177,6 +2177,10 @@ extern const u32		sched_prio_to_wmult[40];
  * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
  *        in the runqueue.
  *
+ * NOCLOCK - skip the update_rq_clock() (avoids double updates)
+ *
+ * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
+ *
  * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
  * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
  * ENQUEUE_MIGRATED  - the task was migrated during wakeup
@@ -2187,6 +2191,7 @@ extern const u32		sched_prio_to_wmult[40];
 #define DEQUEUE_SAVE		0x02 /* Matches ENQUEUE_RESTORE */
 #define DEQUEUE_MOVE		0x04 /* Matches ENQUEUE_MOVE */
 #define DEQUEUE_NOCLOCK		0x08 /* Matches ENQUEUE_NOCLOCK */
+#define DEQUEUE_MIGRATING	0x100 /* Matches ENQUEUE_MIGRATING */
 
 #define ENQUEUE_WAKEUP		0x01
 #define ENQUEUE_RESTORE		0x02
@@ -2201,6 +2206,7 @@ extern const u32		sched_prio_to_wmult[40];
 #define ENQUEUE_MIGRATED	0x00
 #endif
 #define ENQUEUE_INITIAL		0x80
+#define ENQUEUE_MIGRATING	0x100
 
 #define RETRY_TASK		((void *)-1UL)
 
-- 
cgit v1.2.3


From 63ba8422f876e32ee564ea95da9a7313b13ff0a1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 4 Nov 2023 11:59:21 +0100
Subject: sched/deadline: Introduce deadline servers

Low priority tasks (e.g., SCHED_OTHER) can suffer starvation if tasks
with higher priority (e.g., SCHED_FIFO) monopolize CPU(s).

RT Throttling has been introduced a while ago as a (mostly debug)
countermeasure one can utilize to reserve some CPU time for low priority
tasks (usually background type of work, e.g. workqueues, timers, etc.).
It however has its own problems (see documentation) and the undesired
effect of unconditionally throttling FIFO tasks even when no lower
priority activity needs to run (there are mechanisms to fix this issue
as well, but, again, with their own problems).

Introduce deadline servers to service low priority tasks needs under
starvation conditions. Deadline servers are built extending SCHED_DEADLINE
implementation to allow 2-level scheduling (a sched_deadline entity
becomes a container for lower priority scheduling entities).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/4968601859d920335cf85822eb573a5f179f04b8.1699095159.git.bristot@kernel.org
---
 include/linux/sched.h   |  22 +++-
 kernel/sched/core.c     |  17 +++
 kernel/sched/deadline.c | 332 ++++++++++++++++++++++++++++++++----------------
 kernel/sched/fair.c     |   2 +
 kernel/sched/sched.h    |  27 ++++
 5 files changed, 292 insertions(+), 108 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 44b46d9743bf..8d258162deb0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -63,11 +63,13 @@ struct robust_list_head;
 struct root_domain;
 struct rq;
 struct sched_attr;
+struct sched_dl_entity;
 struct seq_file;
 struct sighand_struct;
 struct signal_struct;
 struct task_delay_info;
 struct task_group;
+struct task_struct;
 struct user_event_mm;
 
 /*
@@ -607,6 +609,9 @@ struct sched_rt_entity {
 #endif
 } __randomize_layout;
 
+typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
+typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+
 struct sched_dl_entity {
 	struct rb_node			rb_node;
 
@@ -654,6 +659,7 @@ struct sched_dl_entity {
 	unsigned int			dl_yielded        : 1;
 	unsigned int			dl_non_contending : 1;
 	unsigned int			dl_overrun	  : 1;
+	unsigned int			dl_server         : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
@@ -668,7 +674,20 @@ struct sched_dl_entity {
 	 * timer is needed to decrease the active utilization at the correct
 	 * time.
 	 */
-	struct hrtimer inactive_timer;
+	struct hrtimer			inactive_timer;
+
+	/*
+	 * Bits for DL-server functionality. Also see the comment near
+	 * dl_server_update().
+	 *
+	 * @rq the runqueue this server is for
+	 *
+	 * @server_has_tasks() returns true if @server_pick return a
+	 * runnable task.
+	 */
+	struct rq			*rq;
+	dl_server_has_tasks_f		server_has_tasks;
+	dl_server_pick_f		server_pick;
 
 #ifdef CONFIG_RT_MUTEXES
 	/*
@@ -795,6 +814,7 @@ struct task_struct {
 	struct sched_entity		se;
 	struct sched_rt_entity		rt;
 	struct sched_dl_entity		dl;
+	struct sched_dl_entity		*dl_server;
 	const struct sched_class	*sched_class;
 
 #ifdef CONFIG_SCHED_CORE
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 966631f05d71..f5f4495d1768 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3797,6 +3797,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 		rq->idle_stamp = 0;
 	}
 #endif
+
+	p->dl_server = NULL;
 }
 
 /*
@@ -6003,12 +6005,27 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 			p = pick_next_task_idle(rq);
 		}
 
+		/*
+		 * This is the fast path; it cannot be a DL server pick;
+		 * therefore even if @p == @prev, ->dl_server must be NULL.
+		 */
+		if (p->dl_server)
+			p->dl_server = NULL;
+
 		return p;
 	}
 
 restart:
 	put_prev_task_balance(rq, prev, rf);
 
+	/*
+	 * We've updated @prev and no longer need the server link, clear it.
+	 * Must be done before ->pick_next_task() because that can (re)set
+	 * ->dl_server.
+	 */
+	if (prev->dl_server)
+		prev->dl_server = NULL;
+
 	for_each_class(class) {
 		p = class->pick_next_task(rq);
 		if (p)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 81810f67df7a..a04a436af8cc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -54,8 +54,14 @@ static int __init sched_dl_sysctl_init(void)
 late_initcall(sched_dl_sysctl_init);
 #endif
 
+static bool dl_server(struct sched_dl_entity *dl_se)
+{
+	return dl_se->dl_server;
+}
+
 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
 {
+	BUG_ON(dl_server(dl_se));
 	return container_of(dl_se, struct task_struct, dl);
 }
 
@@ -64,12 +70,19 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
 	return container_of(dl_rq, struct rq, dl);
 }
 
-static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se)
 {
-	struct task_struct *p = dl_task_of(dl_se);
-	struct rq *rq = task_rq(p);
+	struct rq *rq = dl_se->rq;
+
+	if (!dl_server(dl_se))
+		rq = task_rq(dl_task_of(dl_se));
 
-	return &rq->dl;
+	return rq;
+}
+
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+	return &rq_of_dl_se(dl_se)->dl;
 }
 
 static inline int on_dl_rq(struct sched_dl_entity *dl_se)
@@ -394,9 +407,8 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se);
 static void task_non_contending(struct sched_dl_entity *dl_se)
 {
 	struct hrtimer *timer = &dl_se->inactive_timer;
-	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq);
-	struct task_struct *p = dl_task_of(dl_se);
+	struct rq *rq = rq_of_dl_se(dl_se);
+	struct dl_rq *dl_rq = &rq->dl;
 	s64 zerolag_time;
 
 	/*
@@ -426,25 +438,33 @@ static void task_non_contending(struct sched_dl_entity *dl_se)
 	 * utilization now, instead of starting a timer
 	 */
 	if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
-		if (dl_task(p))
+		if (dl_server(dl_se)) {
 			sub_running_bw(dl_se, dl_rq);
+		} else {
+			struct task_struct *p = dl_task_of(dl_se);
 
-		if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
-			struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+			if (dl_task(p))
+				sub_running_bw(dl_se, dl_rq);
 
-			if (READ_ONCE(p->__state) == TASK_DEAD)
-				sub_rq_bw(dl_se, &rq->dl);
-			raw_spin_lock(&dl_b->lock);
-			__dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p)));
-			raw_spin_unlock(&dl_b->lock);
-			__dl_clear_params(dl_se);
+			if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
+				struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+				if (READ_ONCE(p->__state) == TASK_DEAD)
+					sub_rq_bw(dl_se, &rq->dl);
+				raw_spin_lock(&dl_b->lock);
+				__dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p)));
+				raw_spin_unlock(&dl_b->lock);
+				__dl_clear_params(dl_se);
+			}
 		}
 
 		return;
 	}
 
 	dl_se->dl_non_contending = 1;
-	get_task_struct(p);
+	if (!dl_server(dl_se))
+		get_task_struct(dl_task_of(dl_se));
+
 	hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
 }
 
@@ -471,8 +491,10 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
 		 * will not touch the rq's active utilization,
 		 * so we are still safe.
 		 */
-		if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
-			put_task_struct(dl_task_of(dl_se));
+		if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
+			if (!dl_server(dl_se))
+				put_task_struct(dl_task_of(dl_se));
+		}
 	} else {
 		/*
 		 * Since "dl_non_contending" is not set, the
@@ -485,10 +507,8 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
 	}
 }
 
-static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
+static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
-	struct sched_dl_entity *dl_se = &p->dl;
-
 	return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
 }
 
@@ -740,8 +760,10 @@ static inline void deadline_queue_pull_task(struct rq *rq)
 }
 #endif /* CONFIG_SMP */
 
+static void
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags);
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags);
 static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags);
 
 static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
@@ -989,8 +1011,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
  */
 static void update_dl_entity(struct sched_dl_entity *dl_se)
 {
-	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq);
+	struct rq *rq = rq_of_dl_se(dl_se);
 
 	if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
 	    dl_entity_overflow(dl_se, rq_clock(rq))) {
@@ -1021,11 +1042,11 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
  * actually started or not (i.e., the replenishment instant is in
  * the future or in the past).
  */
-static int start_dl_timer(struct task_struct *p)
+static int start_dl_timer(struct sched_dl_entity *dl_se)
 {
-	struct sched_dl_entity *dl_se = &p->dl;
 	struct hrtimer *timer = &dl_se->dl_timer;
-	struct rq *rq = task_rq(p);
+	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+	struct rq *rq = rq_of_dl_rq(dl_rq);
 	ktime_t now, act;
 	s64 delta;
 
@@ -1059,13 +1080,33 @@ static int start_dl_timer(struct task_struct *p)
 	 * and observe our state.
 	 */
 	if (!hrtimer_is_queued(timer)) {
-		get_task_struct(p);
+		if (!dl_server(dl_se))
+			get_task_struct(dl_task_of(dl_se));
 		hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
 	}
 
 	return 1;
 }
 
+static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * Queueing this task back might have overloaded rq, check if we need
+	 * to kick someone away.
+	 */
+	if (has_pushable_dl_tasks(rq)) {
+		/*
+		 * Nothing relies on rq->lock after this, so its safe to drop
+		 * rq->lock.
+		 */
+		rq_unpin_lock(rq, rf);
+		push_dl_task(rq);
+		rq_repin_lock(rq, rf);
+	}
+#endif
+}
+
 /*
  * This is the bandwidth enforcement timer callback. If here, we know
  * a task is not on its dl_rq, since the fact that the timer was running
@@ -1084,10 +1125,34 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	struct sched_dl_entity *dl_se = container_of(timer,
 						     struct sched_dl_entity,
 						     dl_timer);
-	struct task_struct *p = dl_task_of(dl_se);
+	struct task_struct *p;
 	struct rq_flags rf;
 	struct rq *rq;
 
+	if (dl_server(dl_se)) {
+		struct rq *rq = rq_of_dl_se(dl_se);
+		struct rq_flags rf;
+
+		rq_lock(rq, &rf);
+		if (dl_se->dl_throttled) {
+			sched_clock_tick();
+			update_rq_clock(rq);
+
+			if (dl_se->server_has_tasks(dl_se)) {
+				enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+				resched_curr(rq);
+				__push_dl_task(rq, &rf);
+			} else {
+				replenish_dl_entity(dl_se);
+			}
+
+		}
+		rq_unlock(rq, &rf);
+
+		return HRTIMER_NORESTART;
+	}
+
+	p = dl_task_of(dl_se);
 	rq = task_rq_lock(p, &rf);
 
 	/*
@@ -1158,21 +1223,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	else
 		resched_curr(rq);
 
-#ifdef CONFIG_SMP
-	/*
-	 * Queueing this task back might have overloaded rq, check if we need
-	 * to kick someone away.
-	 */
-	if (has_pushable_dl_tasks(rq)) {
-		/*
-		 * Nothing relies on rq->lock after this, so its safe to drop
-		 * rq->lock.
-		 */
-		rq_unpin_lock(rq, &rf);
-		push_dl_task(rq);
-		rq_repin_lock(rq, &rf);
-	}
-#endif
+	__push_dl_task(rq, &rf);
 
 unlock:
 	task_rq_unlock(rq, p, &rf);
@@ -1214,12 +1265,11 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se)
  */
 static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
 {
-	struct task_struct *p = dl_task_of(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+	struct rq *rq = rq_of_dl_se(dl_se);
 
 	if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
 	    dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
-		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
+		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se)))
 			return;
 		dl_se->dl_throttled = 1;
 		if (dl_se->runtime > 0)
@@ -1270,29 +1320,13 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
 	return (delta * u_act) >> BW_SHIFT;
 }
 
-/*
- * Update the current task's runtime statistics (provided it is still
- * a -deadline task and has not been removed from the dl_rq).
- */
-static void update_curr_dl(struct rq *rq)
+static inline void
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+                        int flags);
+static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
 {
-	struct task_struct *curr = rq->curr;
-	struct sched_dl_entity *dl_se = &curr->dl;
-	s64 delta_exec, scaled_delta_exec;
-	int cpu = cpu_of(rq);
-
-	if (!dl_task(curr) || !on_dl_rq(dl_se))
-		return;
+	s64 scaled_delta_exec;
 
-	/*
-	 * Consumed budget is computed considering the time as
-	 * observed by schedulable tasks (excluding time spent
-	 * in hardirq context, etc.). Deadlines are instead
-	 * computed using hard walltime. This seems to be the more
-	 * natural solution, but the full ramifications of this
-	 * approach need further study.
-	 */
-	delta_exec = update_curr_common(rq);
 	if (unlikely(delta_exec <= 0)) {
 		if (unlikely(dl_se->dl_yielded))
 			goto throttle;
@@ -1310,10 +1344,9 @@ static void update_curr_dl(struct rq *rq)
 	 * according to current frequency and CPU maximum capacity.
 	 */
 	if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
-		scaled_delta_exec = grub_reclaim(delta_exec,
-						 rq,
-						 &curr->dl);
+		scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se);
 	} else {
+		int cpu = cpu_of(rq);
 		unsigned long scale_freq = arch_scale_freq_capacity(cpu);
 		unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
 
@@ -1332,11 +1365,20 @@ throttle:
 		    (dl_se->flags & SCHED_FLAG_DL_OVERRUN))
 			dl_se->dl_overrun = 1;
 
-		__dequeue_task_dl(rq, curr, 0);
-		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
-			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+		dequeue_dl_entity(dl_se, 0);
+		if (!dl_server(dl_se)) {
+			update_stats_dequeue_dl(&rq->dl, dl_se, 0);
+			dequeue_pushable_dl_task(rq, dl_task_of(dl_se));
+		}
 
-		if (!is_leftmost(curr, &rq->dl))
+		if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) {
+			if (dl_server(dl_se))
+				enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+			else
+				enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH);
+		}
+
+		if (!is_leftmost(dl_se, &rq->dl))
 			resched_curr(rq);
 	}
 
@@ -1366,20 +1408,82 @@ throttle:
 	}
 }
 
+void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
+{
+	update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+}
+
+void dl_server_start(struct sched_dl_entity *dl_se)
+{
+	if (!dl_server(dl_se)) {
+		dl_se->dl_server = 1;
+		setup_new_dl_entity(dl_se);
+	}
+	enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
+}
+
+void dl_server_stop(struct sched_dl_entity *dl_se)
+{
+	dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
+}
+
+void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+		    dl_server_has_tasks_f has_tasks,
+		    dl_server_pick_f pick)
+{
+	dl_se->rq = rq;
+	dl_se->server_has_tasks = has_tasks;
+	dl_se->server_pick = pick;
+}
+
+/*
+ * Update the current task's runtime statistics (provided it is still
+ * a -deadline task and has not been removed from the dl_rq).
+ */
+static void update_curr_dl(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	struct sched_dl_entity *dl_se = &curr->dl;
+	s64 delta_exec;
+
+	if (!dl_task(curr) || !on_dl_rq(dl_se))
+		return;
+
+	/*
+	 * Consumed budget is computed considering the time as
+	 * observed by schedulable tasks (excluding time spent
+	 * in hardirq context, etc.). Deadlines are instead
+	 * computed using hard walltime. This seems to be the more
+	 * natural solution, but the full ramifications of this
+	 * approach need further study.
+	 */
+	delta_exec = update_curr_common(rq);
+	update_curr_dl_se(rq, dl_se, delta_exec);
+}
+
 static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 {
 	struct sched_dl_entity *dl_se = container_of(timer,
 						     struct sched_dl_entity,
 						     inactive_timer);
-	struct task_struct *p = dl_task_of(dl_se);
+	struct task_struct *p = NULL;
 	struct rq_flags rf;
 	struct rq *rq;
 
-	rq = task_rq_lock(p, &rf);
+	if (!dl_server(dl_se)) {
+		p = dl_task_of(dl_se);
+		rq = task_rq_lock(p, &rf);
+	} else {
+		rq = dl_se->rq;
+		rq_lock(rq, &rf);
+	}
 
 	sched_clock_tick();
 	update_rq_clock(rq);
 
+	if (dl_server(dl_se))
+		goto no_task;
+
 	if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
 		struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
@@ -1396,14 +1500,21 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 
 		goto unlock;
 	}
+
+no_task:
 	if (dl_se->dl_non_contending == 0)
 		goto unlock;
 
 	sub_running_bw(dl_se, &rq->dl);
 	dl_se->dl_non_contending = 0;
 unlock:
-	task_rq_unlock(rq, p, &rf);
-	put_task_struct(p);
+
+	if (!dl_server(dl_se)) {
+		task_rq_unlock(rq, p, &rf);
+		put_task_struct(p);
+	} else {
+		rq_unlock(rq, &rf);
+	}
 
 	return HRTIMER_NORESTART;
 }
@@ -1466,10 +1577,8 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
 static inline
 void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
-	int prio = dl_task_of(dl_se)->prio;
 	u64 deadline = dl_se->deadline;
 
-	WARN_ON(!dl_prio(prio));
 	dl_rq->dl_nr_running++;
 	add_nr_running(rq_of_dl_rq(dl_rq), 1);
 
@@ -1479,9 +1588,6 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 static inline
 void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
-	int prio = dl_task_of(dl_se)->prio;
-
-	WARN_ON(!dl_prio(prio));
 	WARN_ON(!dl_rq->dl_nr_running);
 	dl_rq->dl_nr_running--;
 	sub_nr_running(rq_of_dl_rq(dl_rq), 1);
@@ -1648,8 +1754,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
 	} else if (flags & ENQUEUE_REPLENISH) {
 		replenish_dl_entity(dl_se);
 	} else if ((flags & ENQUEUE_RESTORE) &&
-		  dl_time_before(dl_se->deadline,
-				 rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
+		   dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
 		setup_new_dl_entity(dl_se);
 	}
 
@@ -1730,19 +1835,13 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 
 	enqueue_dl_entity(&p->dl, flags);
 
+	if (dl_server(&p->dl))
+		return;
+
 	if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1)
 		enqueue_pushable_dl_task(rq, p);
 }
 
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
-{
-	update_stats_dequeue_dl(&rq->dl, &p->dl, flags);
-	dequeue_dl_entity(&p->dl, flags);
-
-	if (!p->dl.dl_throttled)
-		dequeue_pushable_dl_task(rq, p);
-}
-
 static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_curr_dl(rq);
@@ -1750,7 +1849,9 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 	if (p->on_rq == TASK_ON_RQ_MIGRATING)
 		flags |= DEQUEUE_MIGRATING;
 
-	__dequeue_task_dl(rq, p, flags);
+	dequeue_dl_entity(&p->dl, flags);
+	if (!p->dl.dl_throttled && !dl_server(&p->dl))
+		dequeue_pushable_dl_task(rq, p);
 }
 
 /*
@@ -1940,12 +2041,12 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
 }
 
 #ifdef CONFIG_SCHED_HRTICK
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
 {
-	hrtick_start(rq, p->dl.runtime);
+	hrtick_start(rq, dl_se->runtime);
 }
 #else /* !CONFIG_SCHED_HRTICK */
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
 {
 }
 #endif
@@ -1965,9 +2066,6 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
 	if (!first)
 		return;
 
-	if (hrtick_enabled_dl(rq))
-		start_hrtick_dl(rq, p);
-
 	if (rq->curr->sched_class != &dl_sched_class)
 		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
 
@@ -1990,12 +2088,25 @@ static struct task_struct *pick_task_dl(struct rq *rq)
 	struct dl_rq *dl_rq = &rq->dl;
 	struct task_struct *p;
 
+again:
 	if (!sched_dl_runnable(rq))
 		return NULL;
 
 	dl_se = pick_next_dl_entity(dl_rq);
 	WARN_ON_ONCE(!dl_se);
-	p = dl_task_of(dl_se);
+
+	if (dl_server(dl_se)) {
+		p = dl_se->server_pick(dl_se);
+		if (!p) {
+			WARN_ON_ONCE(1);
+			dl_se->dl_yielded = 1;
+			update_curr_dl_se(rq, dl_se, 0);
+			goto again;
+		}
+		p->dl_server = dl_se;
+	} else {
+		p = dl_task_of(dl_se);
+	}
 
 	return p;
 }
@@ -2005,9 +2116,15 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
 	struct task_struct *p;
 
 	p = pick_task_dl(rq);
-	if (p)
+	if (!p)
+		return p;
+
+	if (!p->dl_server)
 		set_next_task_dl(rq, p, true);
 
+	if (hrtick_enabled(rq))
+		start_hrtick_dl(rq, &p->dl);
+
 	return p;
 }
 
@@ -2045,8 +2162,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
 	 * be set and schedule() will start a new hrtick for the next task.
 	 */
 	if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 &&
-	    is_leftmost(p, &rq->dl))
-		start_hrtick_dl(rq, p);
+	    is_leftmost(&p->dl, &rq->dl))
+		start_hrtick_dl(rq, &p->dl);
 }
 
 static void task_fork_dl(struct task_struct *p)
@@ -2986,6 +3103,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se)
 	dl_se->dl_yielded		= 0;
 	dl_se->dl_non_contending	= 0;
 	dl_se->dl_overrun		= 0;
+	dl_se->dl_server		= 0;
 
 #ifdef CONFIG_RT_MUTEXES
 	dl_se->pi_se			= dl_se;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1cd92b11b289..07f555857698 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1131,6 +1131,8 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
 	trace_sched_stat_runtime(p, delta_exec);
 	account_group_exec_runtime(p, delta_exec);
 	cgroup_account_cputime(p, delta_exec);
+	if (p->dl_server)
+		dl_server_update(p->dl_server, delta_exec);
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1cda787172f0..8a70d51ffa33 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -313,6 +313,33 @@ extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *att
 extern int  dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
 extern int  dl_bw_check_overflow(int cpu);
 
+/*
+ * SCHED_DEADLINE supports servers (nested scheduling) with the following
+ * interface:
+ *
+ *   dl_se::rq -- runqueue we belong to.
+ *
+ *   dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the
+ *                                server when it runs out of tasks to run.
+ *
+ *   dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
+ *                           returns NULL.
+ *
+ *   dl_server_update() -- called from update_curr_common(), propagates runtime
+ *                         to the server.
+ *
+ *   dl_server_start()
+ *   dl_server_stop()  -- start/stop the server when it has (no) tasks.
+ *
+ *   dl_server_init() -- initializes the server.
+ */
+extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
+extern void dl_server_start(struct sched_dl_entity *dl_se);
+extern void dl_server_stop(struct sched_dl_entity *dl_se);
+extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+		    dl_server_has_tasks_f has_tasks,
+		    dl_server_pick_f pick);
+
 #ifdef CONFIG_CGROUP_SCHED
 
 struct cfs_rq;
-- 
cgit v1.2.3


From dd5403869a40595eb953f12e8cd2bb57bb88bb67 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 14 Nov 2023 14:38:39 -0500
Subject: sched/cpuidle: Comment about timers requirements VS idle handler

Add missing explanation concerning IRQs re-enablement constraints in
the cpuidle path against timers.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lkml.kernel.org/r/20231114193840.4041-2-frederic@kernel.org
---
 kernel/sched/idle.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 565f8374ddbb..31231925f1ec 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -258,6 +258,36 @@ static void do_idle(void)
 	while (!need_resched()) {
 		rmb();
 
+		/*
+		 * Interrupts shouldn't be re-enabled from that point on until
+		 * the CPU sleeping instruction is reached. Otherwise an interrupt
+		 * may fire and queue a timer that would be ignored until the CPU
+		 * wakes from the sleeping instruction. And testing need_resched()
+		 * doesn't tell about pending needed timer reprogram.
+		 *
+		 * Several cases to consider:
+		 *
+		 * - SLEEP-UNTIL-PENDING-INTERRUPT based instructions such as
+		 *   "wfi" or "mwait" are fine because they can be entered with
+		 *   interrupt disabled.
+		 *
+		 * - sti;mwait() couple is fine because the interrupts are
+		 *   re-enabled only upon the execution of mwait, leaving no gap
+		 *   in-between.
+		 *
+		 * - ROLLBACK based idle handlers with the sleeping instruction
+		 *   called with interrupts enabled are NOT fine. In this scheme
+		 *   when the interrupt detects it has interrupted an idle handler,
+		 *   it rolls back to its beginning which performs the
+		 *   need_resched() check before re-executing the sleeping
+		 *   instruction. This can leak a pending needed timer reprogram.
+		 *   If such a scheme is really mandatory due to the lack of an
+		 *   appropriate CPU sleeping instruction, then a FAST-FORWARD
+		 *   must instead be applied: when the interrupt detects it has
+		 *   interrupted an idle handler, it must resume to the end of
+		 *   this idle handler so that the generic idle loop is iterated
+		 *   again to reprogram the tick.
+		 */
 		local_irq_disable();
 
 		if (cpu_is_offline(cpu)) {
-- 
cgit v1.2.3


From 194600008d5c43b5a4ba98c4b81633397e34ffad Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 14 Nov 2023 14:38:40 -0500
Subject: sched/timers: Explain why idle task schedules out on remote timer
 enqueue

Trying to avoid that didn't bring much value after testing, add comment
about this.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lkml.kernel.org/r/20231114193840.4041-3-frederic@kernel.org
---
 kernel/sched/core.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5f4495d1768..2de77a6d5ef8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1131,6 +1131,28 @@ static void wake_up_idle_cpu(int cpu)
 	if (cpu == smp_processor_id())
 		return;
 
+	/*
+	 * Set TIF_NEED_RESCHED and send an IPI if in the non-polling
+	 * part of the idle loop. This forces an exit from the idle loop
+	 * and a round trip to schedule(). Now this could be optimized
+	 * because a simple new idle loop iteration is enough to
+	 * re-evaluate the next tick. Provided some re-ordering of tick
+	 * nohz functions that would need to follow TIF_NR_POLLING
+	 * clearing:
+	 *
+	 * - On most archs, a simple fetch_or on ti::flags with a
+	 *   "0" value would be enough to know if an IPI needs to be sent.
+	 *
+	 * - x86 needs to perform a last need_resched() check between
+	 *   monitor and mwait which doesn't take timers into account.
+	 *   There a dedicated TIF_TIMER flag would be required to
+	 *   fetch_or here and be checked along with TIF_NEED_RESCHED
+	 *   before mwait().
+	 *
+	 * However, remote timer enqueue is not such a frequent event
+	 * and testing of the above solutions didn't appear to report
+	 * much benefits.
+	 */
 	if (set_nr_and_not_polling(rq->idle))
 		smp_send_reschedule(cpu);
 	else
-- 
cgit v1.2.3


From 652ffc2104ec1f69dd4a46313888c33527145ccf Mon Sep 17 00:00:00 2001
From: Greg KH <gregkh@linuxfoundation.org>
Date: Mon, 12 Jun 2023 15:09:09 +0200
Subject: perf/core: Fix narrow startup race when creating the perf
 nr_addr_filters sysfs file

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/2023061204-decal-flyable-6090@gregkh
---
 kernel/events/core.c | 40 ++++++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 08250981d9f4..4f0c45ab8d7d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11408,9 +11408,32 @@ static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
 static struct attribute *pmu_dev_attrs[] = {
 	&dev_attr_type.attr,
 	&dev_attr_perf_event_mux_interval_ms.attr,
+	&dev_attr_nr_addr_filters.attr,
+	NULL,
+};
+
+static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct pmu *pmu = dev_get_drvdata(dev);
+
+	if (!pmu->nr_addr_filters)
+		return 0;
+
+	return a->mode;
+
+	return 0;
+}
+
+static struct attribute_group pmu_dev_attr_group = {
+	.is_visible = pmu_dev_is_visible,
+	.attrs = pmu_dev_attrs,
+};
+
+static const struct attribute_group *pmu_dev_groups[] = {
+	&pmu_dev_attr_group,
 	NULL,
 };
-ATTRIBUTE_GROUPS(pmu_dev);
 
 static int pmu_bus_running;
 static struct bus_type pmu_bus = {
@@ -11447,18 +11470,11 @@ static int pmu_dev_alloc(struct pmu *pmu)
 	if (ret)
 		goto free_dev;
 
-	/* For PMUs with address filters, throw in an extra attribute: */
-	if (pmu->nr_addr_filters)
-		ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
-
-	if (ret)
-		goto del_dev;
-
-	if (pmu->attr_update)
+	if (pmu->attr_update) {
 		ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
-
-	if (ret)
-		goto del_dev;
+		if (ret)
+			goto del_dev;
+	}
 
 out:
 	return ret;
-- 
cgit v1.2.3


From 1fda5bb66ad8fb24ecb3858e61a13a6548428898 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Fri, 10 Nov 2023 17:39:28 -0800
Subject: bpf: Do not allocate percpu memory at init stage

Kirill Shutemov reported significant percpu memory consumption increase after
booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add support for
non-fix-size percpu mem allocation"). The percpu memory consumption is
increased from 111MB to 969MB. The number is from /proc/meminfo.

I tried to reproduce the issue with my local VM which at most supports upto
255 cpus. With 252 cpus, without the above commit, the percpu memory
consumption immediately after boot is 57MB while with the above commit the
percpu memory consumption is 231MB.

This is not good since so far percpu memory from bpf memory allocator is not
widely used yet. Let us change pre-allocation in init stage to on-demand
allocation when verifier detects there is a need of percpu memory for bpf
program. With this change, percpu memory consumption after boot can be reduced
signicantly.

  [1] https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@box.shutemov.name/

Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation")
Reported-and-tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231111013928.948838-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  2 +-
 kernel/bpf/core.c     |  8 +++-----
 kernel/bpf/verifier.c | 20 ++++++++++++++++++--
 3 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 35bff17396c0..6762dac3ef76 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,7 +56,7 @@ extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 extern struct kobject *btf_kobj;
 extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
-extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
+extern bool bpf_global_ma_set;
 
 typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 08626b519ce2..cd3afe57ece3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -64,8 +64,8 @@
 #define OFF	insn->off
 #define IMM	insn->imm
 
-struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
-bool bpf_global_ma_set, bpf_global_percpu_ma_set;
+struct bpf_mem_alloc bpf_global_ma;
+bool bpf_global_ma_set;
 
 /* No hurry in this branch
  *
@@ -2934,9 +2934,7 @@ static int __init bpf_global_ma_init(void)
 
 	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
 	bpf_global_ma_set = !ret;
-	ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
-	bpf_global_percpu_ma_set = !ret;
-	return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
+	return ret;
 }
 late_initcall(bpf_global_ma_init);
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a2267d5ed14e..6da370a047fe 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -26,6 +26,7 @@
 #include <linux/poison.h>
 #include <linux/module.h>
 #include <linux/cpumask.h>
+#include <linux/bpf_mem_alloc.h>
 #include <net/xdp.h>
 
 #include "disasm.h"
@@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
 #undef BPF_LINK_TYPE
 };
 
+struct bpf_mem_alloc bpf_global_percpu_ma;
+static bool bpf_global_percpu_ma_set;
+
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
  * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -336,6 +340,7 @@ struct bpf_kfunc_call_arg_meta {
 struct btf *btf_vmlinux;
 
 static DEFINE_MUTEX(bpf_verifier_lock);
+static DEFINE_MUTEX(bpf_percpu_ma_lock);
 
 static const struct bpf_line_info *
 find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
@@ -12091,8 +12096,19 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
 					return -ENOMEM;
 
-				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set)
-					return -ENOMEM;
+				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+					if (!bpf_global_percpu_ma_set) {
+						mutex_lock(&bpf_percpu_ma_lock);
+						if (!bpf_global_percpu_ma_set) {
+							err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
+							if (!err)
+								bpf_global_percpu_ma_set = true;
+						}
+						mutex_unlock(&bpf_percpu_ma_lock);
+						if (err)
+							return err;
+					}
+				}
 
 				if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
 					verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
-- 
cgit v1.2.3


From 67420501e8681ae18f9f0ea0a69cd2f432100e70 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 11 Nov 2023 17:05:57 -0800
Subject: bpf: generalize reg_set_min_max() to handle non-const register
 comparisons

Generalize bounds adjustment logic of reg_set_min_max() to handle not
just register vs constant case, but in general any register vs any
register cases. For most of the operations it's trivial extension based
on range vs range comparison logic, we just need to properly pick
min/max of a range to compare against min/max of the other range.

For BPF_JSET we keep the original capabilities, just make sure JSET is
integrated in the common framework. This is manifested in the
internal-only BPF_JSET + BPF_X "opcode" to allow for simpler and more
uniform rev_opcode() handling. See the code for details. This allows to
reuse the same code exactly both for TRUE and FALSE branches without
explicitly handling both conditions with custom code.

Note also that now we don't need a special handling of BPF_JEQ/BPF_JNE
case none of the registers are constants. This is now just a normal
generic case handled by reg_set_min_max().

To make tnum handling cleaner, tnum_with_subreg() helper is added, as
that's a common operator when dealing with 32-bit subregister bounds.
This keeps the overall logic much less noisy when it comes to tnums.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231112010609.848406-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/tnum.h  |   4 +
 kernel/bpf/tnum.c     |   7 +-
 kernel/bpf/verifier.c | 314 ++++++++++++++++++++++----------------------------
 3 files changed, 146 insertions(+), 179 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tnum.h b/include/linux/tnum.h
index 1c3948a1d6ad..3c13240077b8 100644
--- a/include/linux/tnum.h
+++ b/include/linux/tnum.h
@@ -106,6 +106,10 @@ int tnum_sbin(char *str, size_t size, struct tnum a);
 struct tnum tnum_subreg(struct tnum a);
 /* Returns the tnum with the lower 32-bit subreg cleared */
 struct tnum tnum_clear_subreg(struct tnum a);
+/* Returns the tnum with the lower 32-bit subreg in *reg* set to the lower
+ * 32-bit subreg in *subreg*
+ */
+struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg);
 /* Returns the tnum with the lower 32-bit subreg set to value */
 struct tnum tnum_const_subreg(struct tnum a, u32 value);
 /* Returns true if 32-bit subreg @a is a known constant*/
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 3d7127f439a1..f4c91c9b27d7 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -208,7 +208,12 @@ struct tnum tnum_clear_subreg(struct tnum a)
 	return tnum_lshift(tnum_rshift(a, 32), 32);
 }
 
+struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg)
+{
+	return tnum_or(tnum_clear_subreg(reg), tnum_subreg(subreg));
+}
+
 struct tnum tnum_const_subreg(struct tnum a, u32 value)
 {
-	return tnum_or(tnum_clear_subreg(a), tnum_const(value));
+	return tnum_with_subreg(a, tnum_const(value));
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9ae6eae13471..39ce141c55d3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14453,218 +14453,186 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg
 	return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
 }
 
-/* Adjusts the register min/max values in the case that the dst_reg and
- * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
- * check, in which case we havea fake SCALAR_VALUE representing insn->imm).
- * Technically we can do similar adjustments for pointers to the same object,
- * but we don't support that right now.
+/* Opcode that corresponds to a *false* branch condition.
+ * E.g., if r1 < r2, then reverse (false) condition is r1 >= r2
  */
-static void reg_set_min_max(struct bpf_reg_state *true_reg1,
-			    struct bpf_reg_state *true_reg2,
-			    struct bpf_reg_state *false_reg1,
-			    struct bpf_reg_state *false_reg2,
-			    u8 opcode, bool is_jmp32)
+static u8 rev_opcode(u8 opcode)
 {
-	struct tnum false_32off, false_64off;
-	struct tnum true_32off, true_64off;
-	u64 uval;
-	u32 uval32;
-	s64 sval;
-	s32 sval32;
-
-	/* If either register is a pointer, we can't learn anything about its
-	 * variable offset from the compare (unless they were a pointer into
-	 * the same object, but we don't bother with that).
+	switch (opcode) {
+	case BPF_JEQ:		return BPF_JNE;
+	case BPF_JNE:		return BPF_JEQ;
+	/* JSET doesn't have it's reverse opcode in BPF, so add
+	 * BPF_X flag to denote the reverse of that operation
 	 */
-	if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
-		return;
-
-	/* we expect right-hand registers (src ones) to be constants, for now */
-	if (!is_reg_const(false_reg2, is_jmp32)) {
-		opcode = flip_opcode(opcode);
-		swap(true_reg1, true_reg2);
-		swap(false_reg1, false_reg2);
+	case BPF_JSET:		return BPF_JSET | BPF_X;
+	case BPF_JSET | BPF_X:	return BPF_JSET;
+	case BPF_JGE:		return BPF_JLT;
+	case BPF_JGT:		return BPF_JLE;
+	case BPF_JLE:		return BPF_JGT;
+	case BPF_JLT:		return BPF_JGE;
+	case BPF_JSGE:		return BPF_JSLT;
+	case BPF_JSGT:		return BPF_JSLE;
+	case BPF_JSLE:		return BPF_JSGT;
+	case BPF_JSLT:		return BPF_JSGE;
+	default:		return 0;
 	}
-	if (!is_reg_const(false_reg2, is_jmp32))
-		return;
+}
 
-	false_32off = tnum_subreg(false_reg1->var_off);
-	false_64off = false_reg1->var_off;
-	true_32off = tnum_subreg(true_reg1->var_off);
-	true_64off = true_reg1->var_off;
-	uval = false_reg2->var_off.value;
-	uval32 = (u32)tnum_subreg(false_reg2->var_off).value;
-	sval = (s64)uval;
-	sval32 = (s32)uval32;
+/* Refine range knowledge for <reg1> <op> <reg>2 conditional operation. */
+static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
+				u8 opcode, bool is_jmp32)
+{
+	struct tnum t;
+	u64 val;
 
+again:
 	switch (opcode) {
-	/* JEQ/JNE comparison doesn't change the register equivalence.
-	 *
-	 * r1 = r2;
-	 * if (r1 == 42) goto label;
-	 * ...
-	 * label: // here both r1 and r2 are known to be 42.
-	 *
-	 * Hence when marking register as known preserve it's ID.
-	 */
 	case BPF_JEQ:
 		if (is_jmp32) {
-			__mark_reg32_known(true_reg1, uval32);
-			true_32off = tnum_subreg(true_reg1->var_off);
+			reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
+			reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
+			reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
+			reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
+			reg2->u32_min_value = reg1->u32_min_value;
+			reg2->u32_max_value = reg1->u32_max_value;
+			reg2->s32_min_value = reg1->s32_min_value;
+			reg2->s32_max_value = reg1->s32_max_value;
+
+			t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off));
+			reg1->var_off = tnum_with_subreg(reg1->var_off, t);
+			reg2->var_off = tnum_with_subreg(reg2->var_off, t);
 		} else {
-			___mark_reg_known(true_reg1, uval);
-			true_64off = true_reg1->var_off;
+			reg1->umin_value = max(reg1->umin_value, reg2->umin_value);
+			reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
+			reg1->smin_value = max(reg1->smin_value, reg2->smin_value);
+			reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
+			reg2->umin_value = reg1->umin_value;
+			reg2->umax_value = reg1->umax_value;
+			reg2->smin_value = reg1->smin_value;
+			reg2->smax_value = reg1->smax_value;
+
+			reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off);
+			reg2->var_off = reg1->var_off;
 		}
 		break;
 	case BPF_JNE:
-		if (is_jmp32) {
-			__mark_reg32_known(false_reg1, uval32);
-			false_32off = tnum_subreg(false_reg1->var_off);
-		} else {
-			___mark_reg_known(false_reg1, uval);
-			false_64off = false_reg1->var_off;
-		}
+		/* we don't derive any new information for inequality yet */
 		break;
 	case BPF_JSET:
+		if (!is_reg_const(reg2, is_jmp32))
+			swap(reg1, reg2);
+		if (!is_reg_const(reg2, is_jmp32))
+			break;
+		val = reg_const_value(reg2, is_jmp32);
+		/* BPF_JSET (i.e., TRUE branch, *not* BPF_JSET | BPF_X)
+		 * requires single bit to learn something useful. E.g., if we
+		 * know that `r1 & 0x3` is true, then which bits (0, 1, or both)
+		 * are actually set? We can learn something definite only if
+		 * it's a single-bit value to begin with.
+		 *
+		 * BPF_JSET | BPF_X (i.e., negation of BPF_JSET) doesn't have
+		 * this restriction. I.e., !(r1 & 0x3) means neither bit 0 nor
+		 * bit 1 is set, which we can readily use in adjustments.
+		 */
+		if (!is_power_of_2(val))
+			break;
 		if (is_jmp32) {
-			false_32off = tnum_and(false_32off, tnum_const(~uval32));
-			if (is_power_of_2(uval32))
-				true_32off = tnum_or(true_32off,
-						     tnum_const(uval32));
+			t = tnum_or(tnum_subreg(reg1->var_off), tnum_const(val));
+			reg1->var_off = tnum_with_subreg(reg1->var_off, t);
 		} else {
-			false_64off = tnum_and(false_64off, tnum_const(~uval));
-			if (is_power_of_2(uval))
-				true_64off = tnum_or(true_64off,
-						     tnum_const(uval));
+			reg1->var_off = tnum_or(reg1->var_off, tnum_const(val));
 		}
 		break;
-	case BPF_JGE:
-	case BPF_JGT:
-	{
+	case BPF_JSET | BPF_X: /* reverse of BPF_JSET, see rev_opcode() */
+		if (!is_reg_const(reg2, is_jmp32))
+			swap(reg1, reg2);
+		if (!is_reg_const(reg2, is_jmp32))
+			break;
+		val = reg_const_value(reg2, is_jmp32);
 		if (is_jmp32) {
-			u32 false_umax = opcode == BPF_JGT ? uval32  : uval32 - 1;
-			u32 true_umin = opcode == BPF_JGT ? uval32 + 1 : uval32;
-
-			false_reg1->u32_max_value = min(false_reg1->u32_max_value,
-						       false_umax);
-			true_reg1->u32_min_value = max(true_reg1->u32_min_value,
-						      true_umin);
+			t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val));
+			reg1->var_off = tnum_with_subreg(reg1->var_off, t);
 		} else {
-			u64 false_umax = opcode == BPF_JGT ? uval    : uval - 1;
-			u64 true_umin = opcode == BPF_JGT ? uval + 1 : uval;
-
-			false_reg1->umax_value = min(false_reg1->umax_value, false_umax);
-			true_reg1->umin_value = max(true_reg1->umin_value, true_umin);
+			reg1->var_off = tnum_and(reg1->var_off, tnum_const(~val));
 		}
 		break;
-	}
-	case BPF_JSGE:
-	case BPF_JSGT:
-	{
+	case BPF_JLE:
 		if (is_jmp32) {
-			s32 false_smax = opcode == BPF_JSGT ? sval32    : sval32 - 1;
-			s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32;
-
-			false_reg1->s32_max_value = min(false_reg1->s32_max_value, false_smax);
-			true_reg1->s32_min_value = max(true_reg1->s32_min_value, true_smin);
+			reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value);
+			reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value);
 		} else {
-			s64 false_smax = opcode == BPF_JSGT ? sval    : sval - 1;
-			s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
-
-			false_reg1->smax_value = min(false_reg1->smax_value, false_smax);
-			true_reg1->smin_value = max(true_reg1->smin_value, true_smin);
+			reg1->umax_value = min(reg1->umax_value, reg2->umax_value);
+			reg2->umin_value = max(reg1->umin_value, reg2->umin_value);
 		}
 		break;
-	}
-	case BPF_JLE:
 	case BPF_JLT:
-	{
 		if (is_jmp32) {
-			u32 false_umin = opcode == BPF_JLT ? uval32  : uval32 + 1;
-			u32 true_umax = opcode == BPF_JLT ? uval32 - 1 : uval32;
-
-			false_reg1->u32_min_value = max(false_reg1->u32_min_value,
-						       false_umin);
-			true_reg1->u32_max_value = min(true_reg1->u32_max_value,
-						      true_umax);
+			reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1);
+			reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value);
 		} else {
-			u64 false_umin = opcode == BPF_JLT ? uval    : uval + 1;
-			u64 true_umax = opcode == BPF_JLT ? uval - 1 : uval;
-
-			false_reg1->umin_value = max(false_reg1->umin_value, false_umin);
-			true_reg1->umax_value = min(true_reg1->umax_value, true_umax);
+			reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1);
+			reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value);
 		}
 		break;
-	}
 	case BPF_JSLE:
+		if (is_jmp32) {
+			reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value);
+			reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value);
+		} else {
+			reg1->smax_value = min(reg1->smax_value, reg2->smax_value);
+			reg2->smin_value = max(reg1->smin_value, reg2->smin_value);
+		}
+		break;
 	case BPF_JSLT:
-	{
 		if (is_jmp32) {
-			s32 false_smin = opcode == BPF_JSLT ? sval32    : sval32 + 1;
-			s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32;
-
-			false_reg1->s32_min_value = max(false_reg1->s32_min_value, false_smin);
-			true_reg1->s32_max_value = min(true_reg1->s32_max_value, true_smax);
+			reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1);
+			reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value);
 		} else {
-			s64 false_smin = opcode == BPF_JSLT ? sval    : sval + 1;
-			s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
-
-			false_reg1->smin_value = max(false_reg1->smin_value, false_smin);
-			true_reg1->smax_value = min(true_reg1->smax_value, true_smax);
+			reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1);
+			reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value);
 		}
 		break;
-	}
+	case BPF_JGE:
+	case BPF_JGT:
+	case BPF_JSGE:
+	case BPF_JSGT:
+		/* just reuse LE/LT logic above */
+		opcode = flip_opcode(opcode);
+		swap(reg1, reg2);
+		goto again;
 	default:
 		return;
 	}
-
-	if (is_jmp32) {
-		false_reg1->var_off = tnum_or(tnum_clear_subreg(false_64off),
-					     tnum_subreg(false_32off));
-		true_reg1->var_off = tnum_or(tnum_clear_subreg(true_64off),
-					    tnum_subreg(true_32off));
-		reg_bounds_sync(false_reg1);
-		reg_bounds_sync(true_reg1);
-	} else {
-		false_reg1->var_off = false_64off;
-		true_reg1->var_off = true_64off;
-		reg_bounds_sync(false_reg1);
-		reg_bounds_sync(true_reg1);
-	}
-}
-
-/* Regs are known to be equal, so intersect their min/max/var_off */
-static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
-				  struct bpf_reg_state *dst_reg)
-{
-	src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value,
-							dst_reg->umin_value);
-	src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value,
-							dst_reg->umax_value);
-	src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value,
-							dst_reg->smin_value);
-	src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value,
-							dst_reg->smax_value);
-	src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
-							     dst_reg->var_off);
-	reg_bounds_sync(src_reg);
-	reg_bounds_sync(dst_reg);
 }
 
-static void reg_combine_min_max(struct bpf_reg_state *true_src,
-				struct bpf_reg_state *true_dst,
-				struct bpf_reg_state *false_src,
-				struct bpf_reg_state *false_dst,
-				u8 opcode)
+/* Adjusts the register min/max values in the case that the dst_reg and
+ * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K
+ * check, in which case we havea fake SCALAR_VALUE representing insn->imm).
+ * Technically we can do similar adjustments for pointers to the same object,
+ * but we don't support that right now.
+ */
+static void reg_set_min_max(struct bpf_reg_state *true_reg1,
+			    struct bpf_reg_state *true_reg2,
+			    struct bpf_reg_state *false_reg1,
+			    struct bpf_reg_state *false_reg2,
+			    u8 opcode, bool is_jmp32)
 {
-	switch (opcode) {
-	case BPF_JEQ:
-		__reg_combine_min_max(true_src, true_dst);
-		break;
-	case BPF_JNE:
-		__reg_combine_min_max(false_src, false_dst);
-		break;
-	}
+	/* If either register is a pointer, we can't learn anything about its
+	 * variable offset from the compare (unless they were a pointer into
+	 * the same object, but we don't bother with that).
+	 */
+	if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
+		return;
+
+	/* fallthrough (FALSE) branch */
+	regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32);
+	reg_bounds_sync(false_reg1);
+	reg_bounds_sync(false_reg2);
+
+	/* jump (TRUE) branch */
+	regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32);
+	reg_bounds_sync(true_reg1);
+	reg_bounds_sync(true_reg2);
 }
 
 static void mark_ptr_or_null_reg(struct bpf_func_state *state,
@@ -14961,22 +14929,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		reg_set_min_max(&other_branch_regs[insn->dst_reg],
 				&other_branch_regs[insn->src_reg],
 				dst_reg, src_reg, opcode, is_jmp32);
-
-		if (dst_reg->type == SCALAR_VALUE &&
-		    src_reg->type == SCALAR_VALUE &&
-		    !is_jmp32 && (opcode == BPF_JEQ || opcode == BPF_JNE)) {
-			/* Comparing for equality, we can combine knowledge */
-			reg_combine_min_max(&other_branch_regs[insn->src_reg],
-					    &other_branch_regs[insn->dst_reg],
-					    src_reg, dst_reg, opcode);
-		}
 	} else /* BPF_SRC(insn->code) == BPF_K */ {
 		reg_set_min_max(&other_branch_regs[insn->dst_reg],
 				src_reg /* fake one */,
 				dst_reg, src_reg /* same fake one */,
 				opcode, is_jmp32);
 	}
-
 	if (BPF_SRC(insn->code) == BPF_X &&
 	    src_reg->type == SCALAR_VALUE && src_reg->id &&
 	    !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
-- 
cgit v1.2.3


From 96381879a370425a30b810906946f64c0726450e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 11 Nov 2023 17:05:58 -0800
Subject: bpf: generalize is_scalar_branch_taken() logic

Generalize is_branch_taken logic for SCALAR_VALUE register to handle
cases when both registers are not constants. Previously supported
<range> vs <scalar> cases are a natural subset of more generic <range>
vs <range> set of cases.

Generalized logic relies on straightforward segment intersection checks.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231112010609.848406-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 98 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 58 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 39ce141c55d3..f459ad99256e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14261,82 +14261,99 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta
 				  u8 opcode, bool is_jmp32)
 {
 	struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off;
+	struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off;
 	u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value;
 	u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value;
 	s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value;
 	s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value;
-	u64 uval = is_jmp32 ? (u32)tnum_subreg(reg2->var_off).value : reg2->var_off.value;
-	s64 sval = is_jmp32 ? (s32)uval : (s64)uval;
+	u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value;
+	u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value;
+	s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value;
+	s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value;
 
 	switch (opcode) {
 	case BPF_JEQ:
-		if (tnum_is_const(t1))
-			return !!tnum_equals_const(t1, uval);
-		else if (uval < umin1 || uval > umax1)
+		/* constants, umin/umax and smin/smax checks would be
+		 * redundant in this case because they all should match
+		 */
+		if (tnum_is_const(t1) && tnum_is_const(t2))
+			return t1.value == t2.value;
+		/* non-overlapping ranges */
+		if (umin1 > umax2 || umax1 < umin2)
 			return 0;
-		else if (sval < smin1 || sval > smax1)
+		if (smin1 > smax2 || smax1 < smin2)
 			return 0;
 		break;
 	case BPF_JNE:
-		if (tnum_is_const(t1))
-			return !tnum_equals_const(t1, uval);
-		else if (uval < umin1 || uval > umax1)
+		/* constants, umin/umax and smin/smax checks would be
+		 * redundant in this case because they all should match
+		 */
+		if (tnum_is_const(t1) && tnum_is_const(t2))
+			return t1.value != t2.value;
+		/* non-overlapping ranges */
+		if (umin1 > umax2 || umax1 < umin2)
 			return 1;
-		else if (sval < smin1 || sval > smax1)
+		if (smin1 > smax2 || smax1 < smin2)
 			return 1;
 		break;
 	case BPF_JSET:
-		if ((~t1.mask & t1.value) & uval)
+		if (!is_reg_const(reg2, is_jmp32)) {
+			swap(reg1, reg2);
+			swap(t1, t2);
+		}
+		if (!is_reg_const(reg2, is_jmp32))
+			return -1;
+		if ((~t1.mask & t1.value) & t2.value)
 			return 1;
-		if (!((t1.mask | t1.value) & uval))
+		if (!((t1.mask | t1.value) & t2.value))
 			return 0;
 		break;
 	case BPF_JGT:
-		if (umin1 > uval )
+		if (umin1 > umax2)
 			return 1;
-		else if (umax1 <= uval)
+		else if (umax1 <= umin2)
 			return 0;
 		break;
 	case BPF_JSGT:
-		if (smin1 > sval)
+		if (smin1 > smax2)
 			return 1;
-		else if (smax1 <= sval)
+		else if (smax1 <= smin2)
 			return 0;
 		break;
 	case BPF_JLT:
-		if (umax1 < uval)
+		if (umax1 < umin2)
 			return 1;
-		else if (umin1 >= uval)
+		else if (umin1 >= umax2)
 			return 0;
 		break;
 	case BPF_JSLT:
-		if (smax1 < sval)
+		if (smax1 < smin2)
 			return 1;
-		else if (smin1 >= sval)
+		else if (smin1 >= smax2)
 			return 0;
 		break;
 	case BPF_JGE:
-		if (umin1 >= uval)
+		if (umin1 >= umax2)
 			return 1;
-		else if (umax1 < uval)
+		else if (umax1 < umin2)
 			return 0;
 		break;
 	case BPF_JSGE:
-		if (smin1 >= sval)
+		if (smin1 >= smax2)
 			return 1;
-		else if (smax1 < sval)
+		else if (smax1 < smin2)
 			return 0;
 		break;
 	case BPF_JLE:
-		if (umax1 <= uval)
+		if (umax1 <= umin2)
 			return 1;
-		else if (umin1 > uval)
+		else if (umin1 > umax2)
 			return 0;
 		break;
 	case BPF_JSLE:
-		if (smax1 <= sval)
+		if (smax1 <= smin2)
 			return 1;
-		else if (smin1 > sval)
+		else if (smin1 > smax2)
 			return 0;
 		break;
 	}
@@ -14415,28 +14432,28 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg,
 static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2,
 			   u8 opcode, bool is_jmp32)
 {
-	u64 val;
-
 	if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32)
 		return is_pkt_ptr_branch_taken(reg1, reg2, opcode);
 
-	/* try to make sure reg2 is a constant SCALAR_VALUE */
-	if (!is_reg_const(reg2, is_jmp32)) {
-		opcode = flip_opcode(opcode);
-		swap(reg1, reg2);
-	}
-	/* for now we expect reg2 to be a constant to make any useful decisions */
-	if (!is_reg_const(reg2, is_jmp32))
-		return -1;
-	val = reg_const_value(reg2, is_jmp32);
+	if (__is_pointer_value(false, reg1) || __is_pointer_value(false, reg2)) {
+		u64 val;
+
+		/* arrange that reg2 is a scalar, and reg1 is a pointer */
+		if (!is_reg_const(reg2, is_jmp32)) {
+			opcode = flip_opcode(opcode);
+			swap(reg1, reg2);
+		}
+		/* and ensure that reg2 is a constant */
+		if (!is_reg_const(reg2, is_jmp32))
+			return -1;
 
-	if (__is_pointer_value(false, reg1)) {
 		if (!reg_not_null(reg1))
 			return -1;
 
 		/* If pointer is valid tests against zero will fail so we can
 		 * use this to direct branch taken.
 		 */
+		val = reg_const_value(reg2, is_jmp32);
 		if (val != 0)
 			return -1;
 
@@ -14450,6 +14467,7 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg
 		}
 	}
 
+	/* now deal with two scalars, but not necessarily constants */
 	return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32);
 }
 
-- 
cgit v1.2.3


From be41a203bb9e0159099e189e510388fe61962eb8 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 11 Nov 2023 17:05:59 -0800
Subject: bpf: enhance BPF_JEQ/BPF_JNE is_branch_taken logic

Use 32-bit subranges to prune some 64-bit BPF_JEQ/BPF_JNE conditions
that otherwise would be "inconclusive" (i.e., is_branch_taken() would
return -1). This can happen, for example, when registers are initialized
as 64-bit u64/s64, then compared for inequality as 32-bit subregisters,
and then followed by 64-bit equality/inequality check. That 32-bit
inequality can establish some pattern for lower 32 bits of a register
(e.g., s< 0 condition determines whether the bit #31 is zero or not),
while overall 64-bit value could be anything (according to a value range
representation).

This is not a fancy quirky special case, but actually a handling that's
necessary to prevent correctness issue with BPF verifier's range
tracking: set_range_min_max() assumes that register ranges are
non-overlapping, and if that condition is not guaranteed by
is_branch_taken() we can end up with invalid ranges, where min > max.

  [0] https://lore.kernel.org/bpf/CACkBjsY2q1_fUohD7hRmKGqv1MV=eP2f6XK8kjkYNw7BaiF8iQ@mail.gmail.com/

Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231112010609.848406-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f459ad99256e..65570eedfe88 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14283,6 +14283,18 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta
 			return 0;
 		if (smin1 > smax2 || smax1 < smin2)
 			return 0;
+		if (!is_jmp32) {
+			/* if 64-bit ranges are inconclusive, see if we can
+			 * utilize 32-bit subrange knowledge to eliminate
+			 * branches that can't be taken a priori
+			 */
+			if (reg1->u32_min_value > reg2->u32_max_value ||
+			    reg1->u32_max_value < reg2->u32_min_value)
+				return 0;
+			if (reg1->s32_min_value > reg2->s32_max_value ||
+			    reg1->s32_max_value < reg2->s32_min_value)
+				return 0;
+		}
 		break;
 	case BPF_JNE:
 		/* constants, umin/umax and smin/smax checks would be
@@ -14295,6 +14307,18 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta
 			return 1;
 		if (smin1 > smax2 || smax1 < smin2)
 			return 1;
+		if (!is_jmp32) {
+			/* if 64-bit ranges are inconclusive, see if we can
+			 * utilize 32-bit subrange knowledge to eliminate
+			 * branches that can't be taken a priori
+			 */
+			if (reg1->u32_min_value > reg2->u32_max_value ||
+			    reg1->u32_max_value < reg2->u32_min_value)
+				return 1;
+			if (reg1->s32_min_value > reg2->s32_max_value ||
+			    reg1->s32_max_value < reg2->s32_min_value)
+				return 1;
+		}
 		break;
 	case BPF_JSET:
 		if (!is_reg_const(reg2, is_jmp32)) {
-- 
cgit v1.2.3


From 5f99f312bd3bedb3b266b0d26376a8c500cdc97f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 11 Nov 2023 17:06:00 -0800
Subject: bpf: add register bounds sanity checks and sanitization

Add simple sanity checks that validate well-formed ranges (min <= max)
across u64, s64, u32, and s32 ranges. Also for cases when the value is
constant (either 64-bit or 32-bit), we validate that ranges and tnums
are in agreement.

These bounds checks are performed at the end of BPF_ALU/BPF_ALU64
operations, on conditional jumps, and for LDX instructions (where subreg
zero/sign extension is probably the most important to check). This
covers most of the interesting cases.

Also, we validate the sanity of the return register when manually
adjusting it for some special helpers.

By default, sanity violation will trigger a warning in verifier log and
resetting register bounds to "unbounded" ones. But to aid development
and debugging, BPF_F_TEST_SANITY_STRICT flag is added, which will
trigger hard failure of verification with -EFAULT on register bounds
violations. This allows selftests to catch such issues. veristat will
also gain a CLI option to enable this behavior.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231112010609.848406-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h   |   1 +
 include/uapi/linux/bpf.h       |   3 ++
 kernel/bpf/syscall.c           |   3 +-
 kernel/bpf/verifier.c          | 117 ++++++++++++++++++++++++++++++++---------
 tools/include/uapi/linux/bpf.h |   3 ++
 5 files changed, 101 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 24213a99cc79..402b6bc44a1b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -602,6 +602,7 @@ struct bpf_verifier_env {
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
 	bool test_state_freq;		/* test verifier with different pruning frequency */
+	bool test_sanity_strict;	/* fail verification on sanity violations */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	struct bpf_verifier_state_list *free_list;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7cf8bcf9f6a2..8a5855fcee69 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1200,6 +1200,9 @@ enum bpf_perf_event_type {
  */
 #define BPF_F_XDP_DEV_BOUND_ONLY	(1U << 6)
 
+/* The verifier internal test flag. Behavior is undefined */
+#define BPF_F_TEST_SANITY_STRICT	(1U << 7)
+
 /* link_create.kprobe_multi.flags used in LINK_CREATE command for
  * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
  */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0ed286b8a0f0..f266e03ba342 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2573,7 +2573,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 				 BPF_F_SLEEPABLE |
 				 BPF_F_TEST_RND_HI32 |
 				 BPF_F_XDP_HAS_FRAGS |
-				 BPF_F_XDP_DEV_BOUND_ONLY))
+				 BPF_F_XDP_DEV_BOUND_ONLY |
+				 BPF_F_TEST_SANITY_STRICT))
 		return -EINVAL;
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 65570eedfe88..e7edacf86e0f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2615,6 +2615,56 @@ static void reg_bounds_sync(struct bpf_reg_state *reg)
 	__update_reg_bounds(reg);
 }
 
+static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
+				   struct bpf_reg_state *reg, const char *ctx)
+{
+	const char *msg;
+
+	if (reg->umin_value > reg->umax_value ||
+	    reg->smin_value > reg->smax_value ||
+	    reg->u32_min_value > reg->u32_max_value ||
+	    reg->s32_min_value > reg->s32_max_value) {
+		    msg = "range bounds violation";
+		    goto out;
+	}
+
+	if (tnum_is_const(reg->var_off)) {
+		u64 uval = reg->var_off.value;
+		s64 sval = (s64)uval;
+
+		if (reg->umin_value != uval || reg->umax_value != uval ||
+		    reg->smin_value != sval || reg->smax_value != sval) {
+			msg = "const tnum out of sync with range bounds";
+			goto out;
+		}
+	}
+
+	if (tnum_subreg_is_const(reg->var_off)) {
+		u32 uval32 = tnum_subreg(reg->var_off).value;
+		s32 sval32 = (s32)uval32;
+
+		if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 ||
+		    reg->s32_min_value != sval32 || reg->s32_max_value != sval32) {
+			msg = "const subreg tnum out of sync with range bounds";
+			goto out;
+		}
+	}
+
+	return 0;
+out:
+	verbose(env, "REG SANITY VIOLATION (%s): %s u64=[%#llx, %#llx] "
+		"s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n",
+		ctx, msg, reg->umin_value, reg->umax_value,
+		reg->smin_value, reg->smax_value,
+		reg->u32_min_value, reg->u32_max_value,
+		reg->s32_min_value, reg->s32_max_value,
+		reg->var_off.value, reg->var_off.mask);
+	if (env->test_sanity_strict)
+		return -EFAULT;
+	__mark_reg_unbounded(reg);
+	return 0;
+}
+
 static bool __reg32_bound_s64(s32 a)
 {
 	return a >= 0 && a <= S32_MAX;
@@ -9982,14 +10032,15 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	return 0;
 }
 
-static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
-				   int func_id,
-				   struct bpf_call_arg_meta *meta)
+static int do_refine_retval_range(struct bpf_verifier_env *env,
+				  struct bpf_reg_state *regs, int ret_type,
+				  int func_id,
+				  struct bpf_call_arg_meta *meta)
 {
 	struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
 
 	if (ret_type != RET_INTEGER)
-		return;
+		return 0;
 
 	switch (func_id) {
 	case BPF_FUNC_get_stack:
@@ -10015,6 +10066,8 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
 		reg_bounds_sync(ret_reg);
 		break;
 	}
+
+	return reg_bounds_sanity_check(env, ret_reg, "retval");
 }
 
 static int
@@ -10666,7 +10719,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		regs[BPF_REG_0].ref_obj_id = id;
 	}
 
-	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+	err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta);
+	if (err)
+		return err;
 
 	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
 	if (err)
@@ -14166,13 +14221,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		/* check dest operand */
 		err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+		err = err ?: adjust_reg_min_max_vals(env, insn);
 		if (err)
 			return err;
-
-		return adjust_reg_min_max_vals(env, insn);
 	}
 
-	return 0;
+	return reg_bounds_sanity_check(env, &regs[insn->dst_reg], "alu");
 }
 
 static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
@@ -14653,18 +14707,21 @@ again:
  * Technically we can do similar adjustments for pointers to the same object,
  * but we don't support that right now.
  */
-static void reg_set_min_max(struct bpf_reg_state *true_reg1,
-			    struct bpf_reg_state *true_reg2,
-			    struct bpf_reg_state *false_reg1,
-			    struct bpf_reg_state *false_reg2,
-			    u8 opcode, bool is_jmp32)
+static int reg_set_min_max(struct bpf_verifier_env *env,
+			   struct bpf_reg_state *true_reg1,
+			   struct bpf_reg_state *true_reg2,
+			   struct bpf_reg_state *false_reg1,
+			   struct bpf_reg_state *false_reg2,
+			   u8 opcode, bool is_jmp32)
 {
+	int err;
+
 	/* If either register is a pointer, we can't learn anything about its
 	 * variable offset from the compare (unless they were a pointer into
 	 * the same object, but we don't bother with that).
 	 */
 	if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE)
-		return;
+		return 0;
 
 	/* fallthrough (FALSE) branch */
 	regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32);
@@ -14675,6 +14732,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg1,
 	regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32);
 	reg_bounds_sync(true_reg1);
 	reg_bounds_sync(true_reg2);
+
+	err = reg_bounds_sanity_check(env, true_reg1, "true_reg1");
+	err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2");
+	err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1");
+	err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2");
+	return err;
 }
 
 static void mark_ptr_or_null_reg(struct bpf_func_state *state,
@@ -14968,15 +15031,20 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
 
 	if (BPF_SRC(insn->code) == BPF_X) {
-		reg_set_min_max(&other_branch_regs[insn->dst_reg],
-				&other_branch_regs[insn->src_reg],
-				dst_reg, src_reg, opcode, is_jmp32);
+		err = reg_set_min_max(env,
+				      &other_branch_regs[insn->dst_reg],
+				      &other_branch_regs[insn->src_reg],
+				      dst_reg, src_reg, opcode, is_jmp32);
 	} else /* BPF_SRC(insn->code) == BPF_K */ {
-		reg_set_min_max(&other_branch_regs[insn->dst_reg],
-				src_reg /* fake one */,
-				dst_reg, src_reg /* same fake one */,
-				opcode, is_jmp32);
+		err = reg_set_min_max(env,
+				      &other_branch_regs[insn->dst_reg],
+				      src_reg /* fake one */,
+				      dst_reg, src_reg /* same fake one */,
+				      opcode, is_jmp32);
 	}
+	if (err)
+		return err;
+
 	if (BPF_SRC(insn->code) == BPF_X &&
 	    src_reg->type == SCALAR_VALUE && src_reg->id &&
 	    !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) {
@@ -17479,10 +17547,8 @@ static int do_check(struct bpf_verifier_env *env)
 					       insn->off, BPF_SIZE(insn->code),
 					       BPF_READ, insn->dst_reg, false,
 					       BPF_MODE(insn->code) == BPF_MEMSX);
-			if (err)
-				return err;
-
-			err = save_aux_ptr_type(env, src_reg_type, true);
+			err = err ?: save_aux_ptr_type(env, src_reg_type, true);
+			err = err ?: reg_bounds_sanity_check(env, &regs[insn->dst_reg], "ldx");
 			if (err)
 				return err;
 		} else if (class == BPF_STX) {
@@ -20769,6 +20835,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
+	env->test_sanity_strict = attr->prog_flags & BPF_F_TEST_SANITY_STRICT;
 
 	env->explored_states = kvcalloc(state_htab_size(env),
 				       sizeof(struct bpf_verifier_state_list *),
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7cf8bcf9f6a2..8a5855fcee69 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1200,6 +1200,9 @@ enum bpf_perf_event_type {
  */
 #define BPF_F_XDP_DEV_BOUND_ONLY	(1U << 6)
 
+/* The verifier internal test flag. Behavior is undefined */
+#define BPF_F_TEST_SANITY_STRICT	(1U << 7)
+
 /* link_create.kprobe_multi.flags used in LINK_CREATE command for
  * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
  */
-- 
cgit v1.2.3


From 3cf98cf594ea923b8b1e0385b580d3d8aae68c06 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 11 Nov 2023 17:06:01 -0800
Subject: bpf: remove redundant s{32,64} -> u{32,64} deduction logic

Equivalent checks were recently added in more succinct and, arguably,
safer form in:
  - f188765f23a5 ("bpf: derive smin32/smax32 from umin32/umax32 bounds");
  - 2e74aef782d3 ("bpf: derive smin/smax from umin/max bounds").

The checks we are removing in this patch set do similar checks to detect
if entire u32/u64 range has signed bit set or not set, but does it with
two separate checks.

Further, we forcefully overwrite either smin or smax (and 32-bit equvalents)
without applying normal min/max intersection logic. It's not clear why
that would be correct in all cases and seems to work by accident. This
logic is also "gated" by previous signed -> unsigned derivation, which
returns early.

All this is quite confusing and seems error-prone, while we already have
at least equivalent checks happening earlier. So remove this duplicate
and error-prone logic to simplify things a bit.

Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231112010609.848406-6-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 36 ------------------------------------
 1 file changed, 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e7edacf86e0f..53a9e3e79ab4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2411,24 +2411,6 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
 			min_t(u32, reg->s32_max_value, reg->u32_max_value);
 		return;
 	}
-	/* Learn sign from unsigned bounds.  Signed bounds cross the sign
-	 * boundary, so we must be careful.
-	 */
-	if ((s32)reg->u32_max_value >= 0) {
-		/* Positive.  We can't learn anything from the smin, but smax
-		 * is positive, hence safe.
-		 */
-		reg->s32_min_value = reg->u32_min_value;
-		reg->s32_max_value = reg->u32_max_value =
-			min_t(u32, reg->s32_max_value, reg->u32_max_value);
-	} else if ((s32)reg->u32_min_value < 0) {
-		/* Negative.  We can't learn anything from the smax, but smin
-		 * is negative, hence safe.
-		 */
-		reg->s32_min_value = reg->u32_min_value =
-			max_t(u32, reg->s32_min_value, reg->u32_min_value);
-		reg->s32_max_value = reg->u32_max_value;
-	}
 }
 
 static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
@@ -2516,24 +2498,6 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
 							  reg->umax_value);
 		return;
 	}
-	/* Learn sign from unsigned bounds.  Signed bounds cross the sign
-	 * boundary, so we must be careful.
-	 */
-	if ((s64)reg->umax_value >= 0) {
-		/* Positive.  We can't learn anything from the smin, but smax
-		 * is positive, hence safe.
-		 */
-		reg->smin_value = reg->umin_value;
-		reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
-							  reg->umax_value);
-	} else if ((s64)reg->umin_value < 0) {
-		/* Negative.  We can't learn anything from the smax, but smin
-		 * is negative, hence safe.
-		 */
-		reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
-							  reg->umin_value);
-		reg->smax_value = reg->umax_value;
-	}
 }
 
 static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
-- 
cgit v1.2.3


From cf5fe3c71c5a34ac0108afc550407c672d0a032d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 11 Nov 2023 17:06:02 -0800
Subject: bpf: make __reg{32,64}_deduce_bounds logic more robust

This change doesn't seem to have any effect on selftests and production
BPF object files, but we preemptively try to make it more robust.

First, "learn sign from signed bounds" comment is misleading, as we are
learning not just sign, but also values.

Second, we simplify the check for determining whether entire range is
positive or negative similarly to other checks added earlier, using
appropriate u32/u64 cast and single comparisons. As explain in comments
in __reg64_deduce_bounds(), the checks are equivalent.

Last but not least, smin/smax and s32_min/s32_max reassignment based on
min/max of both umin/umax and smin/smax (and 32-bit equivalents) is hard
to explain and justify. We are updating unsigned bounds from signed
bounds, why would we update signed bounds at the same time? This might
be correct, but it's far from obvious why and the code or comments don't
try to justify this. Given we've added a separate deduction of signed
bounds from unsigned bounds earlier, this seems at least redundant, if
not just wrong.

In short, we remove doubtful pieces, and streamline the rest to follow
the logic and approach of the rest of reg_bounds_sync() checks.

Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231112010609.848406-7-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 53a9e3e79ab4..59505881e7a7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2399,17 +2399,13 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
 		reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value);
 		reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value);
 	}
-	/* Learn sign from signed bounds.
-	 * If we cannot cross the sign boundary, then signed and unsigned bounds
+	/* If we cannot cross the sign boundary, then signed and unsigned bounds
 	 * are the same, so combine.  This works even in the negative case, e.g.
 	 * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
 	 */
-	if (reg->s32_min_value >= 0 || reg->s32_max_value < 0) {
-		reg->s32_min_value = reg->u32_min_value =
-			max_t(u32, reg->s32_min_value, reg->u32_min_value);
-		reg->s32_max_value = reg->u32_max_value =
-			min_t(u32, reg->s32_max_value, reg->u32_max_value);
-		return;
+	if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
+		reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value);
+		reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value);
 	}
 }
 
@@ -2486,17 +2482,13 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
 		reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value);
 		reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value);
 	}
-	/* Learn sign from signed bounds.
-	 * If we cannot cross the sign boundary, then signed and unsigned bounds
+	/* If we cannot cross the sign boundary, then signed and unsigned bounds
 	 * are the same, so combine.  This works even in the negative case, e.g.
 	 * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
 	 */
-	if (reg->smin_value >= 0 || reg->smax_value < 0) {
-		reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
-							  reg->umin_value);
-		reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
-							  reg->umax_value);
-		return;
+	if ((u64)reg->smin_value <= (u64)reg->smax_value) {
+		reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value);
+		reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value);
 	}
 }
 
-- 
cgit v1.2.3


From ff8867af01daa7ea770bebf5f91199b7434b74e5 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 09:14:04 -0800
Subject: bpf: rename BPF_F_TEST_SANITY_STRICT to BPF_F_TEST_REG_INVARIANTS

Rename verifier internal flag BPF_F_TEST_SANITY_STRICT to more neutral
BPF_F_TEST_REG_INVARIANTS. This is a follow up to [0].

A few selftests and veristat need to be adjusted in the same patch as
well.

  [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231112010609.848406-5-andrii@kernel.org/

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231117171404.225508-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                             |  2 +-
 include/uapi/linux/bpf.h                                 |  2 +-
 kernel/bpf/syscall.c                                     |  2 +-
 kernel/bpf/verifier.c                                    |  6 +++---
 tools/include/uapi/linux/bpf.h                           |  2 +-
 tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c |  2 +-
 tools/testing/selftests/bpf/prog_tests/reg_bounds.c      |  2 +-
 tools/testing/selftests/bpf/progs/verifier_bounds.c      |  4 ++--
 tools/testing/selftests/bpf/test_loader.c                |  6 +++---
 tools/testing/selftests/bpf/test_sock_addr.c             |  3 +--
 tools/testing/selftests/bpf/test_verifier.c              |  2 +-
 tools/testing/selftests/bpf/testing_helpers.c            |  4 ++--
 tools/testing/selftests/bpf/veristat.c                   | 12 ++++++------
 13 files changed, 24 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 402b6bc44a1b..52a4012b8255 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -602,7 +602,7 @@ struct bpf_verifier_env {
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
 	bool test_state_freq;		/* test verifier with different pruning frequency */
-	bool test_sanity_strict;	/* fail verification on sanity violations */
+	bool test_reg_invariants;	/* fail verification on register invariants violations */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	struct bpf_verifier_state_list *free_list;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8a5855fcee69..7a5498242eaa 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1201,7 +1201,7 @@ enum bpf_perf_event_type {
 #define BPF_F_XDP_DEV_BOUND_ONLY	(1U << 6)
 
 /* The verifier internal test flag. Behavior is undefined */
-#define BPF_F_TEST_SANITY_STRICT	(1U << 7)
+#define BPF_F_TEST_REG_INVARIANTS	(1U << 7)
 
 /* link_create.kprobe_multi.flags used in LINK_CREATE command for
  * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f266e03ba342..5e43ddd1b83f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2574,7 +2574,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 				 BPF_F_TEST_RND_HI32 |
 				 BPF_F_XDP_HAS_FRAGS |
 				 BPF_F_XDP_DEV_BOUND_ONLY |
-				 BPF_F_TEST_SANITY_STRICT))
+				 BPF_F_TEST_REG_INVARIANTS))
 		return -EINVAL;
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 59505881e7a7..7c3461b89513 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2608,14 +2608,14 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
 
 	return 0;
 out:
-	verbose(env, "REG SANITY VIOLATION (%s): %s u64=[%#llx, %#llx] "
+	verbose(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] "
 		"s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n",
 		ctx, msg, reg->umin_value, reg->umax_value,
 		reg->smin_value, reg->smax_value,
 		reg->u32_min_value, reg->u32_max_value,
 		reg->s32_min_value, reg->s32_max_value,
 		reg->var_off.value, reg->var_off.mask);
-	if (env->test_sanity_strict)
+	if (env->test_reg_invariants)
 		return -EFAULT;
 	__mark_reg_unbounded(reg);
 	return 0;
@@ -20791,7 +20791,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
-	env->test_sanity_strict = attr->prog_flags & BPF_F_TEST_SANITY_STRICT;
+	env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
 
 	env->explored_states = kvcalloc(state_htab_size(env),
 				       sizeof(struct bpf_verifier_state_list *),
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 8a5855fcee69..7a5498242eaa 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1201,7 +1201,7 @@ enum bpf_perf_event_type {
 #define BPF_F_XDP_DEV_BOUND_ONLY	(1U << 6)
 
 /* The verifier internal test flag. Behavior is undefined */
-#define BPF_F_TEST_SANITY_STRICT	(1U << 7)
+#define BPF_F_TEST_REG_INVARIANTS	(1U << 7)
 
 /* link_create.kprobe_multi.flags used in LINK_CREATE command for
  * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
index 3f2d70831873..e770912fc1d2 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
@@ -35,7 +35,7 @@ static int check_load(const char *file, enum bpf_prog_type type)
 	}
 
 	bpf_program__set_type(prog, type);
-	bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32 | BPF_F_TEST_SANITY_STRICT);
+	bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS);
 	bpf_program__set_log_level(prog, 4 | extra_prog_load_log_flags);
 
 	err = bpf_object__load(obj);
diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
index fe0cb906644b..7a8b0bf0a7f8 100644
--- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
+++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
@@ -838,7 +838,7 @@ static int load_range_cmp_prog(struct range x, struct range y, enum op op,
 		.log_level = 2,
 		.log_buf = log_buf,
 		.log_size = log_sz,
-		.prog_flags = BPF_F_TEST_SANITY_STRICT,
+		.prog_flags = BPF_F_TEST_REG_INVARIANTS,
 	);
 
 	/* ; skip exit block below
diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c
index 0c1460936373..ec430b71730b 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bounds.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c
@@ -965,7 +965,7 @@ l0_%=:	r0 = 0;						\
 SEC("xdp")
 __description("bound check with JMP_JSLT for crossing 64-bit signed boundary")
 __success __retval(0)
-__flag(!BPF_F_TEST_SANITY_STRICT) /* known sanity violation */
+__flag(!BPF_F_TEST_REG_INVARIANTS) /* known invariants violation */
 __naked void crossing_64_bit_signed_boundary_2(void)
 {
 	asm volatile ("					\
@@ -1047,7 +1047,7 @@ l0_%=:	r0 = 0;						\
 SEC("xdp")
 __description("bound check with JMP32_JSLT for crossing 32-bit signed boundary")
 __success __retval(0)
-__flag(!BPF_F_TEST_SANITY_STRICT) /* known sanity violation */
+__flag(!BPF_F_TEST_REG_INVARIANTS) /* known invariants violation */
 __naked void crossing_32_bit_signed_boundary_2(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index 57e27b1a73a6..a350ecdfba4a 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -179,7 +179,7 @@ static int parse_test_spec(struct test_loader *tester,
 	memset(spec, 0, sizeof(*spec));
 
 	spec->prog_name = bpf_program__name(prog);
-	spec->prog_flags = BPF_F_TEST_SANITY_STRICT; /* by default be strict */
+	spec->prog_flags = BPF_F_TEST_REG_INVARIANTS; /* by default be strict */
 
 	btf = bpf_object__btf(obj);
 	if (!btf) {
@@ -280,8 +280,8 @@ static int parse_test_spec(struct test_loader *tester,
 				update_flags(&spec->prog_flags, BPF_F_SLEEPABLE, clear);
 			} else if (strcmp(val, "BPF_F_XDP_HAS_FRAGS") == 0) {
 				update_flags(&spec->prog_flags, BPF_F_XDP_HAS_FRAGS, clear);
-			} else if (strcmp(val, "BPF_F_TEST_SANITY_STRICT") == 0) {
-				update_flags(&spec->prog_flags, BPF_F_TEST_SANITY_STRICT, clear);
+			} else if (strcmp(val, "BPF_F_TEST_REG_INVARIANTS") == 0) {
+				update_flags(&spec->prog_flags, BPF_F_TEST_REG_INVARIANTS, clear);
 			} else /* assume numeric value */ {
 				err = parse_int(val, &flags, "test prog flags");
 				if (err)
diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c
index 878c077e0fa7..b0068a9d2cfe 100644
--- a/tools/testing/selftests/bpf/test_sock_addr.c
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -679,8 +679,7 @@ static int load_path(const struct sock_addr_test *test, const char *path)
 
 	bpf_program__set_type(prog, BPF_PROG_TYPE_CGROUP_SOCK_ADDR);
 	bpf_program__set_expected_attach_type(prog, test->expected_attach_type);
-	bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32);
-	bpf_program__set_flags(prog, BPF_F_TEST_SANITY_STRICT);
+	bpf_program__set_flags(prog, BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS);
 
 	err = bpf_object__load(obj);
 	if (err) {
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 4992022f3137..f36e41435be7 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -1588,7 +1588,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 	if (fixup_skips != skips)
 		return;
 
-	pflags = BPF_F_TEST_RND_HI32 | BPF_F_TEST_SANITY_STRICT;
+	pflags = BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS;
 	if (test->flags & F_LOAD_WITH_STRICT_ALIGNMENT)
 		pflags |= BPF_F_STRICT_ALIGNMENT;
 	if (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS)
diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index 9786a94a666c..d2458c1b1671 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -276,7 +276,7 @@ int bpf_prog_test_load(const char *file, enum bpf_prog_type type,
 	if (type != BPF_PROG_TYPE_UNSPEC && bpf_program__type(prog) != type)
 		bpf_program__set_type(prog, type);
 
-	flags = bpf_program__flags(prog) | BPF_F_TEST_RND_HI32 | BPF_F_TEST_SANITY_STRICT;
+	flags = bpf_program__flags(prog) | BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS;
 	bpf_program__set_flags(prog, flags);
 
 	err = bpf_object__load(obj);
@@ -299,7 +299,7 @@ int bpf_test_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 {
 	LIBBPF_OPTS(bpf_prog_load_opts, opts,
 		.kern_version = kern_version,
-		.prog_flags = BPF_F_TEST_RND_HI32 | BPF_F_TEST_SANITY_STRICT,
+		.prog_flags = BPF_F_TEST_RND_HI32 | BPF_F_TEST_REG_INVARIANTS,
 		.log_level = extra_prog_load_log_flags,
 		.log_buf = log_buf,
 		.log_size = log_buf_sz,
diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 609fd9753af0..1d418d66e375 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -145,7 +145,7 @@ static struct env {
 	bool debug;
 	bool quiet;
 	bool force_checkpoints;
-	bool strict_range_sanity;
+	bool force_reg_invariants;
 	enum resfmt out_fmt;
 	bool show_version;
 	bool comparison_mode;
@@ -225,8 +225,8 @@ static const struct argp_option opts[] = {
 	{ "filter", 'f', "FILTER", 0, "Filter expressions (or @filename for file with expressions)." },
 	{ "test-states", 't', NULL, 0,
 	  "Force frequent BPF verifier state checkpointing (set BPF_F_TEST_STATE_FREQ program flag)" },
-	{ "test-sanity", 'r', NULL, 0,
-	  "Force strict BPF verifier register sanity behavior (BPF_F_TEST_SANITY_STRICT program flag)" },
+	{ "test-reg-invariants", 'r', NULL, 0,
+	  "Force BPF verifier failure on register invariant violation (BPF_F_TEST_REG_INVARIANTS program flag)" },
 	{},
 };
 
@@ -299,7 +299,7 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 		env.force_checkpoints = true;
 		break;
 	case 'r':
-		env.strict_range_sanity = true;
+		env.force_reg_invariants = true;
 		break;
 	case 'n':
 		errno = 0;
@@ -1028,8 +1028,8 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
 
 	if (env.force_checkpoints)
 		bpf_program__set_flags(prog, bpf_program__flags(prog) | BPF_F_TEST_STATE_FREQ);
-	if (env.strict_range_sanity)
-		bpf_program__set_flags(prog, bpf_program__flags(prog) | BPF_F_TEST_SANITY_STRICT);
+	if (env.force_reg_invariants)
+		bpf_program__set_flags(prog, bpf_program__flags(prog) | BPF_F_TEST_REG_INVARIANTS);
 
 	err = bpf_object__load(obj);
 	env.progs_processed++;
-- 
cgit v1.2.3


From f73f6181eb057671e358ebac8ed7f0014f12efb8 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 30 Aug 2023 09:32:15 -0700
Subject: userns: eliminate many kernel-doc warnings

Drop the kernel-doc "/**" notation from 8 structs or functions to
prevent 22 kernel-doc warnings  (samples below).

user_namespace.c:239: warning: Function parameter or member 'map_up' not described in 'idmap_key'
user_namespace.c:246: warning: Function parameter or member 'k' not described in 'cmp_map_id'
user_namespace.c:277: warning: Function parameter or member 'extents' not described in 'map_id_range_down_max'
user_namespace.c:295: warning: Function parameter or member 'extents' not described in 'map_id_range_down_base'
user_namespace.c:344: warning: Function parameter or member 'extents' not described in 'map_id_up_base'
user_namespace.c:364: warning: Function parameter or member 'extents' not described in 'map_id_up_max'
user_namespace.c:776: warning: Function parameter or member 'map' not described in 'insert_extent'
user_namespace.c:844: warning: Function parameter or member 'map' not described in 'sort_idmaps'

Fixes: 6397fac4915a ("userns: bump idmap limits to 340")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20230830163215.13193-1-rdunlap@infradead.org
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Christian Brauner <brauner@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/user_namespace.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index eabe8bcc7042..625101249e4d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -231,7 +231,7 @@ void __put_user_ns(struct user_namespace *ns)
 }
 EXPORT_SYMBOL(__put_user_ns);
 
-/**
+/*
  * struct idmap_key - holds the information necessary to find an idmapping in a
  * sorted idmap array. It is passed to cmp_map_id() as first argument.
  */
@@ -241,7 +241,7 @@ struct idmap_key {
 	u32 count; /* == 0 unless used with map_id_range_down() */
 };
 
-/**
+/*
  * cmp_map_id - Function to be passed to bsearch() to find the requested
  * idmapping. Expects struct idmap_key to be passed via @k.
  */
@@ -271,7 +271,7 @@ static int cmp_map_id(const void *k, const void *e)
 	return 1;
 }
 
-/**
+/*
  * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
  * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
  */
@@ -288,7 +288,7 @@ map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 cou
 		       sizeof(struct uid_gid_extent), cmp_map_id);
 }
 
-/**
+/*
  * map_id_range_down_base - Find idmap via binary search in static extent array.
  * Can only be called if number of mappings is equal or less than
  * UID_GID_MAP_MAX_BASE_EXTENTS.
@@ -337,7 +337,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
 	return map_id_range_down(map, id, 1);
 }
 
-/**
+/*
  * map_id_up_base - Find idmap via binary search in static extent array.
  * Can only be called if number of mappings is equal or less than
  * UID_GID_MAP_MAX_BASE_EXTENTS.
@@ -358,7 +358,7 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
 	return NULL;
 }
 
-/**
+/*
  * map_id_up_max - Find idmap via binary search in ordered idmap array.
  * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
  */
@@ -770,7 +770,7 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
 	return false;
 }
 
-/**
+/*
  * insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
  * Takes care to allocate a 4K block of memory if the number of mappings exceeds
  * UID_GID_MAP_MAX_BASE_EXTENTS.
@@ -839,7 +839,7 @@ static int cmp_extents_reverse(const void *a, const void *b)
 	return 0;
 }
 
-/**
+/*
  * sort_idmaps - Sorts an array of idmap entries.
  * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
  */
-- 
cgit v1.2.3


From 793838138c157d4c49f4fb744b170747e3dabf58 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Sat, 18 Nov 2023 19:33:35 +0100
Subject: prctl: Disable prctl(PR_SET_MDWE) on parisc

systemd-254 tries to use prctl(PR_SET_MDWE) for it's MemoryDenyWriteExecute
functionality, but fails on parisc which still needs executable stacks in
certain combinations of gcc/glibc/kernel.

Disable prctl(PR_SET_MDWE) by returning -EINVAL for now on parisc, until
userspace has catched up.

Signed-off-by: Helge Deller <deller@gmx.de>
Co-developed-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Sam James <sam@gentoo.org>
Closes: https://github.com/systemd/systemd/issues/29775
Tested-by: Sam James <sam@gentoo.org>
Link: https://lore.kernel.org/all/875y2jro9a.fsf@gentoo.org/
Cc: <stable@vger.kernel.org> # v6.3+
---
 kernel/sys.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 420d9cb9cc8e..e219fcfa112d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2394,6 +2394,10 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
 	if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
 		return -EINVAL;
 
+	/* PARISC cannot allow mdwe as it needs writable stacks */
+	if (IS_ENABLED(CONFIG_PARISC))
+		return -EINVAL;
+
 	current_bits = get_current_mdwe();
 	if (current_bits && current_bits != bits)
 		return -EPERM; /* Cannot unset the flags */
-- 
cgit v1.2.3


From db840d389bad60ce6f3aadc1079da13e7e993a16 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 19:46:16 -0800
Subject: bpf: move verbose_linfo() into kernel/bpf/log.c

verifier.c is huge. Let's try to move out parts that are logging-related
into log.c, as we previously did with bpf_log() and other related stuff.
This patch moves line info verbose output routines: it's pretty
self-contained and isolated code, so there is no problem with this.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231118034623.3320920-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  4 +++
 kernel/bpf/log.c             | 59 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c        | 57 ------------------------------------------
 3 files changed, 63 insertions(+), 57 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 52a4012b8255..d896f3db6a22 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -680,6 +680,10 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
 void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos);
 int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual);
 
+__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
+				  u32 insn_off,
+				  const char *prefix_fmt, ...);
+
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state *cur = env->cur_state;
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 850494423530..f20e1449c882 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -10,6 +10,8 @@
 #include <linux/bpf_verifier.h>
 #include <linux/math64.h>
 
+#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args)
+
 static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
 {
 	/* ubuf and len_total should both be specified (or not) together */
@@ -325,3 +327,60 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 	va_end(args);
 }
 EXPORT_SYMBOL_GPL(bpf_log);
+
+static const struct bpf_line_info *
+find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
+{
+	const struct bpf_line_info *linfo;
+	const struct bpf_prog *prog;
+	u32 i, nr_linfo;
+
+	prog = env->prog;
+	nr_linfo = prog->aux->nr_linfo;
+
+	if (!nr_linfo || insn_off >= prog->len)
+		return NULL;
+
+	linfo = prog->aux->linfo;
+	for (i = 1; i < nr_linfo; i++)
+		if (insn_off < linfo[i].insn_off)
+			break;
+
+	return &linfo[i - 1];
+}
+
+static const char *ltrim(const char *s)
+{
+	while (isspace(*s))
+		s++;
+
+	return s;
+}
+
+__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
+				  u32 insn_off,
+				  const char *prefix_fmt, ...)
+{
+	const struct bpf_line_info *linfo;
+
+	if (!bpf_verifier_log_needed(&env->log))
+		return;
+
+	linfo = find_linfo(env, insn_off);
+	if (!linfo || linfo == env->prev_linfo)
+		return;
+
+	if (prefix_fmt) {
+		va_list args;
+
+		va_start(args, prefix_fmt);
+		bpf_verifier_vlog(&env->log, prefix_fmt, args);
+		va_end(args);
+	}
+
+	verbose(env, "%s\n",
+		ltrim(btf_name_by_offset(env->prog->aux->btf,
+					 linfo->line_off)));
+
+	env->prev_linfo = linfo;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7c3461b89513..683fdda25c13 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -337,27 +337,6 @@ struct btf *btf_vmlinux;
 
 static DEFINE_MUTEX(bpf_verifier_lock);
 
-static const struct bpf_line_info *
-find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
-{
-	const struct bpf_line_info *linfo;
-	const struct bpf_prog *prog;
-	u32 i, nr_linfo;
-
-	prog = env->prog;
-	nr_linfo = prog->aux->nr_linfo;
-
-	if (!nr_linfo || insn_off >= prog->len)
-		return NULL;
-
-	linfo = prog->aux->linfo;
-	for (i = 1; i < nr_linfo; i++)
-		if (insn_off < linfo[i].insn_off)
-			break;
-
-	return &linfo[i - 1];
-}
-
 __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
 {
 	struct bpf_verifier_env *env = private_data;
@@ -371,42 +350,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
 	va_end(args);
 }
 
-static const char *ltrim(const char *s)
-{
-	while (isspace(*s))
-		s++;
-
-	return s;
-}
-
-__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env,
-					 u32 insn_off,
-					 const char *prefix_fmt, ...)
-{
-	const struct bpf_line_info *linfo;
-
-	if (!bpf_verifier_log_needed(&env->log))
-		return;
-
-	linfo = find_linfo(env, insn_off);
-	if (!linfo || linfo == env->prev_linfo)
-		return;
-
-	if (prefix_fmt) {
-		va_list args;
-
-		va_start(args, prefix_fmt);
-		bpf_verifier_vlog(&env->log, prefix_fmt, args);
-		va_end(args);
-	}
-
-	verbose(env, "%s\n",
-		ltrim(btf_name_by_offset(env->prog->aux->btf,
-					 linfo->line_off)));
-
-	env->prev_linfo = linfo;
-}
-
 static void verbose_invalid_scalar(struct bpf_verifier_env *env,
 				   struct bpf_reg_state *reg,
 				   struct tnum *range, const char *ctx,
-- 
cgit v1.2.3


From 42feb6620accded89cad5f455665e21281813d79 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 19:46:17 -0800
Subject: bpf: move verifier state printing code to kernel/bpf/log.c

Move a good chunk of code from verifier.c to log.c: verifier state
verbose printing logic. This is an important and very much
logging/debugging oriented code. It fits the overlall log.c's focus on
verifier logging, and moving it allows to keep growing it without
unnecessarily adding to verifier.c code that otherwise contains a core
verification logic.

There are not many shared dependencies between this code and the rest of
verifier.c code, except a few single-line helpers for various register
type checks and a bit of state "scratching" helpers. We move all such
trivial helpers into include/bpf/bpf_verifier.h as static inlines.

No functional changes in this patch.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231118034623.3320920-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  72 ++++++++
 kernel/bpf/log.c             | 342 ++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c        | 403 -------------------------------------------
 3 files changed, 414 insertions(+), 403 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d896f3db6a22..39edc76f436e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -783,4 +783,76 @@ static inline bool bpf_type_has_unsafe_modifiers(u32 type)
 	return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS;
 }
 
+static inline bool type_is_ptr_alloc_obj(u32 type)
+{
+	return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
+}
+
+static inline bool type_is_non_owning_ref(u32 type)
+{
+	return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
+}
+
+static inline bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+	type = base_type(type);
+	return type == PTR_TO_PACKET ||
+	       type == PTR_TO_PACKET_META;
+}
+
+static inline bool type_is_sk_pointer(enum bpf_reg_type type)
+{
+	return type == PTR_TO_SOCKET ||
+		type == PTR_TO_SOCK_COMMON ||
+		type == PTR_TO_TCP_SOCK ||
+		type == PTR_TO_XDP_SOCK;
+}
+
+static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
+{
+	env->scratched_regs |= 1U << regno;
+}
+
+static inline void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
+{
+	env->scratched_stack_slots |= 1ULL << spi;
+}
+
+static inline bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
+{
+	return (env->scratched_regs >> regno) & 1;
+}
+
+static inline bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
+{
+	return (env->scratched_stack_slots >> regno) & 1;
+}
+
+static inline bool verifier_state_scratched(const struct bpf_verifier_env *env)
+{
+	return env->scratched_regs || env->scratched_stack_slots;
+}
+
+static inline void mark_verifier_state_clean(struct bpf_verifier_env *env)
+{
+	env->scratched_regs = 0U;
+	env->scratched_stack_slots = 0ULL;
+}
+
+/* Used for printing the entire verifier state. */
+static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env)
+{
+	env->scratched_regs = ~0U;
+	env->scratched_stack_slots = ~0ULL;
+}
+
+const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type);
+const char *dynptr_type_str(enum bpf_dynptr_type type);
+const char *iter_type_str(const struct btf *btf, u32 btf_id);
+const char *iter_state_str(enum bpf_iter_state state);
+
+void print_verifier_state(struct bpf_verifier_env *env,
+			  const struct bpf_func_state *state, bool print_all);
+void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state);
+
 #endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index f20e1449c882..c1b257eac21b 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -384,3 +384,345 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env,
 
 	env->prev_linfo = linfo;
 }
+
+static const char *btf_type_name(const struct btf *btf, u32 id)
+{
+	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
+}
+
+/* string representation of 'enum bpf_reg_type'
+ *
+ * Note that reg_type_str() can not appear more than once in a single verbose()
+ * statement.
+ */
+const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type)
+{
+	char postfix[16] = {0}, prefix[64] = {0};
+	static const char * const str[] = {
+		[NOT_INIT]		= "?",
+		[SCALAR_VALUE]		= "scalar",
+		[PTR_TO_CTX]		= "ctx",
+		[CONST_PTR_TO_MAP]	= "map_ptr",
+		[PTR_TO_MAP_VALUE]	= "map_value",
+		[PTR_TO_STACK]		= "fp",
+		[PTR_TO_PACKET]		= "pkt",
+		[PTR_TO_PACKET_META]	= "pkt_meta",
+		[PTR_TO_PACKET_END]	= "pkt_end",
+		[PTR_TO_FLOW_KEYS]	= "flow_keys",
+		[PTR_TO_SOCKET]		= "sock",
+		[PTR_TO_SOCK_COMMON]	= "sock_common",
+		[PTR_TO_TCP_SOCK]	= "tcp_sock",
+		[PTR_TO_TP_BUFFER]	= "tp_buffer",
+		[PTR_TO_XDP_SOCK]	= "xdp_sock",
+		[PTR_TO_BTF_ID]		= "ptr_",
+		[PTR_TO_MEM]		= "mem",
+		[PTR_TO_BUF]		= "buf",
+		[PTR_TO_FUNC]		= "func",
+		[PTR_TO_MAP_KEY]	= "map_key",
+		[CONST_PTR_TO_DYNPTR]	= "dynptr_ptr",
+	};
+
+	if (type & PTR_MAYBE_NULL) {
+		if (base_type(type) == PTR_TO_BTF_ID)
+			strncpy(postfix, "or_null_", 16);
+		else
+			strncpy(postfix, "_or_null", 16);
+	}
+
+	snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
+		 type & MEM_RDONLY ? "rdonly_" : "",
+		 type & MEM_RINGBUF ? "ringbuf_" : "",
+		 type & MEM_USER ? "user_" : "",
+		 type & MEM_PERCPU ? "percpu_" : "",
+		 type & MEM_RCU ? "rcu_" : "",
+		 type & PTR_UNTRUSTED ? "untrusted_" : "",
+		 type & PTR_TRUSTED ? "trusted_" : ""
+	);
+
+	snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s",
+		 prefix, str[base_type(type)], postfix);
+	return env->tmp_str_buf;
+}
+
+const char *dynptr_type_str(enum bpf_dynptr_type type)
+{
+	switch (type) {
+	case BPF_DYNPTR_TYPE_LOCAL:
+		return "local";
+	case BPF_DYNPTR_TYPE_RINGBUF:
+		return "ringbuf";
+	case BPF_DYNPTR_TYPE_SKB:
+		return "skb";
+	case BPF_DYNPTR_TYPE_XDP:
+		return "xdp";
+	case BPF_DYNPTR_TYPE_INVALID:
+		return "<invalid>";
+	default:
+		WARN_ONCE(1, "unknown dynptr type %d\n", type);
+		return "<unknown>";
+	}
+}
+
+const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+	if (!btf || btf_id == 0)
+		return "<invalid>";
+
+	/* we already validated that type is valid and has conforming name */
+	return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+const char *iter_state_str(enum bpf_iter_state state)
+{
+	switch (state) {
+	case BPF_ITER_STATE_ACTIVE:
+		return "active";
+	case BPF_ITER_STATE_DRAINED:
+		return "drained";
+	case BPF_ITER_STATE_INVALID:
+		return "<invalid>";
+	default:
+		WARN_ONCE(1, "unknown iter state %d\n", state);
+		return "<unknown>";
+	}
+}
+
+static char slot_type_char[] = {
+	[STACK_INVALID]	= '?',
+	[STACK_SPILL]	= 'r',
+	[STACK_MISC]	= 'm',
+	[STACK_ZERO]	= '0',
+	[STACK_DYNPTR]	= 'd',
+	[STACK_ITER]	= 'i',
+};
+
+static void print_liveness(struct bpf_verifier_env *env,
+			   enum bpf_reg_liveness live)
+{
+	if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
+	    verbose(env, "_");
+	if (live & REG_LIVE_READ)
+		verbose(env, "r");
+	if (live & REG_LIVE_WRITTEN)
+		verbose(env, "w");
+	if (live & REG_LIVE_DONE)
+		verbose(env, "D");
+}
+
+static void print_scalar_ranges(struct bpf_verifier_env *env,
+				const struct bpf_reg_state *reg,
+				const char **sep)
+{
+	struct {
+		const char *name;
+		u64 val;
+		bool omit;
+	} minmaxs[] = {
+		{"smin",   reg->smin_value,         reg->smin_value == S64_MIN},
+		{"smax",   reg->smax_value,         reg->smax_value == S64_MAX},
+		{"umin",   reg->umin_value,         reg->umin_value == 0},
+		{"umax",   reg->umax_value,         reg->umax_value == U64_MAX},
+		{"smin32", (s64)reg->s32_min_value, reg->s32_min_value == S32_MIN},
+		{"smax32", (s64)reg->s32_max_value, reg->s32_max_value == S32_MAX},
+		{"umin32", reg->u32_min_value,      reg->u32_min_value == 0},
+		{"umax32", reg->u32_max_value,      reg->u32_max_value == U32_MAX},
+	}, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
+	bool neg1, neg2;
+
+	for (m1 = &minmaxs[0]; m1 < mend; m1++) {
+		if (m1->omit)
+			continue;
+
+		neg1 = m1->name[0] == 's' && (s64)m1->val < 0;
+
+		verbose(env, "%s%s=", *sep, m1->name);
+		*sep = ",";
+
+		for (m2 = m1 + 2; m2 < mend; m2 += 2) {
+			if (m2->omit || m2->val != m1->val)
+				continue;
+			/* don't mix negatives with positives */
+			neg2 = m2->name[0] == 's' && (s64)m2->val < 0;
+			if (neg2 != neg1)
+				continue;
+			m2->omit = true;
+			verbose(env, "%s=", m2->name);
+		}
+
+		verbose(env, m1->name[0] == 's' ? "%lld" : "%llu", m1->val);
+	}
+}
+
+void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state,
+			  bool print_all)
+{
+	const struct bpf_reg_state *reg;
+	enum bpf_reg_type t;
+	int i;
+
+	if (state->frameno)
+		verbose(env, " frame%d:", state->frameno);
+	for (i = 0; i < MAX_BPF_REG; i++) {
+		reg = &state->regs[i];
+		t = reg->type;
+		if (t == NOT_INIT)
+			continue;
+		if (!print_all && !reg_scratched(env, i))
+			continue;
+		verbose(env, " R%d", i);
+		print_liveness(env, reg->live);
+		verbose(env, "=");
+		if (t == SCALAR_VALUE && reg->precise)
+			verbose(env, "P");
+		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
+		    tnum_is_const(reg->var_off)) {
+			/* reg->off should be 0 for SCALAR_VALUE */
+			verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
+			verbose(env, "%lld", reg->var_off.value + reg->off);
+		} else {
+			const char *sep = "";
+
+			verbose(env, "%s", reg_type_str(env, t));
+			if (base_type(t) == PTR_TO_BTF_ID)
+				verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
+			verbose(env, "(");
+/*
+ * _a stands for append, was shortened to avoid multiline statements below.
+ * This macro is used to output a comma separated list of attributes.
+ */
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; })
+
+			if (reg->id)
+				verbose_a("id=%d", reg->id);
+			if (reg->ref_obj_id)
+				verbose_a("ref_obj_id=%d", reg->ref_obj_id);
+			if (type_is_non_owning_ref(reg->type))
+				verbose_a("%s", "non_own_ref");
+			if (t != SCALAR_VALUE)
+				verbose_a("off=%d", reg->off);
+			if (type_is_pkt_pointer(t))
+				verbose_a("r=%d", reg->range);
+			else if (base_type(t) == CONST_PTR_TO_MAP ||
+				 base_type(t) == PTR_TO_MAP_KEY ||
+				 base_type(t) == PTR_TO_MAP_VALUE)
+				verbose_a("ks=%d,vs=%d",
+					  reg->map_ptr->key_size,
+					  reg->map_ptr->value_size);
+			if (tnum_is_const(reg->var_off)) {
+				/* Typically an immediate SCALAR_VALUE, but
+				 * could be a pointer whose offset is too big
+				 * for reg->off
+				 */
+				verbose_a("imm=%llx", reg->var_off.value);
+			} else {
+				print_scalar_ranges(env, reg, &sep);
+				if (!tnum_is_unknown(reg->var_off)) {
+					char tn_buf[48];
+
+					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+					verbose_a("var_off=%s", tn_buf);
+				}
+			}
+#undef verbose_a
+
+			verbose(env, ")");
+		}
+	}
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		char types_buf[BPF_REG_SIZE + 1];
+		bool valid = false;
+		int j;
+
+		for (j = 0; j < BPF_REG_SIZE; j++) {
+			if (state->stack[i].slot_type[j] != STACK_INVALID)
+				valid = true;
+			types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
+		}
+		types_buf[BPF_REG_SIZE] = 0;
+		if (!valid)
+			continue;
+		if (!print_all && !stack_slot_scratched(env, i))
+			continue;
+		switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
+		case STACK_SPILL:
+			reg = &state->stack[i].spilled_ptr;
+			t = reg->type;
+
+			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+			print_liveness(env, reg->live);
+			verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
+			if (t == SCALAR_VALUE && reg->precise)
+				verbose(env, "P");
+			if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
+				verbose(env, "%lld", reg->var_off.value + reg->off);
+			break;
+		case STACK_DYNPTR:
+			i += BPF_DYNPTR_NR_SLOTS - 1;
+			reg = &state->stack[i].spilled_ptr;
+
+			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+			print_liveness(env, reg->live);
+			verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type));
+			if (reg->ref_obj_id)
+				verbose(env, "(ref_id=%d)", reg->ref_obj_id);
+			break;
+		case STACK_ITER:
+			/* only main slot has ref_obj_id set; skip others */
+			reg = &state->stack[i].spilled_ptr;
+			if (!reg->ref_obj_id)
+				continue;
+
+			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+			print_liveness(env, reg->live);
+			verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+				iter_type_str(reg->iter.btf, reg->iter.btf_id),
+				reg->ref_obj_id, iter_state_str(reg->iter.state),
+				reg->iter.depth);
+			break;
+		case STACK_MISC:
+		case STACK_ZERO:
+		default:
+			reg = &state->stack[i].spilled_ptr;
+
+			for (j = 0; j < BPF_REG_SIZE; j++)
+				types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
+			types_buf[BPF_REG_SIZE] = 0;
+
+			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+			print_liveness(env, reg->live);
+			verbose(env, "=%s", types_buf);
+			break;
+		}
+	}
+	if (state->acquired_refs && state->refs[0].id) {
+		verbose(env, " refs=%d", state->refs[0].id);
+		for (i = 1; i < state->acquired_refs; i++)
+			if (state->refs[i].id)
+				verbose(env, ",%d", state->refs[i].id);
+	}
+	if (state->in_callback_fn)
+		verbose(env, " cb");
+	if (state->in_async_callback_fn)
+		verbose(env, " async_cb");
+	verbose(env, "\n");
+	if (!print_all)
+		mark_verifier_state_clean(env);
+}
+
+static inline u32 vlog_alignment(u32 pos)
+{
+	return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
+			BPF_LOG_MIN_ALIGNMENT) - pos - 1;
+}
+
+void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state)
+{
+	if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
+		/* remove new line character */
+		bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
+		verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
+	} else {
+		verbose(env, "%d:", env->insn_idx);
+	}
+	print_verifier_state(env, state, false);
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 683fdda25c13..8c2d31aa3d31 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -368,21 +368,6 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env,
 	verbose(env, " should have been in %s\n", tn_buf);
 }
 
-static bool type_is_pkt_pointer(enum bpf_reg_type type)
-{
-	type = base_type(type);
-	return type == PTR_TO_PACKET ||
-	       type == PTR_TO_PACKET_META;
-}
-
-static bool type_is_sk_pointer(enum bpf_reg_type type)
-{
-	return type == PTR_TO_SOCKET ||
-		type == PTR_TO_SOCK_COMMON ||
-		type == PTR_TO_TCP_SOCK ||
-		type == PTR_TO_XDP_SOCK;
-}
-
 static bool type_may_be_null(u32 type)
 {
 	return type & PTR_MAYBE_NULL;
@@ -406,16 +391,6 @@ static bool reg_not_null(const struct bpf_reg_state *reg)
 		type == PTR_TO_MEM;
 }
 
-static bool type_is_ptr_alloc_obj(u32 type)
-{
-	return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
-}
-
-static bool type_is_non_owning_ref(u32 type)
-{
-	return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF;
-}
-
 static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
 {
 	struct btf_record *rec = NULL;
@@ -532,83 +507,6 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
 	       insn->imm == BPF_CMPXCHG;
 }
 
-/* string representation of 'enum bpf_reg_type'
- *
- * Note that reg_type_str() can not appear more than once in a single verbose()
- * statement.
- */
-static const char *reg_type_str(struct bpf_verifier_env *env,
-				enum bpf_reg_type type)
-{
-	char postfix[16] = {0}, prefix[64] = {0};
-	static const char * const str[] = {
-		[NOT_INIT]		= "?",
-		[SCALAR_VALUE]		= "scalar",
-		[PTR_TO_CTX]		= "ctx",
-		[CONST_PTR_TO_MAP]	= "map_ptr",
-		[PTR_TO_MAP_VALUE]	= "map_value",
-		[PTR_TO_STACK]		= "fp",
-		[PTR_TO_PACKET]		= "pkt",
-		[PTR_TO_PACKET_META]	= "pkt_meta",
-		[PTR_TO_PACKET_END]	= "pkt_end",
-		[PTR_TO_FLOW_KEYS]	= "flow_keys",
-		[PTR_TO_SOCKET]		= "sock",
-		[PTR_TO_SOCK_COMMON]	= "sock_common",
-		[PTR_TO_TCP_SOCK]	= "tcp_sock",
-		[PTR_TO_TP_BUFFER]	= "tp_buffer",
-		[PTR_TO_XDP_SOCK]	= "xdp_sock",
-		[PTR_TO_BTF_ID]		= "ptr_",
-		[PTR_TO_MEM]		= "mem",
-		[PTR_TO_BUF]		= "buf",
-		[PTR_TO_FUNC]		= "func",
-		[PTR_TO_MAP_KEY]	= "map_key",
-		[CONST_PTR_TO_DYNPTR]	= "dynptr_ptr",
-	};
-
-	if (type & PTR_MAYBE_NULL) {
-		if (base_type(type) == PTR_TO_BTF_ID)
-			strncpy(postfix, "or_null_", 16);
-		else
-			strncpy(postfix, "_or_null", 16);
-	}
-
-	snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
-		 type & MEM_RDONLY ? "rdonly_" : "",
-		 type & MEM_RINGBUF ? "ringbuf_" : "",
-		 type & MEM_USER ? "user_" : "",
-		 type & MEM_PERCPU ? "percpu_" : "",
-		 type & MEM_RCU ? "rcu_" : "",
-		 type & PTR_UNTRUSTED ? "untrusted_" : "",
-		 type & PTR_TRUSTED ? "trusted_" : ""
-	);
-
-	snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s",
-		 prefix, str[base_type(type)], postfix);
-	return env->tmp_str_buf;
-}
-
-static char slot_type_char[] = {
-	[STACK_INVALID]	= '?',
-	[STACK_SPILL]	= 'r',
-	[STACK_MISC]	= 'm',
-	[STACK_ZERO]	= '0',
-	[STACK_DYNPTR]	= 'd',
-	[STACK_ITER]	= 'i',
-};
-
-static void print_liveness(struct bpf_verifier_env *env,
-			   enum bpf_reg_liveness live)
-{
-	if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE))
-	    verbose(env, "_");
-	if (live & REG_LIVE_READ)
-		verbose(env, "r");
-	if (live & REG_LIVE_WRITTEN)
-		verbose(env, "w");
-	if (live & REG_LIVE_DONE)
-		verbose(env, "D");
-}
-
 static int __get_spi(s32 off)
 {
 	return (-off - 1) / BPF_REG_SIZE;
@@ -678,87 +576,6 @@ static const char *btf_type_name(const struct btf *btf, u32 id)
 	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
 }
 
-static const char *dynptr_type_str(enum bpf_dynptr_type type)
-{
-	switch (type) {
-	case BPF_DYNPTR_TYPE_LOCAL:
-		return "local";
-	case BPF_DYNPTR_TYPE_RINGBUF:
-		return "ringbuf";
-	case BPF_DYNPTR_TYPE_SKB:
-		return "skb";
-	case BPF_DYNPTR_TYPE_XDP:
-		return "xdp";
-	case BPF_DYNPTR_TYPE_INVALID:
-		return "<invalid>";
-	default:
-		WARN_ONCE(1, "unknown dynptr type %d\n", type);
-		return "<unknown>";
-	}
-}
-
-static const char *iter_type_str(const struct btf *btf, u32 btf_id)
-{
-	if (!btf || btf_id == 0)
-		return "<invalid>";
-
-	/* we already validated that type is valid and has conforming name */
-	return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
-}
-
-static const char *iter_state_str(enum bpf_iter_state state)
-{
-	switch (state) {
-	case BPF_ITER_STATE_ACTIVE:
-		return "active";
-	case BPF_ITER_STATE_DRAINED:
-		return "drained";
-	case BPF_ITER_STATE_INVALID:
-		return "<invalid>";
-	default:
-		WARN_ONCE(1, "unknown iter state %d\n", state);
-		return "<unknown>";
-	}
-}
-
-static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
-{
-	env->scratched_regs |= 1U << regno;
-}
-
-static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
-{
-	env->scratched_stack_slots |= 1ULL << spi;
-}
-
-static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
-{
-	return (env->scratched_regs >> regno) & 1;
-}
-
-static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
-{
-	return (env->scratched_stack_slots >> regno) & 1;
-}
-
-static bool verifier_state_scratched(const struct bpf_verifier_env *env)
-{
-	return env->scratched_regs || env->scratched_stack_slots;
-}
-
-static void mark_verifier_state_clean(struct bpf_verifier_env *env)
-{
-	env->scratched_regs = 0U;
-	env->scratched_stack_slots = 0ULL;
-}
-
-/* Used for printing the entire verifier state. */
-static void mark_verifier_state_scratched(struct bpf_verifier_env *env)
-{
-	env->scratched_regs = ~0U;
-	env->scratched_stack_slots = ~0ULL;
-}
-
 static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 {
 	switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
@@ -1298,226 +1115,6 @@ static void scrub_spilled_slot(u8 *stype)
 		*stype = STACK_MISC;
 }
 
-static void print_scalar_ranges(struct bpf_verifier_env *env,
-				const struct bpf_reg_state *reg,
-				const char **sep)
-{
-	struct {
-		const char *name;
-		u64 val;
-		bool omit;
-	} minmaxs[] = {
-		{"smin",   reg->smin_value,         reg->smin_value == S64_MIN},
-		{"smax",   reg->smax_value,         reg->smax_value == S64_MAX},
-		{"umin",   reg->umin_value,         reg->umin_value == 0},
-		{"umax",   reg->umax_value,         reg->umax_value == U64_MAX},
-		{"smin32", (s64)reg->s32_min_value, reg->s32_min_value == S32_MIN},
-		{"smax32", (s64)reg->s32_max_value, reg->s32_max_value == S32_MAX},
-		{"umin32", reg->u32_min_value,      reg->u32_min_value == 0},
-		{"umax32", reg->u32_max_value,      reg->u32_max_value == U32_MAX},
-	}, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
-	bool neg1, neg2;
-
-	for (m1 = &minmaxs[0]; m1 < mend; m1++) {
-		if (m1->omit)
-			continue;
-
-		neg1 = m1->name[0] == 's' && (s64)m1->val < 0;
-
-		verbose(env, "%s%s=", *sep, m1->name);
-		*sep = ",";
-
-		for (m2 = m1 + 2; m2 < mend; m2 += 2) {
-			if (m2->omit || m2->val != m1->val)
-				continue;
-			/* don't mix negatives with positives */
-			neg2 = m2->name[0] == 's' && (s64)m2->val < 0;
-			if (neg2 != neg1)
-				continue;
-			m2->omit = true;
-			verbose(env, "%s=", m2->name);
-		}
-
-		verbose(env, m1->name[0] == 's' ? "%lld" : "%llu", m1->val);
-	}
-}
-
-static void print_verifier_state(struct bpf_verifier_env *env,
-				 const struct bpf_func_state *state,
-				 bool print_all)
-{
-	const struct bpf_reg_state *reg;
-	enum bpf_reg_type t;
-	int i;
-
-	if (state->frameno)
-		verbose(env, " frame%d:", state->frameno);
-	for (i = 0; i < MAX_BPF_REG; i++) {
-		reg = &state->regs[i];
-		t = reg->type;
-		if (t == NOT_INIT)
-			continue;
-		if (!print_all && !reg_scratched(env, i))
-			continue;
-		verbose(env, " R%d", i);
-		print_liveness(env, reg->live);
-		verbose(env, "=");
-		if (t == SCALAR_VALUE && reg->precise)
-			verbose(env, "P");
-		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
-		    tnum_is_const(reg->var_off)) {
-			/* reg->off should be 0 for SCALAR_VALUE */
-			verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
-			verbose(env, "%lld", reg->var_off.value + reg->off);
-		} else {
-			const char *sep = "";
-
-			verbose(env, "%s", reg_type_str(env, t));
-			if (base_type(t) == PTR_TO_BTF_ID)
-				verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
-			verbose(env, "(");
-/*
- * _a stands for append, was shortened to avoid multiline statements below.
- * This macro is used to output a comma separated list of attributes.
- */
-#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; })
-
-			if (reg->id)
-				verbose_a("id=%d", reg->id);
-			if (reg->ref_obj_id)
-				verbose_a("ref_obj_id=%d", reg->ref_obj_id);
-			if (type_is_non_owning_ref(reg->type))
-				verbose_a("%s", "non_own_ref");
-			if (t != SCALAR_VALUE)
-				verbose_a("off=%d", reg->off);
-			if (type_is_pkt_pointer(t))
-				verbose_a("r=%d", reg->range);
-			else if (base_type(t) == CONST_PTR_TO_MAP ||
-				 base_type(t) == PTR_TO_MAP_KEY ||
-				 base_type(t) == PTR_TO_MAP_VALUE)
-				verbose_a("ks=%d,vs=%d",
-					  reg->map_ptr->key_size,
-					  reg->map_ptr->value_size);
-			if (tnum_is_const(reg->var_off)) {
-				/* Typically an immediate SCALAR_VALUE, but
-				 * could be a pointer whose offset is too big
-				 * for reg->off
-				 */
-				verbose_a("imm=%llx", reg->var_off.value);
-			} else {
-				print_scalar_ranges(env, reg, &sep);
-				if (!tnum_is_unknown(reg->var_off)) {
-					char tn_buf[48];
-
-					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-					verbose_a("var_off=%s", tn_buf);
-				}
-			}
-#undef verbose_a
-
-			verbose(env, ")");
-		}
-	}
-	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-		char types_buf[BPF_REG_SIZE + 1];
-		bool valid = false;
-		int j;
-
-		for (j = 0; j < BPF_REG_SIZE; j++) {
-			if (state->stack[i].slot_type[j] != STACK_INVALID)
-				valid = true;
-			types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
-		}
-		types_buf[BPF_REG_SIZE] = 0;
-		if (!valid)
-			continue;
-		if (!print_all && !stack_slot_scratched(env, i))
-			continue;
-		switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
-		case STACK_SPILL:
-			reg = &state->stack[i].spilled_ptr;
-			t = reg->type;
-
-			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
-			print_liveness(env, reg->live);
-			verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
-			if (t == SCALAR_VALUE && reg->precise)
-				verbose(env, "P");
-			if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
-				verbose(env, "%lld", reg->var_off.value + reg->off);
-			break;
-		case STACK_DYNPTR:
-			i += BPF_DYNPTR_NR_SLOTS - 1;
-			reg = &state->stack[i].spilled_ptr;
-
-			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
-			print_liveness(env, reg->live);
-			verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type));
-			if (reg->ref_obj_id)
-				verbose(env, "(ref_id=%d)", reg->ref_obj_id);
-			break;
-		case STACK_ITER:
-			/* only main slot has ref_obj_id set; skip others */
-			reg = &state->stack[i].spilled_ptr;
-			if (!reg->ref_obj_id)
-				continue;
-
-			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
-			print_liveness(env, reg->live);
-			verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
-				iter_type_str(reg->iter.btf, reg->iter.btf_id),
-				reg->ref_obj_id, iter_state_str(reg->iter.state),
-				reg->iter.depth);
-			break;
-		case STACK_MISC:
-		case STACK_ZERO:
-		default:
-			reg = &state->stack[i].spilled_ptr;
-
-			for (j = 0; j < BPF_REG_SIZE; j++)
-				types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
-			types_buf[BPF_REG_SIZE] = 0;
-
-			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
-			print_liveness(env, reg->live);
-			verbose(env, "=%s", types_buf);
-			break;
-		}
-	}
-	if (state->acquired_refs && state->refs[0].id) {
-		verbose(env, " refs=%d", state->refs[0].id);
-		for (i = 1; i < state->acquired_refs; i++)
-			if (state->refs[i].id)
-				verbose(env, ",%d", state->refs[i].id);
-	}
-	if (state->in_callback_fn)
-		verbose(env, " cb");
-	if (state->in_async_callback_fn)
-		verbose(env, " async_cb");
-	verbose(env, "\n");
-	if (!print_all)
-		mark_verifier_state_clean(env);
-}
-
-static inline u32 vlog_alignment(u32 pos)
-{
-	return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
-			BPF_LOG_MIN_ALIGNMENT) - pos - 1;
-}
-
-static void print_insn_state(struct bpf_verifier_env *env,
-			     const struct bpf_func_state *state)
-{
-	if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
-		/* remove new line character */
-		bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
-		verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
-	} else {
-		verbose(env, "%d:", env->insn_idx);
-	}
-	print_verifier_state(env, state, false);
-}
-
 /* copy array src of length n * size bytes to dst. dst is reallocated if it's too
  * small to hold src. This is different from krealloc since we don't want to preserve
  * the contents of dst.
-- 
cgit v1.2.3


From 009f5465be3636e9ce795cfbd5d3109d8978774d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 19:46:18 -0800
Subject: bpf: extract register state printing

Extract printing register state representation logic into a separate
helper, as we are going to reuse it for spilled register state printing
in the next patch. This also nicely reduces code nestedness.

No functional changes.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231118034623.3320920-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/log.c | 120 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 63 insertions(+), 57 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index c1b257eac21b..05d737e2fab3 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -553,6 +553,67 @@ static void print_scalar_ranges(struct bpf_verifier_env *env,
 	}
 }
 
+static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_state *reg)
+{
+	enum bpf_reg_type t;
+	const char *sep = "";
+
+	t = reg->type;
+	if (t == SCALAR_VALUE && reg->precise)
+		verbose(env, "P");
+	if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
+	    tnum_is_const(reg->var_off)) {
+		/* reg->off should be 0 for SCALAR_VALUE */
+		verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
+		verbose(env, "%lld", reg->var_off.value + reg->off);
+		return;
+	}
+/*
+ * _a stands for append, was shortened to avoid multiline statements below.
+ * This macro is used to output a comma separated list of attributes.
+ */
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; })
+
+	verbose(env, "%s", reg_type_str(env, t));
+	if (base_type(t) == PTR_TO_BTF_ID)
+		verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
+	verbose(env, "(");
+	if (reg->id)
+		verbose_a("id=%d", reg->id);
+	if (reg->ref_obj_id)
+		verbose_a("ref_obj_id=%d", reg->ref_obj_id);
+	if (type_is_non_owning_ref(reg->type))
+		verbose_a("%s", "non_own_ref");
+	if (t != SCALAR_VALUE)
+		verbose_a("off=%d", reg->off);
+	if (type_is_pkt_pointer(t))
+		verbose_a("r=%d", reg->range);
+	else if (base_type(t) == CONST_PTR_TO_MAP ||
+		 base_type(t) == PTR_TO_MAP_KEY ||
+		 base_type(t) == PTR_TO_MAP_VALUE)
+		verbose_a("ks=%d,vs=%d",
+			  reg->map_ptr->key_size,
+			  reg->map_ptr->value_size);
+	if (tnum_is_const(reg->var_off)) {
+		/* Typically an immediate SCALAR_VALUE, but
+		 * could be a pointer whose offset is too big
+		 * for reg->off
+		 */
+		verbose_a("imm=%llx", reg->var_off.value);
+	} else {
+		print_scalar_ranges(env, reg, &sep);
+		if (!tnum_is_unknown(reg->var_off)) {
+			char tn_buf[48];
+
+			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+			verbose_a("var_off=%s", tn_buf);
+		}
+	}
+	verbose(env, ")");
+
+#undef verbose_a
+}
+
 void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state,
 			  bool print_all)
 {
@@ -564,69 +625,14 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st
 		verbose(env, " frame%d:", state->frameno);
 	for (i = 0; i < MAX_BPF_REG; i++) {
 		reg = &state->regs[i];
-		t = reg->type;
-		if (t == NOT_INIT)
+		if (reg->type == NOT_INIT)
 			continue;
 		if (!print_all && !reg_scratched(env, i))
 			continue;
 		verbose(env, " R%d", i);
 		print_liveness(env, reg->live);
 		verbose(env, "=");
-		if (t == SCALAR_VALUE && reg->precise)
-			verbose(env, "P");
-		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
-		    tnum_is_const(reg->var_off)) {
-			/* reg->off should be 0 for SCALAR_VALUE */
-			verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
-			verbose(env, "%lld", reg->var_off.value + reg->off);
-		} else {
-			const char *sep = "";
-
-			verbose(env, "%s", reg_type_str(env, t));
-			if (base_type(t) == PTR_TO_BTF_ID)
-				verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
-			verbose(env, "(");
-/*
- * _a stands for append, was shortened to avoid multiline statements below.
- * This macro is used to output a comma separated list of attributes.
- */
-#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; })
-
-			if (reg->id)
-				verbose_a("id=%d", reg->id);
-			if (reg->ref_obj_id)
-				verbose_a("ref_obj_id=%d", reg->ref_obj_id);
-			if (type_is_non_owning_ref(reg->type))
-				verbose_a("%s", "non_own_ref");
-			if (t != SCALAR_VALUE)
-				verbose_a("off=%d", reg->off);
-			if (type_is_pkt_pointer(t))
-				verbose_a("r=%d", reg->range);
-			else if (base_type(t) == CONST_PTR_TO_MAP ||
-				 base_type(t) == PTR_TO_MAP_KEY ||
-				 base_type(t) == PTR_TO_MAP_VALUE)
-				verbose_a("ks=%d,vs=%d",
-					  reg->map_ptr->key_size,
-					  reg->map_ptr->value_size);
-			if (tnum_is_const(reg->var_off)) {
-				/* Typically an immediate SCALAR_VALUE, but
-				 * could be a pointer whose offset is too big
-				 * for reg->off
-				 */
-				verbose_a("imm=%llx", reg->var_off.value);
-			} else {
-				print_scalar_ranges(env, reg, &sep);
-				if (!tnum_is_unknown(reg->var_off)) {
-					char tn_buf[48];
-
-					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-					verbose_a("var_off=%s", tn_buf);
-				}
-			}
-#undef verbose_a
-
-			verbose(env, ")");
-		}
+		print_reg_state(env, reg);
 	}
 	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
 		char types_buf[BPF_REG_SIZE + 1];
-- 
cgit v1.2.3


From 67d43dfbb42d6575304daea67733c88fbf536a1c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 19:46:19 -0800
Subject: bpf: print spilled register state in stack slot

Print the same register state representation when printing stack state,
as we do for normal registers. Note that if stack slot contains
subregister spill (1, 2, or 4 byte long), we'll still emit "m0?" mask
for those bytes that are not part of spilled register.

While means we can get something like fp-8=0000scalar() for a 4-byte
spill with other 4 bytes still being STACK_ZERO.

Some example before and after, taken from the log of
pyperf_subprogs.bpf.o:

49: (7b) *(u64 *)(r10 -256) = r1      ; frame1: R1_w=ctx(off=0,imm=0) R10=fp0 fp-256_w=ctx
49: (7b) *(u64 *)(r10 -256) = r1      ; frame1: R1_w=ctx(off=0,imm=0) R10=fp0 fp-256_w=ctx(off=0,imm=0)

150: (7b) *(u64 *)(r10 -264) = r0     ; frame1: R0_w=map_value_or_null(id=6,off=0,ks=192,vs=4,imm=0) R10=fp0 fp-264_w=map_value_or_null
150: (7b) *(u64 *)(r10 -264) = r0     ; frame1: R0_w=map_value_or_null(id=6,off=0,ks=192,vs=4,imm=0) R10=fp0 fp-264_w=map_value_or_null(id=6,off=0,ks=192,vs=4,imm=0)

5192: (61) r1 = *(u32 *)(r10 -272)    ; frame1: R1_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=15,var_off=(0x0; 0xf)) R10=fp0 fp-272=
5192: (61) r1 = *(u32 *)(r10 -272)    ; frame1: R1_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=15,var_off=(0x0; 0xf)) R10=fp0 fp-272=????scalar(smin=smin32=0,smax=umax=smax32=umax32=15,var_off=(0x0; 0xf))

While at it, do a few other simple clean ups:
  - skip slot if it's not scratched before detecting whether it's valid;
  - move taking spilled_reg pointer outside of switch (only DYNPTR has
    to adjust that to get to the "main" slot);
  - don't recalculate types_buf second time for MISC/ZERO/default case.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231118034623.3320920-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/log.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 05d737e2fab3..97a1641e848e 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -618,7 +618,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st
 			  bool print_all)
 {
 	const struct bpf_reg_state *reg;
-	enum bpf_reg_type t;
 	int i;
 
 	if (state->frameno)
@@ -637,32 +636,38 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st
 	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
 		char types_buf[BPF_REG_SIZE + 1];
 		bool valid = false;
+		u8 slot_type;
 		int j;
 
+		if (!print_all && !stack_slot_scratched(env, i))
+			continue;
+
 		for (j = 0; j < BPF_REG_SIZE; j++) {
-			if (state->stack[i].slot_type[j] != STACK_INVALID)
+			slot_type = state->stack[i].slot_type[j];
+			if (slot_type != STACK_INVALID)
 				valid = true;
-			types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
+			types_buf[j] = slot_type_char[slot_type];
 		}
 		types_buf[BPF_REG_SIZE] = 0;
 		if (!valid)
 			continue;
-		if (!print_all && !stack_slot_scratched(env, i))
-			continue;
+
+		reg = &state->stack[i].spilled_ptr;
 		switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
 		case STACK_SPILL:
-			reg = &state->stack[i].spilled_ptr;
-			t = reg->type;
+			/* print MISC/ZERO/INVALID slots above subreg spill */
+			for (j = 0; j < BPF_REG_SIZE; j++)
+				if (state->stack[i].slot_type[j] == STACK_SPILL)
+					break;
+			types_buf[j] = '\0';
 
 			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
 			print_liveness(env, reg->live);
-			verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
-			if (t == SCALAR_VALUE && reg->precise)
-				verbose(env, "P");
-			if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
-				verbose(env, "%lld", reg->var_off.value + reg->off);
+			verbose(env, "=%s", types_buf);
+			print_reg_state(env, reg);
 			break;
 		case STACK_DYNPTR:
+			/* skip to main dynptr slot */
 			i += BPF_DYNPTR_NR_SLOTS - 1;
 			reg = &state->stack[i].spilled_ptr;
 
@@ -674,7 +679,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st
 			break;
 		case STACK_ITER:
 			/* only main slot has ref_obj_id set; skip others */
-			reg = &state->stack[i].spilled_ptr;
 			if (!reg->ref_obj_id)
 				continue;
 
@@ -688,12 +692,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st
 		case STACK_MISC:
 		case STACK_ZERO:
 		default:
-			reg = &state->stack[i].spilled_ptr;
-
-			for (j = 0; j < BPF_REG_SIZE; j++)
-				types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
-			types_buf[BPF_REG_SIZE] = 0;
-
 			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
 			print_liveness(env, reg->live);
 			verbose(env, "=%s", types_buf);
-- 
cgit v1.2.3


From 0c95c9fdb696f35c7864785ba84cb9a50152daff Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 19:46:20 -0800
Subject: bpf: emit map name in register state if applicable and available

In complicated real-world applications, whenever debugging some
verification error through verifier log, it often would be very useful
to see map name for PTR_TO_MAP_VALUE register. Usually this needs to be
inferred from key/value sizes and maybe trying to guess C code location,
but it's not always clear.

Given verifier has the name, and it's never too long, let's just emit it
for ptr_to_map_key, ptr_to_map_value, and const_ptr_to_map registers. We
reshuffle the order a bit, so that map name, key size, and value size
appear before offset and immediate values, which seems like a more
logical order.

Current output:

  R1_w=map_ptr(map=array_map,ks=4,vs=8,off=0,imm=0)

But we'll get rid of useless off=0 and imm=0 parts in the next patch.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231118034623.3320920-6-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/log.c                                   | 24 ++++++++++++++++------
 tools/testing/selftests/bpf/prog_tests/spin_lock.c | 10 ++++-----
 2 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 97a1641e848e..c209ab1ec2b5 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -553,6 +553,17 @@ static void print_scalar_ranges(struct bpf_verifier_env *env,
 	}
 }
 
+static bool type_is_map_ptr(enum bpf_reg_type t) {
+	switch (base_type(t)) {
+	case CONST_PTR_TO_MAP:
+	case PTR_TO_MAP_KEY:
+	case PTR_TO_MAP_VALUE:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_state *reg)
 {
 	enum bpf_reg_type t;
@@ -584,16 +595,17 @@ static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_s
 		verbose_a("ref_obj_id=%d", reg->ref_obj_id);
 	if (type_is_non_owning_ref(reg->type))
 		verbose_a("%s", "non_own_ref");
+	if (type_is_map_ptr(t)) {
+		if (reg->map_ptr->name[0])
+			verbose_a("map=%s", reg->map_ptr->name);
+		verbose_a("ks=%d,vs=%d",
+			  reg->map_ptr->key_size,
+			  reg->map_ptr->value_size);
+	}
 	if (t != SCALAR_VALUE)
 		verbose_a("off=%d", reg->off);
 	if (type_is_pkt_pointer(t))
 		verbose_a("r=%d", reg->range);
-	else if (base_type(t) == CONST_PTR_TO_MAP ||
-		 base_type(t) == PTR_TO_MAP_KEY ||
-		 base_type(t) == PTR_TO_MAP_VALUE)
-		verbose_a("ks=%d,vs=%d",
-			  reg->map_ptr->key_size,
-			  reg->map_ptr->value_size);
 	if (tnum_is_const(reg->var_off)) {
 		/* Typically an immediate SCALAR_VALUE, but
 		 * could be a pointer whose offset is too big
diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c
index f29c08d93beb..ace65224286f 100644
--- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c
+++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c
@@ -17,18 +17,18 @@ static struct {
 	  "R1_w=ptr_foo(id=2,ref_obj_id=2,off=0,imm=0) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n"
 	  "R1 type=ptr_ expected=percpu_ptr_" },
 	{ "lock_id_global_zero",
-	  "; R1_w=map_value(off=0,ks=4,vs=4,imm=0)\n2: (85) call bpf_this_cpu_ptr#154\n"
+	  "; R1_w=map_value(map=.data.A,ks=4,vs=4,off=0,imm=0)\n2: (85) call bpf_this_cpu_ptr#154\n"
 	  "R1 type=map_value expected=percpu_ptr_" },
 	{ "lock_id_mapval_preserve",
 	  "[0-9]\\+: (bf) r1 = r0                       ;"
-	  " R0_w=map_value(id=1,off=0,ks=4,vs=8,imm=0)"
-	  " R1_w=map_value(id=1,off=0,ks=4,vs=8,imm=0)\n"
+	  " R0_w=map_value(id=1,map=array_map,ks=4,vs=8,off=0,imm=0)"
+	  " R1_w=map_value(id=1,map=array_map,ks=4,vs=8,off=0,imm=0)\n"
 	  "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n"
 	  "R1 type=map_value expected=percpu_ptr_" },
 	{ "lock_id_innermapval_preserve",
 	  "[0-9]\\+: (bf) r1 = r0                      ;"
-	  " R0=map_value(id=2,off=0,ks=4,vs=8,imm=0)"
-	  " R1_w=map_value(id=2,off=0,ks=4,vs=8,imm=0)\n"
+	  " R0=map_value(id=2,ks=4,vs=8,off=0,imm=0)"
+	  " R1_w=map_value(id=2,ks=4,vs=8,off=0,imm=0)\n"
 	  "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n"
 	  "R1 type=map_value expected=percpu_ptr_" },
 	{ "lock_id_mismatch_kptr_kptr", "bpf_spin_unlock of different lock" },
-- 
cgit v1.2.3


From 1db747d75b1dbe17bf4283ed87bd3b7a92010f34 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 19:46:21 -0800
Subject: bpf: omit default off=0 and imm=0 in register state log

Simplify BPF verifier log further by omitting default (and frequently
irrelevant) off=0 and imm=0 parts for non-SCALAR_VALUE registers. As can
be seen from fixed tests, this is often a visual noise for PTR_TO_CTX
register and even for PTR_TO_PACKET registers.

Omitting default values follows the rest of register state logic: we
omit default values to keep verifier log succinct and to highlight
interesting state that deviates from default one. E.g., we do the same
for var_off, when it's unknown, which gives no additional information.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231118034623.3320920-7-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/log.c                                   | 10 +++---
 tools/testing/selftests/bpf/prog_tests/align.c     | 42 +++++++++++-----------
 tools/testing/selftests/bpf/prog_tests/log_buf.c   |  4 +--
 tools/testing/selftests/bpf/prog_tests/spin_lock.c | 14 ++++----
 .../selftests/bpf/progs/exceptions_assert.c        | 10 +++---
 5 files changed, 39 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index c209ab1ec2b5..20b4f81087da 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -602,16 +602,14 @@ static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_s
 			  reg->map_ptr->key_size,
 			  reg->map_ptr->value_size);
 	}
-	if (t != SCALAR_VALUE)
+	if (t != SCALAR_VALUE && reg->off)
 		verbose_a("off=%d", reg->off);
 	if (type_is_pkt_pointer(t))
 		verbose_a("r=%d", reg->range);
 	if (tnum_is_const(reg->var_off)) {
-		/* Typically an immediate SCALAR_VALUE, but
-		 * could be a pointer whose offset is too big
-		 * for reg->off
-		 */
-		verbose_a("imm=%llx", reg->var_off.value);
+		/* a pointer register with fixed offset */
+		if (reg->var_off.value)
+			verbose_a("imm=%llx", reg->var_off.value);
 	} else {
 		print_scalar_ranges(env, reg, &sep);
 		if (!tnum_is_unknown(reg->var_off)) {
diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c
index 465c1c3a3d3c..4ebd0da898f5 100644
--- a/tools/testing/selftests/bpf/prog_tests/align.c
+++ b/tools/testing/selftests/bpf/prog_tests/align.c
@@ -40,7 +40,7 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{0, "R1", "ctx(off=0,imm=0)"},
+			{0, "R1", "ctx()"},
 			{0, "R10", "fp0"},
 			{0, "R3_w", "2"},
 			{1, "R3_w", "4"},
@@ -68,7 +68,7 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{0, "R1", "ctx(off=0,imm=0)"},
+			{0, "R1", "ctx()"},
 			{0, "R10", "fp0"},
 			{0, "R3_w", "1"},
 			{1, "R3_w", "2"},
@@ -97,7 +97,7 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{0, "R1", "ctx(off=0,imm=0)"},
+			{0, "R1", "ctx()"},
 			{0, "R10", "fp0"},
 			{0, "R3_w", "4"},
 			{1, "R3_w", "8"},
@@ -119,7 +119,7 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{0, "R1", "ctx(off=0,imm=0)"},
+			{0, "R1", "ctx()"},
 			{0, "R10", "fp0"},
 			{0, "R3_w", "7"},
 			{1, "R3_w", "7"},
@@ -162,13 +162,13 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{6, "R0_w", "pkt(off=8,r=8,imm=0)"},
+			{6, "R0_w", "pkt(off=8,r=8)"},
 			{6, "R3_w", "var_off=(0x0; 0xff)"},
 			{7, "R3_w", "var_off=(0x0; 0x1fe)"},
 			{8, "R3_w", "var_off=(0x0; 0x3fc)"},
 			{9, "R3_w", "var_off=(0x0; 0x7f8)"},
 			{10, "R3_w", "var_off=(0x0; 0xff0)"},
-			{12, "R3_w", "pkt_end(off=0,imm=0)"},
+			{12, "R3_w", "pkt_end()"},
 			{17, "R4_w", "var_off=(0x0; 0xff)"},
 			{18, "R4_w", "var_off=(0x0; 0x1fe0)"},
 			{19, "R4_w", "var_off=(0x0; 0xff0)"},
@@ -235,11 +235,11 @@ static struct bpf_align_test tests[] = {
 		},
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.matches = {
-			{2, "R5_w", "pkt(off=0,r=0,imm=0)"},
-			{4, "R5_w", "pkt(off=14,r=0,imm=0)"},
-			{5, "R4_w", "pkt(off=14,r=0,imm=0)"},
-			{9, "R2", "pkt(off=0,r=18,imm=0)"},
-			{10, "R5", "pkt(off=14,r=18,imm=0)"},
+			{2, "R5_w", "pkt(r=0)"},
+			{4, "R5_w", "pkt(off=14,r=0)"},
+			{5, "R4_w", "pkt(off=14,r=0)"},
+			{9, "R2", "pkt(r=18)"},
+			{10, "R5", "pkt(off=14,r=18)"},
 			{10, "R4_w", "var_off=(0x0; 0xff)"},
 			{13, "R4_w", "var_off=(0x0; 0xffff)"},
 			{14, "R4_w", "var_off=(0x0; 0xffff)"},
@@ -299,7 +299,7 @@ static struct bpf_align_test tests[] = {
 			/* Calculated offset in R6 has unknown value, but known
 			 * alignment of 4.
 			 */
-			{6, "R2_w", "pkt(off=0,r=8,imm=0)"},
+			{6, "R2_w", "pkt(r=8)"},
 			{7, "R6_w", "var_off=(0x0; 0x3fc)"},
 			/* Offset is added to packet pointer R5, resulting in
 			 * known fixed offset, and variable offset from R6.
@@ -337,7 +337,7 @@ static struct bpf_align_test tests[] = {
 			/* Constant offset is added to R5 packet pointer,
 			 * resulting in reg->off value of 14.
 			 */
-			{26, "R5_w", "pkt(off=14,r=8,"},
+			{26, "R5_w", "pkt(off=14,r=8)"},
 			/* Variable offset is added to R5, resulting in a
 			 * variable offset of (4n). See comment for insn #18
 			 * for R4 = R5 trick.
@@ -397,7 +397,7 @@ static struct bpf_align_test tests[] = {
 			/* Calculated offset in R6 has unknown value, but known
 			 * alignment of 4.
 			 */
-			{6, "R2_w", "pkt(off=0,r=8,imm=0)"},
+			{6, "R2_w", "pkt(r=8)"},
 			{7, "R6_w", "var_off=(0x0; 0x3fc)"},
 			/* Adding 14 makes R6 be (4n+2) */
 			{8, "R6_w", "var_off=(0x2; 0x7fc)"},
@@ -459,7 +459,7 @@ static struct bpf_align_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.result = REJECT,
 		.matches = {
-			{3, "R5_w", "pkt_end(off=0,imm=0)"},
+			{3, "R5_w", "pkt_end()"},
 			/* (ptr - ptr) << 2 == unknown, (4n) */
 			{5, "R5_w", "var_off=(0x0; 0xfffffffffffffffc)"},
 			/* (4n) + 14 == (4n+2).  We blow our bounds, because
@@ -513,7 +513,7 @@ static struct bpf_align_test tests[] = {
 			/* Calculated offset in R6 has unknown value, but known
 			 * alignment of 4.
 			 */
-			{6, "R2_w", "pkt(off=0,r=8,imm=0)"},
+			{6, "R2_w", "pkt(r=8)"},
 			{8, "R6_w", "var_off=(0x0; 0x3fc)"},
 			/* Adding 14 makes R6 be (4n+2) */
 			{9, "R6_w", "var_off=(0x2; 0x7fc)"},
@@ -566,7 +566,7 @@ static struct bpf_align_test tests[] = {
 			/* Calculated offset in R6 has unknown value, but known
 			 * alignment of 4.
 			 */
-			{6, "R2_w", "pkt(off=0,r=8,imm=0)"},
+			{6, "R2_w", "pkt(r=8)"},
 			{9, "R6_w", "var_off=(0x0; 0x3c)"},
 			/* Adding 14 makes R6 be (4n+2) */
 			{10, "R6_w", "var_off=(0x2; 0x7c)"},
@@ -659,14 +659,14 @@ static int do_test_single(struct bpf_align_test *test)
 			/* Check the next line as well in case the previous line
 			 * did not have a corresponding bpf insn. Example:
 			 * func#0 @0
-			 * 0: R1=ctx(off=0,imm=0) R10=fp0
+			 * 0: R1=ctx() R10=fp0
 			 * 0: (b7) r3 = 2                 ; R3_w=2
 			 *
 			 * Sometimes it's actually two lines below, e.g. when
 			 * searching for "6: R3_w=scalar(umax=255,var_off=(0x0; 0xff))":
-			 *   from 4 to 6: R0_w=pkt(off=8,r=8,imm=0) R1=ctx(off=0,imm=0) R2_w=pkt(off=0,r=8,imm=0) R3_w=pkt_end(off=0,imm=0) R10=fp0
-			 *   6: R0_w=pkt(off=8,r=8,imm=0) R1=ctx(off=0,imm=0) R2_w=pkt(off=0,r=8,imm=0) R3_w=pkt_end(off=0,imm=0) R10=fp0
-			 *   6: (71) r3 = *(u8 *)(r2 +0)           ; R2_w=pkt(off=0,r=8,imm=0) R3_w=scalar(umax=255,var_off=(0x0; 0xff))
+			 *   from 4 to 6: R0_w=pkt(off=8,r=8) R1=ctx() R2_w=pkt(r=8) R3_w=pkt_end() R10=fp0
+			 *   6: R0_w=pkt(off=8,r=8) R1=ctx() R2_w=pkt(r=8) R3_w=pkt_end() R10=fp0
+			 *   6: (71) r3 = *(u8 *)(r2 +0)           ; R2_w=pkt(r=8) R3_w=scalar(umax=255,var_off=(0x0; 0xff))
 			 */
 			while (!(p = strstr(line_ptr, m.reg)) || !strstr(p, m.match)) {
 				cur_line = -1;
diff --git a/tools/testing/selftests/bpf/prog_tests/log_buf.c b/tools/testing/selftests/bpf/prog_tests/log_buf.c
index fe9a23e65ef4..0f7ea4d7d9f6 100644
--- a/tools/testing/selftests/bpf/prog_tests/log_buf.c
+++ b/tools/testing/selftests/bpf/prog_tests/log_buf.c
@@ -78,7 +78,7 @@ static void obj_load_log_buf(void)
 	ASSERT_OK_PTR(strstr(libbpf_log_buf, "prog 'bad_prog': BPF program load failed"),
 		      "libbpf_log_not_empty");
 	ASSERT_OK_PTR(strstr(obj_log_buf, "DATASEC license"), "obj_log_not_empty");
-	ASSERT_OK_PTR(strstr(good_log_buf, "0: R1=ctx(off=0,imm=0) R10=fp0"),
+	ASSERT_OK_PTR(strstr(good_log_buf, "0: R1=ctx() R10=fp0"),
 		      "good_log_verbose");
 	ASSERT_OK_PTR(strstr(bad_log_buf, "invalid access to map value, value_size=16 off=16000 size=4"),
 		      "bad_log_not_empty");
@@ -175,7 +175,7 @@ static void bpf_prog_load_log_buf(void)
 	opts.log_level = 2;
 	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "good_prog", "GPL",
 			   good_prog_insns, good_prog_insn_cnt, &opts);
-	ASSERT_OK_PTR(strstr(log_buf, "0: R1=ctx(off=0,imm=0) R10=fp0"), "good_log_2");
+	ASSERT_OK_PTR(strstr(log_buf, "0: R1=ctx() R10=fp0"), "good_log_2");
 	ASSERT_GE(fd, 0, "good_fd2");
 	if (fd >= 0)
 		close(fd);
diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c
index ace65224286f..18d451be57c8 100644
--- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c
+++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c
@@ -13,22 +13,22 @@ static struct {
 	const char *err_msg;
 } spin_lock_fail_tests[] = {
 	{ "lock_id_kptr_preserve",
-	  "5: (bf) r1 = r0                       ; R0_w=ptr_foo(id=2,ref_obj_id=2,off=0,imm=0) "
-	  "R1_w=ptr_foo(id=2,ref_obj_id=2,off=0,imm=0) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n"
+	  "5: (bf) r1 = r0                       ; R0_w=ptr_foo(id=2,ref_obj_id=2) "
+	  "R1_w=ptr_foo(id=2,ref_obj_id=2) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n"
 	  "R1 type=ptr_ expected=percpu_ptr_" },
 	{ "lock_id_global_zero",
-	  "; R1_w=map_value(map=.data.A,ks=4,vs=4,off=0,imm=0)\n2: (85) call bpf_this_cpu_ptr#154\n"
+	  "; R1_w=map_value(map=.data.A,ks=4,vs=4)\n2: (85) call bpf_this_cpu_ptr#154\n"
 	  "R1 type=map_value expected=percpu_ptr_" },
 	{ "lock_id_mapval_preserve",
 	  "[0-9]\\+: (bf) r1 = r0                       ;"
-	  " R0_w=map_value(id=1,map=array_map,ks=4,vs=8,off=0,imm=0)"
-	  " R1_w=map_value(id=1,map=array_map,ks=4,vs=8,off=0,imm=0)\n"
+	  " R0_w=map_value(id=1,map=array_map,ks=4,vs=8)"
+	  " R1_w=map_value(id=1,map=array_map,ks=4,vs=8)\n"
 	  "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n"
 	  "R1 type=map_value expected=percpu_ptr_" },
 	{ "lock_id_innermapval_preserve",
 	  "[0-9]\\+: (bf) r1 = r0                      ;"
-	  " R0=map_value(id=2,ks=4,vs=8,off=0,imm=0)"
-	  " R1_w=map_value(id=2,ks=4,vs=8,off=0,imm=0)\n"
+	  " R0=map_value(id=2,ks=4,vs=8)"
+	  " R1_w=map_value(id=2,ks=4,vs=8)\n"
 	  "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n"
 	  "R1 type=map_value expected=percpu_ptr_" },
 	{ "lock_id_mismatch_kptr_kptr", "bpf_spin_unlock of different lock" },
diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c
index e1e5c54a6a11..26f7d67432cc 100644
--- a/tools/testing/selftests/bpf/progs/exceptions_assert.c
+++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c
@@ -59,7 +59,7 @@ check_assert(s64, ge, neg, INT_MIN);
 
 SEC("?tc")
 __log_level(2) __failure
-__msg(": R0=0 R1=ctx(off=0,imm=0) R2=scalar(smin=smin32=-2147483646,smax=smax32=2147483645) R10=fp0")
+__msg(": R0=0 R1=ctx() R2=scalar(smin=smin32=-2147483646,smax=smax32=2147483645) R10=fp0")
 int check_assert_range_s64(struct __sk_buff *ctx)
 {
 	struct bpf_sock *sk = ctx->sk;
@@ -75,7 +75,7 @@ int check_assert_range_s64(struct __sk_buff *ctx)
 
 SEC("?tc")
 __log_level(2) __failure
-__msg(": R1=ctx(off=0,imm=0) R2=scalar(smin=umin=smin32=umin32=4096,smax=umax=smax32=umax32=8192,var_off=(0x0; 0x3fff))")
+__msg(": R1=ctx() R2=scalar(smin=umin=smin32=umin32=4096,smax=umax=smax32=umax32=8192,var_off=(0x0; 0x3fff))")
 int check_assert_range_u64(struct __sk_buff *ctx)
 {
 	u64 num = ctx->len;
@@ -86,7 +86,7 @@ int check_assert_range_u64(struct __sk_buff *ctx)
 
 SEC("?tc")
 __log_level(2) __failure
-__msg(": R0=0 R1=ctx(off=0,imm=0) R2=4096 R10=fp0")
+__msg(": R0=0 R1=ctx() R2=4096 R10=fp0")
 int check_assert_single_range_s64(struct __sk_buff *ctx)
 {
 	struct bpf_sock *sk = ctx->sk;
@@ -103,7 +103,7 @@ int check_assert_single_range_s64(struct __sk_buff *ctx)
 
 SEC("?tc")
 __log_level(2) __failure
-__msg(": R1=ctx(off=0,imm=0) R2=4096 R10=fp0")
+__msg(": R1=ctx() R2=4096 R10=fp0")
 int check_assert_single_range_u64(struct __sk_buff *ctx)
 {
 	u64 num = ctx->len;
@@ -114,7 +114,7 @@ int check_assert_single_range_u64(struct __sk_buff *ctx)
 
 SEC("?tc")
 __log_level(2) __failure
-__msg(": R1=pkt(off=64,r=64,imm=0) R2=pkt_end(off=0,imm=0) R6=pkt(off=0,r=64,imm=0) R10=fp0")
+__msg(": R1=pkt(off=64,r=64) R2=pkt_end() R6=pkt(r=64) R10=fp0")
 int check_assert_generic(struct __sk_buff *ctx)
 {
 	u8 *data_end = (void *)(long)ctx->data_end;
-- 
cgit v1.2.3


From 0f8dbdbc641b45a5fa31d497f9fc83ffe1174fa3 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 19:46:22 -0800
Subject: bpf: smarter verifier log number printing logic

Instead of always printing numbers as either decimals (and in some
cases, like for "imm=%llx", in hexadecimals), decide the form based on
actual values. For numbers in a reasonably small range (currently,
[0, U16_MAX] for unsigned values, and [S16_MIN, S16_MAX] for signed ones),
emit them as decimals. In all other cases, even for signed values,
emit them in hexadecimals.

For large values hex form is often times way more useful: it's easier to
see an exact difference between 0xffffffff80000000 and 0xffffffff7fffffff,
than between 18446744071562067966 and 18446744071562067967, as one
particular example.

Small values representing small pointer offsets or application
constants, on the other hand, are way more useful to be represented in
decimal notation.

Adjust reg_bounds register state parsing logic to take into account this
change.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231118034623.3320920-8-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/log.c                                   | 79 +++++++++++++++++++---
 .../testing/selftests/bpf/prog_tests/reg_bounds.c  | 53 +++++++++------
 .../selftests/bpf/progs/exceptions_assert.c        | 32 ++++-----
 3 files changed, 118 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 20b4f81087da..87105aa482ed 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -509,10 +509,52 @@ static void print_liveness(struct bpf_verifier_env *env,
 		verbose(env, "D");
 }
 
+#define UNUM_MAX_DECIMAL U16_MAX
+#define SNUM_MAX_DECIMAL S16_MAX
+#define SNUM_MIN_DECIMAL S16_MIN
+
+static bool is_unum_decimal(u64 num)
+{
+	return num <= UNUM_MAX_DECIMAL;
+}
+
+static bool is_snum_decimal(s64 num)
+{
+	return num >= SNUM_MIN_DECIMAL && num <= SNUM_MAX_DECIMAL;
+}
+
+static void verbose_unum(struct bpf_verifier_env *env, u64 num)
+{
+	if (is_unum_decimal(num))
+		verbose(env, "%llu", num);
+	else
+		verbose(env, "%#llx", num);
+}
+
+static void verbose_snum(struct bpf_verifier_env *env, s64 num)
+{
+	if (is_snum_decimal(num))
+		verbose(env, "%lld", num);
+	else
+		verbose(env, "%#llx", num);
+}
+
 static void print_scalar_ranges(struct bpf_verifier_env *env,
 				const struct bpf_reg_state *reg,
 				const char **sep)
 {
+	/* For signed ranges, we want to unify 64-bit and 32-bit values in the
+	 * output as much as possible, but there is a bit of a complication.
+	 * If we choose to print values as decimals, this is natural to do,
+	 * because negative 64-bit and 32-bit values >= -S32_MIN have the same
+	 * representation due to sign extension. But if we choose to print
+	 * them in hex format (see is_snum_decimal()), then sign extension is
+	 * misleading.
+	 * E.g., smin=-2 and smin32=-2 are exactly the same in decimal, but in
+	 * hex they will be smin=0xfffffffffffffffe and smin32=0xfffffffe, two
+	 * very different numbers.
+	 * So we avoid sign extension if we choose to print values in hex.
+	 */
 	struct {
 		const char *name;
 		u64 val;
@@ -522,8 +564,14 @@ static void print_scalar_ranges(struct bpf_verifier_env *env,
 		{"smax",   reg->smax_value,         reg->smax_value == S64_MAX},
 		{"umin",   reg->umin_value,         reg->umin_value == 0},
 		{"umax",   reg->umax_value,         reg->umax_value == U64_MAX},
-		{"smin32", (s64)reg->s32_min_value, reg->s32_min_value == S32_MIN},
-		{"smax32", (s64)reg->s32_max_value, reg->s32_max_value == S32_MAX},
+		{"smin32",
+		 is_snum_decimal((s64)reg->s32_min_value)
+			 ? (s64)reg->s32_min_value
+			 : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN},
+		{"smax32",
+		 is_snum_decimal((s64)reg->s32_max_value)
+			 ? (s64)reg->s32_max_value
+			 : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX},
 		{"umin32", reg->u32_min_value,      reg->u32_min_value == 0},
 		{"umax32", reg->u32_max_value,      reg->u32_max_value == U32_MAX},
 	}, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)];
@@ -549,7 +597,10 @@ static void print_scalar_ranges(struct bpf_verifier_env *env,
 			verbose(env, "%s=", m2->name);
 		}
 
-		verbose(env, m1->name[0] == 's' ? "%lld" : "%llu", m1->val);
+		if (m1->name[0] == 's')
+			verbose_snum(env, m1->val);
+		else
+			verbose_unum(env, m1->val);
 	}
 }
 
@@ -576,14 +627,14 @@ static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_s
 	    tnum_is_const(reg->var_off)) {
 		/* reg->off should be 0 for SCALAR_VALUE */
 		verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
-		verbose(env, "%lld", reg->var_off.value + reg->off);
+		verbose_snum(env, reg->var_off.value + reg->off);
 		return;
 	}
 /*
  * _a stands for append, was shortened to avoid multiline statements below.
  * This macro is used to output a comma separated list of attributes.
  */
-#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; })
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; })
 
 	verbose(env, "%s", reg_type_str(env, t));
 	if (base_type(t) == PTR_TO_BTF_ID)
@@ -602,14 +653,20 @@ static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_s
 			  reg->map_ptr->key_size,
 			  reg->map_ptr->value_size);
 	}
-	if (t != SCALAR_VALUE && reg->off)
-		verbose_a("off=%d", reg->off);
-	if (type_is_pkt_pointer(t))
-		verbose_a("r=%d", reg->range);
+	if (t != SCALAR_VALUE && reg->off) {
+		verbose_a("off=");
+		verbose_snum(env, reg->off);
+	}
+	if (type_is_pkt_pointer(t)) {
+		verbose_a("r=");
+		verbose_unum(env, reg->range);
+	}
 	if (tnum_is_const(reg->var_off)) {
 		/* a pointer register with fixed offset */
-		if (reg->var_off.value)
-			verbose_a("imm=%llx", reg->var_off.value);
+		if (reg->var_off.value) {
+			verbose_a("imm=");
+			verbose_snum(env, reg->var_off.value);
+		}
 	} else {
 		print_scalar_ranges(env, reg, &sep);
 		if (!tnum_is_unknown(reg->var_off)) {
diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
index 7a8b0bf0a7f8..fd4ab23e6f54 100644
--- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
+++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c
@@ -13,10 +13,13 @@
  */
 #define U64_MAX ((u64)UINT64_MAX)
 #define U32_MAX ((u32)UINT_MAX)
+#define U16_MAX ((u32)UINT_MAX)
 #define S64_MIN ((s64)INT64_MIN)
 #define S64_MAX ((s64)INT64_MAX)
 #define S32_MIN ((s32)INT_MIN)
 #define S32_MAX ((s32)INT_MAX)
+#define S16_MIN ((s16)0x80000000)
+#define S16_MAX ((s16)0x7fffffff)
 
 typedef unsigned long long ___u64;
 typedef unsigned int ___u32;
@@ -138,13 +141,17 @@ static enum num_t t_unsigned(enum num_t t)
 	}
 }
 
+#define UNUM_MAX_DECIMAL U16_MAX
+#define SNUM_MAX_DECIMAL S16_MAX
+#define SNUM_MIN_DECIMAL S16_MIN
+
 static bool num_is_small(enum num_t t, u64 x)
 {
 	switch (t) {
-	case U64: return (u64)x <= 256;
-	case U32: return (u32)x <= 256;
-	case S64: return (s64)x >= -256 && (s64)x <= 256;
-	case S32: return (s32)x >= -256 && (s32)x <= 256;
+	case U64: return (u64)x <= UNUM_MAX_DECIMAL;
+	case U32: return (u32)x <= UNUM_MAX_DECIMAL;
+	case S64: return (s64)x >= SNUM_MIN_DECIMAL && (s64)x <= SNUM_MAX_DECIMAL;
+	case S32: return (s32)x >= SNUM_MIN_DECIMAL && (s32)x <= SNUM_MAX_DECIMAL;
 	default: printf("num_is_small!\n"); exit(1);
 	}
 }
@@ -1023,20 +1030,19 @@ static int parse_reg_state(const char *s, struct reg_state *reg)
 	 */
 	struct {
 		const char *pfx;
-		const char *fmt;
 		u64 *dst, def;
 		bool is_32, is_set;
 	} *f, fields[8] = {
-		{"smin=", "%lld", &reg->r[S64].a, S64_MIN},
-		{"smax=", "%lld", &reg->r[S64].b, S64_MAX},
-		{"umin=", "%llu", &reg->r[U64].a, 0},
-		{"umax=", "%llu", &reg->r[U64].b, U64_MAX},
-		{"smin32=", "%lld", &reg->r[S32].a, (u32)S32_MIN, true},
-		{"smax32=", "%lld", &reg->r[S32].b, (u32)S32_MAX, true},
-		{"umin32=", "%llu", &reg->r[U32].a, 0,            true},
-		{"umax32=", "%llu", &reg->r[U32].b, U32_MAX,      true},
+		{"smin=", &reg->r[S64].a, S64_MIN},
+		{"smax=", &reg->r[S64].b, S64_MAX},
+		{"umin=", &reg->r[U64].a, 0},
+		{"umax=", &reg->r[U64].b, U64_MAX},
+		{"smin32=", &reg->r[S32].a, (u32)S32_MIN, true},
+		{"smax32=", &reg->r[S32].b, (u32)S32_MAX, true},
+		{"umin32=", &reg->r[U32].a, 0,            true},
+		{"umax32=", &reg->r[U32].b, U32_MAX,      true},
 	};
-	const char *p, *fmt;
+	const char *p;
 	int i;
 
 	p = strchr(s, '=');
@@ -1050,8 +1056,13 @@ static int parse_reg_state(const char *s, struct reg_state *reg)
 		long long sval;
 		enum num_t t;
 
-		if (sscanf(p, "%lld", &sval) != 1)
-			return -EINVAL;
+		if (p[0] == '0' && p[1] == 'x') {
+			if (sscanf(p, "%llx", &sval) != 1)
+				return -EINVAL;
+		} else {
+			if (sscanf(p, "%lld", &sval) != 1)
+				return -EINVAL;
+		}
 
 		reg->valid = true;
 		for (t = first_t; t <= last_t; t++) {
@@ -1075,9 +1086,13 @@ static int parse_reg_state(const char *s, struct reg_state *reg)
 
 		if (mcnt) {
 			/* populate all matched fields */
-			fmt = fields[midxs[0]].fmt;
-			if (sscanf(p, fmt, &val) != 1)
-				return -EINVAL;
+			if (p[0] == '0' && p[1] == 'x') {
+				if (sscanf(p, "%llx", &val) != 1)
+					return -EINVAL;
+			} else {
+				if (sscanf(p, "%lld", &val) != 1)
+					return -EINVAL;
+			}
 
 			for (i = 0; i < mcnt; i++) {
 				f = &fields[midxs[i]];
diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c
index 26f7d67432cc..49efaed143fc 100644
--- a/tools/testing/selftests/bpf/progs/exceptions_assert.c
+++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c
@@ -18,48 +18,48 @@
 		return *(u64 *)num;					\
 	}
 
-__msg(": R0_w=-2147483648 R10=fp0")
+__msg(": R0_w=0xffffffff80000000 R10=fp0")
 check_assert(s64, eq, int_min, INT_MIN);
-__msg(": R0_w=2147483647 R10=fp0")
+__msg(": R0_w=0x7fffffff R10=fp0")
 check_assert(s64, eq, int_max, INT_MAX);
 __msg(": R0_w=0 R10=fp0")
 check_assert(s64, eq, zero, 0);
-__msg(": R0_w=-9223372036854775808 R1_w=-9223372036854775808 R10=fp0")
+__msg(": R0_w=0x8000000000000000 R1_w=0x8000000000000000 R10=fp0")
 check_assert(s64, eq, llong_min, LLONG_MIN);
-__msg(": R0_w=9223372036854775807 R1_w=9223372036854775807 R10=fp0")
+__msg(": R0_w=0x7fffffffffffffff R1_w=0x7fffffffffffffff R10=fp0")
 check_assert(s64, eq, llong_max, LLONG_MAX);
 
-__msg(": R0_w=scalar(smax=2147483646) R10=fp0")
+__msg(": R0_w=scalar(smax=0x7ffffffe) R10=fp0")
 check_assert(s64, lt, pos, INT_MAX);
-__msg(": R0_w=scalar(smax=-1,umin=9223372036854775808,var_off=(0x8000000000000000; 0x7fffffffffffffff))")
+__msg(": R0_w=scalar(smax=-1,umin=0x8000000000000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))")
 check_assert(s64, lt, zero, 0);
-__msg(": R0_w=scalar(smax=-2147483649,umin=9223372036854775808,umax=18446744071562067967,var_off=(0x8000000000000000; 0x7fffffffffffffff))")
+__msg(": R0_w=scalar(smax=0xffffffff7fffffff,umin=0x8000000000000000,umax=0xffffffff7fffffff,var_off=(0x8000000000000000; 0x7fffffffffffffff))")
 check_assert(s64, lt, neg, INT_MIN);
 
-__msg(": R0_w=scalar(smax=2147483647) R10=fp0")
+__msg(": R0_w=scalar(smax=0x7fffffff) R10=fp0")
 check_assert(s64, le, pos, INT_MAX);
 __msg(": R0_w=scalar(smax=0) R10=fp0")
 check_assert(s64, le, zero, 0);
-__msg(": R0_w=scalar(smax=-2147483648,umin=9223372036854775808,umax=18446744071562067968,var_off=(0x8000000000000000; 0x7fffffffffffffff))")
+__msg(": R0_w=scalar(smax=0xffffffff80000000,umin=0x8000000000000000,umax=0xffffffff80000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))")
 check_assert(s64, le, neg, INT_MIN);
 
-__msg(": R0_w=scalar(smin=umin=2147483648,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff))")
+__msg(": R0_w=scalar(smin=umin=0x80000000,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))")
 check_assert(s64, gt, pos, INT_MAX);
-__msg(": R0_w=scalar(smin=umin=1,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff))")
+__msg(": R0_w=scalar(smin=umin=1,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))")
 check_assert(s64, gt, zero, 0);
-__msg(": R0_w=scalar(smin=-2147483647) R10=fp0")
+__msg(": R0_w=scalar(smin=0xffffffff80000001) R10=fp0")
 check_assert(s64, gt, neg, INT_MIN);
 
-__msg(": R0_w=scalar(smin=umin=2147483647,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff))")
+__msg(": R0_w=scalar(smin=umin=0x7fffffff,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))")
 check_assert(s64, ge, pos, INT_MAX);
-__msg(": R0_w=scalar(smin=0,umax=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff)) R10=fp0")
+__msg(": R0_w=scalar(smin=0,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff)) R10=fp0")
 check_assert(s64, ge, zero, 0);
-__msg(": R0_w=scalar(smin=-2147483648) R10=fp0")
+__msg(": R0_w=scalar(smin=0xffffffff80000000) R10=fp0")
 check_assert(s64, ge, neg, INT_MIN);
 
 SEC("?tc")
 __log_level(2) __failure
-__msg(": R0=0 R1=ctx() R2=scalar(smin=smin32=-2147483646,smax=smax32=2147483645) R10=fp0")
+__msg(": R0=0 R1=ctx() R2=scalar(smin=0xffffffff80000002,smax=smax32=0x7ffffffd,smin32=0x80000002) R10=fp0")
 int check_assert_range_s64(struct __sk_buff *ctx)
 {
 	struct bpf_sock *sk = ctx->sk;
-- 
cgit v1.2.3


From 46862ee854b4f5a315d63b677ca3af14a89aefeb Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 17 Nov 2023 19:46:23 -0800
Subject: bpf: emit frameno for PTR_TO_STACK regs if it differs from current
 one

It's possible to pass a pointer to parent's stack to child subprogs. In
such case verifier state output is ambiguous not showing whether
register container a pointer to "current" stack, belonging to current
subprog (frame), or it's actually a pointer to one of parent frames.

So emit this information if frame number differs between the state which
register is part of. E.g., if current state is in frame 2 and it has
a register pointing to stack in grand parent state (frame #0), we'll see
something like 'R1=fp[0]-16', while "local stack pointer" will be just
'R2=fp-16'.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231118034623.3320920-9-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/log.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 87105aa482ed..3505f3e5ae96 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -615,7 +615,9 @@ static bool type_is_map_ptr(enum bpf_reg_type t) {
 	}
 }
 
-static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_state *reg)
+static void print_reg_state(struct bpf_verifier_env *env,
+			    const struct bpf_func_state *state,
+			    const struct bpf_reg_state *reg)
 {
 	enum bpf_reg_type t;
 	const char *sep = "";
@@ -623,10 +625,8 @@ static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_s
 	t = reg->type;
 	if (t == SCALAR_VALUE && reg->precise)
 		verbose(env, "P");
-	if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
-	    tnum_is_const(reg->var_off)) {
+	if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) {
 		/* reg->off should be 0 for SCALAR_VALUE */
-		verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
 		verbose_snum(env, reg->var_off.value + reg->off);
 		return;
 	}
@@ -637,6 +637,14 @@ static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_reg_s
 #define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; })
 
 	verbose(env, "%s", reg_type_str(env, t));
+	if (t == PTR_TO_STACK) {
+		if (state->frameno != reg->frameno)
+			verbose(env, "[%d]", reg->frameno);
+		if (tnum_is_const(reg->var_off)) {
+			verbose_snum(env, reg->var_off.value + reg->off);
+			return;
+		}
+	}
 	if (base_type(t) == PTR_TO_BTF_ID)
 		verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
 	verbose(env, "(");
@@ -698,7 +706,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st
 		verbose(env, " R%d", i);
 		print_liveness(env, reg->live);
 		verbose(env, "=");
-		print_reg_state(env, reg);
+		print_reg_state(env, state, reg);
 	}
 	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
 		char types_buf[BPF_REG_SIZE + 1];
@@ -731,7 +739,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st
 			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
 			print_liveness(env, reg->live);
 			verbose(env, "=%s", types_buf);
-			print_reg_state(env, reg);
+			print_reg_state(env, state, reg);
 			break;
 		case STACK_DYNPTR:
 			/* skip to main dynptr slot */
-- 
cgit v1.2.3


From 2d1618054f25e11c44d189dbff4a60342a4cfb4b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 14 Nov 2023 17:32:34 +0100
Subject: bpf: task_group_seq_get_next: use __next_thread() rather than
 next_thread()

Lockless use of next_thread() should be avoided, kernel/bpf/task_iter.c
is the last user and the usage is wrong.

task_group_seq_get_next() can return the group leader twice if it races
with mt-thread exec which changes the group->leader's pid.

Change the main loop to use __next_thread(), kill "next_tid == common->pid"
check.

__next_thread() can't loop forever, we can also change this code to retry
if next_tid == 0.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231114163234.GA890@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/task_iter.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 26082b97894d..51ae15e2b290 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -70,15 +70,13 @@ static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_comm
 		return NULL;
 
 retry:
-	task = next_thread(task);
+	task = __next_thread(task);
+	if (!task)
+		return NULL;
 
 	next_tid = __task_pid_nr_ns(task, PIDTYPE_PID, common->ns);
-	if (!next_tid || next_tid == common->pid) {
-		/* Run out of tasks of a process.  The tasks of a
-		 * thread_group are linked as circular linked list.
-		 */
-		return NULL;
-	}
+	if (!next_tid)
+		goto retry;
 
 	if (skip_if_dup_files && task->files == task->group_leader->files)
 		goto retry;
-- 
cgit v1.2.3


From 5a34f9dabd9aa567e2d37e1aa27a67f80acfaa1c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 14 Nov 2023 17:32:37 +0100
Subject: bpf: bpf_iter_task_next: use __next_thread() rather than
 next_thread()

Lockless use of next_thread() should be avoided, kernel/bpf/task_iter.c
is the last user and the usage is wrong.

bpf_iter_task_next() can loop forever, "kit->pos == kit->task" can never
happen if kit->pos execs. Change this code to use __next_thread().

With or without this change the usage of kit->pos/task and next_task()
doesn't look nice, see the next patch.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231114163237.GA897@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/task_iter.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 51ae15e2b290..d42e08d0d0b7 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -1015,12 +1015,11 @@ __bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it)
 	if (flags == BPF_TASK_ITER_ALL_PROCS)
 		goto get_next_task;
 
-	kit->pos = next_thread(kit->pos);
-	if (kit->pos == kit->task) {
-		if (flags == BPF_TASK_ITER_PROC_THREADS) {
-			kit->pos = NULL;
+	kit->pos = __next_thread(kit->pos);
+	if (!kit->pos) {
+		if (flags == BPF_TASK_ITER_PROC_THREADS)
 			return pos;
-		}
+		kit->pos = kit->task;
 	} else
 		return pos;
 
-- 
cgit v1.2.3


From ac8148d957f50434411a0c15a2e4f352b5bb4ff2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 14 Nov 2023 17:32:39 +0100
Subject: bpf: bpf_iter_task_next: use next_task(kit->task) rather than
 next_task(kit->pos)

This looks more clear and simplifies the code. While at it, remove the
unnecessary initialization of pos/task at the start of bpf_iter_task_new().

Note that we can even kill kit->task, we can just use pos->group_leader,
but I don't understand the BUILD_BUG_ON() checks in bpf_iter_task_new().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231114163239.GA903@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/task_iter.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index d42e08d0d0b7..e5c3500443c6 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -978,7 +978,6 @@ __bpf_kfunc int bpf_iter_task_new(struct bpf_iter_task *it,
 	BUILD_BUG_ON(__alignof__(struct bpf_iter_task_kern) !=
 					__alignof__(struct bpf_iter_task));
 
-	kit->task = kit->pos = NULL;
 	switch (flags) {
 	case BPF_TASK_ITER_ALL_THREADS:
 	case BPF_TASK_ITER_ALL_PROCS:
@@ -1016,18 +1015,15 @@ __bpf_kfunc struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it)
 		goto get_next_task;
 
 	kit->pos = __next_thread(kit->pos);
-	if (!kit->pos) {
-		if (flags == BPF_TASK_ITER_PROC_THREADS)
-			return pos;
-		kit->pos = kit->task;
-	} else
+	if (kit->pos || flags == BPF_TASK_ITER_PROC_THREADS)
 		return pos;
 
 get_next_task:
-	kit->pos = next_task(kit->pos);
-	kit->task = kit->pos;
-	if (kit->pos == &init_task)
+	kit->task = next_task(kit->task);
+	if (kit->task == &init_task)
 		kit->pos = NULL;
+	else
+		kit->pos = kit->task;
 
 	return pos;
 }
-- 
cgit v1.2.3


From 683b96f9606ab7308ffb23c46ab43cecdef8a241 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 21 Nov 2023 04:06:54 +0200
Subject: bpf: extract __check_reg_arg() utility function

Split check_reg_arg() into two utility functions:
- check_reg_arg() operating on registers from current verifier state;
- __check_reg_arg() operating on a specific set of registers passed as
  a parameter;

The __check_reg_arg() function would be used by a follow-up change for
callbacks handling.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231121020701.26440-5-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6da370a047fe..e6e1bcfe00f5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3439,13 +3439,11 @@ static void mark_insn_zext(struct bpf_verifier_env *env,
 	reg->subreg_def = DEF_NOT_SUBREG;
 }
 
-static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
-			 enum reg_arg_type t)
+static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno,
+			   enum reg_arg_type t)
 {
-	struct bpf_verifier_state *vstate = env->cur_state;
-	struct bpf_func_state *state = vstate->frame[vstate->curframe];
 	struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
-	struct bpf_reg_state *reg, *regs = state->regs;
+	struct bpf_reg_state *reg;
 	bool rw64;
 
 	if (regno >= MAX_BPF_REG) {
@@ -3486,6 +3484,15 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	return 0;
 }
 
+static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
+			 enum reg_arg_type t)
+{
+	struct bpf_verifier_state *vstate = env->cur_state;
+	struct bpf_func_state *state = vstate->frame[vstate->curframe];
+
+	return __check_reg_arg(env, state->regs, regno, t);
+}
+
 static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
 {
 	env->insn_aux_data[idx].jmp_point = true;
@@ -9350,7 +9357,7 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env,
 	/* after the call registers r0 - r5 were scratched */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
-		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+		__check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK);
 	}
 }
 
-- 
cgit v1.2.3


From 58124a98cb8eda69d248d7f1de954c8b2767c945 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 21 Nov 2023 04:06:55 +0200
Subject: bpf: extract setup_func_entry() utility function

Move code for simulated stack frame creation to a separate utility
function. This function would be used in the follow-up change for
callbacks handling.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231121020701.26440-6-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 84 +++++++++++++++++++++++++++++----------------------
 1 file changed, 48 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e6e1bcfe00f5..68ee4803d3a2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9370,11 +9370,10 @@ static int set_callee_state(struct bpf_verifier_env *env,
 			    struct bpf_func_state *caller,
 			    struct bpf_func_state *callee, int insn_idx);
 
-static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
-			     int *insn_idx, int subprog,
-			     set_callee_state_fn set_callee_state_cb)
+static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite,
+			    set_callee_state_fn set_callee_state_cb,
+			    struct bpf_verifier_state *state)
 {
-	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_func_state *caller, *callee;
 	int err;
 
@@ -9384,13 +9383,53 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		return -E2BIG;
 	}
 
-	caller = state->frame[state->curframe];
 	if (state->frame[state->curframe + 1]) {
 		verbose(env, "verifier bug. Frame %d already allocated\n",
 			state->curframe + 1);
 		return -EFAULT;
 	}
 
+	caller = state->frame[state->curframe];
+	callee = kzalloc(sizeof(*callee), GFP_KERNEL);
+	if (!callee)
+		return -ENOMEM;
+	state->frame[state->curframe + 1] = callee;
+
+	/* callee cannot access r0, r6 - r9 for reading and has to write
+	 * into its own stack before reading from it.
+	 * callee can read/write into caller's stack
+	 */
+	init_func_state(env, callee,
+			/* remember the callsite, it will be used by bpf_exit */
+			callsite,
+			state->curframe + 1 /* frameno within this callchain */,
+			subprog /* subprog number within this prog */);
+	/* Transfer references to the callee */
+	err = copy_reference_state(callee, caller);
+	err = err ?: set_callee_state_cb(env, caller, callee, callsite);
+	if (err)
+		goto err_out;
+
+	/* only increment it after check_reg_arg() finished */
+	state->curframe++;
+
+	return 0;
+
+err_out:
+	free_func_state(callee);
+	state->frame[state->curframe + 1] = NULL;
+	return err;
+}
+
+static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+			     int *insn_idx, int subprog,
+			     set_callee_state_fn set_callee_state_cb)
+{
+	struct bpf_verifier_state *state = env->cur_state;
+	struct bpf_func_state *caller, *callee;
+	int err;
+
+	caller = state->frame[state->curframe];
 	err = btf_check_subprog_call(env, subprog, caller->regs);
 	if (err == -EFAULT)
 		return err;
@@ -9460,35 +9499,12 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		return 0;
 	}
 
-	callee = kzalloc(sizeof(*callee), GFP_KERNEL);
-	if (!callee)
-		return -ENOMEM;
-	state->frame[state->curframe + 1] = callee;
-
-	/* callee cannot access r0, r6 - r9 for reading and has to write
-	 * into its own stack before reading from it.
-	 * callee can read/write into caller's stack
-	 */
-	init_func_state(env, callee,
-			/* remember the callsite, it will be used by bpf_exit */
-			*insn_idx /* callsite */,
-			state->curframe + 1 /* frameno within this callchain */,
-			subprog /* subprog number within this prog */);
-
-	/* Transfer references to the callee */
-	err = copy_reference_state(callee, caller);
+	err = setup_func_entry(env, subprog, *insn_idx, set_callee_state_cb, state);
 	if (err)
-		goto err_out;
-
-	err = set_callee_state_cb(env, caller, callee, *insn_idx);
-	if (err)
-		goto err_out;
+		return err;
 
 	clear_caller_saved_regs(env, caller->regs);
 
-	/* only increment it after check_reg_arg() finished */
-	state->curframe++;
-
 	/* and go analyze first insn of the callee */
 	*insn_idx = env->subprog_info[subprog].start - 1;
 
@@ -9496,14 +9512,10 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		verbose(env, "caller:\n");
 		print_verifier_state(env, caller, true);
 		verbose(env, "callee:\n");
-		print_verifier_state(env, callee, true);
+		print_verifier_state(env, state->frame[state->curframe], true);
 	}
-	return 0;
 
-err_out:
-	free_func_state(callee);
-	state->frame[state->curframe + 1] = NULL;
-	return err;
+	return 0;
 }
 
 int map_set_for_each_callback_args(struct bpf_verifier_env *env,
-- 
cgit v1.2.3


From ab5cfac139ab8576fb54630d4cca23c3e690ee90 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 21 Nov 2023 04:06:56 +0200
Subject: bpf: verify callbacks as if they are called unknown number of times

Prior to this patch callbacks were handled as regular function calls,
execution of callback body was modeled exactly once.
This patch updates callbacks handling logic as follows:
- introduces a function push_callback_call() that schedules callback
  body verification in env->head stack;
- updates prepare_func_exit() to reschedule callback body verification
  upon BPF_EXIT;
- as calls to bpf_*_iter_next(), calls to callback invoking functions
  are marked as checkpoints;
- is_state_visited() is updated to stop callback based iteration when
  some identical parent state is found.

Paths with callback function invoked zero times are now verified first,
which leads to necessity to modify some selftests:
- the following negative tests required adding release/unlock/drop
  calls to avoid previously masked unrelated error reports:
  - cb_refs.c:underflow_prog
  - exceptions_fail.c:reject_rbtree_add_throw
  - exceptions_fail.c:reject_with_cp_reference
- the following precision tracking selftests needed change in expected
  log trace:
  - verifier_subprog_precision.c:callback_result_precise
    (note: r0 precision is no longer propagated inside callback and
           I think this is a correct behavior)
  - verifier_subprog_precision.c:parent_callee_saved_reg_precise_with_callback
  - verifier_subprog_precision.c:parent_stack_slot_precise_with_callback

Reported-by: Andrew Werner <awerner32@gmail.com>
Closes: https://lore.kernel.org/bpf/CA+vRuzPChFNXmouzGG+wsy=6eMcfr1mFG0F3g7rbg-sedGKW3w@mail.gmail.com/
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231121020701.26440-7-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                       |   5 +
 kernel/bpf/verifier.c                              | 274 +++++++++++++--------
 tools/testing/selftests/bpf/progs/cb_refs.c        |   1 +
 .../testing/selftests/bpf/progs/exceptions_fail.c  |   2 +
 .../bpf/progs/verifier_subprog_precision.c         |  71 ++++--
 5 files changed, 240 insertions(+), 113 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 24213a99cc79..dd326936dd6f 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -400,6 +400,7 @@ struct bpf_verifier_state {
 	struct bpf_idx_pair *jmp_history;
 	u32 jmp_history_cnt;
 	u32 dfs_depth;
+	u32 callback_unroll_depth;
 };
 
 #define bpf_get_spilled_reg(slot, frame, mask)				\
@@ -511,6 +512,10 @@ struct bpf_insn_aux_data {
 	 * this instruction, regardless of any heuristics
 	 */
 	bool force_checkpoint;
+	/* true if instruction is a call to a helper function that
+	 * accepts callback function as a parameter.
+	 */
+	bool calls_callback;
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 68ee4803d3a2..a60dfa56ebb3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -547,13 +547,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
 	return func_id == BPF_FUNC_dynptr_data;
 }
 
-static bool is_callback_calling_kfunc(u32 btf_id);
+static bool is_sync_callback_calling_kfunc(u32 btf_id);
 static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
 
-static bool is_callback_calling_function(enum bpf_func_id func_id)
+static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
 {
 	return func_id == BPF_FUNC_for_each_map_elem ||
-	       func_id == BPF_FUNC_timer_set_callback ||
 	       func_id == BPF_FUNC_find_vma ||
 	       func_id == BPF_FUNC_loop ||
 	       func_id == BPF_FUNC_user_ringbuf_drain;
@@ -564,6 +563,18 @@ static bool is_async_callback_calling_function(enum bpf_func_id func_id)
 	return func_id == BPF_FUNC_timer_set_callback;
 }
 
+static bool is_callback_calling_function(enum bpf_func_id func_id)
+{
+	return is_sync_callback_calling_function(func_id) ||
+	       is_async_callback_calling_function(func_id);
+}
+
+static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
+{
+	return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) ||
+	       (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
+}
+
 static bool is_storage_get_function(enum bpf_func_id func_id)
 {
 	return func_id == BPF_FUNC_sk_storage_get ||
@@ -1808,6 +1819,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->first_insn_idx = src->first_insn_idx;
 	dst_state->last_insn_idx = src->last_insn_idx;
 	dst_state->dfs_depth = src->dfs_depth;
+	dst_state->callback_unroll_depth = src->callback_unroll_depth;
 	dst_state->used_as_loop_entry = src->used_as_loop_entry;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
@@ -3731,6 +3743,8 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
 	}
 }
 
+static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
+
 /* For given verifier state backtrack_insn() is called from the last insn to
  * the first insn. Its purpose is to compute a bitmask of registers and
  * stack slots that needs precision in the parent verifier state.
@@ -3906,16 +3920,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 					return -EFAULT;
 				return 0;
 			}
-		} else if ((bpf_helper_call(insn) &&
-			    is_callback_calling_function(insn->imm) &&
-			    !is_async_callback_calling_function(insn->imm)) ||
-			   (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) {
-			/* callback-calling helper or kfunc call, which means
-			 * we are exiting from subprog, but unlike the subprog
-			 * call handling above, we shouldn't propagate
-			 * precision of r1-r5 (if any requested), as they are
-			 * not actually arguments passed directly to callback
-			 * subprogs
+		} else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
+			/* exit from callback subprog to callback-calling helper or
+			 * kfunc call. Use idx/subseq_idx check to discern it from
+			 * straight line code backtracking.
+			 * Unlike the subprog call handling above, we shouldn't
+			 * propagate precision of r1-r5 (if any requested), as they are
+			 * not actually arguments passed directly to callback subprogs
 			 */
 			if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
 				verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
@@ -3950,10 +3961,18 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 		} else if (opcode == BPF_EXIT) {
 			bool r0_precise;
 
+			/* Backtracking to a nested function call, 'idx' is a part of
+			 * the inner frame 'subseq_idx' is a part of the outer frame.
+			 * In case of a regular function call, instructions giving
+			 * precision to registers R1-R5 should have been found already.
+			 * In case of a callback, it is ok to have R1-R5 marked for
+			 * backtracking, as these registers are set by the function
+			 * invoking callback.
+			 */
+			if (subseq_idx >= 0 && calls_callback(env, subseq_idx))
+				for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+					bt_clear_reg(bt, i);
 			if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
-				/* if backtracing was looking for registers R1-R5
-				 * they should have been found already.
-				 */
 				verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
 				WARN_ONCE(1, "verifier backtracking bug");
 				return -EFAULT;
@@ -9421,11 +9440,11 @@ err_out:
 	return err;
 }
 
-static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
-			     int *insn_idx, int subprog,
-			     set_callee_state_fn set_callee_state_cb)
+static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+			      int insn_idx, int subprog,
+			      set_callee_state_fn set_callee_state_cb)
 {
-	struct bpf_verifier_state *state = env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state, *callback_state;
 	struct bpf_func_state *caller, *callee;
 	int err;
 
@@ -9433,44 +9452,22 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	err = btf_check_subprog_call(env, subprog, caller->regs);
 	if (err == -EFAULT)
 		return err;
-	if (subprog_is_global(env, subprog)) {
-		if (err) {
-			verbose(env, "Caller passes invalid args into func#%d\n",
-				subprog);
-			return err;
-		} else {
-			if (env->log.level & BPF_LOG_LEVEL)
-				verbose(env,
-					"Func#%d is global and valid. Skipping.\n",
-					subprog);
-			clear_caller_saved_regs(env, caller->regs);
-
-			/* All global functions return a 64-bit SCALAR_VALUE */
-			mark_reg_unknown(env, caller->regs, BPF_REG_0);
-			caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
-
-			/* continue with next insn after call */
-			return 0;
-		}
-	}
 
 	/* set_callee_state is used for direct subprog calls, but we are
 	 * interested in validating only BPF helpers that can call subprogs as
 	 * callbacks
 	 */
-	if (set_callee_state_cb != set_callee_state) {
-		env->subprog_info[subprog].is_cb = true;
-		if (bpf_pseudo_kfunc_call(insn) &&
-		    !is_callback_calling_kfunc(insn->imm)) {
-			verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
-				func_id_name(insn->imm), insn->imm);
-			return -EFAULT;
-		} else if (!bpf_pseudo_kfunc_call(insn) &&
-			   !is_callback_calling_function(insn->imm)) { /* helper */
-			verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
-				func_id_name(insn->imm), insn->imm);
-			return -EFAULT;
-		}
+	env->subprog_info[subprog].is_cb = true;
+	if (bpf_pseudo_kfunc_call(insn) &&
+	    !is_sync_callback_calling_kfunc(insn->imm)) {
+		verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
+			func_id_name(insn->imm), insn->imm);
+		return -EFAULT;
+	} else if (!bpf_pseudo_kfunc_call(insn) &&
+		   !is_callback_calling_function(insn->imm)) { /* helper */
+		verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
+			func_id_name(insn->imm), insn->imm);
+		return -EFAULT;
 	}
 
 	if (insn->code == (BPF_JMP | BPF_CALL) &&
@@ -9481,25 +9478,76 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		/* there is no real recursion here. timer callbacks are async */
 		env->subprog_info[subprog].is_async_cb = true;
 		async_cb = push_async_cb(env, env->subprog_info[subprog].start,
-					 *insn_idx, subprog);
+					 insn_idx, subprog);
 		if (!async_cb)
 			return -EFAULT;
 		callee = async_cb->frame[0];
 		callee->async_entry_cnt = caller->async_entry_cnt + 1;
 
 		/* Convert bpf_timer_set_callback() args into timer callback args */
-		err = set_callee_state_cb(env, caller, callee, *insn_idx);
+		err = set_callee_state_cb(env, caller, callee, insn_idx);
 		if (err)
 			return err;
 
+		return 0;
+	}
+
+	/* for callback functions enqueue entry to callback and
+	 * proceed with next instruction within current frame.
+	 */
+	callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false);
+	if (!callback_state)
+		return -ENOMEM;
+
+	err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb,
+			       callback_state);
+	if (err)
+		return err;
+
+	callback_state->callback_unroll_depth++;
+	return 0;
+}
+
+static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+			   int *insn_idx)
+{
+	struct bpf_verifier_state *state = env->cur_state;
+	struct bpf_func_state *caller;
+	int err, subprog, target_insn;
+
+	target_insn = *insn_idx + insn->imm + 1;
+	subprog = find_subprog(env, target_insn);
+	if (subprog < 0) {
+		verbose(env, "verifier bug. No program starts at insn %d\n", target_insn);
+		return -EFAULT;
+	}
+
+	caller = state->frame[state->curframe];
+	err = btf_check_subprog_call(env, subprog, caller->regs);
+	if (err == -EFAULT)
+		return err;
+	if (subprog_is_global(env, subprog)) {
+		if (err) {
+			verbose(env, "Caller passes invalid args into func#%d\n", subprog);
+			return err;
+		}
+
+		if (env->log.level & BPF_LOG_LEVEL)
+			verbose(env, "Func#%d is global and valid. Skipping.\n", subprog);
 		clear_caller_saved_regs(env, caller->regs);
+
+		/* All global functions return a 64-bit SCALAR_VALUE */
 		mark_reg_unknown(env, caller->regs, BPF_REG_0);
 		caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+
 		/* continue with next insn after call */
 		return 0;
 	}
 
-	err = setup_func_entry(env, subprog, *insn_idx, set_callee_state_cb, state);
+	/* for regular function entry setup new frame and continue
+	 * from that frame.
+	 */
+	err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state);
 	if (err)
 		return err;
 
@@ -9559,22 +9607,6 @@ static int set_callee_state(struct bpf_verifier_env *env,
 	return 0;
 }
 
-static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
-			   int *insn_idx)
-{
-	int subprog, target_insn;
-
-	target_insn = *insn_idx + insn->imm + 1;
-	subprog = find_subprog(env, target_insn);
-	if (subprog < 0) {
-		verbose(env, "verifier bug. No program starts at insn %d\n",
-			target_insn);
-		return -EFAULT;
-	}
-
-	return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
-}
-
 static int set_map_elem_callback_state(struct bpf_verifier_env *env,
 				       struct bpf_func_state *caller,
 				       struct bpf_func_state *callee,
@@ -9798,6 +9830,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 			verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
 			return -EINVAL;
 		}
+		if (!calls_callback(env, callee->callsite)) {
+			verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n",
+				*insn_idx, callee->callsite);
+			return -EFAULT;
+		}
 	} else {
 		/* return to the caller whatever r0 had in the callee */
 		caller->regs[BPF_REG_0] = *r0;
@@ -9815,7 +9852,15 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 			return err;
 	}
 
-	*insn_idx = callee->callsite + 1;
+	/* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
+	 * there function call logic would reschedule callback visit. If iteration
+	 * converges is_state_visited() would prune that visit eventually.
+	 */
+	if (callee->in_callback_fn)
+		*insn_idx = callee->callsite;
+	else
+		*insn_idx = callee->callsite + 1;
+
 	if (env->log.level & BPF_LOG_LEVEL) {
 		verbose(env, "returning from callee:\n");
 		print_verifier_state(env, callee, true);
@@ -10228,24 +10273,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		}
 		break;
 	case BPF_FUNC_for_each_map_elem:
-		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
-					set_map_elem_callback_state);
+		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+					 set_map_elem_callback_state);
 		break;
 	case BPF_FUNC_timer_set_callback:
-		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
-					set_timer_callback_state);
+		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+					 set_timer_callback_state);
 		break;
 	case BPF_FUNC_find_vma:
-		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
-					set_find_vma_callback_state);
+		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+					 set_find_vma_callback_state);
 		break;
 	case BPF_FUNC_snprintf:
 		err = check_bpf_snprintf_call(env, regs);
 		break;
 	case BPF_FUNC_loop:
 		update_loop_inline_state(env, meta.subprogno);
-		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
-					set_loop_callback_state);
+		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+					 set_loop_callback_state);
 		break;
 	case BPF_FUNC_dynptr_from_mem:
 		if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
@@ -10341,8 +10386,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		break;
 	}
 	case BPF_FUNC_user_ringbuf_drain:
-		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
-					set_user_ringbuf_callback_state);
+		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+					 set_user_ringbuf_callback_state);
 		break;
 	}
 
@@ -11230,7 +11275,7 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id)
 	       btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
 }
 
-static bool is_callback_calling_kfunc(u32 btf_id)
+static bool is_sync_callback_calling_kfunc(u32 btf_id)
 {
 	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
 }
@@ -11982,6 +12027,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return -EACCES;
 	}
 
+	/* Check the arguments */
+	err = check_kfunc_args(env, &meta, insn_idx);
+	if (err < 0)
+		return err;
+
+	if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+					 set_rbtree_add_callback_state);
+		if (err) {
+			verbose(env, "kfunc %s#%d failed callback verification\n",
+				func_name, meta.func_id);
+			return err;
+		}
+	}
+
 	rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
 	rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
 
@@ -12017,10 +12077,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return -EINVAL;
 	}
 
-	/* Check the arguments */
-	err = check_kfunc_args(env, &meta, insn_idx);
-	if (err < 0)
-		return err;
 	/* In case of release function, we get register number of refcounted
 	 * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
 	 */
@@ -12054,16 +12110,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		}
 	}
 
-	if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
-		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
-					set_rbtree_add_callback_state);
-		if (err) {
-			verbose(env, "kfunc %s#%d failed callback verification\n",
-				func_name, meta.func_id);
-			return err;
-		}
-	}
-
 	if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
 		if (!bpf_jit_supports_exceptions()) {
 			verbose(env, "JIT does not support calling kfunc %s#%d\n",
@@ -15427,6 +15473,15 @@ static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
 	return env->insn_aux_data[insn_idx].force_checkpoint;
 }
 
+static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
+{
+	env->insn_aux_data[idx].calls_callback = true;
+}
+
+static bool calls_callback(struct bpf_verifier_env *env, int insn_idx)
+{
+	return env->insn_aux_data[insn_idx].calls_callback;
+}
 
 enum {
 	DONE_EXPLORING = 0,
@@ -15540,6 +15595,21 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 			 * async state will be pushed for further exploration.
 			 */
 			mark_prune_point(env, t);
+		/* For functions that invoke callbacks it is not known how many times
+		 * callback would be called. Verifier models callback calling functions
+		 * by repeatedly visiting callback bodies and returning to origin call
+		 * instruction.
+		 * In order to stop such iteration verifier needs to identify when a
+		 * state identical some state from a previous iteration is reached.
+		 * Check below forces creation of checkpoint before callback calling
+		 * instruction to allow search for such identical states.
+		 */
+		if (is_sync_callback_calling_insn(insn)) {
+			mark_calls_callback(env, t);
+			mark_force_checkpoint(env, t);
+			mark_prune_point(env, t);
+			mark_jmp_point(env, t);
+		}
 		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
 			struct bpf_kfunc_call_arg_meta meta;
 
@@ -17009,10 +17079,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 				}
 				goto skip_inf_loop_check;
 			}
+			if (calls_callback(env, insn_idx)) {
+				if (states_equal(env, &sl->state, cur, true))
+					goto hit;
+				goto skip_inf_loop_check;
+			}
 			/* attempt to detect infinite loop to avoid unnecessary doomed work */
 			if (states_maybe_looping(&sl->state, cur) &&
 			    states_equal(env, &sl->state, cur, false) &&
-			    !iter_active_depths_differ(&sl->state, cur)) {
+			    !iter_active_depths_differ(&sl->state, cur) &&
+			    sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
 				verbose_linfo(env, insn_idx, "; ");
 				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
 				verbose(env, "cur state:");
diff --git a/tools/testing/selftests/bpf/progs/cb_refs.c b/tools/testing/selftests/bpf/progs/cb_refs.c
index 76d661b20e87..56c764df8196 100644
--- a/tools/testing/selftests/bpf/progs/cb_refs.c
+++ b/tools/testing/selftests/bpf/progs/cb_refs.c
@@ -33,6 +33,7 @@ int underflow_prog(void *ctx)
 	if (!p)
 		return 0;
 	bpf_for_each_map_elem(&array_map, cb1, &p, 0);
+	bpf_kfunc_call_test_release(p);
 	return 0;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/exceptions_fail.c b/tools/testing/selftests/bpf/progs/exceptions_fail.c
index 4c39e920dac2..8c0ef2742208 100644
--- a/tools/testing/selftests/bpf/progs/exceptions_fail.c
+++ b/tools/testing/selftests/bpf/progs/exceptions_fail.c
@@ -171,6 +171,7 @@ int reject_with_rbtree_add_throw(void *ctx)
 		return 0;
 	bpf_spin_lock(&lock);
 	bpf_rbtree_add(&rbtree, &f->node, rbless);
+	bpf_spin_unlock(&lock);
 	return 0;
 }
 
@@ -214,6 +215,7 @@ int reject_with_cb_reference(void *ctx)
 	if (!f)
 		return 0;
 	bpf_loop(5, subprog_cb_ref, NULL, 0);
+	bpf_obj_drop(f);
 	return 0;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
index db6b3143338b..da803cffb5ef 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -119,15 +119,26 @@ __naked int global_subprog_result_precise(void)
 
 SEC("?raw_tp")
 __success __log_level(2)
+/* First simulated path does not include callback body */
 __msg("14: (0f) r1 += r6")
-__msg("mark_precise: frame0: last_idx 14 first_idx 10")
+__msg("mark_precise: frame0: last_idx 14 first_idx 9")
 __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7")
 __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4")
 __msg("mark_precise: frame0: regs=r6 stack= before 11: (25) if r6 > 0x3 goto pc+4")
 __msg("mark_precise: frame0: regs=r6 stack= before 10: (bf) r6 = r0")
-__msg("mark_precise: frame0: parent state regs=r0 stack=:")
-__msg("mark_precise: frame0: last_idx 18 first_idx 0")
-__msg("mark_precise: frame0: regs=r0 stack= before 18: (95) exit")
+__msg("mark_precise: frame0: regs=r0 stack= before 9: (85) call bpf_loop")
+/* State entering callback body popped from states stack */
+__msg("from 9 to 17: frame1:")
+__msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb")
+__msg("17: (b7) r0 = 0")
+__msg("18: (95) exit")
+__msg("returning from callee:")
+__msg("to caller at 9:")
+/* r4 (flags) is always precise for bpf_loop() */
+__msg("frame 0: propagating r4")
+__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
+__msg("mark_precise: frame0: regs=r4 stack= before 18: (95) exit")
+__msg("from 18 to 9: safe")
 __naked int callback_result_precise(void)
 {
 	asm volatile (
@@ -233,20 +244,36 @@ __naked int parent_callee_saved_reg_precise_global(void)
 
 SEC("?raw_tp")
 __success __log_level(2)
+/* First simulated path does not include callback body */
 __msg("12: (0f) r1 += r6")
-__msg("mark_precise: frame0: last_idx 12 first_idx 10")
+__msg("mark_precise: frame0: last_idx 12 first_idx 9")
 __msg("mark_precise: frame0: regs=r6 stack= before 11: (bf) r1 = r7")
 __msg("mark_precise: frame0: regs=r6 stack= before 10: (27) r6 *= 4")
+__msg("mark_precise: frame0: regs=r6 stack= before 9: (85) call bpf_loop")
 __msg("mark_precise: frame0: parent state regs=r6 stack=:")
-__msg("mark_precise: frame0: last_idx 16 first_idx 0")
-__msg("mark_precise: frame0: regs=r6 stack= before 16: (95) exit")
-__msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0")
-__msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop#181")
+__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9")
 __msg("mark_precise: frame0: regs=r6 stack= before 8: (b7) r4 = 0")
 __msg("mark_precise: frame0: regs=r6 stack= before 7: (b7) r3 = 0")
 __msg("mark_precise: frame0: regs=r6 stack= before 6: (bf) r2 = r8")
 __msg("mark_precise: frame0: regs=r6 stack= before 5: (b7) r1 = 1")
 __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
+/* State entering callback body popped from states stack */
+__msg("from 9 to 15: frame1:")
+__msg("15: frame1: R1=scalar() R2=0 R10=fp0 cb")
+__msg("15: (b7) r0 = 0")
+__msg("16: (95) exit")
+__msg("returning from callee:")
+__msg("to caller at 9:")
+/* r4 (flags) is always precise for bpf_loop(),
+ * r6 was marked before backtracking to callback body.
+ */
+__msg("frame 0: propagating r4,r6")
+__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
+__msg("mark_precise: frame0: regs=r4,r6 stack= before 16: (95) exit")
+__msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0")
+__msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop")
+__msg("mark_precise: frame0: parent state regs= stack=:")
+__msg("from 16 to 9: safe")
 __naked int parent_callee_saved_reg_precise_with_callback(void)
 {
 	asm volatile (
@@ -373,22 +400,38 @@ __naked int parent_stack_slot_precise_global(void)
 
 SEC("?raw_tp")
 __success __log_level(2)
+/* First simulated path does not include callback body */
 __msg("14: (0f) r1 += r6")
-__msg("mark_precise: frame0: last_idx 14 first_idx 11")
+__msg("mark_precise: frame0: last_idx 14 first_idx 10")
 __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7")
 __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4")
 __msg("mark_precise: frame0: regs=r6 stack= before 11: (79) r6 = *(u64 *)(r10 -8)")
+__msg("mark_precise: frame0: regs= stack=-8 before 10: (85) call bpf_loop")
 __msg("mark_precise: frame0: parent state regs= stack=-8:")
-__msg("mark_precise: frame0: last_idx 18 first_idx 0")
-__msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit")
-__msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0")
-__msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181")
+__msg("mark_precise: frame0: last_idx 9 first_idx 0 subseq_idx 10")
 __msg("mark_precise: frame0: regs= stack=-8 before 9: (b7) r4 = 0")
 __msg("mark_precise: frame0: regs= stack=-8 before 8: (b7) r3 = 0")
 __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r2 = r8")
 __msg("mark_precise: frame0: regs= stack=-8 before 6: (bf) r1 = r6")
 __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -8) = r6")
 __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
+/* State entering callback body popped from states stack */
+__msg("from 10 to 17: frame1:")
+__msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb")
+__msg("17: (b7) r0 = 0")
+__msg("18: (95) exit")
+__msg("returning from callee:")
+__msg("to caller at 10:")
+/* r4 (flags) is always precise for bpf_loop(),
+ * fp-8 was marked before backtracking to callback body.
+ */
+__msg("frame 0: propagating r4,fp-8")
+__msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx -1")
+__msg("mark_precise: frame0: regs=r4 stack=-8 before 18: (95) exit")
+__msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0")
+__msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181")
+__msg("mark_precise: frame0: parent state regs= stack=:")
+__msg("from 18 to 10: safe")
 __naked int parent_stack_slot_precise_with_callback(void)
 {
 	asm volatile (
-- 
cgit v1.2.3


From cafe2c21508a38cdb3ed22708842e957b2572c3e Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 21 Nov 2023 04:06:58 +0200
Subject: bpf: widening for callback iterators

Callbacks are similar to open coded iterators, so add imprecise
widening logic for callback body processing. This makes callback based
loops behave identically to open coded iterators, e.g. allowing to
verify programs like below:

  struct ctx { u32 i; };
  int cb(u32 idx, struct ctx* ctx)
  {
          ++ctx->i;
          return 0;
  }
  ...
  struct ctx ctx = { .i = 0 };
  bpf_loop(100, cb, &ctx, 0);
  ...

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231121020701.26440-9-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a60dfa56ebb3..2f03e6b11bb9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9799,9 +9799,10 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
 
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
-	struct bpf_verifier_state *state = env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state, *prev_st;
 	struct bpf_func_state *caller, *callee;
 	struct bpf_reg_state *r0;
+	bool in_callback_fn;
 	int err;
 
 	callee = state->frame[state->curframe];
@@ -9856,7 +9857,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	 * there function call logic would reschedule callback visit. If iteration
 	 * converges is_state_visited() would prune that visit eventually.
 	 */
-	if (callee->in_callback_fn)
+	in_callback_fn = callee->in_callback_fn;
+	if (in_callback_fn)
 		*insn_idx = callee->callsite;
 	else
 		*insn_idx = callee->callsite + 1;
@@ -9871,6 +9873,24 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	 * bpf_throw, this will be done by copy_verifier_state for extra frames. */
 	free_func_state(callee);
 	state->frame[state->curframe--] = NULL;
+
+	/* for callbacks widen imprecise scalars to make programs like below verify:
+	 *
+	 *   struct ctx { int i; }
+	 *   void cb(int idx, struct ctx *ctx) { ctx->i++; ... }
+	 *   ...
+	 *   struct ctx = { .i = 0; }
+	 *   bpf_loop(100, cb, &ctx, 0);
+	 *
+	 * This is similar to what is done in process_iter_next_call() for open
+	 * coded iterators.
+	 */
+	prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL;
+	if (prev_st) {
+		err = widen_imprecise_scalars(env, prev_st, state);
+		if (err)
+			return err;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From bb124da69c47dd98d69361ec13244ece50bec63e Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 21 Nov 2023 04:07:00 +0200
Subject: bpf: keep track of max number of bpf_loop callback iterations

In some cases verifier can't infer convergence of the bpf_loop()
iteration. E.g. for the following program:

    static int cb(__u32 idx, struct num_context* ctx)
    {
        ctx->i++;
        return 0;
    }

    SEC("?raw_tp")
    int prog(void *_)
    {
        struct num_context ctx = { .i = 0 };
        __u8 choice_arr[2] = { 0, 1 };

        bpf_loop(2, cb, &ctx, 0);
        return choice_arr[ctx.i];
    }

Each 'cb' simulation would eventually return to 'prog' and reach
'return choice_arr[ctx.i]' statement. At which point ctx.i would be
marked precise, thus forcing verifier to track multitude of separate
states with {.i=0}, {.i=1}, ... at bpf_loop() callback entry.

This commit allows "brute force" handling for such cases by limiting
number of callback body simulations using 'umax' value of the first
bpf_loop() parameter.

For this, extend bpf_func_state with 'callback_depth' field.
Increment this field when callback visiting state is pushed to states
traversal stack. For frame #N it's 'callback_depth' field counts how
many times callback with frame depth N+1 had been executed.
Use bpf_func_state specifically to allow independent tracking of
callback depths when multiple nested bpf_loop() calls are present.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231121020701.26440-11-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                       | 11 +++++++
 kernel/bpf/verifier.c                              | 19 ++++++++++--
 .../bpf/progs/verifier_subprog_precision.c         | 35 +++++++++++++++-------
 3 files changed, 53 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index dd326936dd6f..aa4d19d0bc94 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -301,6 +301,17 @@ struct bpf_func_state {
 	struct tnum callback_ret_range;
 	bool in_async_callback_fn;
 	bool in_exception_callback_fn;
+	/* For callback calling functions that limit number of possible
+	 * callback executions (e.g. bpf_loop) keeps track of current
+	 * simulated iteration number.
+	 * Value in frame N refers to number of times callback with frame
+	 * N+1 was simulated, e.g. for the following call:
+	 *
+	 *   bpf_loop(..., fn, ...); | suppose current frame is N
+	 *                           | fn would be simulated in frame N+1
+	 *                           | number of simulations is tracked in frame N
+	 */
+	u32 callback_depth;
 
 	/* The following fields should be last. See copy_func_state() */
 	int acquired_refs;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2f03e6b11bb9..af2819d5c8ee 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9505,6 +9505,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
 		return err;
 
 	callback_state->callback_unroll_depth++;
+	callback_state->frame[callback_state->curframe - 1]->callback_depth++;
+	caller->callback_depth = 0;
 	return 0;
 }
 
@@ -10309,8 +10311,21 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		break;
 	case BPF_FUNC_loop:
 		update_loop_inline_state(env, meta.subprogno);
-		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
-					 set_loop_callback_state);
+		/* Verifier relies on R1 value to determine if bpf_loop() iteration
+		 * is finished, thus mark it precise.
+		 */
+		err = mark_chain_precision(env, BPF_REG_1);
+		if (err)
+			return err;
+		if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
+			err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+						 set_loop_callback_state);
+		} else {
+			cur_func(env)->callback_depth = 0;
+			if (env->log.level & BPF_LOG_LEVEL2)
+				verbose(env, "frame%d bpf_loop iteration limit reached\n",
+					env->cur_state->curframe);
+		}
 		break;
 	case BPF_FUNC_dynptr_from_mem:
 		if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
index da803cffb5ef..f61d623b1ce8 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -119,7 +119,23 @@ __naked int global_subprog_result_precise(void)
 
 SEC("?raw_tp")
 __success __log_level(2)
-/* First simulated path does not include callback body */
+/* First simulated path does not include callback body,
+ * r1 and r4 are always precise for bpf_loop() calls.
+ */
+__msg("9: (85) call bpf_loop#181")
+__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
+__msg("mark_precise: frame0: parent state regs=r4 stack=:")
+__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9")
+__msg("mark_precise: frame0: regs=r4 stack= before 8: (b7) r4 = 0")
+__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
+__msg("mark_precise: frame0: parent state regs=r1 stack=:")
+__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9")
+__msg("mark_precise: frame0: regs=r1 stack= before 8: (b7) r4 = 0")
+__msg("mark_precise: frame0: regs=r1 stack= before 7: (b7) r3 = 0")
+__msg("mark_precise: frame0: regs=r1 stack= before 6: (bf) r2 = r8")
+__msg("mark_precise: frame0: regs=r1 stack= before 5: (bf) r1 = r6")
+__msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3")
+/* r6 precision propagation */
 __msg("14: (0f) r1 += r6")
 __msg("mark_precise: frame0: last_idx 14 first_idx 9")
 __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7")
@@ -134,10 +150,9 @@ __msg("17: (b7) r0 = 0")
 __msg("18: (95) exit")
 __msg("returning from callee:")
 __msg("to caller at 9:")
-/* r4 (flags) is always precise for bpf_loop() */
-__msg("frame 0: propagating r4")
+__msg("frame 0: propagating r1,r4")
 __msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
-__msg("mark_precise: frame0: regs=r4 stack= before 18: (95) exit")
+__msg("mark_precise: frame0: regs=r1,r4 stack= before 18: (95) exit")
 __msg("from 18 to 9: safe")
 __naked int callback_result_precise(void)
 {
@@ -264,12 +279,12 @@ __msg("15: (b7) r0 = 0")
 __msg("16: (95) exit")
 __msg("returning from callee:")
 __msg("to caller at 9:")
-/* r4 (flags) is always precise for bpf_loop(),
+/* r1, r4 are always precise for bpf_loop(),
  * r6 was marked before backtracking to callback body.
  */
-__msg("frame 0: propagating r4,r6")
+__msg("frame 0: propagating r1,r4,r6")
 __msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1")
-__msg("mark_precise: frame0: regs=r4,r6 stack= before 16: (95) exit")
+__msg("mark_precise: frame0: regs=r1,r4,r6 stack= before 16: (95) exit")
 __msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0")
 __msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop")
 __msg("mark_precise: frame0: parent state regs= stack=:")
@@ -422,12 +437,12 @@ __msg("17: (b7) r0 = 0")
 __msg("18: (95) exit")
 __msg("returning from callee:")
 __msg("to caller at 10:")
-/* r4 (flags) is always precise for bpf_loop(),
+/* r1, r4 are always precise for bpf_loop(),
  * fp-8 was marked before backtracking to callback body.
  */
-__msg("frame 0: propagating r4,fp-8")
+__msg("frame 0: propagating r1,r4,fp-8")
 __msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx -1")
-__msg("mark_precise: frame0: regs=r4 stack=-8 before 18: (95) exit")
+__msg("mark_precise: frame0: regs=r1,r4 stack=-8 before 18: (95) exit")
 __msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0")
 __msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181")
 __msg("mark_precise: frame0: parent state regs= stack=:")
-- 
cgit v1.2.3


From 49277a5b76373e630075ff7d32fc0f9f51294f24 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 20 Nov 2023 21:18:40 -0500
Subject: workqueue: Move workqueue_set_unbound_cpumask() and its helpers
 inside CONFIG_SYSFS

Commit fe28f631fa94 ("workqueue: Add workqueue_unbound_exclude_cpumask()
to exclude CPUs from wq_unbound_cpumask") makes
workqueue_set_unbound_cpumask() static as it is not used elsewhere in
the kernel. However, this triggers a kernel test robot warning about
'workqueue_set_unbound_cpumask' defined but not used when CONFIG_SYS
isn't defined. It happens that workqueue_set_unbound_cpumask() is only
called when CONFIG_SYS is defined.

Move workqueue_set_unbound_cpumask() and its helpers inside the
CONFIG_SYSFS compilation block to avoid the warning. There is no
functional change.

Fixes: fe28f631fa94 ("workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202311130831.uh0AoCd1-lkp@intel.com/
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 102 ++++++++++++++++++++++++++---------------------------
 1 file changed, 51 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bd9d34eacd78..2fc585d3d6ca 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4417,19 +4417,6 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
 	mutex_unlock(&ctx->wq->mutex);
 }
 
-static void apply_wqattrs_lock(void)
-{
-	/* CPUs should stay stable across pwq creations and installations */
-	cpus_read_lock();
-	mutex_lock(&wq_pool_mutex);
-}
-
-static void apply_wqattrs_unlock(void)
-{
-	mutex_unlock(&wq_pool_mutex);
-	cpus_read_unlock();
-}
-
 static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
 					const struct workqueue_attrs *attrs)
 {
@@ -5833,44 +5820,6 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
 	return ret;
 }
 
-/**
- *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
- *  @cpumask: the cpumask to set
- *
- *  The low-level workqueues cpumask is a global cpumask that limits
- *  the affinity of all unbound workqueues.  This function check the @cpumask
- *  and apply it to all unbound workqueues and updates all pwqs of them.
- *
- *  Return:	0	- Success
- *  		-EINVAL	- Invalid @cpumask
- *  		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
- */
-static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
-{
-	int ret = -EINVAL;
-
-	/*
-	 * Not excluding isolated cpus on purpose.
-	 * If the user wishes to include them, we allow that.
-	 */
-	cpumask_and(cpumask, cpumask, cpu_possible_mask);
-	if (!cpumask_empty(cpumask)) {
-		apply_wqattrs_lock();
-		cpumask_copy(wq_requested_unbound_cpumask, cpumask);
-		if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
-			ret = 0;
-			goto out_unlock;
-		}
-
-		ret = workqueue_apply_unbound_cpumask(cpumask);
-
-out_unlock:
-		apply_wqattrs_unlock();
-	}
-
-	return ret;
-}
-
 /**
  * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
  * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
@@ -6027,6 +5976,19 @@ static struct attribute *wq_sysfs_attrs[] = {
 };
 ATTRIBUTE_GROUPS(wq_sysfs);
 
+static void apply_wqattrs_lock(void)
+{
+	/* CPUs should stay stable across pwq creations and installations */
+	cpus_read_lock();
+	mutex_lock(&wq_pool_mutex);
+}
+
+static void apply_wqattrs_unlock(void)
+{
+	mutex_unlock(&wq_pool_mutex);
+	cpus_read_unlock();
+}
+
 static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
 			    char *buf)
 {
@@ -6203,6 +6165,44 @@ static struct bus_type wq_subsys = {
 	.dev_groups			= wq_sysfs_groups,
 };
 
+/**
+ *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
+ *  @cpumask: the cpumask to set
+ *
+ *  The low-level workqueues cpumask is a global cpumask that limits
+ *  the affinity of all unbound workqueues.  This function check the @cpumask
+ *  and apply it to all unbound workqueues and updates all pwqs of them.
+ *
+ *  Return:	0	- Success
+ *		-EINVAL	- Invalid @cpumask
+ *		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
+ */
+static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+{
+	int ret = -EINVAL;
+
+	/*
+	 * Not excluding isolated cpus on purpose.
+	 * If the user wishes to include them, we allow that.
+	 */
+	cpumask_and(cpumask, cpumask, cpu_possible_mask);
+	if (!cpumask_empty(cpumask)) {
+		apply_wqattrs_lock();
+		cpumask_copy(wq_requested_unbound_cpumask, cpumask);
+		if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
+			ret = 0;
+			goto out_unlock;
+		}
+
+		ret = workqueue_apply_unbound_cpumask(cpumask);
+
+out_unlock:
+		apply_wqattrs_unlock();
+	}
+
+	return ret;
+}
+
 static ssize_t __wq_cpumask_show(struct device *dev,
 		struct device_attribute *attr, char *buf, cpumask_var_t mask)
 {
-- 
cgit v1.2.3


From a89299c40911ee29c6ec4fb66f9c598cd947265b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 8 Nov 2023 13:58:25 +0100
Subject: time: Make sysfs_get_uname() function visible in header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This function is defined globally in clocksource.c and used conditionally
in clockevent.c, which the declaration hidden when clockevent support
is disabled. This causes a harmless warning in the definition:

kernel/time/clocksource.c:1324:9: warning: no previous prototype for 'sysfs_get_uname' [-Wmissing-prototypes]
 1324 | ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)

Move the declaration out of the #ifdef so it is always visible.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20231108125843.3806765-5-arnd@kernel.org
---
 kernel/time/tick-internal.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 649f2b48e8f0..481b7ab65e2c 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -56,7 +56,6 @@ extern int clockevents_program_event(struct clock_event_device *dev,
 				     ktime_t expires, bool force);
 extern void clockevents_handle_noop(struct clock_event_device *dev);
 extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
-extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
 
 /* Broadcasting support */
 # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
@@ -197,3 +196,5 @@ void hrtimers_resume_local(void);
 #else
 #define JIFFIES_SHIFT	8
 #endif
+
+extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
-- 
cgit v1.2.3


From 4a6c5607d4502ccd1b15b57d57f17d12b6f257a7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 21 Nov 2023 11:39:36 -1000
Subject: workqueue: Make sure that wq_unbound_cpumask is never empty

During boot, depending on how the housekeeping and workqueue.unbound_cpus
masks are set, wq_unbound_cpumask can end up empty. Since 8639ecebc9b1
("workqueue: Implement non-strict affinity scope for unbound workqueues"),
this may end up feeding -1 as a CPU number into scheduler leading to oopses.

  BUG: unable to handle page fault for address: ffffffff8305e9c0
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  ...
  Call Trace:
   <TASK>
   select_idle_sibling+0x79/0xaf0
   select_task_rq_fair+0x1cb/0x7b0
   try_to_wake_up+0x29c/0x5c0
   wake_up_process+0x19/0x20
   kick_pool+0x5e/0xb0
   __queue_work+0x119/0x430
   queue_work_on+0x29/0x30
  ...

An empty wq_unbound_cpumask is a clear misconfiguration and already
disallowed once system is booted up. Let's warn on and ignore
unbound_cpumask restrictions which lead to no unbound cpus. While at it,
also remove now unncessary empty check on wq_unbound_cpumask in
wq_select_unbound_cpu().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-Tested-by: Yong He <alexyonghe@tencent.com>
Link: http://lkml.kernel.org/r/20231120121623.119780-1-alexyonghe@tencent.com
Fixes: 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues")
Cc: stable@vger.kernel.org # v6.6+
Reviewed-by: Waiman Long <longman@redhat.com>
---
 kernel/workqueue.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6e578f576a6f..2989b57e154a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1684,9 +1684,6 @@ static int wq_select_unbound_cpu(int cpu)
 		pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
 	}
 
-	if (cpumask_empty(wq_unbound_cpumask))
-		return cpu;
-
 	new_cpu = __this_cpu_read(wq_rr_cpu_last);
 	new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
 	if (unlikely(new_cpu >= nr_cpu_ids)) {
@@ -6515,6 +6512,17 @@ static inline void wq_watchdog_init(void) { }
 
 #endif	/* CONFIG_WQ_WATCHDOG */
 
+static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
+{
+	if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
+		pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
+			cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
+		return;
+	}
+
+	cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
+}
+
 /**
  * workqueue_init_early - early init for workqueue subsystem
  *
@@ -6534,11 +6542,11 @@ void __init workqueue_init_early(void)
 	BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-	cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
-	cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
-
+	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+	restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
+	restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
 	if (!cpumask_empty(&wq_cmdline_cpumask))
-		cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask);
+		restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-- 
cgit v1.2.3


From 69dcbbd80421a5d8230c178e01869f8e2edb2317 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 10 Oct 2023 09:52:19 -0700
Subject: locktorture: Increase Hamming distance between call_rcu_chain and
 rcu_call_chains

One letter difference is really not enough, so this commit changes
call_rcu_chain to call_rcu_chain_list.

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 kernel/locking/locktorture.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 69d3cd2cfc3b..415d81e6ce70 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -124,7 +124,7 @@ struct call_rcu_chain {
 	struct rcu_head crc_rh;
 	bool crc_stop;
 };
-struct call_rcu_chain *call_rcu_chain;
+struct call_rcu_chain *call_rcu_chain_list;
 
 /* Forward reference. */
 static void lock_torture_cleanup(void);
@@ -1074,12 +1074,12 @@ static int call_rcu_chain_init(void)
 
 	if (call_rcu_chains <= 0)
 		return 0;
-	call_rcu_chain = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain), GFP_KERNEL);
-	if (!call_rcu_chain)
+	call_rcu_chain_list = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain_list), GFP_KERNEL);
+	if (!call_rcu_chain_list)
 		return -ENOMEM;
 	for (i = 0; i < call_rcu_chains; i++) {
-		call_rcu_chain[i].crc_stop = false;
-		call_rcu(&call_rcu_chain[i].crc_rh, call_rcu_chain_cb);
+		call_rcu_chain_list[i].crc_stop = false;
+		call_rcu(&call_rcu_chain_list[i].crc_rh, call_rcu_chain_cb);
 	}
 	return 0;
 }
@@ -1089,13 +1089,13 @@ static void call_rcu_chain_cleanup(void)
 {
 	int i;
 
-	if (!call_rcu_chain)
+	if (!call_rcu_chain_list)
 		return;
 	for (i = 0; i < call_rcu_chains; i++)
-		smp_store_release(&call_rcu_chain[i].crc_stop, true);
+		smp_store_release(&call_rcu_chain_list[i].crc_stop, true);
 	rcu_barrier();
-	kfree(call_rcu_chain);
-	call_rcu_chain = NULL;
+	kfree(call_rcu_chain_list);
+	call_rcu_chain_list = NULL;
 }
 
 static void lock_torture_cleanup(void)
-- 
cgit v1.2.3


From 90f1015dfee3d33f8ca7bfe03296d100d465e385 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang1211@gmail.com>
Date: Fri, 3 Nov 2023 15:26:39 +0800
Subject: rcutorture: Add fqs_holdoff check before fqs_task is created

For rcutorture tests on RCU implementations that support
force-quiescent-state operations and that set the fqs_duration module
parameter greater than zero, the fqs_task kthread will be created.
However, if the fqs_holdoff module parameter is not set, then its default
value of zero will cause fqs_task enter a long-term busy loop until
stopped by kthread_stop().  This commit therefore adds a fqs_holdoff
check before the fqs_task is created, making sure that whenever the
fqs_task is created, the fqs_holdoff will be greater than zero.

Signed-off-by: Zqiang <qiang.zhang1211@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 kernel/rcu/rcutorture.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 30fc9d34e329..a0b2520bd32b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -3872,7 +3872,9 @@ rcu_torture_init(void)
 	}
 	if (fqs_duration < 0)
 		fqs_duration = 0;
-	if (fqs_duration) {
+	if (fqs_holdoff < 0)
+		fqs_holdoff = 0;
+	if (fqs_duration && fqs_holdoff) {
 		/* Create the fqs thread */
 		firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
 						  fqs_task);
-- 
cgit v1.2.3


From 50181c0cff31281b9f1071575ffba8a102375ece Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 22 Nov 2023 15:01:19 +0100
Subject: sched/pelt: Avoid underestimation of task utilization

Lukasz Luba reported that a thread's util_est can significantly decrease as
a result of sharing the CPU with other threads.

The use case can be easily reproduced with a periodic task TA that runs 1ms
and sleeps 100us. When the task is alone on the CPU, its max utilization and
its util_est is around 888. If another similar task starts to run on the
same CPU, TA will have to share the CPU runtime and its maximum utilization
will decrease around half the CPU capacity (512) then TA's util_est will
follow this new maximum trend which is only the result of sharing the CPU
with others tasks.

Such situation can be detected with runnable_avg wich is close or
equal to util_avg when TA is alone, but increases above util_avg when TA
shares the CPU with other threads and wait on the runqueue.

[ We prefer an util_est that overestimate rather than under estimate
  because in 1st case we will not provide enough performance to the
  task which will remain under-provisioned, whereas in the other case we
  will create some idle time which will enable to reduce contention and
  as a result reduces the util_est so the overestimate will be transient
  whereas the underestimate will remain. ]

[ mingo: Refined the changelog, added comments from the LKML discussion. ]

Reported-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/lkml/CAKfTPtDd-HhF-YiNTtL9i5k0PfJbF819Yxu4YquzfXgwi7voyw@mail.gmail.com/#t
Link: https://lore.kernel.org/r/20231122140119.472110-1-vincent.guittot@linaro.org
Cc: Hongyan Xia <hongyan.xia2@arm.com>
---
 kernel/sched/fair.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 07f555857698..53dea95ad8c9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4774,6 +4774,11 @@ static inline unsigned long task_util(struct task_struct *p)
 	return READ_ONCE(p->se.avg.util_avg);
 }
 
+static inline unsigned long task_runnable(struct task_struct *p)
+{
+	return READ_ONCE(p->se.avg.runnable_avg);
+}
+
 static inline unsigned long _task_util_est(struct task_struct *p)
 {
 	struct util_est ue = READ_ONCE(p->se.avg.util_est);
@@ -4892,6 +4897,14 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
 		return;
 
+	/*
+	 * To avoid underestimate of task utilization, skip updates of EWMA if
+	 * we cannot grant that thread got all CPU time it wanted.
+	 */
+	if ((ue.enqueued + UTIL_EST_MARGIN) < task_runnable(p))
+		goto done;
+
+
 	/*
 	 * Update Task's estimated utilization
 	 *
-- 
cgit v1.2.3


From 9c0b4bb7f6303c9c4e2e34984c46f5a86478f84d Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 22 Nov 2023 14:39:03 +0100
Subject: sched/cpufreq: Rework schedutil governor performance estimation

The current method to take into account uclamp hints when estimating the
target frequency can end in a situation where the selected target
frequency is finally higher than uclamp hints, whereas there are no real
needs. Such cases mainly happen because we are currently mixing the
traditional scheduler utilization signal with the uclamp performance
hints. By adding these 2 metrics, we loose an important information when
it comes to select the target frequency, and we have to make some
assumptions which can't fit all cases.

Rework the interface between the scheduler and schedutil governor in order
to propagate all information down to the cpufreq governor.

effective_cpu_util() interface changes and now returns the actual
utilization of the CPU with 2 optional inputs:

- The minimum performance for this CPU; typically the capacity to handle
  the deadline task and the interrupt pressure. But also uclamp_min
  request when available.

- The maximum targeting performance for this CPU which reflects the
  maximum level that we would like to not exceed. By default it will be
  the CPU capacity but can be reduced because of some performance hints
  set with uclamp. The value can be lower than actual utilization and/or
  min performance level.

A new sugov_effective_cpu_perf() interface is also available to compute
the final performance level that is targeted for the CPU, after applying
some cpufreq headroom and taking into account all inputs.

With these 2 functions, schedutil is now able to decide when it must go
above uclamp hints. It now also has a generic way to get the min
performance level.

The dependency between energy model and cpufreq governor and its headroom
policy doesn't exist anymore.

eenv_pd_max_util() asks schedutil for the targeted performance after
applying the impact of the waking task.

[ mingo: Refined the changelog & C comments. ]

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20231122133904.446032-2-vincent.guittot@linaro.org
---
 include/linux/energy_model.h     |  1 -
 kernel/sched/core.c              | 90 +++++++++++++++++-----------------------
 kernel/sched/cpufreq_schedutil.c | 35 +++++++++++-----
 kernel/sched/fair.c              | 22 ++++++++--
 kernel/sched/sched.h             | 24 ++++-------
 5 files changed, 89 insertions(+), 83 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index b9caa01dfac4..adec808b371a 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -243,7 +243,6 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	scale_cpu = arch_scale_cpu_capacity(cpu);
 	ps = &pd->table[pd->nr_perf_states - 1];
 
-	max_util = map_util_perf(max_util);
 	max_util = min(max_util, allowed_cpu_cap);
 	freq = map_util_freq(max_util, ps->frequency, scale_cpu);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2de77a6d5ef8..db4be4921e7f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7467,18 +7467,13 @@ int sched_core_idle_cpu(int cpu)
  * required to meet deadlines.
  */
 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
-				 enum cpu_util_type type,
-				 struct task_struct *p)
+				 unsigned long *min,
+				 unsigned long *max)
 {
-	unsigned long dl_util, util, irq, max;
+	unsigned long util, irq, scale;
 	struct rq *rq = cpu_rq(cpu);
 
-	max = arch_scale_cpu_capacity(cpu);
-
-	if (!uclamp_is_used() &&
-	    type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
-		return max;
-	}
+	scale = arch_scale_cpu_capacity(cpu);
 
 	/*
 	 * Early check to see if IRQ/steal time saturates the CPU, can be
@@ -7486,45 +7481,49 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 	 * update_irq_load_avg().
 	 */
 	irq = cpu_util_irq(rq);
-	if (unlikely(irq >= max))
-		return max;
+	if (unlikely(irq >= scale)) {
+		if (min)
+			*min = scale;
+		if (max)
+			*max = scale;
+		return scale;
+	}
+
+	if (min) {
+		/*
+		 * The minimum utilization returns the highest level between:
+		 * - the computed DL bandwidth needed with the IRQ pressure which
+		 *   steals time to the deadline task.
+		 * - The minimum performance requirement for CFS and/or RT.
+		 */
+		*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+
+		/*
+		 * When an RT task is runnable and uclamp is not used, we must
+		 * ensure that the task will run at maximum compute capacity.
+		 */
+		if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+			*min = max(*min, scale);
+	}
 
 	/*
 	 * Because the time spend on RT/DL tasks is visible as 'lost' time to
 	 * CFS tasks and we use the same metric to track the effective
 	 * utilization (PELT windows are synchronized) we can directly add them
 	 * to obtain the CPU's actual utilization.
-	 *
-	 * CFS and RT utilization can be boosted or capped, depending on
-	 * utilization clamp constraints requested by currently RUNNABLE
-	 * tasks.
-	 * When there are no CFS RUNNABLE tasks, clamps are released and
-	 * frequency will be gracefully reduced with the utilization decay.
 	 */
 	util = util_cfs + cpu_util_rt(rq);
-	if (type == FREQUENCY_UTIL)
-		util = uclamp_rq_util_with(rq, util, p);
-
-	dl_util = cpu_util_dl(rq);
+	util += cpu_util_dl(rq);
 
 	/*
-	 * For frequency selection we do not make cpu_util_dl() a permanent part
-	 * of this sum because we want to use cpu_bw_dl() later on, but we need
-	 * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
-	 * that we select f_max when there is no idle time.
-	 *
-	 * NOTE: numerical errors or stop class might cause us to not quite hit
-	 * saturation when we should -- something for later.
+	 * The maximum hint is a soft bandwidth requirement, which can be lower
+	 * than the actual utilization because of uclamp_max requirements.
 	 */
-	if (util + dl_util >= max)
-		return max;
+	if (max)
+		*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
 
-	/*
-	 * OTOH, for energy computation we need the estimated running time, so
-	 * include util_dl and ignore dl_bw.
-	 */
-	if (type == ENERGY_UTIL)
-		util += dl_util;
+	if (util >= scale)
+		return scale;
 
 	/*
 	 * There is still idle time; further improve the number by using the
@@ -7535,28 +7534,15 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 	 *   U' = irq + --------- * U
 	 *                 max
 	 */
-	util = scale_irq_capacity(util, irq, max);
+	util = scale_irq_capacity(util, irq, scale);
 	util += irq;
 
-	/*
-	 * Bandwidth required by DEADLINE must always be granted while, for
-	 * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
-	 * to gracefully reduce the frequency when no tasks show up for longer
-	 * periods of time.
-	 *
-	 * Ideally we would like to set bw_dl as min/guaranteed freq and util +
-	 * bw_dl as requested freq. However, cpufreq is not yet ready for such
-	 * an interface. So, we only do the latter for now.
-	 */
-	if (type == FREQUENCY_UTIL)
-		util += cpu_bw_dl(rq);
-
-	return min(max, util);
+	return min(scale, util);
 }
 
 unsigned long sched_cpu_util(int cpu)
 {
-	return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
+	return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
 }
 #endif /* CONFIG_SMP */
 
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 5888176354e2..f3acf2cf26ed 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -47,7 +47,7 @@ struct sugov_cpu {
 	u64			last_update;
 
 	unsigned long		util;
-	unsigned long		bw_dl;
+	unsigned long		bw_min;
 
 	/* The field below is for single-CPU policies only: */
 #ifdef CONFIG_NO_HZ_COMMON
@@ -143,7 +143,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 	unsigned int freq = arch_scale_freq_invariant() ?
 				policy->cpuinfo.max_freq : policy->cur;
 
-	util = map_util_perf(util);
 	freq = map_util_freq(util, freq, max);
 
 	if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
@@ -153,14 +152,30 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 	return cpufreq_driver_resolve_freq(policy, freq);
 }
 
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+				 unsigned long min,
+				 unsigned long max)
+{
+	/* Add dvfs headroom to actual utilization */
+	actual = map_util_perf(actual);
+	/* Actually we don't need to target the max performance */
+	if (actual < max)
+		max = actual;
+
+	/*
+	 * Ensure at least minimum performance while providing more compute
+	 * capacity when possible.
+	 */
+	return max(min, max);
+}
+
 static void sugov_get_util(struct sugov_cpu *sg_cpu)
 {
-	unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu);
-	struct rq *rq = cpu_rq(sg_cpu->cpu);
+	unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
 
-	sg_cpu->bw_dl = cpu_bw_dl(rq);
-	sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util,
-					  FREQUENCY_UTIL, NULL);
+	util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
+	sg_cpu->bw_min = min;
+	sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max);
 }
 
 /**
@@ -306,7 +321,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
  */
 static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
 {
-	if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
+	if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min)
 		sg_cpu->sg_policy->limits_changed = true;
 }
 
@@ -407,8 +422,8 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
 	    sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
 		sg_cpu->util = prev_util;
 
-	cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
-				   map_util_perf(sg_cpu->util), max_cap);
+	cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
+				   sg_cpu->util, max_cap);
 
 	sg_cpu->sg_policy->last_freq_update_time = time;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 53dea95ad8c9..34fe6e9490c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7793,7 +7793,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
 	for_each_cpu(cpu, pd_cpus) {
 		unsigned long util = cpu_util(cpu, p, -1, 0);
 
-		busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
+		busy_time += effective_cpu_util(cpu, util, NULL, NULL);
 	}
 
 	eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
@@ -7816,7 +7816,7 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
 	for_each_cpu(cpu, pd_cpus) {
 		struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
 		unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
-		unsigned long eff_util;
+		unsigned long eff_util, min, max;
 
 		/*
 		 * Performance domain frequency: utilization clamping
@@ -7825,7 +7825,23 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
 		 * NOTE: in case RT tasks are running, by default the
 		 * FREQUENCY_UTIL's utilization can be max OPP.
 		 */
-		eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+		eff_util = effective_cpu_util(cpu, util, &min, &max);
+
+		/* Task's uclamp can modify min and max value */
+		if (tsk && uclamp_is_used()) {
+			min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
+
+			/*
+			 * If there is no active max uclamp constraint,
+			 * directly use task's one, otherwise keep max.
+			 */
+			if (uclamp_rq_is_idle(cpu_rq(cpu)))
+				max = uclamp_eff_value(p, UCLAMP_MAX);
+			else
+				max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
+		}
+
+		eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
 		max_util = max(max_util, eff_util);
 	}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8a70d51ffa33..c1574cd388e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2994,24 +2994,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #endif
 
 #ifdef CONFIG_SMP
-/**
- * enum cpu_util_type - CPU utilization type
- * @FREQUENCY_UTIL:	Utilization used to select frequency
- * @ENERGY_UTIL:	Utilization used during energy calculation
- *
- * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
- * need to be aggregated differently depending on the usage made of them. This
- * enum is used within effective_cpu_util() to differentiate the types of
- * utilization expected by the callers, and adjust the aggregation accordingly.
- */
-enum cpu_util_type {
-	FREQUENCY_UTIL,
-	ENERGY_UTIL,
-};
-
 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
-				 enum cpu_util_type type,
-				 struct task_struct *p);
+				 unsigned long *min,
+				 unsigned long *max);
+
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+				 unsigned long min,
+				 unsigned long max);
+
 
 /*
  * Verify the fitness of task @p to run on @cpu taking into account the
-- 
cgit v1.2.3


From f12560779f9d734446508f3df17f5632e9aaa2c8 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 22 Nov 2023 14:39:04 +0100
Subject: sched/cpufreq: Rework iowait boost

Use the max value that has already been computed inside sugov_get_util()
to cap the iowait boost and remove dependency with uclamp_rq_util_with()
which is not used anymore.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20231122133904.446032-3-vincent.guittot@linaro.org
---
 kernel/sched/cpufreq_schedutil.c | 29 ++++++++++---------
 kernel/sched/sched.h             | 60 ----------------------------------------
 2 files changed, 14 insertions(+), 75 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index f3acf2cf26ed..4ee8ad70be99 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -169,11 +169,12 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
 	return max(min, max);
 }
 
-static void sugov_get_util(struct sugov_cpu *sg_cpu)
+static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
 {
 	unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
 
 	util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
+	util = max(util, boost);
 	sg_cpu->bw_min = min;
 	sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max);
 }
@@ -266,18 +267,16 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
  * This mechanism is designed to boost high frequently IO waiting tasks, while
  * being more conservative on tasks which does sporadic IO operations.
  */
-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
 			       unsigned long max_cap)
 {
-	unsigned long boost;
-
 	/* No boost currently required */
 	if (!sg_cpu->iowait_boost)
-		return;
+		return 0;
 
 	/* Reset boost if the CPU appears to have been idle enough */
 	if (sugov_iowait_reset(sg_cpu, time, false))
-		return;
+		return 0;
 
 	if (!sg_cpu->iowait_boost_pending) {
 		/*
@@ -286,7 +285,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
 		sg_cpu->iowait_boost >>= 1;
 		if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
 			sg_cpu->iowait_boost = 0;
-			return;
+			return 0;
 		}
 	}
 
@@ -296,10 +295,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
 	 * sg_cpu->util is already in capacity scale; convert iowait_boost
 	 * into the same scale so we can compare.
 	 */
-	boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
-	boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
-	if (sg_cpu->util < boost)
-		sg_cpu->util = boost;
+	return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -329,6 +325,8 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
 					      u64 time, unsigned long max_cap,
 					      unsigned int flags)
 {
+	unsigned long boost;
+
 	sugov_iowait_boost(sg_cpu, time, flags);
 	sg_cpu->last_update = time;
 
@@ -337,8 +335,8 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
 	if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
 		return false;
 
-	sugov_get_util(sg_cpu);
-	sugov_iowait_apply(sg_cpu, time, max_cap);
+	boost = sugov_iowait_apply(sg_cpu, time, max_cap);
+	sugov_get_util(sg_cpu, boost);
 
 	return true;
 }
@@ -439,9 +437,10 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 
 	for_each_cpu(j, policy->cpus) {
 		struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
+		unsigned long boost;
 
-		sugov_get_util(j_sg_cpu);
-		sugov_iowait_apply(j_sg_cpu, time, max_cap);
+		boost = sugov_iowait_apply(j_sg_cpu, time, max_cap);
+		sugov_get_util(j_sg_cpu, boost);
 
 		util = max(j_sg_cpu->util, util);
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c1574cd388e7..e58a54bda77d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3058,59 +3058,6 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
 	return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
 }
 
-/**
- * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
- * @rq:		The rq to clamp against. Must not be NULL.
- * @util:	The util value to clamp.
- * @p:		The task to clamp against. Can be NULL if you want to clamp
- *		against @rq only.
- *
- * Clamps the passed @util to the max(@rq, @p) effective uclamp values.
- *
- * If sched_uclamp_used static key is disabled, then just return the util
- * without any clamping since uclamp aggregation at the rq level in the fast
- * path is disabled, rendering this operation a NOP.
- *
- * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It
- * will return the correct effective uclamp value of the task even if the
- * static key is disabled.
- */
-static __always_inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
-				  struct task_struct *p)
-{
-	unsigned long min_util = 0;
-	unsigned long max_util = 0;
-
-	if (!static_branch_likely(&sched_uclamp_used))
-		return util;
-
-	if (p) {
-		min_util = uclamp_eff_value(p, UCLAMP_MIN);
-		max_util = uclamp_eff_value(p, UCLAMP_MAX);
-
-		/*
-		 * Ignore last runnable task's max clamp, as this task will
-		 * reset it. Similarly, no need to read the rq's min clamp.
-		 */
-		if (uclamp_rq_is_idle(rq))
-			goto out;
-	}
-
-	min_util = max_t(unsigned long, min_util, uclamp_rq_get(rq, UCLAMP_MIN));
-	max_util = max_t(unsigned long, max_util, uclamp_rq_get(rq, UCLAMP_MAX));
-out:
-	/*
-	 * Since CPU's {min,max}_util clamps are MAX aggregated considering
-	 * RUNNABLE tasks with _different_ clamps, we can end up with an
-	 * inversion. Fix it now when the clamps are applied.
-	 */
-	if (unlikely(min_util >= max_util))
-		return min_util;
-
-	return clamp(util, min_util, max_util);
-}
-
 /* Is the rq being capped/throttled by uclamp_max? */
 static inline bool uclamp_rq_is_capped(struct rq *rq)
 {
@@ -3148,13 +3095,6 @@ static inline unsigned long uclamp_eff_value(struct task_struct *p,
 	return SCHED_CAPACITY_SCALE;
 }
 
-static inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
-				  struct task_struct *p)
-{
-	return util;
-}
-
 static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; }
 
 static inline bool uclamp_is_used(void)
-- 
cgit v1.2.3


From 388a1fb7da6aaa1970c7e2a7d7fcd983a87a8484 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Nov 2023 11:07:56 +0100
Subject: perf: Fix the nr_addr_filters fix

Thomas reported that commit 652ffc2104ec ("perf/core: Fix narrow
startup race when creating the perf nr_addr_filters sysfs file") made
the entire attribute group vanish, instead of only the nr_addr_filters
attribute.

Additionally a stray return.

Insufficient coffee was involved with both writing and merging the
patch.

Fixes: 652ffc2104ec ("perf/core: Fix narrow startup race when creating the perf nr_addr_filters sysfs file")
Reported-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Thomas Richter <tmricht@linux.ibm.com>
Link: https://lkml.kernel.org/r/20231122100756.GP8262@noisy.programming.kicks-ass.net
---
 kernel/events/core.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4f0c45ab8d7d..59b332cce9e7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11417,12 +11417,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int
 	struct device *dev = kobj_to_dev(kobj);
 	struct pmu *pmu = dev_get_drvdata(dev);
 
-	if (!pmu->nr_addr_filters)
+	if (n == 2 && !pmu->nr_addr_filters)
 		return 0;
 
 	return a->mode;
-
-	return 0;
 }
 
 static struct attribute_group pmu_dev_attr_group = {
-- 
cgit v1.2.3


From 491dd8edecbc5027ee317f3f1e7e9800fb66d88f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 23 Nov 2023 19:59:35 -0800
Subject: bpf: Emit global subprog name in verifier logs

We have the name, instead of emitting just func#N to identify global
subprog, augment verifier log messages with actual function name to make
it more user-friendly.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20231124035937.403208-2-andrii@kernel.org
---
 kernel/bpf/verifier.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 405da1f9e724..a2939ebf2638 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -339,6 +339,11 @@ struct bpf_kfunc_call_arg_meta {
 
 struct btf *btf_vmlinux;
 
+static const char *btf_type_name(const struct btf *btf, u32 id)
+{
+	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
+}
+
 static DEFINE_MUTEX(bpf_verifier_lock);
 static DEFINE_MUTEX(bpf_percpu_ma_lock);
 
@@ -418,6 +423,17 @@ static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog)
 	return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL;
 }
 
+static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
+{
+	struct bpf_func_info *info;
+
+	if (!env->prog->aux->func_info)
+		return "";
+
+	info = &env->prog->aux->func_info[subprog];
+	return btf_type_name(env->prog->aux->btf, info->type_id);
+}
+
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
 {
 	return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
@@ -587,11 +603,6 @@ static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 	return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
 }
 
-static const char *btf_type_name(const struct btf *btf, u32 id)
-{
-	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
-}
-
 static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 {
 	switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
@@ -9269,13 +9280,16 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	if (err == -EFAULT)
 		return err;
 	if (subprog_is_global(env, subprog)) {
+		const char *sub_name = subprog_name(env, subprog);
+
 		if (err) {
-			verbose(env, "Caller passes invalid args into func#%d\n", subprog);
+			verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
+				subprog, sub_name);
 			return err;
 		}
 
-		if (env->log.level & BPF_LOG_LEVEL)
-			verbose(env, "Func#%d is global and valid. Skipping.\n", subprog);
+		verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
+			subprog, sub_name);
 		clear_caller_saved_regs(env, caller->regs);
 
 		/* All global functions return a 64-bit SCALAR_VALUE */
@@ -19893,9 +19907,8 @@ static int do_check_subprogs(struct bpf_verifier_env *env)
 		if (ret) {
 			return ret;
 		} else if (env->log.level & BPF_LOG_LEVEL) {
-			verbose(env,
-				"Func#%d is safe for any args that match its prototype\n",
-				i);
+			verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n",
+				i, subprog_name(env, i));
 		}
 	}
 	return 0;
-- 
cgit v1.2.3


From 2afae08c9dcb8ac648414277cec70c2fe6a34d9e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 23 Nov 2023 19:59:36 -0800
Subject: bpf: Validate global subprogs lazily

Slightly change BPF verifier logic around eagerness and order of global
subprog validation. Instead of going over every global subprog eagerly
and validating it before main (entry) BPF program is verified, turn it
around. Validate main program first, mark subprogs that were called from
main program for later verification, but otherwise assume it is valid.
Afterwards, go over marked global subprogs and validate those,
potentially marking some more global functions as being called. Continue
this process until all (transitively) callable global subprogs are
validated. It's a BFS traversal at its heart and will always converge.

This is an important change because it allows to feature-gate some
subprograms that might not be verifiable on some older kernel, depending
on supported set of features.

E.g., at some point, global functions were allowed to accept a pointer
to memory, which size is identified by user-provided type.
Unfortunately, older kernels don't support this feature. With BPF CO-RE
approach, the natural way would be to still compile BPF object file once
and guard calls to this global subprog with some CO-RE check or using
.rodata variables. That's what people do to guard usage of new helpers
or kfuncs, and any other new BPF-side feature that might be missing on
old kernels.

That's currently impossible to do with global subprogs, unfortunately,
because they are eagerly and unconditionally validated. This patch set
aims to change this, so that in the future when global funcs gain new
features, those can be guarded using BPF CO-RE techniques in the same
fashion as any other new kernel feature.

Two selftests had to be adjusted in sync with these changes.

test_global_func12 relied on eager global subprog validation failing
before main program failure is detected (unknown return value). Fix by
making sure that main program is always valid.

verifier_subprog_precision's parent_stack_slot_precise subtest relied on
verifier checkpointing heuristic to do a checkpoint at instruction #5,
but that's no longer true because we don't have enough jumps validated
before reaching insn #5 due to global subprogs being validated later.

Other than that, no changes, as one would expect.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20231124035937.403208-3-andrii@kernel.org
---
 include/linux/bpf.h                                |  2 +
 kernel/bpf/verifier.c                              | 48 +++++++++++++++++++---
 .../selftests/bpf/progs/test_global_func12.c       |  4 +-
 .../bpf/progs/verifier_subprog_precision.c         |  4 +-
 4 files changed, 48 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 258ba232e302..eb447b0a9423 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1347,6 +1347,8 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 struct bpf_func_info_aux {
 	u16 linkage;
 	bool unreliable;
+	bool called : 1;
+	bool verified : 1;
 };
 
 enum bpf_jit_poke_reason {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a2939ebf2638..8e7b6072e3f4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -434,6 +434,11 @@ static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
 	return btf_type_name(env->prog->aux->btf, info->type_id);
 }
 
+static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env, int subprog)
+{
+	return &env->prog->aux->func_info_aux[subprog];
+}
+
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
 {
 	return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
@@ -9290,6 +9295,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 
 		verbose(env, "Func#%d ('%s') is global and assumed valid.\n",
 			subprog, sub_name);
+		/* mark global subprog for verifying after main prog */
+		subprog_aux(env, subprog)->called = true;
 		clear_caller_saved_regs(env, caller->regs);
 
 		/* All global functions return a 64-bit SCALAR_VALUE */
@@ -19873,8 +19880,11 @@ out:
 	return ret;
 }
 
-/* Verify all global functions in a BPF program one by one based on their BTF.
- * All global functions must pass verification. Otherwise the whole program is rejected.
+/* Lazily verify all global functions based on their BTF, if they are called
+ * from main BPF program or any of subprograms transitively.
+ * BPF global subprogs called from dead code are not validated.
+ * All callable global functions must pass verification.
+ * Otherwise the whole program is rejected.
  * Consider:
  * int bar(int);
  * int foo(int f)
@@ -19893,14 +19903,26 @@ out:
 static int do_check_subprogs(struct bpf_verifier_env *env)
 {
 	struct bpf_prog_aux *aux = env->prog->aux;
-	int i, ret;
+	struct bpf_func_info_aux *sub_aux;
+	int i, ret, new_cnt;
 
 	if (!aux->func_info)
 		return 0;
 
+	/* exception callback is presumed to be always called */
+	if (env->exception_callback_subprog)
+		subprog_aux(env, env->exception_callback_subprog)->called = true;
+
+again:
+	new_cnt = 0;
 	for (i = 1; i < env->subprog_cnt; i++) {
-		if (aux->func_info_aux[i].linkage != BTF_FUNC_GLOBAL)
+		if (!subprog_is_global(env, i))
+			continue;
+
+		sub_aux = subprog_aux(env, i);
+		if (!sub_aux->called || sub_aux->verified)
 			continue;
+
 		env->insn_idx = env->subprog_info[i].start;
 		WARN_ON_ONCE(env->insn_idx == 0);
 		ret = do_check_common(env, i, env->exception_callback_subprog == i);
@@ -19910,7 +19932,21 @@ static int do_check_subprogs(struct bpf_verifier_env *env)
 			verbose(env, "Func#%d ('%s') is safe for any args that match its prototype\n",
 				i, subprog_name(env, i));
 		}
+
+		/* We verified new global subprog, it might have called some
+		 * more global subprogs that we haven't verified yet, so we
+		 * need to do another pass over subprogs to verify those.
+		 */
+		sub_aux->verified = true;
+		new_cnt++;
 	}
+
+	/* We can't loop forever as we verify at least one global subprog on
+	 * each pass.
+	 */
+	if (new_cnt)
+		goto again;
+
 	return 0;
 }
 
@@ -20556,8 +20592,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (ret < 0)
 		goto skip_full_check;
 
-	ret = do_check_subprogs(env);
-	ret = ret ?: do_check_main(env);
+	ret = do_check_main(env);
+	ret = ret ?: do_check_subprogs(env);
 
 	if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))
 		ret = bpf_prog_offload_finalize(env);
diff --git a/tools/testing/selftests/bpf/progs/test_global_func12.c b/tools/testing/selftests/bpf/progs/test_global_func12.c
index 7f159d83c6f6..6e03d42519a6 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func12.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func12.c
@@ -19,5 +19,7 @@ int global_func12(struct __sk_buff *skb)
 {
 	const struct S s = {.x = skb->len };
 
-	return foo(&s);
+	foo(&s);
+
+	return 1;
 }
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
index f61d623b1ce8..b5efcaeaa1ae 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -370,12 +370,10 @@ __naked int parent_stack_slot_precise(void)
 SEC("?raw_tp")
 __success __log_level(2)
 __msg("9: (0f) r1 += r6")
-__msg("mark_precise: frame0: last_idx 9 first_idx 6")
+__msg("mark_precise: frame0: last_idx 9 first_idx 0")
 __msg("mark_precise: frame0: regs=r6 stack= before 8: (bf) r1 = r7")
 __msg("mark_precise: frame0: regs=r6 stack= before 7: (27) r6 *= 4")
 __msg("mark_precise: frame0: regs=r6 stack= before 6: (79) r6 = *(u64 *)(r10 -8)")
-__msg("mark_precise: frame0: parent state regs= stack=-8:")
-__msg("mark_precise: frame0: last_idx 5 first_idx 0")
 __msg("mark_precise: frame0: regs= stack=-8 before 5: (85) call pc+6")
 __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r1 = 0")
 __msg("mark_precise: frame0: regs= stack=-8 before 3: (7b) *(u64 *)(r10 -8) = r6")
-- 
cgit v1.2.3


From bca4104b00fec60be330cd32818dd5c70db3d469 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 21 Nov 2023 12:41:26 +0100
Subject: lockdep: Fix block chain corruption

Kent reported an occasional KASAN splat in lockdep. Mark then noted:

> I suspect the dodgy access is to chain_block_buckets[-1], which hits the last 4
> bytes of the redzone and gets (incorrectly/misleadingly) attributed to
> nr_large_chain_blocks.

That would mean @size == 0, at which point size_to_bucket() returns -1
and the above happens.

alloc_chain_hlocks() has 'size - req', for the first with the
precondition 'size >= rq', which allows the 0.

This code is trying to split a block, del_chain_block() takes what we
need, and add_chain_block() puts back the remainder, except in the
above case the remainder is 0 sized and things go sideways.

Fixes: 810507fe6fd5 ("locking/lockdep: Reuse freed chain_hlocks entries")
Reported-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Kent Overstreet <kent.overstreet@linux.dev>
Link: https://lkml.kernel.org/r/20231121114126.GH8262@noisy.programming.kicks-ass.net
---
 kernel/locking/lockdep.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e85b5ad3e206..151bd3de5936 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3497,7 +3497,8 @@ static int alloc_chain_hlocks(int req)
 		size = chain_block_size(curr);
 		if (likely(size >= req)) {
 			del_chain_block(0, size, chain_block_next(curr));
-			add_chain_block(curr + req, size - req);
+			if (size > req)
+				add_chain_block(curr + req, size - req);
 			return curr;
 		}
 	}
-- 
cgit v1.2.3


From 75a442581d05edaee168222ffbe00d4389785636 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Sat, 11 Nov 2023 12:38:21 +0800
Subject: bpf: Add missed allocation hint for bpf_mem_cache_alloc_flags()

bpf_mem_cache_alloc_flags() may call __alloc() directly when there is no
free object in free list, but it doesn't initialize the allocation hint
for the returned pointer. It may lead to bad memory dereference when
freeing the pointer, so fix it by initializing the allocation hint.

Fixes: 822fb26bdb55 ("bpf: Add a hint to allocated objects.")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231111043821.2258513-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 63b909d277d4..6a51cfe4c2d6 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -978,6 +978,8 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
 		memcg = get_memcg(c);
 		old_memcg = set_active_memcg(memcg);
 		ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
+		if (ret)
+			*(struct bpf_mem_cache **)ret = c;
 		set_active_memcg(old_memcg);
 		mem_cgroup_put(memcg);
 	}
-- 
cgit v1.2.3


From 783822e44594639848b78d4bb61dde26fba04e05 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 22 Nov 2023 13:44:39 +0100
Subject: mnt_idmapping: decouple from namespaces

There's no reason we need to couple mnt idmapping to namespaces in the
way we currently do. Copy the idmapping when an idmapped mount is
created and don't take any reference on the namespace at all.

We also can't easily refcount struct uid_gid_map because it needs to
stay the size of a cacheline otherwise we risk performance regressions
(Ignoring for a second that right now struct uid_gid_map isn't actually
 64 byte but 72 but that's a fix for another patch series.).

Link: https://lore.kernel.org/r/20231122-vfs-mnt_idmap-v1-3-dae4abdde5bd@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mnt_idmapping.c      | 106 +++++++++++++++++++++++++++++++++++++++++-------
 include/linux/uidgid.h  |  13 ++++++
 kernel/user_namespace.c |   4 +-
 3 files changed, 106 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 35d78cb3c38a..64c5205e2b5e 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -9,8 +9,16 @@
 
 #include "internal.h"
 
+/*
+ * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t,
+ * never from raw values. These are just internal helpers.
+ */
+#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val }
+#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val }
+
 struct mnt_idmap {
-	struct user_namespace *owner;
+	struct uid_gid_map uid_map;
+	struct uid_gid_map gid_map;
 	refcount_t count;
 };
 
@@ -20,7 +28,6 @@ struct mnt_idmap {
  * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
  */
 struct mnt_idmap nop_mnt_idmap = {
-	.owner	= &init_user_ns,
 	.count	= REFCOUNT_INIT(1),
 };
 EXPORT_SYMBOL_GPL(nop_mnt_idmap);
@@ -65,7 +72,6 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
 		     kuid_t kuid)
 {
 	uid_t uid;
-	struct user_namespace *mnt_userns = idmap->owner;
 
 	if (idmap == &nop_mnt_idmap)
 		return VFSUIDT_INIT(kuid);
@@ -75,7 +81,7 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
 		uid = from_kuid(fs_userns, kuid);
 	if (uid == (uid_t)-1)
 		return INVALID_VFSUID;
-	return VFSUIDT_INIT(make_kuid(mnt_userns, uid));
+	return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid));
 }
 EXPORT_SYMBOL_GPL(make_vfsuid);
 
@@ -103,7 +109,6 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
 		     struct user_namespace *fs_userns, kgid_t kgid)
 {
 	gid_t gid;
-	struct user_namespace *mnt_userns = idmap->owner;
 
 	if (idmap == &nop_mnt_idmap)
 		return VFSGIDT_INIT(kgid);
@@ -113,7 +118,7 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
 		gid = from_kgid(fs_userns, kgid);
 	if (gid == (gid_t)-1)
 		return INVALID_VFSGID;
-	return VFSGIDT_INIT(make_kgid(mnt_userns, gid));
+	return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid));
 }
 EXPORT_SYMBOL_GPL(make_vfsgid);
 
@@ -132,11 +137,10 @@ kuid_t from_vfsuid(struct mnt_idmap *idmap,
 		   struct user_namespace *fs_userns, vfsuid_t vfsuid)
 {
 	uid_t uid;
-	struct user_namespace *mnt_userns = idmap->owner;
 
 	if (idmap == &nop_mnt_idmap)
 		return AS_KUIDT(vfsuid);
-	uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid));
+	uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid));
 	if (uid == (uid_t)-1)
 		return INVALID_UID;
 	if (initial_idmapping(fs_userns))
@@ -160,11 +164,10 @@ kgid_t from_vfsgid(struct mnt_idmap *idmap,
 		   struct user_namespace *fs_userns, vfsgid_t vfsgid)
 {
 	gid_t gid;
-	struct user_namespace *mnt_userns = idmap->owner;
 
 	if (idmap == &nop_mnt_idmap)
 		return AS_KGIDT(vfsgid);
-	gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid));
+	gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid));
 	if (gid == (gid_t)-1)
 		return INVALID_GID;
 	if (initial_idmapping(fs_userns))
@@ -195,16 +198,91 @@ int vfsgid_in_group_p(vfsgid_t vfsgid)
 #endif
 EXPORT_SYMBOL_GPL(vfsgid_in_group_p);
 
+static int copy_mnt_idmap(struct uid_gid_map *map_from,
+			  struct uid_gid_map *map_to)
+{
+	struct uid_gid_extent *forward, *reverse;
+	u32 nr_extents = READ_ONCE(map_from->nr_extents);
+	/* Pairs with smp_wmb() when writing the idmapping. */
+	smp_rmb();
+
+	/*
+	 * Don't blindly copy @map_to into @map_from if nr_extents is
+	 * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we
+	 * read @nr_extents someone could have written an idmapping and
+	 * then we might end up with inconsistent data. So just don't do
+	 * anything at all.
+	 */
+	if (nr_extents == 0)
+		return 0;
+
+	/*
+	 * Here we know that nr_extents is greater than zero which means
+	 * a map has been written. Since idmappings can't be changed
+	 * once they have been written we know that we can safely copy
+	 * from @map_to into @map_from.
+	 */
+
+	if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
+		*map_to = *map_from;
+		return 0;
+	}
+
+	forward = kmemdup(map_from->forward,
+			  nr_extents * sizeof(struct uid_gid_extent),
+			  GFP_KERNEL_ACCOUNT);
+	if (!forward)
+		return -ENOMEM;
+
+	reverse = kmemdup(map_from->reverse,
+			  nr_extents * sizeof(struct uid_gid_extent),
+			  GFP_KERNEL_ACCOUNT);
+	if (!reverse) {
+		kfree(forward);
+		return -ENOMEM;
+	}
+
+	/*
+	 * The idmapping isn't exposed anywhere so we don't need to care
+	 * about ordering between extent pointers and @nr_extents
+	 * initialization.
+	 */
+	map_to->forward = forward;
+	map_to->reverse = reverse;
+	map_to->nr_extents = nr_extents;
+	return 0;
+}
+
+static void free_mnt_idmap(struct mnt_idmap *idmap)
+{
+	if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+		kfree(idmap->uid_map.forward);
+		kfree(idmap->uid_map.reverse);
+	}
+	if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+		kfree(idmap->gid_map.forward);
+		kfree(idmap->gid_map.reverse);
+	}
+	kfree(idmap);
+}
+
 struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
 {
 	struct mnt_idmap *idmap;
+	int ret;
 
 	idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
 	if (!idmap)
 		return ERR_PTR(-ENOMEM);
 
-	idmap->owner = get_user_ns(mnt_userns);
 	refcount_set(&idmap->count, 1);
+	ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map);
+	if (!ret)
+		ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map);
+	if (ret) {
+		free_mnt_idmap(idmap);
+		idmap = ERR_PTR(ret);
+	}
 	return idmap;
 }
 
@@ -234,9 +312,7 @@ EXPORT_SYMBOL_GPL(mnt_idmap_get);
  */
 void mnt_idmap_put(struct mnt_idmap *idmap)
 {
-	if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) {
-		put_user_ns(idmap->owner);
-		kfree(idmap);
-	}
+	if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count))
+		free_mnt_idmap(idmap);
 }
 EXPORT_SYMBOL_GPL(mnt_idmap_put);
diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
index b0542cd11aeb..415a7ca2b882 100644
--- a/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@ -17,6 +17,7 @@
 
 struct user_namespace;
 extern struct user_namespace init_user_ns;
+struct uid_gid_map;
 
 typedef struct {
 	uid_t val;
@@ -138,6 +139,9 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
 	return from_kgid(ns, gid) != (gid_t) -1;
 }
 
+u32 map_id_down(struct uid_gid_map *map, u32 id);
+u32 map_id_up(struct uid_gid_map *map, u32 id);
+
 #else
 
 static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid)
@@ -186,6 +190,15 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
 	return gid_valid(gid);
 }
 
+static inline u32 map_id_down(struct uid_gid_map *map, u32 id)
+{
+	return id;
+}
+
+static inline u32 map_id_up(struct uid_gid_map *map, u32 id)
+{
+	return id;
+}
 #endif /* CONFIG_USER_NS */
 
 #endif /* _LINUX_UIDGID_H */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 625101249e4d..ce4d99df5f0e 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -332,7 +332,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 	return id;
 }
 
-static u32 map_id_down(struct uid_gid_map *map, u32 id)
+u32 map_id_down(struct uid_gid_map *map, u32 id)
 {
 	return map_id_range_down(map, id, 1);
 }
@@ -375,7 +375,7 @@ map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
 		       sizeof(struct uid_gid_extent), cmp_map_id);
 }
 
-static u32 map_id_up(struct uid_gid_map *map, u32 id)
+u32 map_id_up(struct uid_gid_map *map, u32 id)
 {
 	struct uid_gid_extent *extent;
 	unsigned extents = map->nr_extents;
-- 
cgit v1.2.3


From 877c737db9355acaa1ec2fd2b8dbdaff82605df7 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 27 Nov 2023 14:51:05 -0500
Subject: cgroup/cpuset: Expose cpuset.cpus.isolated

The root-only cpuset.cpus.isolated control file shows the current set
of isolated CPUs in isolated partitions. This control file is currently
exposed only with the cgroup_debug boot command line option which also
adds the ".__DEBUG__." prefix. This is actually a useful control file if
users want to find out which CPUs are currently in an isolated state by
the cpuset controller. Remove CFTYPE_DEBUG flag for this control file and
make it available by default without any prefix.

The test_cpuset_prs.sh test script and the cgroup-v2.rst documentation
file are also updated accordingly. Minor code change is also made in
test_cpuset_prs.sh to avoid false test failure when running on debug
kernel.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/admin-guide/cgroup-v2.rst           |  7 +++++
 kernel/cgroup/cpuset.c                            |  2 +-
 tools/testing/selftests/cgroup/test_cpuset_prs.sh | 32 +++++++++++++----------
 3 files changed, 26 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index cf5651a11df8..30f6ff2eba47 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2316,6 +2316,13 @@ Cpuset Interface Files
 	treated to have an implicit value of "cpuset.cpus" in the
 	formation of local partition.
 
+  cpuset.cpus.isolated
+	A read-only and root cgroup only multiple values file.
+
+	This file shows the set of all isolated CPUs used in existing
+	isolated partitions. It will be empty if no isolated partition
+	is created.
+
   cpuset.cpus.partition
 	A read-write single value file which exists on non-root
 	cpuset-enabled cgroups.  This flag is owned by the parent cgroup
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 1bad4007ff4b..2a16df86c55c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3974,7 +3974,7 @@ static struct cftype dfl_files[] = {
 		.name = "cpus.isolated",
 		.seq_show = cpuset_common_seq_show,
 		.private = FILE_ISOLATED_CPULIST,
-		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
+		.flags = CFTYPE_ONLY_ON_ROOT,
 	},
 
 	{ }	/* terminate */
diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
index 7b7c4c2b6d85..b5eb1be2248c 100755
--- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -508,7 +508,7 @@ dump_states()
 		XECPUS=$DIR/cpuset.cpus.exclusive.effective
 		PRS=$DIR/cpuset.cpus.partition
 		PCPUS=$DIR/.__DEBUG__.cpuset.cpus.subpartitions
-		ISCPUS=$DIR/.__DEBUG__.cpuset.cpus.isolated
+		ISCPUS=$DIR/cpuset.cpus.isolated
 		[[ -e $CPUS   ]] && echo "$CPUS: $(cat $CPUS)"
 		[[ -e $XCPUS  ]] && echo "$XCPUS: $(cat $XCPUS)"
 		[[ -e $ECPUS  ]] && echo "$ECPUS: $(cat $ECPUS)"
@@ -593,17 +593,17 @@ check_cgroup_states()
 
 #
 # Get isolated (including offline) CPUs by looking at
-# /sys/kernel/debug/sched/domains and *cpuset.cpus.isolated control file,
+# /sys/kernel/debug/sched/domains and cpuset.cpus.isolated control file,
 # if available, and compare that with the expected value.
 #
 # Note that isolated CPUs from the sched/domains context include offline
 # CPUs as well as CPUs in non-isolated 1-CPU partition. Those CPUs may
-# not be included in the *cpuset.cpus.isolated control file which contains
+# not be included in the cpuset.cpus.isolated control file which contains
 # only CPUs in isolated partitions.
 #
 # $1 - expected isolated cpu list(s) <isolcpus1>{,<isolcpus2>}
 # <isolcpus1> - expected sched/domains value
-# <isolcpus2> - *cpuset.cpus.isolated value = <isolcpus1> if not defined
+# <isolcpus2> - cpuset.cpus.isolated value = <isolcpus1> if not defined
 #
 check_isolcpus()
 {
@@ -611,7 +611,7 @@ check_isolcpus()
 	ISOLCPUS=
 	LASTISOLCPU=
 	SCHED_DOMAINS=/sys/kernel/debug/sched/domains
-	ISCPUS=${CGROUP2}/.__DEBUG__.cpuset.cpus.isolated
+	ISCPUS=${CGROUP2}/cpuset.cpus.isolated
 	if [[ $EXPECT_VAL = . ]]
 	then
 		EXPECT_VAL=
@@ -692,14 +692,18 @@ test_fail()
 null_isolcpus_check()
 {
 	[[ $VERBOSE -gt 0 ]] || return 0
-	pause 0.02
-	check_isolcpus "."
-	if [[ $? -ne 0 ]]
-	then
-		echo "Unexpected isolated CPUs: $ISOLCPUS"
-		dump_states
-		exit 1
-	fi
+	# Retry a few times before printing error
+	RETRY=0
+	while [[ $RETRY -lt 5 ]]
+	do
+		pause 0.01
+		check_isolcpus "."
+		[[ $? -eq 0 ]] && return 0
+		((RETRY++))
+	done
+	echo "Unexpected isolated CPUs: $ISOLCPUS"
+	dump_states
+	exit 1
 }
 
 #
@@ -776,7 +780,7 @@ run_state_test()
 		#
 		NEWLIST=$(cat cpuset.cpus.effective)
 		RETRY=0
-		while [[ $NEWLIST != $CPULIST && $RETRY -lt 5 ]]
+		while [[ $NEWLIST != $CPULIST && $RETRY -lt 8 ]]
 		do
 			# Wait a bit longer & recheck a few times
 			pause 0.01
-- 
cgit v1.2.3


From cff5f49d433fcd0063c8be7dd08fa5bf190c6c37 Mon Sep 17 00:00:00 2001
From: Tim Van Patten <timvp@google.com>
Date: Wed, 15 Nov 2023 09:20:43 -0700
Subject: cgroup_freezer: cgroup_freezing: Check if not frozen

__thaw_task() was recently updated to warn if the task being thawed was
part of a freezer cgroup that is still currently freezing:

	void __thaw_task(struct task_struct *p)
	{
	...
		if (WARN_ON_ONCE(freezing(p)))
			goto unlock;

This has exposed a bug in cgroup1 freezing where when CGROUP_FROZEN is
asserted, the CGROUP_FREEZING bits are not also cleared at the same
time. Meaning, when a cgroup is marked FROZEN it continues to be marked
FREEZING as well. This causes the WARNING to trigger, because
cgroup_freezing() thinks the cgroup is still freezing.

There are two ways to fix this:

1. Whenever FROZEN is set, clear FREEZING for the cgroup and all
children cgroups.
2. Update cgroup_freezing() to also verify that FROZEN is not set.

This patch implements option (2), since it's smaller and more
straightforward.

Signed-off-by: Tim Van Patten <timvp@google.com>
Tested-by: Mark Hasemeyer <markhas@chromium.org>
Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic")
Cc: stable@vger.kernel.org # v6.1+
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/legacy_freezer.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 122dacb3a443..66d1708042a7 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -66,9 +66,15 @@ static struct freezer *parent_freezer(struct freezer *freezer)
 bool cgroup_freezing(struct task_struct *task)
 {
 	bool ret;
+	unsigned int state;
 
 	rcu_read_lock();
-	ret = task_freezer(task)->state & CGROUP_FREEZING;
+	/* Check if the cgroup is still FREEZING, but not FROZEN. The extra
+	 * !FROZEN check is required, because the FREEZING bit is not cleared
+	 * when the state FROZEN is reached.
+	 */
+	state = task_freezer(task)->state;
+	ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN);
 	rcu_read_unlock();
 
 	return ret;
-- 
cgit v1.2.3


From 4930b7f53a298533bc31d7540b6ea8b79a000331 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sat, 25 Nov 2023 20:31:26 +0100
Subject: bpf: Store ref_ctr_offsets values in bpf_uprobe array

We will need to return ref_ctr_offsets values through link_info
interface in following change, so we need to keep them around.

Storing ref_ctr_offsets values directly into bpf_uprobe array.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/bpf/20231125193130.834322-3-jolsa@kernel.org
---
 kernel/trace/bpf_trace.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f0b8b7c29126..ad0323f27288 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3033,6 +3033,7 @@ struct bpf_uprobe_multi_link;
 struct bpf_uprobe {
 	struct bpf_uprobe_multi_link *link;
 	loff_t offset;
+	unsigned long ref_ctr_offset;
 	u64 cookie;
 	struct uprobe_consumer consumer;
 };
@@ -3172,7 +3173,6 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 {
 	struct bpf_uprobe_multi_link *link = NULL;
 	unsigned long __user *uref_ctr_offsets;
-	unsigned long *ref_ctr_offsets = NULL;
 	struct bpf_link_primer link_primer;
 	struct bpf_uprobe *uprobes = NULL;
 	struct task_struct *task = NULL;
@@ -3245,18 +3245,12 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	if (!uprobes || !link)
 		goto error_free;
 
-	if (uref_ctr_offsets) {
-		ref_ctr_offsets = kvcalloc(cnt, sizeof(*ref_ctr_offsets), GFP_KERNEL);
-		if (!ref_ctr_offsets)
-			goto error_free;
-	}
-
 	for (i = 0; i < cnt; i++) {
 		if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
 			err = -EFAULT;
 			goto error_free;
 		}
-		if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) {
+		if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) {
 			err = -EFAULT;
 			goto error_free;
 		}
@@ -3287,7 +3281,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	for (i = 0; i < cnt; i++) {
 		err = uprobe_register_refctr(d_real_inode(link->path.dentry),
 					     uprobes[i].offset,
-					     ref_ctr_offsets ? ref_ctr_offsets[i] : 0,
+					     uprobes[i].ref_ctr_offset,
 					     &uprobes[i].consumer);
 		if (err) {
 			bpf_uprobe_unregister(&path, uprobes, i);
@@ -3299,11 +3293,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	if (err)
 		goto error_free;
 
-	kvfree(ref_ctr_offsets);
 	return bpf_link_settle(&link_primer);
 
 error_free:
-	kvfree(ref_ctr_offsets);
 	kvfree(uprobes);
 	kfree(link);
 	if (task)
-- 
cgit v1.2.3


From e56fdbfb06e26a7066b070967badef4148528df2 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sat, 25 Nov 2023 20:31:27 +0100
Subject: bpf: Add link_info support for uprobe multi link

Adding support to get uprobe_link details through bpf_link_info
interface.

Adding new struct uprobe_multi to struct bpf_link_info to carry
the uprobe_multi link details.

The uprobe_multi.count is passed from user space to denote size
of array fields (offsets/ref_ctr_offsets/cookies). The actual
array size is stored back to uprobe_multi.count (allowing user
to find out the actual array size) and array fields are populated
up to the user passed size.

All the non-array fields (path/count/flags/pid) are always set.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20231125193130.834322-4-jolsa@kernel.org
---
 include/uapi/linux/bpf.h       | 10 ++++++
 kernel/trace/bpf_trace.c       | 72 ++++++++++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h | 10 ++++++
 3 files changed, 92 insertions(+)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7a5498242eaa..e88746ba7d21 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6562,6 +6562,16 @@ struct bpf_link_info {
 			__u32 flags;
 			__u64 missed;
 		} kprobe_multi;
+		struct {
+			__aligned_u64 path;
+			__aligned_u64 offsets;
+			__aligned_u64 ref_ctr_offsets;
+			__aligned_u64 cookies;
+			__u32 path_size; /* in/out: real path size on success, including zero byte */
+			__u32 count; /* in/out: uprobe_multi offsets/ref_ctr_offsets/cookies count */
+			__u32 flags;
+			__u32 pid;
+		} uprobe_multi;
 		struct {
 			__u32 type; /* enum bpf_perf_event_type */
 			__u32 :32;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ad0323f27288..c284a4ad0315 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3042,6 +3042,7 @@ struct bpf_uprobe_multi_link {
 	struct path path;
 	struct bpf_link link;
 	u32 cnt;
+	u32 flags;
 	struct bpf_uprobe *uprobes;
 	struct task_struct *task;
 };
@@ -3083,9 +3084,79 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
 	kfree(umulti_link);
 }
 
+static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
+						struct bpf_link_info *info)
+{
+	u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets);
+	u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies);
+	u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets);
+	u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path);
+	u32 upath_size = info->uprobe_multi.path_size;
+	struct bpf_uprobe_multi_link *umulti_link;
+	u32 ucount = info->uprobe_multi.count;
+	int err = 0, i;
+	long left;
+
+	if (!upath ^ !upath_size)
+		return -EINVAL;
+
+	if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount)
+		return -EINVAL;
+
+	umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+	info->uprobe_multi.count = umulti_link->cnt;
+	info->uprobe_multi.flags = umulti_link->flags;
+	info->uprobe_multi.pid = umulti_link->task ?
+				 task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
+
+	if (upath) {
+		char *p, *buf;
+
+		upath_size = min_t(u32, upath_size, PATH_MAX);
+
+		buf = kmalloc(upath_size, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+		p = d_path(&umulti_link->path, buf, upath_size);
+		if (IS_ERR(p)) {
+			kfree(buf);
+			return PTR_ERR(p);
+		}
+		upath_size = buf + upath_size - p;
+		left = copy_to_user(upath, p, upath_size);
+		kfree(buf);
+		if (left)
+			return -EFAULT;
+		info->uprobe_multi.path_size = upath_size;
+	}
+
+	if (!uoffsets && !ucookies && !uref_ctr_offsets)
+		return 0;
+
+	if (ucount < umulti_link->cnt)
+		err = -ENOSPC;
+	else
+		ucount = umulti_link->cnt;
+
+	for (i = 0; i < ucount; i++) {
+		if (uoffsets &&
+		    put_user(umulti_link->uprobes[i].offset, uoffsets + i))
+			return -EFAULT;
+		if (uref_ctr_offsets &&
+		    put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i))
+			return -EFAULT;
+		if (ucookies &&
+		    put_user(umulti_link->uprobes[i].cookie, ucookies + i))
+			return -EFAULT;
+	}
+
+	return err;
+}
+
 static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
 	.release = bpf_uprobe_multi_link_release,
 	.dealloc = bpf_uprobe_multi_link_dealloc,
+	.fill_link_info = bpf_uprobe_multi_link_fill_link_info,
 };
 
 static int uprobe_prog_run(struct bpf_uprobe *uprobe,
@@ -3274,6 +3345,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	link->uprobes = uprobes;
 	link->path = path;
 	link->task = task;
+	link->flags = flags;
 
 	bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
 		      &bpf_uprobe_multi_link_lops, prog);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7a5498242eaa..e88746ba7d21 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6562,6 +6562,16 @@ struct bpf_link_info {
 			__u32 flags;
 			__u64 missed;
 		} kprobe_multi;
+		struct {
+			__aligned_u64 path;
+			__aligned_u64 offsets;
+			__aligned_u64 ref_ctr_offsets;
+			__aligned_u64 cookies;
+			__u32 path_size; /* in/out: real path size on success, including zero byte */
+			__u32 count; /* in/out: uprobe_multi offsets/ref_ctr_offsets/cookies count */
+			__u32 flags;
+			__u32 pid;
+		} uprobe_multi;
 		struct {
 			__u32 type; /* enum bpf_perf_event_type */
 			__u32 :32;
-- 
cgit v1.2.3


From 23ab79e8e469e2605beec2e3ccb40d19c68dd2e0 Mon Sep 17 00:00:00 2001
From: Elliot Berman <quic_eberman@quicinc.com>
Date: Mon, 20 Nov 2023 09:36:31 -0800
Subject: freezer,sched: Do not restore saved_state of a thawed task

It is possible for a task to be thawed multiple times when mixing the
*legacy* cgroup freezer and system-wide freezer. To do this, freeze the
cgroup, do system-wide freeze/thaw, then thaw the cgroup. When this
happens, then a stale saved_state can be written to the task's state
and cause task to hang indefinitely. Fix this by only trying to thaw
tasks that are actually frozen.

This change also has the marginal benefit avoiding unnecessary
wake_up_state(p, TASK_FROZEN) if we know the task is already thawed.
There is not possibility of time-of-compare/time-of-use race when we skip
the wake_up_state because entering/exiting TASK_FROZEN is guarded by
freezer_lock.

Fixes: 8f0eed4a78a8 ("freezer,sched: Use saved_state to reduce some spurious wakeups")
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Abhijeet Dharmapurikar <quic_adharmap@quicinc.com>
Link: https://lore.kernel.org/r/20231120-freezer-state-multiple-thaws-v1-1-f2e1dd7ce5a2@quicinc.com
---
 kernel/freezer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/freezer.c b/kernel/freezer.c
index c450fa8b8b5e..759006a9a910 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -201,7 +201,7 @@ void __thaw_task(struct task_struct *p)
 	if (WARN_ON_ONCE(freezing(p)))
 		goto unlock;
 
-	if (task_call_func(p, __restore_freezer_state, NULL))
+	if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL))
 		goto unlock;
 
 	wake_up_state(p, TASK_FROZEN);
-- 
cgit v1.2.3


From 382c27f4ed28f803b1f1473ac2d8db0afc795a1b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 29 Nov 2023 15:24:52 +0100
Subject: perf: Fix perf_event_validate_size()

Budimir noted that perf_event_validate_size() only checks the size of
the newly added event, even though the sizes of all existing events
can also change due to not all events having the same read_format.

When we attach the new event, perf_group_attach(), we do re-compute
the size for all events.

Fixes: a723968c0ed3 ("perf: Fix u16 overflows")
Reported-by: Budimir Markovic <markovicbudimir@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/events/core.c | 61 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index b704d83a28b2..c9d123e13b57 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1814,31 +1814,34 @@ static inline void perf_event__state_init(struct perf_event *event)
 					      PERF_EVENT_STATE_INACTIVE;
 }
 
-static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
+static int __perf_event_read_size(u64 read_format, int nr_siblings)
 {
 	int entry = sizeof(u64); /* value */
 	int size = 0;
 	int nr = 1;
 
-	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 		size += sizeof(u64);
 
-	if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		size += sizeof(u64);
 
-	if (event->attr.read_format & PERF_FORMAT_ID)
+	if (read_format & PERF_FORMAT_ID)
 		entry += sizeof(u64);
 
-	if (event->attr.read_format & PERF_FORMAT_LOST)
+	if (read_format & PERF_FORMAT_LOST)
 		entry += sizeof(u64);
 
-	if (event->attr.read_format & PERF_FORMAT_GROUP) {
+	if (read_format & PERF_FORMAT_GROUP) {
 		nr += nr_siblings;
 		size += sizeof(u64);
 	}
 
-	size += entry * nr;
-	event->read_size = size;
+	/*
+	 * Since perf_event_validate_size() limits this to 16k and inhibits
+	 * adding more siblings, this will never overflow.
+	 */
+	return size + nr * entry;
 }
 
 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
@@ -1888,8 +1891,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
  */
 static void perf_event__header_size(struct perf_event *event)
 {
-	__perf_event_read_size(event,
-			       event->group_leader->nr_siblings);
+	event->read_size =
+		__perf_event_read_size(event->attr.read_format,
+				       event->group_leader->nr_siblings);
 	__perf_event_header_size(event, event->attr.sample_type);
 }
 
@@ -1920,24 +1924,35 @@ static void perf_event__id_header_size(struct perf_event *event)
 	event->id_header_size = size;
 }
 
+/*
+ * Check that adding an event to the group does not result in anybody
+ * overflowing the 64k event limit imposed by the output buffer.
+ *
+ * Specifically, check that the read_size for the event does not exceed 16k,
+ * read_size being the one term that grows with groups size. Since read_size
+ * depends on per-event read_format, also (re)check the existing events.
+ *
+ * This leaves 48k for the constant size fields and things like callchains,
+ * branch stacks and register sets.
+ */
 static bool perf_event_validate_size(struct perf_event *event)
 {
-	/*
-	 * The values computed here will be over-written when we actually
-	 * attach the event.
-	 */
-	__perf_event_read_size(event, event->group_leader->nr_siblings + 1);
-	__perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
-	perf_event__id_header_size(event);
+	struct perf_event *sibling, *group_leader = event->group_leader;
 
-	/*
-	 * Sum the lot; should not exceed the 64k limit we have on records.
-	 * Conservative limit to allow for callchains and other variable fields.
-	 */
-	if (event->read_size + event->header_size +
-	    event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
+	if (__perf_event_read_size(event->attr.read_format,
+				   group_leader->nr_siblings + 1) > 16*1024)
 		return false;
 
+	if (__perf_event_read_size(group_leader->attr.read_format,
+				   group_leader->nr_siblings + 1) > 16*1024)
+		return false;
+
+	for_each_sibling_event(sibling, group_leader) {
+		if (__perf_event_read_size(sibling->attr.read_format,
+					   group_leader->nr_siblings + 1) > 16*1024)
+			return false;
+	}
+
 	return true;
 }
 
-- 
cgit v1.2.3


From 5068d84054b766efe7c6202fc71b2350d1c326f1 Mon Sep 17 00:00:00 2001
From: Yiwei Lin <s921975628@gmail.com>
Date: Fri, 17 Nov 2023 16:01:06 +0800
Subject: sched/fair: Update min_vruntime for reweight_entity() correctly

Since reweight_entity() may have chance to change the weight of
cfs_rq->curr entity, we should also update_min_vruntime() if
this is the case

Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight")
Signed-off-by: Yiwei Lin <s921975628@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Abel Wu <wuyun.abel@bytedance.com>
Link: https://lore.kernel.org/r/20231117080106.12890-1-s921975628@gmail.com
---
 kernel/sched/fair.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 34fe6e9490c2..bcea3d55d95d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3815,17 +3815,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	enqueue_load_avg(cfs_rq, se);
 	if (se->on_rq) {
 		update_load_add(&cfs_rq->load, se->load.weight);
-		if (!curr) {
-			/*
-			 * The entity's vruntime has been adjusted, so let's check
-			 * whether the rq-wide min_vruntime needs updated too. Since
-			 * the calculations above require stable min_vruntime rather
-			 * than up-to-date one, we do the update at the end of the
-			 * reweight process.
-			 */
+		if (!curr)
 			__enqueue_entity(cfs_rq, se);
-			update_min_vruntime(cfs_rq);
-		}
+
+		/*
+		 * The entity's vruntime has been adjusted, so let's check
+		 * whether the rq-wide min_vruntime needs updated too. Since
+		 * the calculations above require stable min_vruntime rather
+		 * than up-to-date one, we do the update at the end of the
+		 * reweight process.
+		 */
+		update_min_vruntime(cfs_rq);
 	}
 }
 
-- 
cgit v1.2.3


From 418146e39891ef1fb2284dee4cabbfe616cd21cf Mon Sep 17 00:00:00 2001
From: Elliot Berman <quic_eberman@quicinc.com>
Date: Mon, 20 Nov 2023 09:36:32 -0800
Subject: freezer,sched: Clean saved_state when restoring it during thaw

Clean saved_state after using it during thaw. Cleaning the saved_state
allows us to avoid some unnecessary branches in ttwu_state_match.

Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20231120-freezer-state-multiple-thaws-v1-2-f2e1dd7ce5a2@quicinc.com
---
 kernel/freezer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/freezer.c b/kernel/freezer.c
index c450fa8b8b5e..43b1d1b94d9e 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -187,6 +187,7 @@ static int __restore_freezer_state(struct task_struct *p, void *arg)
 
 	if (state != TASK_RUNNING) {
 		WRITE_ONCE(p->__state, state);
+		p->saved_state = TASK_RUNNING;
 		return 1;
 	}
 
-- 
cgit v1.2.3


From 5431fdd2c181dd2eac218e45b44deb2925fa48f0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sun, 17 Sep 2023 13:24:21 +0200
Subject: ptrace: Convert ptrace_attach() to use lock guards

Created as testing for the conditional guard infrastructure.
Specifically this makes use of the following form:

  scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR,
		     &task->signal->cred_guard_mutex) {
    ...
  }
  ...
  return 0;

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lkml.kernel.org/r/20231102110706.568467727%40infradead.org
---
 include/linux/sched/task.h |   2 +
 include/linux/spinlock.h   |  26 +++++++++
 kernel/ptrace.c            | 128 +++++++++++++++++++++------------------------
 3 files changed, 89 insertions(+), 67 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a23af225c898..4f3dca353556 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -226,4 +226,6 @@ static inline void task_unlock(struct task_struct *p)
 	spin_unlock(&p->alloc_lock);
 }
 
+DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T))
+
 #endif /* _LINUX_SCHED_TASK_H */
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index ceb56b39c70f..90bc853cafb6 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -548,5 +548,31 @@ DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t,
 DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try,
 			 spin_trylock_irqsave(_T->lock, _T->flags))
 
+DEFINE_LOCK_GUARD_1(read_lock, rwlock_t,
+		    read_lock(_T->lock),
+		    read_unlock(_T->lock))
+
+DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t,
+		    read_lock_irq(_T->lock),
+		    read_unlock_irq(_T->lock))
+
+DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t,
+		    read_lock_irqsave(_T->lock, _T->flags),
+		    read_unlock_irqrestore(_T->lock, _T->flags),
+		    unsigned long flags)
+
+DEFINE_LOCK_GUARD_1(write_lock, rwlock_t,
+		    write_lock(_T->lock),
+		    write_unlock(_T->lock))
+
+DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t,
+		    write_lock_irq(_T->lock),
+		    write_unlock_irq(_T->lock))
+
+DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t,
+		    write_lock_irqsave(_T->lock, _T->flags),
+		    write_unlock_irqrestore(_T->lock, _T->flags),
+		    unsigned long flags)
+
 #undef __LINUX_INSIDE_SPINLOCK_H
 #endif /* __LINUX_SPINLOCK_H */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d8b5e13a2229..5c579fb9a5e3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -386,6 +386,34 @@ static int check_ptrace_options(unsigned long data)
 	return 0;
 }
 
+static inline void ptrace_set_stopped(struct task_struct *task)
+{
+	guard(spinlock)(&task->sighand->siglock);
+
+	/*
+	 * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
+	 * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
+	 * will be cleared if the child completes the transition or any
+	 * event which clears the group stop states happens.  We'll wait
+	 * for the transition to complete before returning from this
+	 * function.
+	 *
+	 * This hides STOPPED -> RUNNING -> TRACED transition from the
+	 * attaching thread but a different thread in the same group can
+	 * still observe the transient RUNNING state.  IOW, if another
+	 * thread's WNOHANG wait(2) on the stopped tracee races against
+	 * ATTACH, the wait(2) may fail due to the transient RUNNING.
+	 *
+	 * The following task_is_stopped() test is safe as both transitions
+	 * in and out of STOPPED are protected by siglock.
+	 */
+	if (task_is_stopped(task) &&
+	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
+		task->jobctl &= ~JOBCTL_STOPPED;
+		signal_wake_up_state(task, __TASK_STOPPED);
+	}
+}
+
 static int ptrace_attach(struct task_struct *task, long request,
 			 unsigned long addr,
 			 unsigned long flags)
@@ -393,17 +421,17 @@ static int ptrace_attach(struct task_struct *task, long request,
 	bool seize = (request == PTRACE_SEIZE);
 	int retval;
 
-	retval = -EIO;
 	if (seize) {
 		if (addr != 0)
-			goto out;
+			return -EIO;
 		/*
 		 * This duplicates the check in check_ptrace_options() because
 		 * ptrace_attach() and ptrace_setoptions() have historically
 		 * used different error codes for unknown ptrace options.
 		 */
 		if (flags & ~(unsigned long)PTRACE_O_MASK)
-			goto out;
+			return -EIO;
+
 		retval = check_ptrace_options(flags);
 		if (retval)
 			return retval;
@@ -414,88 +442,54 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	audit_ptrace(task);
 
-	retval = -EPERM;
 	if (unlikely(task->flags & PF_KTHREAD))
-		goto out;
+		return -EPERM;
 	if (same_thread_group(task, current))
-		goto out;
+		return -EPERM;
 
 	/*
 	 * Protect exec's credential calculations against our interference;
 	 * SUID, SGID and LSM creds get determined differently
 	 * under ptrace.
 	 */
-	retval = -ERESTARTNOINTR;
-	if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
-		goto out;
+	scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR,
+			   &task->signal->cred_guard_mutex) {
 
-	task_lock(task);
-	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
-	task_unlock(task);
-	if (retval)
-		goto unlock_creds;
+		scoped_guard (task_lock, task) {
+			retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+			if (retval)
+				return retval;
+		}
 
-	write_lock_irq(&tasklist_lock);
-	retval = -EPERM;
-	if (unlikely(task->exit_state))
-		goto unlock_tasklist;
-	if (task->ptrace)
-		goto unlock_tasklist;
+		scoped_guard (write_lock_irq, &tasklist_lock) {
+			if (unlikely(task->exit_state))
+				return -EPERM;
+			if (task->ptrace)
+				return -EPERM;
 
-	task->ptrace = flags;
+			task->ptrace = flags;
 
-	ptrace_link(task, current);
+			ptrace_link(task, current);
 
-	/* SEIZE doesn't trap tracee on attach */
-	if (!seize)
-		send_sig_info(SIGSTOP, SEND_SIG_PRIV, task);
+			/* SEIZE doesn't trap tracee on attach */
+			if (!seize)
+				send_sig_info(SIGSTOP, SEND_SIG_PRIV, task);
 
-	spin_lock(&task->sighand->siglock);
+			ptrace_set_stopped(task);
+		}
+	}
 
 	/*
-	 * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
-	 * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
-	 * will be cleared if the child completes the transition or any
-	 * event which clears the group stop states happens.  We'll wait
-	 * for the transition to complete before returning from this
-	 * function.
-	 *
-	 * This hides STOPPED -> RUNNING -> TRACED transition from the
-	 * attaching thread but a different thread in the same group can
-	 * still observe the transient RUNNING state.  IOW, if another
-	 * thread's WNOHANG wait(2) on the stopped tracee races against
-	 * ATTACH, the wait(2) may fail due to the transient RUNNING.
-	 *
-	 * The following task_is_stopped() test is safe as both transitions
-	 * in and out of STOPPED are protected by siglock.
+	 * We do not bother to change retval or clear JOBCTL_TRAPPING
+	 * if wait_on_bit() was interrupted by SIGKILL. The tracer will
+	 * not return to user-mode, it will exit and clear this bit in
+	 * __ptrace_unlink() if it wasn't already cleared by the tracee;
+	 * and until then nobody can ptrace this task.
 	 */
-	if (task_is_stopped(task) &&
-	    task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
-		task->jobctl &= ~JOBCTL_STOPPED;
-		signal_wake_up_state(task, __TASK_STOPPED);
-	}
-
-	spin_unlock(&task->sighand->siglock);
-
-	retval = 0;
-unlock_tasklist:
-	write_unlock_irq(&tasklist_lock);
-unlock_creds:
-	mutex_unlock(&task->signal->cred_guard_mutex);
-out:
-	if (!retval) {
-		/*
-		 * We do not bother to change retval or clear JOBCTL_TRAPPING
-		 * if wait_on_bit() was interrupted by SIGKILL. The tracer will
-		 * not return to user-mode, it will exit and clear this bit in
-		 * __ptrace_unlink() if it wasn't already cleared by the tracee;
-		 * and until then nobody can ptrace this task.
-		 */
-		wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
-		proc_ptrace_connector(task, PTRACE_ATTACH);
-	}
+	wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
+	proc_ptrace_connector(task, PTRACE_ATTACH);
 
-	return retval;
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3


From d839a656d0f3caca9f96e9bf912fd394ac6a11bc Mon Sep 17 00:00:00 2001
From: JP Kobryn <inwardvessel@gmail.com>
Date: Fri, 1 Dec 2023 14:53:55 +0900
Subject: kprobes: consistent rcu api usage for kretprobe holder

It seems that the pointer-to-kretprobe "rp" within the kretprobe_holder is
RCU-managed, based on the (non-rethook) implementation of get_kretprobe().
The thought behind this patch is to make use of the RCU API where possible
when accessing this pointer so that the needed barriers are always in place
and to self-document the code.

The __rcu annotation to "rp" allows for sparse RCU checking. Plain writes
done to the "rp" pointer are changed to make use of the RCU macro for
assignment. For the single read, the implementation of get_kretprobe()
is simplified by making use of an RCU macro which accomplishes the same,
but note that the log warning text will be more generic.

I did find that there is a difference in assembly generated between the
usage of the RCU macros vs without. For example, on arm64, when using
rcu_assign_pointer(), the corresponding store instruction is a
store-release (STLR) which has an implicit barrier. When normal assignment
is done, a regular store (STR) is found. In the macro case, this seems to
be a result of rcu_assign_pointer() using smp_store_release() when the
value to write is not NULL.

Link: https://lore.kernel.org/all/20231122132058.3359-1-inwardvessel@gmail.com/

Fixes: d741bf41d7c7 ("kprobes: Remove kretprobe hash")
Cc: stable@vger.kernel.org
Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/kprobes.h | 7 ++-----
 kernel/kprobes.c        | 4 ++--
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index ab1da3142b06..64672bace560 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -139,7 +139,7 @@ static inline bool kprobe_ftrace(struct kprobe *p)
  *
  */
 struct kretprobe_holder {
-	struct kretprobe	*rp;
+	struct kretprobe __rcu *rp;
 	struct objpool_head	pool;
 };
 
@@ -245,10 +245,7 @@ unsigned long kretprobe_trampoline_handler(struct pt_regs *regs,
 
 static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri)
 {
-	RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(),
-		"Kretprobe is accessed from instance under preemptive context");
-
-	return READ_ONCE(ri->rph->rp);
+	return rcu_dereference_check(ri->rph->rp, rcu_read_lock_any_held());
 }
 
 static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 075a632e6c7c..d5a0ee40bf66 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2252,7 +2252,7 @@ int register_kretprobe(struct kretprobe *rp)
 		rp->rph = NULL;
 		return -ENOMEM;
 	}
-	rp->rph->rp = rp;
+	rcu_assign_pointer(rp->rph->rp, rp);
 	rp->nmissed = 0;
 	/* Establish function entry probe point */
 	ret = register_kprobe(&rp->kp);
@@ -2300,7 +2300,7 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
 		rethook_free(rps[i]->rh);
 #else
-		rps[i]->rph->rp = NULL;
+		rcu_assign_pointer(rps[i]->rph->rp, NULL);
 #endif
 	}
 	mutex_unlock(&kprobe_mutex);
-- 
cgit v1.2.3


From a1461f1fd6cfdc4b8917c9d4a91e92605d1f28dc Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Fri, 1 Dec 2023 14:53:56 +0900
Subject: rethook: Use __rcu pointer for rethook::handler

Since the rethook::handler is an RCU-maganged pointer so that it will
notice readers the rethook is stopped (unregistered) or not, it should
be an __rcu pointer and use appropriate functions to be accessed. This
will use appropriate memory barrier when accessing it. OTOH,
rethook::data is never changed, so we don't need to check it in
get_kretprobe().

NOTE: To avoid sparse warning, rethook::handler is defined by a raw
function pointer type with __rcu instead of rethook_handler_t.

Link: https://lore.kernel.org/all/170126066201.398836.837498688669005979.stgit@devnote2/

Fixes: 54ecbe6f1ed5 ("rethook: Add a generic return hook")
Cc: stable@vger.kernel.org
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202311241808.rv9ceuAh-lkp@intel.com/
Tested-by: JP Kobryn <inwardvessel@gmail.com>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/kprobes.h |  6 ++----
 include/linux/rethook.h |  7 ++++++-
 kernel/trace/rethook.c  | 23 ++++++++++++++---------
 3 files changed, 22 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 64672bace560..0ff44d6633e3 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -197,10 +197,8 @@ extern int arch_trampoline_kprobe(struct kprobe *p);
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
 static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri)
 {
-	RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(),
-		"Kretprobe is accessed from instance under preemptive context");
-
-	return (struct kretprobe *)READ_ONCE(ri->node.rethook->data);
+	/* rethook::data is non-changed field, so that you can access it freely. */
+	return (struct kretprobe *)ri->node.rethook->data;
 }
 static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri)
 {
diff --git a/include/linux/rethook.h b/include/linux/rethook.h
index ce69b2b7bc35..ba60962805f6 100644
--- a/include/linux/rethook.h
+++ b/include/linux/rethook.h
@@ -28,7 +28,12 @@ typedef void (*rethook_handler_t) (struct rethook_node *, void *, unsigned long,
  */
 struct rethook {
 	void			*data;
-	rethook_handler_t	handler;
+	/*
+	 * To avoid sparse warnings, this uses a raw function pointer with
+	 * __rcu, instead of rethook_handler_t. But this must be same as
+	 * rethook_handler_t.
+	 */
+	void (__rcu *handler) (struct rethook_node *, void *, unsigned long, struct pt_regs *);
 	struct objpool_head	pool;
 	struct rcu_head		rcu;
 };
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index 6fd7d4ecbbc6..fa03094e9e69 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -48,7 +48,7 @@ static void rethook_free_rcu(struct rcu_head *head)
  */
 void rethook_stop(struct rethook *rh)
 {
-	WRITE_ONCE(rh->handler, NULL);
+	rcu_assign_pointer(rh->handler, NULL);
 }
 
 /**
@@ -63,7 +63,7 @@ void rethook_stop(struct rethook *rh)
  */
 void rethook_free(struct rethook *rh)
 {
-	WRITE_ONCE(rh->handler, NULL);
+	rethook_stop(rh);
 
 	call_rcu(&rh->rcu, rethook_free_rcu);
 }
@@ -82,6 +82,12 @@ static int rethook_fini_pool(struct objpool_head *head, void *context)
 	return 0;
 }
 
+static inline rethook_handler_t rethook_get_handler(struct rethook *rh)
+{
+	return (rethook_handler_t)rcu_dereference_check(rh->handler,
+							rcu_read_lock_any_held());
+}
+
 /**
  * rethook_alloc() - Allocate struct rethook.
  * @data: a data to pass the @handler when hooking the return.
@@ -107,7 +113,7 @@ struct rethook *rethook_alloc(void *data, rethook_handler_t handler,
 		return ERR_PTR(-ENOMEM);
 
 	rh->data = data;
-	rh->handler = handler;
+	rcu_assign_pointer(rh->handler, handler);
 
 	/* initialize the objpool for rethook nodes */
 	if (objpool_init(&rh->pool, num, size, GFP_KERNEL, rh,
@@ -135,9 +141,10 @@ static void free_rethook_node_rcu(struct rcu_head *head)
  */
 void rethook_recycle(struct rethook_node *node)
 {
-	lockdep_assert_preemption_disabled();
+	rethook_handler_t handler;
 
-	if (likely(READ_ONCE(node->rethook->handler)))
+	handler = rethook_get_handler(node->rethook);
+	if (likely(handler))
 		objpool_push(node, &node->rethook->pool);
 	else
 		call_rcu(&node->rcu, free_rethook_node_rcu);
@@ -153,9 +160,7 @@ NOKPROBE_SYMBOL(rethook_recycle);
  */
 struct rethook_node *rethook_try_get(struct rethook *rh)
 {
-	rethook_handler_t handler = READ_ONCE(rh->handler);
-
-	lockdep_assert_preemption_disabled();
+	rethook_handler_t handler = rethook_get_handler(rh);
 
 	/* Check whether @rh is going to be freed. */
 	if (unlikely(!handler))
@@ -300,7 +305,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs,
 		rhn = container_of(first, struct rethook_node, llist);
 		if (WARN_ON_ONCE(rhn->frame != frame))
 			break;
-		handler = READ_ONCE(rhn->rethook->handler);
+		handler = rethook_get_handler(rhn->rethook);
 		if (handler)
 			handler(rhn, rhn->rethook->data,
 				correct_ret_addr, regs);
-- 
cgit v1.2.3


From a51749ab34d9e5dec548fe38ede7e01e8bb26454 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Thu, 30 Nov 2023 21:48:17 +0100
Subject: locking/mutex: Document that mutex_unlock() is non-atomic

I have seen several cases of attempts to use mutex_unlock() to release an
object such that the object can then be freed by another task.

This is not safe because mutex_unlock(), in the
MUTEX_FLAG_WAITERS && !MUTEX_FLAG_HANDOFF case, accesses the mutex
structure after having marked it as unlocked; so mutex_unlock() requires
its caller to ensure that the mutex stays alive until mutex_unlock()
returns.

If MUTEX_FLAG_WAITERS is set and there are real waiters, those waiters
have to keep the mutex alive, but we could have a spurious
MUTEX_FLAG_WAITERS left if an interruptible/killable waiter bailed
between the points where __mutex_unlock_slowpath() did the cmpxchg
reading the flags and where it acquired the wait_lock.

( With spinlocks, that kind of code pattern is allowed and, from what I
  remember, used in several places in the kernel. )

Document this, such a semantic difference between mutexes and spinlocks
is fairly unintuitive.

[ mingo: Made the changelog a bit more assertive, refined the comments. ]

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20231130204817.2031407-1-jannh@google.com
---
 Documentation/locking/mutex-design.rst | 6 ++++++
 kernel/locking/mutex.c                 | 5 +++++
 2 files changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/locking/mutex-design.rst b/Documentation/locking/mutex-design.rst
index 78540cd7f54b..7572339b2f12 100644
--- a/Documentation/locking/mutex-design.rst
+++ b/Documentation/locking/mutex-design.rst
@@ -101,6 +101,12 @@ features that make lock debugging easier and faster:
     - Detects multi-task circular deadlocks and prints out all affected
       locks and tasks (and only those tasks).
 
+Releasing a mutex is not an atomic operation: Once a mutex release operation
+has begun, another context may be able to acquire the mutex before the release
+operation has fully completed. The mutex user must ensure that the mutex is not
+destroyed while a release operation is still in progress - in other words,
+callers of mutex_unlock() must ensure that the mutex stays alive until
+mutex_unlock() has returned.
 
 Interfaces
 ----------
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 2deeeca3e71b..cbae8c0b89ab 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -532,6 +532,11 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
  * This function must not be used in interrupt context. Unlocking
  * of a not locked mutex is not allowed.
  *
+ * The caller must ensure that the mutex stays alive until this function has
+ * returned - mutex_unlock() can NOT directly be used to release an object such
+ * that another concurrent task can free it.
+ * Mutexes are different from spinlocks & refcounts in this aspect.
+ *
  * This function is similar to (but not equivalent to) up().
  */
 void __sched mutex_unlock(struct mutex *lock)
-- 
cgit v1.2.3


From d499fd418fa15949d86d28bb5442ab88203fc513 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 30 Nov 2023 15:43:26 -0500
Subject: cgroup/rstat: Optimize cgroup_rstat_updated_list()

The current design of cgroup_rstat_cpu_pop_updated() is to traverse
the updated tree in a way to pop out the leaf nodes first before
their parents. This can cause traversal of multiple nodes before a
leaf node can be found and popped out. IOW, a given node in the tree
can be visited multiple times before the whole operation is done. So
it is not very efficient and the code can be hard to read.

With the introduction of cgroup_rstat_updated_list() to build a list
of cgroups to be flushed first before any flushing operation is being
done, we can optimize the way the updated tree nodes are being popped
by pushing the parents first to the tail end of the list before their
children. In this way, most updated tree nodes will be visited only
once with the exception of the subtree root as we still need to go
back to its parent and popped it out of its updated_children list.
This also makes the code easier to read.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/rstat.c | 153 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 91 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 1f300bf4dc40..4ec29e6b1d8d 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -74,64 +74,109 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
 }
 
 /**
- * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
- * @pos: current position
- * @root: root of the tree to traversal
+ * cgroup_rstat_push_children - push children cgroups into the given list
+ * @head: current head of the list (= subtree root)
+ * @child: first child of the root
  * @cpu: target cpu
+ * Return: A new singly linked list of cgroups to be flush
  *
- * Walks the updated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
- * the traversal and %NULL return indicates the end.  During traversal,
- * each returned cgroup is unlinked from the tree.  Must be called with the
- * matching cgroup_rstat_cpu_lock held.
+ * Iteratively traverse down the cgroup_rstat_cpu updated tree level by
+ * level and push all the parents first before their next level children
+ * into a singly linked list built from the tail backward like "pushing"
+ * cgroups into a stack. The root is pushed by the caller.
+ */
+static struct cgroup *cgroup_rstat_push_children(struct cgroup *head,
+						 struct cgroup *child, int cpu)
+{
+	struct cgroup *chead = child;	/* Head of child cgroup level */
+	struct cgroup *ghead = NULL;	/* Head of grandchild cgroup level */
+	struct cgroup *parent, *grandchild;
+	struct cgroup_rstat_cpu *crstatc;
+
+	child->rstat_flush_next = NULL;
+
+next_level:
+	while (chead) {
+		child = chead;
+		chead = child->rstat_flush_next;
+		parent = cgroup_parent(child);
+
+		/* updated_next is parent cgroup terminated */
+		while (child != parent) {
+			child->rstat_flush_next = head;
+			head = child;
+			crstatc = cgroup_rstat_cpu(child, cpu);
+			grandchild = crstatc->updated_children;
+			if (grandchild != child) {
+				/* Push the grand child to the next level */
+				crstatc->updated_children = child;
+				grandchild->rstat_flush_next = ghead;
+				ghead = grandchild;
+			}
+			child = crstatc->updated_next;
+			crstatc->updated_next = NULL;
+		}
+	}
+
+	if (ghead) {
+		chead = ghead;
+		ghead = NULL;
+		goto next_level;
+	}
+	return head;
+}
+
+/**
+ * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
+ * @root: root of the cgroup subtree to traverse
+ * @cpu: target cpu
+ * Return: A singly linked list of cgroups to be flushed
+ *
+ * Walks the updated rstat_cpu tree on @cpu from @root.  During traversal,
+ * each returned cgroup is unlinked from the updated tree.
  *
  * The only ordering guarantee is that, for a parent and a child pair
- * covered by a given traversal, if a child is visited, its parent is
- * guaranteed to be visited afterwards.
+ * covered by a given traversal, the child is before its parent in
+ * the list.
+ *
+ * Note that updated_children is self terminated and points to a list of
+ * child cgroups if not empty. Whereas updated_next is like a sibling link
+ * within the children list and terminated by the parent cgroup. An exception
+ * here is the cgroup root whose updated_next can be self terminated.
  */
-static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
-						   struct cgroup *root, int cpu)
+static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
 {
-	struct cgroup_rstat_cpu *rstatc;
-	struct cgroup *parent;
-
-	if (pos == root)
-		return NULL;
+	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
+	struct cgroup *head = NULL, *parent, *child;
+	unsigned long flags;
 
 	/*
-	 * We're gonna walk down to the first leaf and visit/remove it.  We
-	 * can pick whatever unvisited node as the starting point.
+	 * The _irqsave() is needed because cgroup_rstat_lock is
+	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+	 * this lock with the _irq() suffix only disables interrupts on
+	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+	 * interrupts on both configurations. The _irqsave() ensures
+	 * that interrupts are always disabled and later restored.
 	 */
-	if (!pos) {
-		pos = root;
-		/* return NULL if this subtree is not on-list */
-		if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
-			return NULL;
-	} else {
-		pos = cgroup_parent(pos);
-	}
+	raw_spin_lock_irqsave(cpu_lock, flags);
 
-	/* walk down to the first leaf */
-	while (true) {
-		rstatc = cgroup_rstat_cpu(pos, cpu);
-		if (rstatc->updated_children == pos)
-			break;
-		pos = rstatc->updated_children;
-	}
+	/* Return NULL if this subtree is not on-list */
+	if (!rstatc->updated_next)
+		goto unlock_ret;
 
 	/*
-	 * Unlink @pos from the tree.  As the updated_children list is
+	 * Unlink @root from its parent. As the updated_children list is
 	 * singly linked, we have to walk it to find the removal point.
-	 * However, due to the way we traverse, @pos will be the first
-	 * child in most cases. The only exception is @root.
 	 */
-	parent = cgroup_parent(pos);
+	parent = cgroup_parent(root);
 	if (parent) {
 		struct cgroup_rstat_cpu *prstatc;
 		struct cgroup **nextp;
 
 		prstatc = cgroup_rstat_cpu(parent, cpu);
 		nextp = &prstatc->updated_children;
-		while (*nextp != pos) {
+		while (*nextp != root) {
 			struct cgroup_rstat_cpu *nrstatc;
 
 			nrstatc = cgroup_rstat_cpu(*nextp, cpu);
@@ -142,31 +187,15 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
 	}
 
 	rstatc->updated_next = NULL;
-	return pos;
-}
 
-/* Return a list of updated cgroups to be flushed */
-static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
-{
-	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
-	struct cgroup *head, *tail, *next;
-	unsigned long flags;
-
-	/*
-	 * The _irqsave() is needed because cgroup_rstat_lock is
-	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
-	 * this lock with the _irq() suffix only disables interrupts on
-	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
-	 * interrupts on both configurations. The _irqsave() ensures
-	 * that interrupts are always disabled and later restored.
-	 */
-	raw_spin_lock_irqsave(cpu_lock, flags);
-	head = tail = cgroup_rstat_cpu_pop_updated(NULL, root, cpu);
-	while (tail) {
-		next = cgroup_rstat_cpu_pop_updated(tail, root, cpu);
-		tail->rstat_flush_next = next;
-		tail = next;
-	}
+	/* Push @root to the list first before pushing the children */
+	head = root;
+	root->rstat_flush_next = NULL;
+	child = rstatc->updated_children;
+	rstatc->updated_children = root;
+	if (child != root)
+		head = cgroup_rstat_push_children(head, child, cpu);
+unlock_ret:
 	raw_spin_unlock_irqrestore(cpu_lock, flags);
 	return head;
 }
-- 
cgit v1.2.3


From 12cd3cd8c797e07afcc47bc4afa760e4ec75e9d7 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 20 Nov 2023 17:11:42 +0200
Subject: params: Introduce the param_unknown_fn type

Introduce a new type for the callback to parse an unknown argument.
This unifies function prototypes which takes that as a parameter.

Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231120151419.1661807-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/moduleparam.h | 6 +++---
 kernel/params.c             | 8 ++------
 2 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 4fa9726bc328..bfb85fd13e1f 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -385,6 +385,8 @@ extern bool parameq(const char *name1, const char *name2);
  */
 extern bool parameqn(const char *name1, const char *name2, size_t n);
 
+typedef int (*parse_unknown_fn)(char *param, char *val, const char *doing, void *arg);
+
 /* Called on module insert or kernel boot */
 extern char *parse_args(const char *name,
 		      char *args,
@@ -392,9 +394,7 @@ extern char *parse_args(const char *name,
 		      unsigned num,
 		      s16 level_min,
 		      s16 level_max,
-		      void *arg,
-		      int (*unknown)(char *param, char *val,
-				     const char *doing, void *arg));
+		      void *arg, parse_unknown_fn unknown);
 
 /* Called by module remove. */
 #ifdef CONFIG_SYSFS
diff --git a/kernel/params.c b/kernel/params.c
index 2d4a0564697e..626fa8265932 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -120,9 +120,7 @@ static int parse_one(char *param,
 		     unsigned num_params,
 		     s16 min_level,
 		     s16 max_level,
-		     void *arg,
-		     int (*handle_unknown)(char *param, char *val,
-				     const char *doing, void *arg))
+		     void *arg, parse_unknown_fn handle_unknown)
 {
 	unsigned int i;
 	int err;
@@ -165,9 +163,7 @@ char *parse_args(const char *doing,
 		 unsigned num,
 		 s16 min_level,
 		 s16 max_level,
-		 void *arg,
-		 int (*unknown)(char *param, char *val,
-				const char *doing, void *arg))
+		 void *arg, parse_unknown_fn unknown)
 {
 	char *param, *val, *err = NULL;
 
-- 
cgit v1.2.3


From fd0cd057a1b7351604daa6ffc91dfe28adf7225d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 20 Nov 2023 17:11:43 +0200
Subject: params: Do not go over the limit when getting the string length

We can use strnlen() even on early stages and it prevents from
going over the string boundaries in case it's already too long.

Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231120151419.1661807-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/params.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 626fa8265932..f8e3c4139854 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -260,7 +260,10 @@ EXPORT_SYMBOL_GPL(param_set_uint_minmax);
 
 int param_set_charp(const char *val, const struct kernel_param *kp)
 {
-	if (strlen(val) > 1024) {
+	size_t len, maxlen = 1024;
+
+	len = strnlen(val, maxlen + 1);
+	if (len == maxlen + 1) {
 		pr_err("%s: string parameter too long\n", kp->name);
 		return -ENOSPC;
 	}
@@ -270,7 +273,7 @@ int param_set_charp(const char *val, const struct kernel_param *kp)
 	/* This is a hack.  We can't kmalloc in early boot, and we
 	 * don't need to; this mangled commandline is preserved. */
 	if (slab_is_available()) {
-		*(char **)kp->arg = kmalloc_parameter(strlen(val)+1);
+		*(char **)kp->arg = kmalloc_parameter(len + 1);
 		if (!*(char **)kp->arg)
 			return -ENOMEM;
 		strcpy(*(char **)kp->arg, val);
@@ -508,7 +511,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp)
 {
 	const struct kparam_string *kps = kp->str;
 
-	if (strlen(val)+1 > kps->maxlen) {
+	if (strnlen(val, kps->maxlen) == kps->maxlen) {
 		pr_err("%s: string doesn't fit in %u chars.\n",
 		       kp->name, kps->maxlen-1);
 		return -ENOSPC;
-- 
cgit v1.2.3


From 0fc79cbc937f2a754a302a710a94b68c61d0a89a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 20 Nov 2023 17:11:44 +0200
Subject: params: Use size_add() for kmalloc()

Prevent allocations from integer overflow by using size_add().

Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231120151419.1661807-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/params.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index f8e3c4139854..c3a029fe183d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -11,6 +11,7 @@
 #include <linux/moduleparam.h>
 #include <linux/device.h>
 #include <linux/err.h>
+#include <linux/overflow.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
 #include <linux/security.h>
@@ -48,7 +49,7 @@ static void *kmalloc_parameter(unsigned int size)
 {
 	struct kmalloced_param *p;
 
-	p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
+	p = kmalloc(size_add(sizeof(*p), size), GFP_KERNEL);
 	if (!p)
 		return NULL;
 
-- 
cgit v1.2.3


From a05f096c2c0ca52e8fd34740c7d4b53ab3e7123e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 20 Nov 2023 17:11:45 +0200
Subject: params: Sort headers

Sort the headers in alphabetic order in order to ease
the maintenance for this part.

Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231120151419.1661807-5-andriy.shevchenko@linux.intel.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/params.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index c3a029fe183d..eb55b32399b4 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -3,18 +3,18 @@
    Copyright (C) 2001 Rusty Russell.
 
 */
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/kstrtox.h>
-#include <linux/string.h>
-#include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
-#include <linux/device.h>
-#include <linux/err.h>
 #include <linux/overflow.h>
-#include <linux/slab.h>
-#include <linux/ctype.h>
 #include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/string.h>
 
 #ifdef CONFIG_SYSFS
 /* Protects all built-in parameters, modules use their own param_lock */
-- 
cgit v1.2.3


From b5e3f86a47d34f7b8af899f8cc70520f6daf8b53 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 20 Nov 2023 17:11:46 +0200
Subject: params: Fix multi-line comment style

The multi-line comment style in the file is rather arbitrary.
Make it follow the standard one.

Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20231120151419.1661807-6-andriy.shevchenko@linux.intel.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/params.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index eb55b32399b4..2e447f8ae183 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-/* Helpers for initial module or kernel cmdline parsing
-   Copyright (C) 2001 Rusty Russell.
-
-*/
+/*
+ * Helpers for initial module or kernel cmdline parsing
+ * Copyright (C) 2001 Rusty Russell.
+ */
 #include <linux/ctype.h>
 #include <linux/device.h>
 #include <linux/err.h>
@@ -271,8 +271,10 @@ int param_set_charp(const char *val, const struct kernel_param *kp)
 
 	maybe_kfree_parameter(*(char **)kp->arg);
 
-	/* This is a hack.  We can't kmalloc in early boot, and we
-	 * don't need to; this mangled commandline is preserved. */
+	/*
+	 * This is a hack. We can't kmalloc() in early boot, and we
+	 * don't need to; this mangled commandline is preserved.
+	 */
 	if (slab_is_available()) {
 		*(char **)kp->arg = kmalloc_parameter(len + 1);
 		if (!*(char **)kp->arg)
@@ -743,8 +745,10 @@ void module_param_sysfs_remove(struct module *mod)
 {
 	if (mod->mkobj.mp) {
 		sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
-		/* We are positive that no one is using any param
-		 * attrs at this point.  Deallocate immediately. */
+		/*
+		 * We are positive that no one is using any param
+		 * attrs at this point. Deallocate immediately.
+		 */
 		free_module_param_attrs(&mod->mkobj);
 	}
 }
-- 
cgit v1.2.3


From 8a3750ecf8104de55c569ffbe844a85aa9d5deaa Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 30 Nov 2023 12:56:08 -0800
Subject: tracing/uprobe: Replace strlcpy() with strscpy()

strlcpy() reads the entire source buffer first. This read may exceed
the destination size limit. This is both inefficient and can lead
to linear read overflows if a source string is not NUL-terminated[1].
Additionally, it returns the size of the source string, not the
resulting size of the destination string. In an effort to remove strlcpy()
completely[2], replace strlcpy() here with strscpy().

The negative return value is already handled by this code so no new
handling is needed here.

Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [1]
Link: https://github.com/KSPP/linux/issues/89 [2]
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: linux-trace-kernel@vger.kernel.org
Acked-by: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/20231130205607.work.463-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/trace/trace_uprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 99c051de412a..a84b85d8aac1 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -151,7 +151,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
 		return -ENOMEM;
 
 	if (addr == FETCH_TOKEN_COMM)
-		ret = strlcpy(dst, current->comm, maxlen);
+		ret = strscpy(dst, current->comm, maxlen);
 	else
 		ret = strncpy_from_user(dst, src, maxlen);
 	if (ret >= 0) {
-- 
cgit v1.2.3


From dfce9cb3140592b886838e06f3e0c25fea2a9cae Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 30 Nov 2023 18:46:40 -0800
Subject: bpf: Fix a verifier bug due to incorrect branch offset comparison
 with cpu=v4

Bpf cpu=v4 support is introduced in [1] and Commit 4cd58e9af8b9
("bpf: Support new 32bit offset jmp instruction") added support for new
32bit offset jmp instruction. Unfortunately, in function
bpf_adj_delta_to_off(), for new branch insn with 32bit offset, the offset
(plus/minor a small delta) compares to 16-bit offset bound
[S16_MIN, S16_MAX], which caused the following verification failure:
  $ ./test_progs-cpuv4 -t verif_scale_pyperf180
  ...
  insn 10 cannot be patched due to 16-bit range
  ...
  libbpf: failed to load object 'pyperf180.bpf.o'
  scale_test:FAIL:expect_success unexpected error: -12 (errno 12)
  #405     verif_scale_pyperf180:FAIL

Note that due to recent llvm18 development, the patch [2] (already applied
in bpf-next) needs to be applied to bpf tree for testing purpose.

The fix is rather simple. For 32bit offset branch insn, the adjusted
offset compares to [S32_MIN, S32_MAX] and then verification succeeded.

  [1] https://lore.kernel.org/all/20230728011143.3710005-1-yonghong.song@linux.dev
  [2] https://lore.kernel.org/bpf/20231110193644.3130906-1-yonghong.song@linux.dev

Fixes: 4cd58e9af8b9 ("bpf: Support new 32bit offset jmp instruction")
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231201024640.3417057-1-yonghong.song@linux.dev
---
 kernel/bpf/core.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index cd3afe57ece3..fe254ae035fe 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -371,14 +371,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
 static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
 				s32 end_new, s32 curr, const bool probe_pass)
 {
-	const s32 off_min = S16_MIN, off_max = S16_MAX;
+	s64 off_min, off_max, off;
 	s32 delta = end_new - end_old;
-	s32 off;
 
-	if (insn->code == (BPF_JMP32 | BPF_JA))
+	if (insn->code == (BPF_JMP32 | BPF_JA)) {
 		off = insn->imm;
-	else
+		off_min = S32_MIN;
+		off_max = S32_MAX;
+	} else {
 		off = insn->off;
+		off_min = S16_MIN;
+		off_max = S16_MAX;
+	}
 
 	if (curr < pos && curr + off + 1 >= end_old)
 		off += delta;
-- 
cgit v1.2.3


From ac9c05e0e453cfcab2866f6d28f257590e4f66e5 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 29 Nov 2023 15:44:12 -0800
Subject: bpf: Add kfunc bpf_get_file_xattr

It is common practice for security solutions to store tags/labels in
xattrs. To implement similar functionalities in BPF LSM, add new kfunc
bpf_get_file_xattr().

The first use case of bpf_get_file_xattr() is to implement file
verifications with asymmetric keys. Specificially, security applications
could use fsverity for file hashes and use xattr to store file signatures.
(kfunc for fsverity hash will be added in a separate commit.)

Currently, only xattrs with "user." prefix can be read with kfunc
bpf_get_file_xattr(). As use cases evolve, we may add a dedicated prefix
for bpf_get_file_xattr().

To avoid recursion, bpf_get_file_xattr can be only called from LSM hooks.

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Christian Brauner <brauner@kernel.org>
Acked-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/20231129234417.856536-2-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/bpf_trace.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c284a4ad0315..1648bde28f01 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -24,6 +24,7 @@
 #include <linux/key.h>
 #include <linux/verification.h>
 #include <linux/namei.h>
+#include <linux/fileattr.h>
 
 #include <net/bpf_sk_storage.h>
 
@@ -1431,6 +1432,72 @@ static int __init bpf_key_sig_kfuncs_init(void)
 late_initcall(bpf_key_sig_kfuncs_init);
 #endif /* CONFIG_KEYS */
 
+/* filesystem kfuncs */
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_get_file_xattr - get xattr of a file
+ * @file: file to get xattr from
+ * @name__str: name of the xattr
+ * @value_ptr: output buffer of the xattr value
+ *
+ * Get xattr *name__str* of *file* and store the output in *value_ptr*.
+ *
+ * For security reasons, only *name__str* with prefix "user." is allowed.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
+				   struct bpf_dynptr_kern *value_ptr)
+{
+	struct dentry *dentry;
+	u32 value_len;
+	void *value;
+	int ret;
+
+	if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+		return -EPERM;
+
+	value_len = __bpf_dynptr_size(value_ptr);
+	value = __bpf_dynptr_data_rw(value_ptr, value_len);
+	if (!value)
+		return -EINVAL;
+
+	dentry = file_dentry(file);
+	ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ);
+	if (ret)
+		return ret;
+	return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_SET8_START(fs_kfunc_set_ids)
+BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_SET8_END(fs_kfunc_set_ids)
+
+static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+	if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id))
+		return 0;
+
+	/* Only allow to attach from LSM hooks, to avoid recursion */
+	return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0;
+}
+
+static const struct btf_kfunc_id_set bpf_fs_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set = &fs_kfunc_set_ids,
+	.filter = bpf_get_file_xattr_filter,
+};
+
+static int __init bpf_fs_kfuncs_init(void)
+{
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set);
+}
+
+late_initcall(bpf_fs_kfuncs_init);
+
 static const struct bpf_func_proto *
 bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-- 
cgit v1.2.3


From 5fad52bee30414270104525e3a0266327a6e9d11 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 2 Dec 2023 09:56:56 -0800
Subject: bpf: provide correct register name for exception callback retval
 check

bpf_throw() is checking R1, so let's report R1 in the log.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231202175705.885270-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                                 | 12 ++++++------
 tools/testing/selftests/bpf/progs/exceptions_assert.c |  2 +-
 tools/testing/selftests/bpf/progs/exceptions_fail.c   |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8e7b6072e3f4..25b9d470957e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11805,7 +11805,7 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env,
 	return 0;
 }
 
-static int check_return_code(struct bpf_verifier_env *env, int regno);
+static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);
 
 static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			    int *insn_idx_p)
@@ -11942,7 +11942,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		 * to bpf_throw becomes the return value of the program.
 		 */
 		if (!env->exception_callback_subprog) {
-			err = check_return_code(env, BPF_REG_1);
+			err = check_return_code(env, BPF_REG_1, "R1");
 			if (err < 0)
 				return err;
 		}
@@ -14972,7 +14972,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
-static int check_return_code(struct bpf_verifier_env *env, int regno)
+static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name)
 {
 	struct tnum enforce_attach_type_range = tnum_unknown;
 	const struct bpf_prog *prog = env->prog;
@@ -15026,7 +15026,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno)
 		}
 
 		if (!tnum_in(const_0, reg->var_off)) {
-			verbose_invalid_scalar(env, reg, &const_0, "async callback", "R0");
+			verbose_invalid_scalar(env, reg, &const_0, "async callback", reg_name);
 			return -EINVAL;
 		}
 		return 0;
@@ -15126,7 +15126,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno)
 	}
 
 	if (!tnum_in(range, reg->var_off)) {
-		verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
+		verbose_invalid_scalar(env, reg, &range, "program exit", reg_name);
 		if (prog->expected_attach_type == BPF_LSM_CGROUP &&
 		    prog_type == BPF_PROG_TYPE_LSM &&
 		    !prog->aux->attach_func_proto->type)
@@ -17410,7 +17410,7 @@ process_bpf_exit_full:
 					continue;
 				}
 
-				err = check_return_code(env, BPF_REG_0);
+				err = check_return_code(env, BPF_REG_0, "R0");
 				if (err)
 					return err;
 process_bpf_exit:
diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c
index 49efaed143fc..575e7dd719c4 100644
--- a/tools/testing/selftests/bpf/progs/exceptions_assert.c
+++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c
@@ -125,7 +125,7 @@ int check_assert_generic(struct __sk_buff *ctx)
 }
 
 SEC("?fentry/bpf_check")
-__failure __msg("At program exit the register R0 has value (0x40; 0x0)")
+__failure __msg("At program exit the register R1 has value (0x40; 0x0)")
 int check_assert_with_return(void *ctx)
 {
 	bpf_assert_with(!ctx, 64);
diff --git a/tools/testing/selftests/bpf/progs/exceptions_fail.c b/tools/testing/selftests/bpf/progs/exceptions_fail.c
index 8c0ef2742208..81ead7512ba2 100644
--- a/tools/testing/selftests/bpf/progs/exceptions_fail.c
+++ b/tools/testing/selftests/bpf/progs/exceptions_fail.c
@@ -308,7 +308,7 @@ int reject_set_exception_cb_bad_ret1(void *ctx)
 }
 
 SEC("?fentry/bpf_check")
-__failure __msg("At program exit the register R0 has value (0x40; 0x0) should")
+__failure __msg("At program exit the register R1 has value (0x40; 0x0) should")
 int reject_set_exception_cb_bad_ret2(void *ctx)
 {
 	bpf_throw(64);
-- 
cgit v1.2.3


From 0acd03a5bd188b0c501d285d938439618bd855c4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 2 Dec 2023 09:56:57 -0800
Subject: bpf: enforce precision of R0 on callback return

Given verifier checks actual value, r0 has to be precise, so we need to
propagate precision properly. r0 also has to be marked as read,
otherwise subsequent state comparisons will ignore such register as
unimportant and precision won't really help here.

Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper")
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231202175705.885270-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 25b9d470957e..849fbf47b5f3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9590,6 +9590,13 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 			verbose(env, "R0 not a scalar value\n");
 			return -EACCES;
 		}
+
+		/* we are going to rely on register's precise value */
+		err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64);
+		err = err ?: mark_chain_precision(env, BPF_REG_0);
+		if (err)
+			return err;
+
 		if (!tnum_in(range, r0->var_off)) {
 			verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
 			return -EINVAL;
-- 
cgit v1.2.3


From 8fa4ecd49b81ccd9d1d87f1c8b2260e218644878 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 2 Dec 2023 09:56:58 -0800
Subject: bpf: enforce exact retval range on subprog/callback exit

Instead of relying on potentially imprecise tnum representation of
expected return value range for callbacks and subprogs, validate that
smin/smax range satisfy exact expected range of return values.

E.g., if callback would need to return [0, 2] range, tnum can't
represent this precisely and instead will allow [0, 3] range. By
checking smin/smax range, we can make sure that subprog/callback indeed
returns only valid [0, 2] range.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231202175705.885270-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  7 ++++++-
 kernel/bpf/verifier.c        | 33 ++++++++++++++++++++++-----------
 2 files changed, 28 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 0c0e1bccad45..3378cc753061 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -275,6 +275,11 @@ struct bpf_reference_state {
 	int callback_ref;
 };
 
+struct bpf_retval_range {
+	s32 minval;
+	s32 maxval;
+};
+
 /* state of the program:
  * type of all registers and stack info
  */
@@ -297,7 +302,7 @@ struct bpf_func_state {
 	 * void foo(void) { bpf_timer_set_callback(,foo); }
 	 */
 	u32 async_entry_cnt;
-	struct tnum callback_ret_range;
+	struct bpf_retval_range callback_ret_range;
 	bool in_callback_fn;
 	bool in_async_callback_fn;
 	bool in_exception_callback_fn;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 849fbf47b5f3..f3d9d7de68da 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2305,6 +2305,11 @@ static void init_reg_state(struct bpf_verifier_env *env,
 	regs[BPF_REG_FP].frameno = state->frameno;
 }
 
+static struct bpf_retval_range retval_range(s32 minval, s32 maxval)
+{
+	return (struct bpf_retval_range){ minval, maxval };
+}
+
 #define BPF_MAIN_FUNC (-1)
 static void init_func_state(struct bpf_verifier_env *env,
 			    struct bpf_func_state *state,
@@ -2313,7 +2318,7 @@ static void init_func_state(struct bpf_verifier_env *env,
 	state->callsite = callsite;
 	state->frameno = frameno;
 	state->subprogno = subprogno;
-	state->callback_ret_range = tnum_range(0, 0);
+	state->callback_ret_range = retval_range(0, 0);
 	init_reg_state(env, state);
 	mark_verifier_state_scratched(env);
 }
@@ -9396,7 +9401,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
 		return err;
 
 	callee->in_callback_fn = true;
-	callee->callback_ret_range = tnum_range(0, 1);
+	callee->callback_ret_range = retval_range(0, 1);
 	return 0;
 }
 
@@ -9418,7 +9423,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
 
 	callee->in_callback_fn = true;
-	callee->callback_ret_range = tnum_range(0, 1);
+	callee->callback_ret_range = retval_range(0, 1);
 	return 0;
 }
 
@@ -9448,7 +9453,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
 	callee->in_async_callback_fn = true;
-	callee->callback_ret_range = tnum_range(0, 1);
+	callee->callback_ret_range = retval_range(0, 1);
 	return 0;
 }
 
@@ -9476,7 +9481,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
 	callee->in_callback_fn = true;
-	callee->callback_ret_range = tnum_range(0, 1);
+	callee->callback_ret_range = retval_range(0, 1);
 	return 0;
 }
 
@@ -9499,7 +9504,7 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
 
 	callee->in_callback_fn = true;
-	callee->callback_ret_range = tnum_range(0, 1);
+	callee->callback_ret_range = retval_range(0, 1);
 	return 0;
 }
 
@@ -9531,7 +9536,7 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
 	callee->in_callback_fn = true;
-	callee->callback_ret_range = tnum_range(0, 1);
+	callee->callback_ret_range = retval_range(0, 1);
 	return 0;
 }
 
@@ -9560,6 +9565,11 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
 	return is_rbtree_lock_required_kfunc(kfunc_btf_id);
 }
 
+static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg)
+{
+	return range.minval <= reg->smin_value && reg->smax_value <= range.maxval;
+}
+
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state, *prev_st;
@@ -9583,9 +9593,6 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 
 	caller = state->frame[state->curframe - 1];
 	if (callee->in_callback_fn) {
-		/* enforce R0 return value range [0, 1]. */
-		struct tnum range = callee->callback_ret_range;
-
 		if (r0->type != SCALAR_VALUE) {
 			verbose(env, "R0 not a scalar value\n");
 			return -EACCES;
@@ -9597,7 +9604,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 		if (err)
 			return err;
 
-		if (!tnum_in(range, r0->var_off)) {
+		/* enforce R0 return value range */
+		if (!retval_range_within(callee->callback_ret_range, r0)) {
+			struct tnum range = tnum_range(callee->callback_ret_range.minval,
+						       callee->callback_ret_range.maxval);
+
 			verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From c871d0e00f0e8c207ce8ff89025e35cc49a8a3c3 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 2 Dec 2023 09:57:00 -0800
Subject: bpf: enforce precise retval range on program exit

Similarly to subprog/callback logic, enforce return value of BPF program
using more precise smin/smax range.

We need to adjust a bunch of tests due to a changed format of an error
message.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231202175705.885270-7-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                              | 56 +++++++++++-----------
 .../selftests/bpf/progs/exceptions_assert.c        |  2 +-
 .../testing/selftests/bpf/progs/exceptions_fail.c  |  2 +-
 .../selftests/bpf/progs/test_global_func15.c       |  2 +-
 tools/testing/selftests/bpf/progs/timer_failure.c  |  2 +-
 .../selftests/bpf/progs/user_ringbuf_fail.c        |  2 +-
 .../bpf/progs/verifier_cgroup_inv_retcode.c        |  8 ++--
 .../bpf/progs/verifier_netfilter_retcode.c         |  2 +-
 .../bpf/progs/verifier_subprog_precision.c         |  2 +-
 9 files changed, 40 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f3d9d7de68da..9411c1046268 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -362,20 +362,23 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
 
 static void verbose_invalid_scalar(struct bpf_verifier_env *env,
 				   struct bpf_reg_state *reg,
-				   struct tnum *range, const char *ctx,
+				   struct bpf_retval_range range, const char *ctx,
 				   const char *reg_name)
 {
-	char tn_buf[48];
+	bool unknown = true;
 
-	verbose(env, "At %s the register %s ", ctx, reg_name);
-	if (!tnum_is_unknown(reg->var_off)) {
-		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose(env, "has value %s", tn_buf);
-	} else {
-		verbose(env, "has unknown scalar value");
+	verbose(env, "At %s the register %s has", ctx, reg_name);
+	if (reg->smin_value > S64_MIN) {
+		verbose(env, " smin=%lld", reg->smin_value);
+		unknown = false;
 	}
-	tnum_strn(tn_buf, sizeof(tn_buf), *range);
-	verbose(env, " should have been in %s\n", tn_buf);
+	if (reg->smax_value < S64_MAX) {
+		verbose(env, " smax=%lld", reg->smax_value);
+		unknown = false;
+	}
+	if (unknown)
+		verbose(env, " unknown scalar value");
+	verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval);
 }
 
 static bool type_may_be_null(u32 type)
@@ -9606,10 +9609,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 
 		/* enforce R0 return value range */
 		if (!retval_range_within(callee->callback_ret_range, r0)) {
-			struct tnum range = tnum_range(callee->callback_ret_range.minval,
-						       callee->callback_ret_range.maxval);
-
-			verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
+			verbose_invalid_scalar(env, r0, callee->callback_ret_range,
+					       "callback return", "R0");
 			return -EINVAL;
 		}
 		if (!calls_callback(env, callee->callsite)) {
@@ -14995,7 +14996,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 	struct tnum enforce_attach_type_range = tnum_unknown;
 	const struct bpf_prog *prog = env->prog;
 	struct bpf_reg_state *reg;
-	struct tnum range = tnum_range(0, 1), const_0 = tnum_const(0);
+	struct bpf_retval_range range = retval_range(0, 1);
+	struct bpf_retval_range const_0 = retval_range(0, 0);
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
 	int err;
 	struct bpf_func_state *frame = env->cur_state->frame[0];
@@ -15043,8 +15045,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 			return -EINVAL;
 		}
 
-		if (!tnum_in(const_0, reg->var_off)) {
-			verbose_invalid_scalar(env, reg, &const_0, "async callback", reg_name);
+		if (!retval_range_within(const_0, reg)) {
+			verbose_invalid_scalar(env, reg, const_0, "async callback", reg_name);
 			return -EINVAL;
 		}
 		return 0;
@@ -15070,14 +15072,14 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 		    env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME ||
 		    env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME)
-			range = tnum_range(1, 1);
+			range = retval_range(1, 1);
 		if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND ||
 		    env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND)
-			range = tnum_range(0, 3);
+			range = retval_range(0, 3);
 		break;
 	case BPF_PROG_TYPE_CGROUP_SKB:
 		if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
-			range = tnum_range(0, 3);
+			range = retval_range(0, 3);
 			enforce_attach_type_range = tnum_range(2, 3);
 		}
 		break;
@@ -15090,13 +15092,13 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 	case BPF_PROG_TYPE_RAW_TRACEPOINT:
 		if (!env->prog->aux->attach_btf_id)
 			return 0;
-		range = tnum_const(0);
+		range = retval_range(0, 0);
 		break;
 	case BPF_PROG_TYPE_TRACING:
 		switch (env->prog->expected_attach_type) {
 		case BPF_TRACE_FENTRY:
 		case BPF_TRACE_FEXIT:
-			range = tnum_const(0);
+			range = retval_range(0, 0);
 			break;
 		case BPF_TRACE_RAW_TP:
 		case BPF_MODIFY_RETURN:
@@ -15108,7 +15110,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 		}
 		break;
 	case BPF_PROG_TYPE_SK_LOOKUP:
-		range = tnum_range(SK_DROP, SK_PASS);
+		range = retval_range(SK_DROP, SK_PASS);
 		break;
 
 	case BPF_PROG_TYPE_LSM:
@@ -15122,12 +15124,12 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 			/* Make sure programs that attach to void
 			 * hooks don't try to modify return value.
 			 */
-			range = tnum_range(1, 1);
+			range = retval_range(1, 1);
 		}
 		break;
 
 	case BPF_PROG_TYPE_NETFILTER:
-		range = tnum_range(NF_DROP, NF_ACCEPT);
+		range = retval_range(NF_DROP, NF_ACCEPT);
 		break;
 	case BPF_PROG_TYPE_EXT:
 		/* freplace program can return anything as its return value
@@ -15143,8 +15145,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 		return -EINVAL;
 	}
 
-	if (!tnum_in(range, reg->var_off)) {
-		verbose_invalid_scalar(env, reg, &range, "program exit", reg_name);
+	if (!retval_range_within(range, reg)) {
+		verbose_invalid_scalar(env, reg, range, "program exit", reg_name);
 		if (prog->expected_attach_type == BPF_LSM_CGROUP &&
 		    prog_type == BPF_PROG_TYPE_LSM &&
 		    !prog->aux->attach_func_proto->type)
diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c
index 575e7dd719c4..0ef81040da59 100644
--- a/tools/testing/selftests/bpf/progs/exceptions_assert.c
+++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c
@@ -125,7 +125,7 @@ int check_assert_generic(struct __sk_buff *ctx)
 }
 
 SEC("?fentry/bpf_check")
-__failure __msg("At program exit the register R1 has value (0x40; 0x0)")
+__failure __msg("At program exit the register R1 has smin=64 smax=64")
 int check_assert_with_return(void *ctx)
 {
 	bpf_assert_with(!ctx, 64);
diff --git a/tools/testing/selftests/bpf/progs/exceptions_fail.c b/tools/testing/selftests/bpf/progs/exceptions_fail.c
index 81ead7512ba2..9cceb6521143 100644
--- a/tools/testing/selftests/bpf/progs/exceptions_fail.c
+++ b/tools/testing/selftests/bpf/progs/exceptions_fail.c
@@ -308,7 +308,7 @@ int reject_set_exception_cb_bad_ret1(void *ctx)
 }
 
 SEC("?fentry/bpf_check")
-__failure __msg("At program exit the register R1 has value (0x40; 0x0) should")
+__failure __msg("At program exit the register R1 has smin=64 smax=64 should")
 int reject_set_exception_cb_bad_ret2(void *ctx)
 {
 	bpf_throw(64);
diff --git a/tools/testing/selftests/bpf/progs/test_global_func15.c b/tools/testing/selftests/bpf/progs/test_global_func15.c
index b512d6a6c75e..f80207480e8a 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func15.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func15.c
@@ -13,7 +13,7 @@ __noinline int foo(unsigned int *v)
 }
 
 SEC("cgroup_skb/ingress")
-__failure __msg("At program exit the register R0 has value")
+__failure __msg("At program exit the register R0 has ")
 int global_func15(struct __sk_buff *skb)
 {
 	unsigned int v = 1;
diff --git a/tools/testing/selftests/bpf/progs/timer_failure.c b/tools/testing/selftests/bpf/progs/timer_failure.c
index 226d33b5a05c..9000da1e2120 100644
--- a/tools/testing/selftests/bpf/progs/timer_failure.c
+++ b/tools/testing/selftests/bpf/progs/timer_failure.c
@@ -30,7 +30,7 @@ static int timer_cb_ret1(void *map, int *key, struct bpf_timer *timer)
 }
 
 SEC("fentry/bpf_fentry_test1")
-__failure __msg("should have been in (0x0; 0x0)")
+__failure __msg("should have been in [0, 0]")
 int BPF_PROG2(test_ret_1, int, a)
 {
 	int key = 0;
diff --git a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c
index 03ee946c6bf7..11ab25c42c36 100644
--- a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c
+++ b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c
@@ -184,7 +184,7 @@ invalid_drain_callback_return(struct bpf_dynptr *dynptr, void *context)
  * not be able to write to that pointer.
  */
 SEC("?raw_tp")
-__failure __msg("At callback return the register R0 has value")
+__failure __msg("At callback return the register R0 has ")
 int user_ringbuf_callback_invalid_return(void *ctx)
 {
 	bpf_user_ringbuf_drain(&user_ringbuf, invalid_drain_callback_return, NULL, 0);
diff --git a/tools/testing/selftests/bpf/progs/verifier_cgroup_inv_retcode.c b/tools/testing/selftests/bpf/progs/verifier_cgroup_inv_retcode.c
index d6c4a7f3f790..6e0f349f8f15 100644
--- a/tools/testing/selftests/bpf/progs/verifier_cgroup_inv_retcode.c
+++ b/tools/testing/selftests/bpf/progs/verifier_cgroup_inv_retcode.c
@@ -7,7 +7,7 @@
 
 SEC("cgroup/sock")
 __description("bpf_exit with invalid return code. test1")
-__failure __msg("R0 has value (0x0; 0xffffffff)")
+__failure __msg("smin=0 smax=4294967295 should have been in [0, 1]")
 __naked void with_invalid_return_code_test1(void)
 {
 	asm volatile ("					\
@@ -30,7 +30,7 @@ __naked void with_invalid_return_code_test2(void)
 
 SEC("cgroup/sock")
 __description("bpf_exit with invalid return code. test3")
-__failure __msg("R0 has value (0x0; 0x3)")
+__failure __msg("smin=0 smax=3 should have been in [0, 1]")
 __naked void with_invalid_return_code_test3(void)
 {
 	asm volatile ("					\
@@ -53,7 +53,7 @@ __naked void with_invalid_return_code_test4(void)
 
 SEC("cgroup/sock")
 __description("bpf_exit with invalid return code. test5")
-__failure __msg("R0 has value (0x2; 0x0)")
+__failure __msg("smin=2 smax=2 should have been in [0, 1]")
 __naked void with_invalid_return_code_test5(void)
 {
 	asm volatile ("					\
@@ -75,7 +75,7 @@ __naked void with_invalid_return_code_test6(void)
 
 SEC("cgroup/sock")
 __description("bpf_exit with invalid return code. test7")
-__failure __msg("R0 has unknown scalar value")
+__failure __msg("R0 has unknown scalar value should have been in [0, 1]")
 __naked void with_invalid_return_code_test7(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_netfilter_retcode.c b/tools/testing/selftests/bpf/progs/verifier_netfilter_retcode.c
index 353ae6da00e1..e1ffa5d32ff0 100644
--- a/tools/testing/selftests/bpf/progs/verifier_netfilter_retcode.c
+++ b/tools/testing/selftests/bpf/progs/verifier_netfilter_retcode.c
@@ -39,7 +39,7 @@ __naked void with_valid_return_code_test3(void)
 
 SEC("netfilter")
 __description("bpf_exit with invalid return code. test4")
-__failure __msg("R0 has value (0x2; 0x0)")
+__failure __msg("R0 has smin=2 smax=2 should have been in [0, 1]")
 __naked void with_invalid_return_code_test4(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
index d41d2a8bb97e..0dfe3f8b69ac 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -148,7 +148,7 @@ __msg("mark_precise: frame1: regs=r0 stack= before 11: (b7) r0 = 0")
 __msg("from 10 to 12: frame1: R0=scalar(smin=umin=1001")
 /* check that branch code path marks r0 as precise, before failing */
 __msg("mark_precise: frame1: regs=r0 stack= before 9: (85) call bpf_get_prandom_u32#7")
-__msg("At callback return the register R0 has value (0x0; 0x7fffffffffffffff) should have been in (0x0; 0x1)")
+__msg("At callback return the register R0 has smin=1001 should have been in [0, 1]")
 __naked int callback_precise_return_fail(void)
 {
 	asm volatile (
-- 
cgit v1.2.3


From 0ef24c8dfae24a4b8aa2e92eac20faecdc5502e5 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 2 Dec 2023 09:57:01 -0800
Subject: bpf: unify async callback and program retval checks

Use common logic to verify program return values and async callback
return values. This allows to avoid duplication of any extra steps
necessary, like precision marking, which will be added in the next
patch.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231202175705.885270-8-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9411c1046268..c54944af1bcc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -367,7 +367,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env,
 {
 	bool unknown = true;
 
-	verbose(env, "At %s the register %s has", ctx, reg_name);
+	verbose(env, "%s the register %s has", ctx, reg_name);
 	if (reg->smin_value > S64_MIN) {
 		verbose(env, " smin=%lld", reg->smin_value);
 		unknown = false;
@@ -9610,7 +9610,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 		/* enforce R0 return value range */
 		if (!retval_range_within(callee->callback_ret_range, r0)) {
 			verbose_invalid_scalar(env, r0, callee->callback_ret_range,
-					       "callback return", "R0");
+					       "At callback return", "R0");
 			return -EINVAL;
 		}
 		if (!calls_callback(env, callee->callsite)) {
@@ -14993,11 +14993,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name)
 {
+	const char *exit_ctx = "At program exit";
 	struct tnum enforce_attach_type_range = tnum_unknown;
 	const struct bpf_prog *prog = env->prog;
 	struct bpf_reg_state *reg;
 	struct bpf_retval_range range = retval_range(0, 1);
-	struct bpf_retval_range const_0 = retval_range(0, 0);
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
 	int err;
 	struct bpf_func_state *frame = env->cur_state->frame[0];
@@ -15039,17 +15039,9 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 
 	if (frame->in_async_callback_fn) {
 		/* enforce return zero from async callbacks like timer */
-		if (reg->type != SCALAR_VALUE) {
-			verbose(env, "In async callback the register R%d is not a known value (%s)\n",
-				regno, reg_type_str(env, reg->type));
-			return -EINVAL;
-		}
-
-		if (!retval_range_within(const_0, reg)) {
-			verbose_invalid_scalar(env, reg, const_0, "async callback", reg_name);
-			return -EINVAL;
-		}
-		return 0;
+		exit_ctx = "At async callback return";
+		range = retval_range(0, 0);
+		goto enforce_retval;
 	}
 
 	if (is_subprog && !frame->in_exception_callback_fn) {
@@ -15139,15 +15131,17 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 		return 0;
 	}
 
+enforce_retval:
 	if (reg->type != SCALAR_VALUE) {
-		verbose(env, "At program exit the register R%d is not a known value (%s)\n",
-			regno, reg_type_str(env, reg->type));
+		verbose(env, "%s the register R%d is not a known value (%s)\n",
+			exit_ctx, regno, reg_type_str(env, reg->type));
 		return -EINVAL;
 	}
 
 	if (!retval_range_within(range, reg)) {
-		verbose_invalid_scalar(env, reg, range, "program exit", reg_name);
-		if (prog->expected_attach_type == BPF_LSM_CGROUP &&
+		verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
+		if (!is_subprog &&
+		    prog->expected_attach_type == BPF_LSM_CGROUP &&
 		    prog_type == BPF_PROG_TYPE_LSM &&
 		    !prog->aux->attach_func_proto->type)
 			verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
-- 
cgit v1.2.3


From eabe518de533a4291996020977054a7a7b78c7d3 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 2 Dec 2023 09:57:02 -0800
Subject: bpf: enforce precision of R0 on program/async callback return

Given we enforce a valid range for program and async callback return
value, we must mark R0 as precise to avoid incorrect state pruning.

Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking")
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231202175705.885270-9-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c54944af1bcc..2cd150d6d141 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15138,6 +15138,10 @@ enforce_retval:
 		return -EINVAL;
 	}
 
+	err = mark_chain_precision(env, regno);
+	if (err)
+		return err;
+
 	if (!retval_range_within(range, reg)) {
 		verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name);
 		if (!is_subprog &&
-- 
cgit v1.2.3


From 81eff2e36481c5cf4a2ac906ae56c3fbd3e6f305 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Sat, 2 Dec 2023 09:57:05 -0800
Subject: bpf: simplify tnum output if a fully known constant

Emit tnum representation as just a constant if all bits are known.
Use decimal-vs-hex logic to determine exact format of emitted
constant value, just like it's done for register range values.
For that move tnum_strn() to kernel/bpf/log.c to reuse decimal-vs-hex
determination logic and constants.

Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231202175705.885270-12-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/log.c                                            | 13 +++++++++++++
 kernel/bpf/tnum.c                                           |  6 ------
 .../selftests/bpf/progs/verifier_direct_packet_access.c     |  2 +-
 tools/testing/selftests/bpf/progs/verifier_int_ptr.c        |  2 +-
 tools/testing/selftests/bpf/progs/verifier_stack_ptr.c      |  4 ++--
 5 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 3505f3e5ae96..55d019f30e91 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -539,6 +539,19 @@ static void verbose_snum(struct bpf_verifier_env *env, s64 num)
 		verbose(env, "%#llx", num);
 }
 
+int tnum_strn(char *str, size_t size, struct tnum a)
+{
+	/* print as a constant, if tnum is fully known */
+	if (a.mask == 0) {
+		if (is_unum_decimal(a.value))
+			return snprintf(str, size, "%llu", a.value);
+		else
+			return snprintf(str, size, "%#llx", a.value);
+	}
+	return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
+}
+EXPORT_SYMBOL_GPL(tnum_strn);
+
 static void print_scalar_ranges(struct bpf_verifier_env *env,
 				const struct bpf_reg_state *reg,
 				const char **sep)
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index f4c91c9b27d7..9dbc31b25e3d 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -172,12 +172,6 @@ bool tnum_in(struct tnum a, struct tnum b)
 	return a.value == b.value;
 }
 
-int tnum_strn(char *str, size_t size, struct tnum a)
-{
-	return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
-}
-EXPORT_SYMBOL_GPL(tnum_strn);
-
 int tnum_sbin(char *str, size_t size, struct tnum a)
 {
 	size_t n;
diff --git a/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c b/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c
index 99a23dea8233..be95570ab382 100644
--- a/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c
+++ b/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c
@@ -411,7 +411,7 @@ l0_%=:	r0 = 0;						\
 
 SEC("tc")
 __description("direct packet access: test17 (pruning, alignment)")
-__failure __msg("misaligned packet access off 2+(0x0; 0x0)+15+-4 size 4")
+__failure __msg("misaligned packet access off 2+0+15+-4 size 4")
 __flag(BPF_F_STRICT_ALIGNMENT)
 __naked void packet_access_test17_pruning_alignment(void)
 {
diff --git a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
index b054f9c48143..74d9cad469d9 100644
--- a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
+++ b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
@@ -67,7 +67,7 @@ __naked void ptr_to_long_half_uninitialized(void)
 
 SEC("cgroup/sysctl")
 __description("ARG_PTR_TO_LONG misaligned")
-__failure __msg("misaligned stack access off (0x0; 0x0)+-20+0 size 8")
+__failure __msg("misaligned stack access off 0+-20+0 size 8")
 __naked void arg_ptr_to_long_misaligned(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c b/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c
index e0f77e3e7869..417c61cd4b19 100644
--- a/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c
+++ b/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c
@@ -37,7 +37,7 @@ __naked void ptr_to_stack_store_load(void)
 
 SEC("socket")
 __description("PTR_TO_STACK store/load - bad alignment on off")
-__failure __msg("misaligned stack access off (0x0; 0x0)+-8+2 size 8")
+__failure __msg("misaligned stack access off 0+-8+2 size 8")
 __failure_unpriv
 __naked void load_bad_alignment_on_off(void)
 {
@@ -53,7 +53,7 @@ __naked void load_bad_alignment_on_off(void)
 
 SEC("socket")
 __description("PTR_TO_STACK store/load - bad alignment on reg")
-__failure __msg("misaligned stack access off (0x0; 0x0)+-10+8 size 8")
+__failure __msg("misaligned stack access off 0+-10+8 size 8")
 __failure_unpriv
 __naked void load_bad_alignment_on_reg(void)
 {
-- 
cgit v1.2.3


From 5bd90cdc65ef9ef5e13c9ff23620079db5c608a0 Mon Sep 17 00:00:00 2001
From: Andrei Matei <andreimatei1@gmail.com>
Date: Sun, 3 Dec 2023 20:12:48 -0500
Subject: bpf: Minor logging improvement

One place where we were logging a register was only logging the variable
part, not also the fixed part.

Signed-off-by: Andrei Matei <andreimatei1@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20231204011248.2040084-1-andreimatei1@gmail.com
---
 kernel/bpf/verifier.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2cd150d6d141..cdb4f5f0ba79 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6585,8 +6585,8 @@ static int check_stack_access_within_bounds(
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n",
-				err_extra, regno, tn_buf, access_size);
+			verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n",
+				err_extra, regno, tn_buf, off, access_size);
 		}
 	}
 	return err;
-- 
cgit v1.2.3


From 659aa050a53817157b7459529538598a6449c1d3 Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Mon, 13 Nov 2023 14:13:24 -0800
Subject: kernel/resource: Increment by align value in get_free_mem_region()

Currently get_free_mem_region() searches for available capacity
in increments equal to the region size being requested. This can
cause the search to take giant steps through the resource leaving
needless gaps and missing available space.

Specifically 'cxl create-region' fails with ERANGE even though capacity
of the given size and CXL's expected 256M x InterleaveWays alignment can
be satisfied.

Replace the total-request-size increment with a next alignment increment
so that the next possible address is always examined for availability.

Fixes: 14b80582c43e ("resource: Introduce alloc_free_mem_region()")
Reported-by: Dmytro Adamenko <dmytro.adamenko@intel.com>
Reported-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/20231113221324.1118092-1-alison.schofield@intel.com
Cc: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 kernel/resource.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 866ef3663a0b..91be1bc50b60 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1844,8 +1844,8 @@ get_free_mem_region(struct device *dev, struct resource *base,
 
 	write_lock(&resource_lock);
 	for (addr = gfr_start(base, size, align, flags);
-	     gfr_continue(base, addr, size, flags);
-	     addr = gfr_next(addr, size, flags)) {
+	     gfr_continue(base, addr, align, flags);
+	     addr = gfr_next(addr, align, flags)) {
 		if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) !=
 		    REGION_DISJOINT)
 			continue;
-- 
cgit v1.2.3


From 169410eba271afc9f0fb476d996795aa26770c6d Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 4 Dec 2023 22:04:19 +0800
Subject: bpf: Check rcu_read_lock_trace_held() before calling bpf map helpers

These three bpf_map_{lookup,update,delete}_elem() helpers are also
available for sleepable bpf program, so add the corresponding lock
assertion for sleepable bpf program, otherwise the following warning
will be reported when a sleepable bpf program manipulates bpf map under
interpreter mode (aka bpf_jit_enable=0):

  WARNING: CPU: 3 PID: 4985 at kernel/bpf/helpers.c:40 ......
  CPU: 3 PID: 4985 Comm: test_progs Not tainted 6.6.0+ #2
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ......
  RIP: 0010:bpf_map_lookup_elem+0x54/0x60
  ......
  Call Trace:
   <TASK>
   ? __warn+0xa5/0x240
   ? bpf_map_lookup_elem+0x54/0x60
   ? report_bug+0x1ba/0x1f0
   ? handle_bug+0x40/0x80
   ? exc_invalid_op+0x18/0x50
   ? asm_exc_invalid_op+0x1b/0x20
   ? __pfx_bpf_map_lookup_elem+0x10/0x10
   ? rcu_lockdep_current_cpu_online+0x65/0xb0
   ? rcu_is_watching+0x23/0x50
   ? bpf_map_lookup_elem+0x54/0x60
   ? __pfx_bpf_map_lookup_elem+0x10/0x10
   ___bpf_prog_run+0x513/0x3b70
   __bpf_prog_run32+0x9d/0xd0
   ? __bpf_prog_enter_sleepable_recur+0xad/0x120
   ? __bpf_prog_enter_sleepable_recur+0x3e/0x120
   bpf_trampoline_6442580665+0x4d/0x1000
   __x64_sys_getpgid+0x5/0x30
   ? do_syscall_64+0x36/0xb0
   entry_SYSCALL_64_after_hwframe+0x6e/0x76
   </TASK>

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231204140425.1480317-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b45a8381f9bd..ee9bdf29246a 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -32,12 +32,13 @@
  *
  * Different map implementations will rely on rcu in map methods
  * lookup/update/delete, therefore eBPF programs must run under rcu lock
- * if program is allowed to access maps, so check rcu_read_lock_held in
- * all three functions.
+ * if program is allowed to access maps, so check rcu_read_lock_held() or
+ * rcu_read_lock_trace_held() in all three functions.
  */
 BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
 {
-	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+		     !rcu_read_lock_bh_held());
 	return (unsigned long) map->ops->map_lookup_elem(map, key);
 }
 
@@ -53,7 +54,8 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = {
 BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
 	   void *, value, u64, flags)
 {
-	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+		     !rcu_read_lock_bh_held());
 	return map->ops->map_update_elem(map, key, value, flags);
 }
 
@@ -70,7 +72,8 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
 
 BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
 {
-	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+		     !rcu_read_lock_bh_held());
 	return map->ops->map_delete_elem(map, key);
 }
 
-- 
cgit v1.2.3


From 20c20bd11a0702ce4dc9300c3da58acf551d9725 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 4 Dec 2023 22:04:20 +0800
Subject: bpf: Add map and need_defer parameters to .map_fd_put_ptr()

map is the pointer of outer map, and need_defer needs some explanation.
need_defer tells the implementation to defer the reference release of
the passed element and ensure that the element is still alive before
the bpf program, which may manipulate it, exits.

The following three cases will invoke map_fd_put_ptr() and different
need_defer values will be passed to these callers:

1) release the reference of the old element in the map during map update
   or map deletion. The release must be deferred, otherwise the bpf
   program may incur use-after-free problem, so need_defer needs to be
   true.
2) release the reference of the to-be-added element in the error path of
   map update. The to-be-added element is not visible to any bpf
   program, so it is OK to pass false for need_defer parameter.
3) release the references of all elements in the map during map release.
   Any bpf program which has access to the map must have been exited and
   released, so need_defer=false will be OK.

These two parameters will be used by the following patches to fix the
potential use-after-free problem for map-in-map.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231204140425.1480317-3-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     |  6 +++++-
 kernel/bpf/arraymap.c   | 12 +++++++-----
 kernel/bpf/hashtab.c    |  6 +++---
 kernel/bpf/map_in_map.c |  2 +-
 kernel/bpf/map_in_map.h |  2 +-
 5 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index eb447b0a9423..d273348cfb2f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -106,7 +106,11 @@ struct bpf_map_ops {
 	/* funcs called by prog_array and perf_event_array map */
 	void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
 				int fd);
-	void (*map_fd_put_ptr)(void *ptr);
+	/* If need_defer is true, the implementation should guarantee that
+	 * the to-be-put element is still alive before the bpf program, which
+	 * may manipulate it, exists.
+	 */
+	void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer);
 	int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
 	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 2058e89b5ddd..f9aed5909d6e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -867,7 +867,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 	}
 
 	if (old_ptr)
-		map->ops->map_fd_put_ptr(old_ptr);
+		map->ops->map_fd_put_ptr(map, old_ptr, true);
 	return 0;
 }
 
@@ -890,7 +890,7 @@ static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
 	}
 
 	if (old_ptr) {
-		map->ops->map_fd_put_ptr(old_ptr);
+		map->ops->map_fd_put_ptr(map, old_ptr, true);
 		return 0;
 	} else {
 		return -ENOENT;
@@ -913,8 +913,9 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map,
 	return prog;
 }
 
-static void prog_fd_array_put_ptr(void *ptr)
+static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
 {
+	/* bpf_prog is freed after one RCU or tasks trace grace period */
 	bpf_prog_put(ptr);
 }
 
@@ -1239,8 +1240,9 @@ err_out:
 	return ee;
 }
 
-static void perf_event_fd_array_put_ptr(void *ptr)
+static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
 {
+	/* bpf_perf_event is freed after one RCU grace period */
 	bpf_event_entry_free_rcu(ptr);
 }
 
@@ -1294,7 +1296,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
 	return cgroup_get_from_fd(fd);
 }
 
-static void cgroup_fd_array_put_ptr(void *ptr)
+static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
 {
 	/* cgroup_put free cgrp after a rcu grace period */
 	cgroup_put(ptr);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fd8d4b0addfc..5b9146fa825f 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -897,7 +897,7 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
 
 	if (map->ops->map_fd_put_ptr) {
 		ptr = fd_htab_map_get_ptr(map, l);
-		map->ops->map_fd_put_ptr(ptr);
+		map->ops->map_fd_put_ptr(map, ptr, true);
 	}
 }
 
@@ -2484,7 +2484,7 @@ static void fd_htab_map_free(struct bpf_map *map)
 		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
 			void *ptr = fd_htab_map_get_ptr(map, l);
 
-			map->ops->map_fd_put_ptr(ptr);
+			map->ops->map_fd_put_ptr(map, ptr, false);
 		}
 	}
 
@@ -2525,7 +2525,7 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 
 	ret = htab_map_update_elem(map, key, &ptr, map_flags);
 	if (ret)
-		map->ops->map_fd_put_ptr(ptr);
+		map->ops->map_fd_put_ptr(map, ptr, false);
 
 	return ret;
 }
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index cd5eafaba97e..2dfeb5835e16 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -127,7 +127,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
 	return inner_map;
 }
 
-void bpf_map_fd_put_ptr(void *ptr)
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
 {
 	/* ptr->ops->map_free() has to go through one
 	 * rcu grace period by itself.
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index bcb7534afb3c..7d61602354de 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -13,7 +13,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
 void bpf_map_meta_free(struct bpf_map *map_meta);
 void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
 			 int ufd);
-void bpf_map_fd_put_ptr(void *ptr);
+void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer);
 u32 bpf_map_fd_sys_lookup_elem(void *ptr);
 
 #endif
-- 
cgit v1.2.3


From 79d93b3c6ffd79abcd8e43345980aa1e904879c4 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 4 Dec 2023 22:04:21 +0800
Subject: bpf: Set need_defer as false when clearing fd array during map free

Both map deletion operation, map release and map free operation use
fd_array_map_delete_elem() to remove the element from fd array and
need_defer is always true in fd_array_map_delete_elem(). For the map
deletion operation and map release operation, need_defer=true is
necessary, because the bpf program, which accesses the element in fd
array, may still alive. However for map free operation, it is certain
that the bpf program which owns the fd array has already been exited, so
setting need_defer as false is appropriate for map free operation.

So fix it by adding need_defer parameter to bpf_fd_array_map_clear() and
adding a new helper __fd_array_map_delete_elem() to handle the map
deletion, map release and map free operations correspondingly.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231204140425.1480317-4-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/arraymap.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index f9aed5909d6e..4a4a67956e21 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -871,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 	return 0;
 }
 
-static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
+static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	void *old_ptr;
@@ -890,13 +890,18 @@ static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
 	}
 
 	if (old_ptr) {
-		map->ops->map_fd_put_ptr(map, old_ptr, true);
+		map->ops->map_fd_put_ptr(map, old_ptr, need_defer);
 		return 0;
 	} else {
 		return -ENOENT;
 	}
 }
 
+static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+	return __fd_array_map_delete_elem(map, key, true);
+}
+
 static void *prog_fd_array_get_ptr(struct bpf_map *map,
 				   struct file *map_file, int fd)
 {
@@ -925,13 +930,13 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr)
 }
 
 /* decrement refcnt of all bpf_progs that are stored in this map */
-static void bpf_fd_array_map_clear(struct bpf_map *map)
+static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
 
 	for (i = 0; i < array->map.max_entries; i++)
-		fd_array_map_delete_elem(map, &i);
+		__fd_array_map_delete_elem(map, &i, need_defer);
 }
 
 static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
@@ -1110,7 +1115,7 @@ static void prog_array_map_clear_deferred(struct work_struct *work)
 {
 	struct bpf_map *map = container_of(work, struct bpf_array_aux,
 					   work)->map;
-	bpf_fd_array_map_clear(map);
+	bpf_fd_array_map_clear(map, true);
 	bpf_map_put(map);
 }
 
@@ -1260,7 +1265,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,
 	for (i = 0; i < array->map.max_entries; i++) {
 		ee = READ_ONCE(array->ptrs[i]);
 		if (ee && ee->map_file == map_file)
-			fd_array_map_delete_elem(map, &i);
+			__fd_array_map_delete_elem(map, &i, true);
 	}
 	rcu_read_unlock();
 }
@@ -1268,7 +1273,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,
 static void perf_event_fd_array_map_free(struct bpf_map *map)
 {
 	if (map->map_flags & BPF_F_PRESERVE_ELEMS)
-		bpf_fd_array_map_clear(map);
+		bpf_fd_array_map_clear(map, false);
 	fd_array_map_free(map);
 }
 
@@ -1304,7 +1309,7 @@ static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_de
 
 static void cgroup_fd_array_free(struct bpf_map *map)
 {
-	bpf_fd_array_map_clear(map);
+	bpf_fd_array_map_clear(map, false);
 	fd_array_map_free(map);
 }
 
@@ -1349,7 +1354,7 @@ static void array_of_map_free(struct bpf_map *map)
 	 * is protected by fdget/fdput.
 	 */
 	bpf_map_meta_free(map->inner_map_meta);
-	bpf_fd_array_map_clear(map);
+	bpf_fd_array_map_clear(map, false);
 	fd_array_map_free(map);
 }
 
-- 
cgit v1.2.3


From 876673364161da50eed6b472d746ef88242b2368 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 4 Dec 2023 22:04:22 +0800
Subject: bpf: Defer the free of inner map when necessary

When updating or deleting an inner map in map array or map htab, the map
may still be accessed by non-sleepable program or sleepable program.
However bpf_map_fd_put_ptr() decreases the ref-counter of the inner map
directly through bpf_map_put(), if the ref-counter is the last one
(which is true for most cases), the inner map will be freed by
ops->map_free() in a kworker. But for now, most .map_free() callbacks
don't use synchronize_rcu() or its variants to wait for the elapse of a
RCU grace period, so after the invocation of ops->map_free completes,
the bpf program which is accessing the inner map may incur
use-after-free problem.

Fix the free of inner map by invoking bpf_map_free_deferred() after both
one RCU grace period and one tasks trace RCU grace period if the inner
map has been removed from the outer map before. The deferment is
accomplished by using call_rcu() or call_rcu_tasks_trace() when
releasing the last ref-counter of bpf map. The newly-added rcu_head
field in bpf_map shares the same storage space with work field to
reduce the size of bpf_map.

Fixes: bba1dc0b55ac ("bpf: Remove redundant synchronize_rcu.")
Fixes: 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231204140425.1480317-5-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     |  7 ++++++-
 kernel/bpf/map_in_map.c | 11 ++++++++---
 kernel/bpf/syscall.c    | 32 +++++++++++++++++++++++++++-----
 3 files changed, 41 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d273348cfb2f..de3bd03cbeea 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -276,7 +276,11 @@ struct bpf_map {
 	 */
 	atomic64_t refcnt ____cacheline_aligned;
 	atomic64_t usercnt;
-	struct work_struct work;
+	/* rcu is used before freeing and work is only used during freeing */
+	union {
+		struct work_struct work;
+		struct rcu_head rcu;
+	};
 	struct mutex freeze_mutex;
 	atomic64_t writecnt;
 	/* 'Ownership' of program-containing map is claimed by the first program
@@ -292,6 +296,7 @@ struct bpf_map {
 	} owner;
 	bool bypass_spec_v1;
 	bool frozen; /* write-once; write-protected by freeze_mutex */
+	bool free_after_mult_rcu_gp;
 	s64 __percpu *elem_count;
 };
 
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 2dfeb5835e16..3248ff5d8161 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -129,10 +129,15 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
 
 void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
 {
-	/* ptr->ops->map_free() has to go through one
-	 * rcu grace period by itself.
+	struct bpf_map *inner_map = ptr;
+
+	/* The inner map may still be used by both non-sleepable and sleepable
+	 * bpf program, so free it after one RCU grace period and one tasks
+	 * trace RCU grace period.
 	 */
-	bpf_map_put(ptr);
+	if (need_defer)
+		WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
+	bpf_map_put(inner_map);
 }
 
 u32 bpf_map_fd_sys_lookup_elem(void *ptr)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5e43ddd1b83f..dd515f6b9741 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -719,6 +719,28 @@ static void bpf_map_put_uref(struct bpf_map *map)
 	}
 }
 
+static void bpf_map_free_in_work(struct bpf_map *map)
+{
+	INIT_WORK(&map->work, bpf_map_free_deferred);
+	/* Avoid spawning kworkers, since they all might contend
+	 * for the same mutex like slab_mutex.
+	 */
+	queue_work(system_unbound_wq, &map->work);
+}
+
+static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
+{
+	bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
+}
+
+static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
+{
+	if (rcu_trace_implies_rcu_gp())
+		bpf_map_free_rcu_gp(rcu);
+	else
+		call_rcu(rcu, bpf_map_free_rcu_gp);
+}
+
 /* decrement map refcnt and schedule it for freeing via workqueue
  * (underlying map implementation ops->map_free() might sleep)
  */
@@ -728,11 +750,11 @@ void bpf_map_put(struct bpf_map *map)
 		/* bpf_map_free_id() must be called first */
 		bpf_map_free_id(map);
 		btf_put(map->btf);
-		INIT_WORK(&map->work, bpf_map_free_deferred);
-		/* Avoid spawning kworkers, since they all might contend
-		 * for the same mutex like slab_mutex.
-		 */
-		queue_work(system_unbound_wq, &map->work);
+
+		if (READ_ONCE(map->free_after_mult_rcu_gp))
+			call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
+		else
+			bpf_map_free_in_work(map);
 	}
 }
 EXPORT_SYMBOL_GPL(bpf_map_put);
-- 
cgit v1.2.3


From af66bfd3c8538ed21cf72af18426fc4a408665cf Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 4 Dec 2023 22:04:23 +0800
Subject: bpf: Optimize the free of inner map

When removing the inner map from the outer map, the inner map will be
freed after one RCU grace period and one RCU tasks trace grace
period, so it is certain that the bpf program, which may access the
inner map, has exited before the inner map is freed.

However there is no need to wait for one RCU tasks trace grace period if
the outer map is only accessed by non-sleepable program. So adding
sleepable_refcnt in bpf_map and increasing sleepable_refcnt when adding
the outer map into env->used_maps for sleepable program. Although the
max number of bpf program is INT_MAX - 1, the number of bpf programs
which are being loaded may be greater than INT_MAX, so using atomic64_t
instead of atomic_t for sleepable_refcnt. When removing the inner map
from the outer map, using sleepable_refcnt to decide whether or not a
RCU tasks trace grace period is needed before freeing the inner map.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231204140425.1480317-6-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     |  2 ++
 kernel/bpf/core.c       |  4 ++++
 kernel/bpf/map_in_map.c | 14 +++++++++-----
 kernel/bpf/syscall.c    |  8 ++++++++
 kernel/bpf/verifier.c   |  4 +++-
 5 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index de3bd03cbeea..10e5e4d8a00f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -297,6 +297,8 @@ struct bpf_map {
 	bool bypass_spec_v1;
 	bool frozen; /* write-once; write-protected by freeze_mutex */
 	bool free_after_mult_rcu_gp;
+	bool free_after_rcu_gp;
+	atomic64_t sleepable_refcnt;
 	s64 __percpu *elem_count;
 };
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index cd3afe57ece3..4b813da8d6c0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2664,12 +2664,16 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
 			  struct bpf_map **used_maps, u32 len)
 {
 	struct bpf_map *map;
+	bool sleepable;
 	u32 i;
 
+	sleepable = aux->sleepable;
 	for (i = 0; i < len; i++) {
 		map = used_maps[i];
 		if (map->ops->map_poke_untrack)
 			map->ops->map_poke_untrack(map, aux);
+		if (sleepable)
+			atomic64_dec(&map->sleepable_refcnt);
 		bpf_map_put(map);
 	}
 }
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 3248ff5d8161..8ef269e66ba5 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -131,12 +131,16 @@ void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
 {
 	struct bpf_map *inner_map = ptr;
 
-	/* The inner map may still be used by both non-sleepable and sleepable
-	 * bpf program, so free it after one RCU grace period and one tasks
-	 * trace RCU grace period.
+	/* Defer the freeing of inner map according to the sleepable attribute
+	 * of bpf program which owns the outer map, so unnecessary waiting for
+	 * RCU tasks trace grace period can be avoided.
 	 */
-	if (need_defer)
-		WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
+	if (need_defer) {
+		if (atomic64_read(&map->sleepable_refcnt))
+			WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
+		else
+			WRITE_ONCE(inner_map->free_after_rcu_gp, true);
+	}
 	bpf_map_put(inner_map);
 }
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dd515f6b9741..ebaccf77d56e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -751,8 +751,11 @@ void bpf_map_put(struct bpf_map *map)
 		bpf_map_free_id(map);
 		btf_put(map->btf);
 
+		WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
 		if (READ_ONCE(map->free_after_mult_rcu_gp))
 			call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
+		else if (READ_ONCE(map->free_after_rcu_gp))
+			call_rcu(&map->rcu, bpf_map_free_rcu_gp);
 		else
 			bpf_map_free_in_work(map);
 	}
@@ -5345,6 +5348,11 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
 		goto out_unlock;
 	}
 
+	/* The bpf program will not access the bpf map, but for the sake of
+	 * simplicity, increase sleepable_refcnt for sleepable program as well.
+	 */
+	if (prog->aux->sleepable)
+		atomic64_inc(&map->sleepable_refcnt);
 	memcpy(used_maps_new, used_maps_old,
 	       sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
 	used_maps_new[prog->aux->used_map_cnt] = map;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cdb4f5f0ba79..1ed39665f802 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -17889,10 +17889,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
 				return -E2BIG;
 			}
 
+			if (env->prog->aux->sleepable)
+				atomic64_inc(&map->sleepable_refcnt);
 			/* hold the map. If the program is rejected by verifier,
 			 * the map will be released by release_maps() or it
 			 * will be used by the valid program until it's unloaded
-			 * and all maps are released in free_used_maps()
+			 * and all maps are released in bpf_free_used_maps()
 			 */
 			bpf_map_inc(map);
 
-- 
cgit v1.2.3


From 70da1d01edf6da3fde1df98b2125a77083a0fb82 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 2 Oct 2023 16:36:55 +0200
Subject: cpu/hotplug: remove CPUHP_SLAB_PREPARE hooks

The CPUHP_SLAB_PREPARE hooks are only used by SLAB which is removed.
SLUB defines them as NULL, so we can remove those altogether.

Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/cpuhotplug.h | 1 -
 include/linux/slab.h       | 8 --------
 kernel/cpu.c               | 5 -----
 3 files changed, 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index d305db70674b..07cb8f7030b6 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -108,7 +108,6 @@ enum cpuhp_state {
 	CPUHP_X2APIC_PREPARE,
 	CPUHP_SMPCFD_PREPARE,
 	CPUHP_RELAY_PREPARE,
-	CPUHP_SLAB_PREPARE,
 	CPUHP_MD_RAID5_PREPARE,
 	CPUHP_RCUTREE_PREP,
 	CPUHP_CPUIDLE_COUPLED_PREPARE,
diff --git a/include/linux/slab.h b/include/linux/slab.h
index d6d6ffeeb9a2..34e43cddc520 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -788,12 +788,4 @@ size_t kmalloc_size_roundup(size_t size);
 
 void __init kmem_cache_init_late(void);
 
-#if defined(CONFIG_SMP) && defined(CONFIG_SLAB)
-int slab_prepare_cpu(unsigned int cpu);
-int slab_dead_cpu(unsigned int cpu);
-#else
-#define slab_prepare_cpu	NULL
-#define slab_dead_cpu		NULL
-#endif
-
 #endif	/* _LINUX_SLAB_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9e4c6780adde..530b026d95a1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2125,11 +2125,6 @@ static struct cpuhp_step cpuhp_hp_states[] = {
 		.startup.single		= relay_prepare_cpu,
 		.teardown.single	= NULL,
 	},
-	[CPUHP_SLAB_PREPARE] = {
-		.name			= "slab:prepare",
-		.startup.single		= slab_prepare_cpu,
-		.teardown.single	= slab_dead_cpu,
-	},
 	[CPUHP_RCUTREE_PREP] = {
 		.name			= "RCU/tree:prepare",
 		.startup.single		= rcutree_prepare_cpu,
-- 
cgit v1.2.3


From 6ac805d13870925c787a28e3fe5cc73610cacd03 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 4 Dec 2023 10:47:49 -0700
Subject: iov_iter: remove unused 'iov' argument from import_single_range()

It is entirely unused, just get rid of it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20231204174827.1258875-2-axboe@kernel.dk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/block/ublk_drv.c         | 6 ++----
 drivers/char/random.c            | 6 ++----
 fs/aio.c                         | 2 +-
 include/linux/uio.h              | 2 +-
 kernel/trace/trace_events_user.c | 3 +--
 lib/iov_iter.c                   | 2 +-
 net/ipv4/tcp.c                   | 6 ++----
 net/socket.c                     | 6 ++----
 security/keys/keyctl.c           | 3 +--
 9 files changed, 13 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 83600b45e12a..5656b0a1233d 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -893,11 +893,10 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
 	 */
 	if (ublk_need_map_req(req)) {
 		struct iov_iter iter;
-		struct iovec iov;
 		const int dir = ITER_DEST;
 
 		import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes,
-				&iov, &iter);
+					&iter);
 
 		return ublk_copy_user_pages(req, 0, &iter, dir);
 	}
@@ -915,13 +914,12 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
 
 	if (ublk_need_unmap_req(req)) {
 		struct iov_iter iter;
-		struct iovec iov;
 		const int dir = ITER_SOURCE;
 
 		WARN_ON_ONCE(io->res > rq_bytes);
 
 		import_single_range(dir, u64_to_user_ptr(io->addr), io->res,
-				&iov, &iter);
+					&iter);
 		return ublk_copy_user_pages(req, 0, &iter, dir);
 	}
 	return rq_bytes;
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 4a9c79391dee..e79ae238b30d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1364,7 +1364,6 @@ static void __cold try_to_generate_entropy(void)
 SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags)
 {
 	struct iov_iter iter;
-	struct iovec iov;
 	int ret;
 
 	if (flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE))
@@ -1385,7 +1384,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags
 			return ret;
 	}
 
-	ret = import_single_range(ITER_DEST, ubuf, len, &iov, &iter);
+	ret = import_single_range(ITER_DEST, ubuf, len, &iter);
 	if (unlikely(ret))
 		return ret;
 	return get_random_bytes_user(&iter);
@@ -1491,7 +1490,6 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 		return 0;
 	case RNDADDENTROPY: {
 		struct iov_iter iter;
-		struct iovec iov;
 		ssize_t ret;
 		int len;
 
@@ -1503,7 +1501,7 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 			return -EINVAL;
 		if (get_user(len, p++))
 			return -EFAULT;
-		ret = import_single_range(ITER_SOURCE, p, len, &iov, &iter);
+		ret = import_single_range(ITER_SOURCE, p, len, &iter);
 		if (unlikely(ret))
 			return ret;
 		ret = write_pool_user(&iter);
diff --git a/fs/aio.c b/fs/aio.c
index f8589caef9c1..251eeaef7fbf 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1498,7 +1498,7 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
 	size_t len = iocb->aio_nbytes;
 
 	if (!vectored) {
-		ssize_t ret = import_single_range(rw, buf, len, *iovec, iter);
+		ssize_t ret = import_single_range(rw, buf, len, iter);
 		*iovec = NULL;
 		return ret;
 	}
diff --git a/include/linux/uio.h b/include/linux/uio.h
index b6214cbf2a43..bfafd3542fa7 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -348,7 +348,7 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec,
 		 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
 		 struct iov_iter *i, bool compat);
 int import_single_range(int type, void __user *buf, size_t len,
-		 struct iovec *iov, struct iov_iter *i);
+		 struct iov_iter *i);
 int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);
 
 static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 9365ce407426..4efc75d90a0d 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -2177,14 +2177,13 @@ static int user_events_open(struct inode *node, struct file *file)
 static ssize_t user_events_write(struct file *file, const char __user *ubuf,
 				 size_t count, loff_t *ppos)
 {
-	struct iovec iov;
 	struct iov_iter i;
 
 	if (unlikely(*ppos != 0))
 		return -EFAULT;
 
 	if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf,
-					 count, &iov, &i)))
+					 count, &i)))
 		return -EFAULT;
 
 	return user_events_write_core(file, &i);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index de7d11cf4c63..d60c73354e1f 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1370,7 +1370,7 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec,
 EXPORT_SYMBOL(import_iovec);
 
 int import_single_range(int rw, void __user *buf, size_t len,
-		 struct iovec *iov, struct iov_iter *i)
+			struct iov_iter *i)
 {
 	if (len > MAX_RW_COUNT)
 		len = MAX_RW_COUNT;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 53bcc17c91e4..57cf3adb191a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1849,7 +1849,6 @@ static int receive_fallback_to_copy(struct sock *sk,
 {
 	unsigned long copy_address = (unsigned long)zc->copybuf_address;
 	struct msghdr msg = {};
-	struct iovec iov;
 	int err;
 
 	zc->length = 0;
@@ -1859,7 +1858,7 @@ static int receive_fallback_to_copy(struct sock *sk,
 		return -EINVAL;
 
 	err = import_single_range(ITER_DEST, (void __user *)copy_address,
-				  inq, &iov, &msg.msg_iter);
+				  inq, &msg.msg_iter);
 	if (err)
 		return err;
 
@@ -1886,14 +1885,13 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
 {
 	unsigned long copy_address = (unsigned long)zc->copybuf_address;
 	struct msghdr msg = {};
-	struct iovec iov;
 	int err;
 
 	if (copy_address != zc->copybuf_address)
 		return -EINVAL;
 
 	err = import_single_range(ITER_DEST, (void __user *)copy_address,
-				  copylen, &iov, &msg.msg_iter);
+				  copylen, &msg.msg_iter);
 	if (err)
 		return err;
 	err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
diff --git a/net/socket.c b/net/socket.c
index 3379c64217a4..1f0d0e8d0a50 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2161,10 +2161,9 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
 	struct sockaddr_storage address;
 	int err;
 	struct msghdr msg;
-	struct iovec iov;
 	int fput_needed;
 
-	err = import_single_range(ITER_SOURCE, buff, len, &iov, &msg.msg_iter);
+	err = import_single_range(ITER_SOURCE, buff, len, &msg.msg_iter);
 	if (unlikely(err))
 		return err;
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
@@ -2226,11 +2225,10 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
 		.msg_name = addr ? (struct sockaddr *)&address : NULL,
 	};
 	struct socket *sock;
-	struct iovec iov;
 	int err, err2;
 	int fput_needed;
 
-	err = import_single_range(ITER_DEST, ubuf, size, &iov, &msg.msg_iter);
+	err = import_single_range(ITER_DEST, ubuf, size, &msg.msg_iter);
 	if (unlikely(err))
 		return err;
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 19be69fa4d05..193df7ca3ca8 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1252,12 +1252,11 @@ long keyctl_instantiate_key(key_serial_t id,
 			    key_serial_t ringid)
 {
 	if (_payload && plen) {
-		struct iovec iov;
 		struct iov_iter from;
 		int ret;
 
 		ret = import_single_range(ITER_SOURCE, (void __user *)_payload, plen,
-					  &iov, &from);
+					  &from);
 		if (unlikely(ret))
 			return ret;
 
-- 
cgit v1.2.3


From 9fd7874c0e5c89d7da0b4442271696ec0f8edcba Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 4 Dec 2023 10:47:50 -0700
Subject: iov_iter: replace import_single_range() with import_ubuf()

With the removal of the 'iov' argument to import_single_range(), the two
functions are now fully identical. Convert the import_single_range()
callers to import_ubuf(), and remove the former fully.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/r/20231204174827.1258875-3-axboe@kernel.dk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/block/ublk_drv.c         |  7 ++-----
 drivers/char/random.c            |  4 ++--
 fs/aio.c                         |  2 +-
 include/linux/uio.h              |  2 --
 kernel/trace/trace_events_user.c |  3 +--
 lib/iov_iter.c                   | 13 -------------
 net/ipv4/tcp.c                   |  8 ++++----
 net/socket.c                     |  4 ++--
 security/keys/keyctl.c           |  4 ++--
 9 files changed, 14 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 5656b0a1233d..3eaf02ebeebe 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -895,9 +895,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
 		struct iov_iter iter;
 		const int dir = ITER_DEST;
 
-		import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes,
-					&iter);
-
+		import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter);
 		return ublk_copy_user_pages(req, 0, &iter, dir);
 	}
 	return rq_bytes;
@@ -918,8 +916,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
 
 		WARN_ON_ONCE(io->res > rq_bytes);
 
-		import_single_range(dir, u64_to_user_ptr(io->addr), io->res,
-					&iter);
+		import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter);
 		return ublk_copy_user_pages(req, 0, &iter, dir);
 	}
 	return rq_bytes;
diff --git a/drivers/char/random.c b/drivers/char/random.c
index e79ae238b30d..456be28ba67c 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1384,7 +1384,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags
 			return ret;
 	}
 
-	ret = import_single_range(ITER_DEST, ubuf, len, &iter);
+	ret = import_ubuf(ITER_DEST, ubuf, len, &iter);
 	if (unlikely(ret))
 		return ret;
 	return get_random_bytes_user(&iter);
@@ -1501,7 +1501,7 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 			return -EINVAL;
 		if (get_user(len, p++))
 			return -EFAULT;
-		ret = import_single_range(ITER_SOURCE, p, len, &iter);
+		ret = import_ubuf(ITER_SOURCE, p, len, &iter);
 		if (unlikely(ret))
 			return ret;
 		ret = write_pool_user(&iter);
diff --git a/fs/aio.c b/fs/aio.c
index 251eeaef7fbf..4ea639509d41 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1498,7 +1498,7 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
 	size_t len = iocb->aio_nbytes;
 
 	if (!vectored) {
-		ssize_t ret = import_single_range(rw, buf, len, iter);
+		ssize_t ret = import_ubuf(rw, buf, len, iter);
 		*iovec = NULL;
 		return ret;
 	}
diff --git a/include/linux/uio.h b/include/linux/uio.h
index bfafd3542fa7..bea9c89922d9 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -347,8 +347,6 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec,
 ssize_t __import_iovec(int type, const struct iovec __user *uvec,
 		 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
 		 struct iov_iter *i, bool compat);
-int import_single_range(int type, void __user *buf, size_t len,
-		 struct iov_iter *i);
 int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);
 
 static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 4efc75d90a0d..e76f5e1efdf2 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -2182,8 +2182,7 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf,
 	if (unlikely(*ppos != 0))
 		return -EFAULT;
 
-	if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf,
-					 count, &i)))
+	if (unlikely(import_ubuf(ITER_SOURCE, (char __user *)ubuf, count, &i)))
 		return -EFAULT;
 
 	return user_events_write_core(file, &i);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index d60c73354e1f..009875bc95aa 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1369,19 +1369,6 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec,
 }
 EXPORT_SYMBOL(import_iovec);
 
-int import_single_range(int rw, void __user *buf, size_t len,
-			struct iov_iter *i)
-{
-	if (len > MAX_RW_COUNT)
-		len = MAX_RW_COUNT;
-	if (unlikely(!access_ok(buf, len)))
-		return -EFAULT;
-
-	iov_iter_ubuf(i, rw, buf, len);
-	return 0;
-}
-EXPORT_SYMBOL(import_single_range);
-
 int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
 {
 	if (len > MAX_RW_COUNT)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 57cf3adb191a..54d3c762d400 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1857,8 +1857,8 @@ static int receive_fallback_to_copy(struct sock *sk,
 	if (copy_address != zc->copybuf_address)
 		return -EINVAL;
 
-	err = import_single_range(ITER_DEST, (void __user *)copy_address,
-				  inq, &msg.msg_iter);
+	err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq,
+			  &msg.msg_iter);
 	if (err)
 		return err;
 
@@ -1890,8 +1890,8 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
 	if (copy_address != zc->copybuf_address)
 		return -EINVAL;
 
-	err = import_single_range(ITER_DEST, (void __user *)copy_address,
-				  copylen, &msg.msg_iter);
+	err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen,
+			  &msg.msg_iter);
 	if (err)
 		return err;
 	err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
diff --git a/net/socket.c b/net/socket.c
index 1f0d0e8d0a50..5d49ae0c1b79 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2163,7 +2163,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
 	struct msghdr msg;
 	int fput_needed;
 
-	err = import_single_range(ITER_SOURCE, buff, len, &msg.msg_iter);
+	err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter);
 	if (unlikely(err))
 		return err;
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
@@ -2228,7 +2228,7 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
 	int err, err2;
 	int fput_needed;
 
-	err = import_single_range(ITER_DEST, ubuf, size, &msg.msg_iter);
+	err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter);
 	if (unlikely(err))
 		return err;
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 193df7ca3ca8..10ba439968f7 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1255,8 +1255,8 @@ long keyctl_instantiate_key(key_serial_t id,
 		struct iov_iter from;
 		int ret;
 
-		ret = import_single_range(ITER_SOURCE, (void __user *)_payload, plen,
-					  &from);
+		ret = import_ubuf(ITER_SOURCE, (void __user *)_payload, plen,
+				  &from);
 		if (unlikely(ret))
 			return ret;
 
-- 
cgit v1.2.3


From 41f6f64e6999a837048b1bd13a2f8742964eca6b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 5 Dec 2023 10:42:39 -0800
Subject: bpf: support non-r10 register spill/fill to/from stack in precision
 tracking

Use instruction (jump) history to record instructions that performed
register spill/fill to/from stack, regardless if this was done through
read-only r10 register, or any other register after copying r10 into it
*and* potentially adjusting offset.

To make this work reliably, we push extra per-instruction flags into
instruction history, encoding stack slot index (spi) and stack frame
number in extra 10 bit flags we take away from prev_idx in instruction
history. We don't touch idx field for maximum performance, as it's
checked most frequently during backtracking.

This change removes basically the last remaining practical limitation of
precision backtracking logic in BPF verifier. It fixes known
deficiencies, but also opens up new opportunities to reduce number of
verified states, explored in the subsequent patches.

There are only three differences in selftests' BPF object files
according to veristat, all in the positive direction (less states).

File                                    Program        Insns (A)  Insns (B)  Insns  (DIFF)  States (A)  States (B)  States (DIFF)
--------------------------------------  -------------  ---------  ---------  -------------  ----------  ----------  -------------
test_cls_redirect_dynptr.bpf.linked3.o  cls_redirect        2987       2864  -123 (-4.12%)         240         231    -9 (-3.75%)
xdp_synproxy_kern.bpf.linked3.o         syncookie_tc       82848      82661  -187 (-0.23%)        5107        5073   -34 (-0.67%)
xdp_synproxy_kern.bpf.linked3.o         syncookie_xdp      85116      84964  -152 (-0.18%)        5162        5130   -32 (-0.62%)

Note, I avoided renaming jmp_history to more generic insn_hist to
minimize number of lines changed and potential merge conflicts between
bpf and bpf-next trees.

Notice also cur_hist_entry pointer reset to NULL at the beginning of
instruction verification loop. This pointer avoids the problem of
relying on last jump history entry's insn_idx to determine whether we
already have entry for current instruction or not. It can happen that we
added jump history entry because current instruction is_jmp_point(), but
also we need to add instruction flags for stack access. In this case, we
don't want to entries, so we need to reuse last added entry, if it is
present.

Relying on insn_idx comparison has the same ambiguity problem as the one
that was fixed recently in [0], so we avoid that.

  [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231110002638.4168352-3-andrii@kernel.org/

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reported-by: Tao Lyu <tao.lyu@epfl.ch>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231205184248.1502704-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                       |  31 +++-
 kernel/bpf/verifier.c                              | 175 ++++++++++++---------
 .../bpf/progs/verifier_subprog_precision.c         |  23 ++-
 tools/testing/selftests/bpf/verifier/precise.c     |  38 +++--
 4 files changed, 169 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3378cc753061..bada59812e00 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -325,12 +325,34 @@ struct bpf_func_state {
 	int allocated_stack;
 };
 
-struct bpf_idx_pair {
-	u32 prev_idx;
+#define MAX_CALL_FRAMES 8
+
+/* instruction history flags, used in bpf_jmp_history_entry.flags field */
+enum {
+	/* instruction references stack slot through PTR_TO_STACK register;
+	 * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8)
+	 * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512,
+	 * 8 bytes per slot, so slot index (spi) is [0, 63])
+	 */
+	INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */
+
+	INSN_F_SPI_MASK = 0x3f, /* 6 bits */
+	INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */
+
+	INSN_F_STACK_ACCESS = BIT(9), /* we need 10 bits total */
+};
+
+static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES);
+static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8);
+
+struct bpf_jmp_history_entry {
 	u32 idx;
+	/* insn idx can't be bigger than 1 million */
+	u32 prev_idx : 22;
+	/* special flags, e.g., whether insn is doing register stack spill/load */
+	u32 flags : 10;
 };
 
-#define MAX_CALL_FRAMES 8
 /* Maximum number of register states that can exist at once */
 #define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES)
 struct bpf_verifier_state {
@@ -413,7 +435,7 @@ struct bpf_verifier_state {
 	 * For most states jmp_history_cnt is [0-3].
 	 * For loops can go up to ~40.
 	 */
-	struct bpf_idx_pair *jmp_history;
+	struct bpf_jmp_history_entry *jmp_history;
 	u32 jmp_history_cnt;
 	u32 dfs_depth;
 	u32 callback_unroll_depth;
@@ -656,6 +678,7 @@ struct bpf_verifier_env {
 		int cur_stack;
 	} cfg;
 	struct backtrack_state bt;
+	struct bpf_jmp_history_entry *cur_hist_ent;
 	u32 pass_cnt; /* number of times do_check() was called */
 	u32 subprog_cnt;
 	/* number of instructions analyzed by the verifier */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1ed39665f802..9bc16dc66465 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1355,8 +1355,8 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	int i, err;
 
 	dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
-					    src->jmp_history_cnt, sizeof(struct bpf_idx_pair),
-					    GFP_USER);
+					  src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
+					  GFP_USER);
 	if (!dst_state->jmp_history)
 		return -ENOMEM;
 	dst_state->jmp_history_cnt = src->jmp_history_cnt;
@@ -3221,6 +3221,21 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	return __check_reg_arg(env, state->regs, regno, t);
 }
 
+static int insn_stack_access_flags(int frameno, int spi)
+{
+	return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno;
+}
+
+static int insn_stack_access_spi(int insn_flags)
+{
+	return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK;
+}
+
+static int insn_stack_access_frameno(int insn_flags)
+{
+	return insn_flags & INSN_F_FRAMENO_MASK;
+}
+
 static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
 {
 	env->insn_aux_data[idx].jmp_point = true;
@@ -3232,28 +3247,51 @@ static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
 }
 
 /* for any branch, call, exit record the history of jmps in the given state */
-static int push_jmp_history(struct bpf_verifier_env *env,
-			    struct bpf_verifier_state *cur)
+static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+			    int insn_flags)
 {
 	u32 cnt = cur->jmp_history_cnt;
-	struct bpf_idx_pair *p;
+	struct bpf_jmp_history_entry *p;
 	size_t alloc_size;
 
-	if (!is_jmp_point(env, env->insn_idx))
+	/* combine instruction flags if we already recorded this instruction */
+	if (env->cur_hist_ent) {
+		/* atomic instructions push insn_flags twice, for READ and
+		 * WRITE sides, but they should agree on stack slot
+		 */
+		WARN_ONCE((env->cur_hist_ent->flags & insn_flags) &&
+			  (env->cur_hist_ent->flags & insn_flags) != insn_flags,
+			  "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n",
+			  env->insn_idx, env->cur_hist_ent->flags, insn_flags);
+		env->cur_hist_ent->flags |= insn_flags;
 		return 0;
+	}
 
 	cnt++;
 	alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
 	p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
 	if (!p)
 		return -ENOMEM;
-	p[cnt - 1].idx = env->insn_idx;
-	p[cnt - 1].prev_idx = env->prev_insn_idx;
 	cur->jmp_history = p;
+
+	p = &cur->jmp_history[cnt - 1];
+	p->idx = env->insn_idx;
+	p->prev_idx = env->prev_insn_idx;
+	p->flags = insn_flags;
 	cur->jmp_history_cnt = cnt;
+	env->cur_hist_ent = p;
+
 	return 0;
 }
 
+static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
+						        u32 hist_end, int insn_idx)
+{
+	if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
+		return &st->jmp_history[hist_end - 1];
+	return NULL;
+}
+
 /* Backtrack one insn at a time. If idx is not at the top of recorded
  * history then previous instruction came from straight line execution.
  * Return -ENOENT if we exhausted all instructions within given state.
@@ -3415,9 +3453,14 @@ static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg)
 	return bt->reg_masks[bt->frame] & (1 << reg);
 }
 
+static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot)
+{
+	return bt->stack_masks[frame] & (1ull << slot);
+}
+
 static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot)
 {
-	return bt->stack_masks[bt->frame] & (1ull << slot);
+	return bt_is_frame_slot_set(bt, bt->frame, slot);
 }
 
 /* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
@@ -3471,7 +3514,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
  *   - *was* processed previously during backtracking.
  */
 static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
-			  struct backtrack_state *bt)
+			  struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
 {
 	const struct bpf_insn_cbs cbs = {
 		.cb_call	= disasm_kfunc_name,
@@ -3484,7 +3527,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 	u8 mode = BPF_MODE(insn->code);
 	u32 dreg = insn->dst_reg;
 	u32 sreg = insn->src_reg;
-	u32 spi, i;
+	u32 spi, i, fr;
 
 	if (insn->code == 0)
 		return 0;
@@ -3545,20 +3588,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 		 * by 'precise' mark in corresponding register of this state.
 		 * No further tracking necessary.
 		 */
-		if (insn->src_reg != BPF_REG_FP)
+		if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
 			return 0;
-
 		/* dreg = *(u64 *)[fp - off] was a fill from the stack.
 		 * that [fp - off] slot contains scalar that needs to be
 		 * tracked with precision
 		 */
-		spi = (-insn->off - 1) / BPF_REG_SIZE;
-		if (spi >= 64) {
-			verbose(env, "BUG spi %d\n", spi);
-			WARN_ONCE(1, "verifier backtracking bug");
-			return -EFAULT;
-		}
-		bt_set_slot(bt, spi);
+		spi = insn_stack_access_spi(hist->flags);
+		fr = insn_stack_access_frameno(hist->flags);
+		bt_set_frame_slot(bt, fr, spi);
 	} else if (class == BPF_STX || class == BPF_ST) {
 		if (bt_is_reg_set(bt, dreg))
 			/* stx & st shouldn't be using _scalar_ dst_reg
@@ -3567,17 +3605,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 			 */
 			return -ENOTSUPP;
 		/* scalars can only be spilled into stack */
-		if (insn->dst_reg != BPF_REG_FP)
+		if (!hist || !(hist->flags & INSN_F_STACK_ACCESS))
 			return 0;
-		spi = (-insn->off - 1) / BPF_REG_SIZE;
-		if (spi >= 64) {
-			verbose(env, "BUG spi %d\n", spi);
-			WARN_ONCE(1, "verifier backtracking bug");
-			return -EFAULT;
-		}
-		if (!bt_is_slot_set(bt, spi))
+		spi = insn_stack_access_spi(hist->flags);
+		fr = insn_stack_access_frameno(hist->flags);
+		if (!bt_is_frame_slot_set(bt, fr, spi))
 			return 0;
-		bt_clear_slot(bt, spi);
+		bt_clear_frame_slot(bt, fr, spi);
 		if (class == BPF_STX)
 			bt_set_reg(bt, sreg);
 	} else if (class == BPF_JMP || class == BPF_JMP32) {
@@ -3621,10 +3655,14 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 					WARN_ONCE(1, "verifier backtracking bug");
 					return -EFAULT;
 				}
-				/* we don't track register spills perfectly,
-				 * so fallback to force-precise instead of failing */
-				if (bt_stack_mask(bt) != 0)
-					return -ENOTSUPP;
+				/* we are now tracking register spills correctly,
+				 * so any instance of leftover slots is a bug
+				 */
+				if (bt_stack_mask(bt) != 0) {
+					verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
+					WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)");
+					return -EFAULT;
+				}
 				/* propagate r1-r5 to the caller */
 				for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
 					if (bt_is_reg_set(bt, i)) {
@@ -3649,8 +3687,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
 				WARN_ONCE(1, "verifier backtracking bug");
 				return -EFAULT;
 			}
-			if (bt_stack_mask(bt) != 0)
-				return -ENOTSUPP;
+			if (bt_stack_mask(bt) != 0) {
+				verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
+				WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)");
+				return -EFAULT;
+			}
 			/* clear r1-r5 in callback subprog's mask */
 			for (i = BPF_REG_1; i <= BPF_REG_5; i++)
 				bt_clear_reg(bt, i);
@@ -4087,6 +4128,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 	for (;;) {
 		DECLARE_BITMAP(mask, 64);
 		u32 history = st->jmp_history_cnt;
+		struct bpf_jmp_history_entry *hist;
 
 		if (env->log.level & BPF_LOG_LEVEL2) {
 			verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
@@ -4150,7 +4192,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 				err = 0;
 				skip_first = false;
 			} else {
-				err = backtrack_insn(env, i, subseq_idx, bt);
+				hist = get_jmp_hist_entry(st, history, i);
+				err = backtrack_insn(env, i, subseq_idx, hist, bt);
 			}
 			if (err == -ENOTSUPP) {
 				mark_all_scalars_precise(env, env->cur_state);
@@ -4203,22 +4246,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 			bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
 			for_each_set_bit(i, mask, 64) {
 				if (i >= func->allocated_stack / BPF_REG_SIZE) {
-					/* the sequence of instructions:
-					 * 2: (bf) r3 = r10
-					 * 3: (7b) *(u64 *)(r3 -8) = r0
-					 * 4: (79) r4 = *(u64 *)(r10 -8)
-					 * doesn't contain jmps. It's backtracked
-					 * as a single block.
-					 * During backtracking insn 3 is not recognized as
-					 * stack access, so at the end of backtracking
-					 * stack slot fp-8 is still marked in stack_mask.
-					 * However the parent state may not have accessed
-					 * fp-8 and it's "unallocated" stack space.
-					 * In such case fallback to conservative.
-					 */
-					mark_all_scalars_precise(env, env->cur_state);
-					bt_reset(bt);
-					return 0;
+					verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n",
+						i, func->allocated_stack / BPF_REG_SIZE);
+					WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)");
+					return -EFAULT;
 				}
 
 				if (!is_spilled_scalar_reg(&func->stack[i])) {
@@ -4391,7 +4422,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
 	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
 	struct bpf_reg_state *reg = NULL;
-	u32 dst_reg = insn->dst_reg;
+	int insn_flags = insn_stack_access_flags(state->frameno, spi);
 
 	err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));
 	if (err)
@@ -4432,17 +4463,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	mark_stack_slot_scratched(env, spi);
 	if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
 	    !register_is_null(reg) && env->bpf_capable) {
-		if (dst_reg != BPF_REG_FP) {
-			/* The backtracking logic can only recognize explicit
-			 * stack slot address like [fp - 8]. Other spill of
-			 * scalar via different register has to be conservative.
-			 * Backtrack from here and mark all registers as precise
-			 * that contributed into 'reg' being a constant.
-			 */
-			err = mark_chain_precision(env, value_regno);
-			if (err)
-				return err;
-		}
 		save_register_state(state, spi, reg, size);
 		/* Break the relation on a narrowing spill. */
 		if (fls64(reg->umax_value) > BITS_PER_BYTE * size)
@@ -4454,6 +4474,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 		__mark_reg_known(&fake_reg, insn->imm);
 		fake_reg.type = SCALAR_VALUE;
 		save_register_state(state, spi, &fake_reg, size);
+		insn_flags = 0; /* not a register spill */
 	} else if (reg && is_spillable_regtype(reg->type)) {
 		/* register containing pointer is being spilled into stack */
 		if (size != BPF_REG_SIZE) {
@@ -4499,9 +4520,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 
 		/* Mark slots affected by this stack write. */
 		for (i = 0; i < size; i++)
-			state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
-				type;
+			state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = type;
+		insn_flags = 0; /* not a register spill */
 	}
+
+	if (insn_flags)
+		return push_jmp_history(env, env->cur_state, insn_flags);
 	return 0;
 }
 
@@ -4694,6 +4718,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
 	struct bpf_reg_state *reg;
 	u8 *stype, type;
+	int insn_flags = insn_stack_access_flags(reg_state->frameno, spi);
 
 	stype = reg_state->stack[spi].slot_type;
 	reg = &reg_state->stack[spi].spilled_ptr;
@@ -4739,12 +4764,10 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 					return -EACCES;
 				}
 				mark_reg_unknown(env, state->regs, dst_regno);
+				insn_flags = 0; /* not restoring original register state */
 			}
 			state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
-			return 0;
-		}
-
-		if (dst_regno >= 0) {
+		} else if (dst_regno >= 0) {
 			/* restore register state from stack */
 			copy_register_state(&state->regs[dst_regno], reg);
 			/* mark reg as written since spilled pointer state likely
@@ -4780,7 +4803,10 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 		mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
 		if (dst_regno >= 0)
 			mark_reg_stack_read(env, reg_state, off, off + size, dst_regno);
+		insn_flags = 0; /* we are not restoring spilled register */
 	}
+	if (insn_flags)
+		return push_jmp_history(env, env->cur_state, insn_flags);
 	return 0;
 }
 
@@ -6940,7 +6966,6 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 			       BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
 	if (err)
 		return err;
-
 	return 0;
 }
 
@@ -16910,7 +16935,8 @@ hit:
 			 * the precision needs to be propagated back in
 			 * the current state.
 			 */
-			err = err ? : push_jmp_history(env, cur);
+			if (is_jmp_point(env, env->insn_idx))
+				err = err ? : push_jmp_history(env, cur, 0);
 			err = err ? : propagate_precision(env, &sl->state);
 			if (err)
 				return err;
@@ -17135,6 +17161,9 @@ static int do_check(struct bpf_verifier_env *env)
 		u8 class;
 		int err;
 
+		/* reset current history entry on each new instruction */
+		env->cur_hist_ent = NULL;
+
 		env->prev_insn_idx = prev_insn_idx;
 		if (env->insn_idx >= insn_cnt) {
 			verbose(env, "invalid insn idx %d insn_cnt %d\n",
@@ -17174,7 +17203,7 @@ static int do_check(struct bpf_verifier_env *env)
 		}
 
 		if (is_jmp_point(env, env->insn_idx)) {
-			err = push_jmp_history(env, state);
+			err = push_jmp_history(env, state, 0);
 			if (err)
 				return err;
 		}
diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
index 0dfe3f8b69ac..eba98fab2f54 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c
@@ -589,11 +589,24 @@ static __u64 subprog_spill_reg_precise(void)
 
 SEC("?raw_tp")
 __success __log_level(2)
-/* precision backtracking can't currently handle stack access not through r10,
- * so we won't be able to mark stack slot fp-8 as precise, and so will
- * fallback to forcing all as precise
- */
-__msg("mark_precise: frame0: falling back to forcing all scalars precise")
+__msg("10: (0f) r1 += r7")
+__msg("mark_precise: frame0: last_idx 10 first_idx 7 subseq_idx -1")
+__msg("mark_precise: frame0: regs=r7 stack= before 9: (bf) r1 = r8")
+__msg("mark_precise: frame0: regs=r7 stack= before 8: (27) r7 *= 4")
+__msg("mark_precise: frame0: regs=r7 stack= before 7: (79) r7 = *(u64 *)(r10 -8)")
+__msg("mark_precise: frame0: parent state regs= stack=-8:  R0_w=2 R6_w=1 R8_rw=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8_rw=P1")
+__msg("mark_precise: frame0: last_idx 18 first_idx 0 subseq_idx 7")
+__msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit")
+__msg("mark_precise: frame1: regs= stack= before 17: (0f) r0 += r2")
+__msg("mark_precise: frame1: regs= stack= before 16: (79) r2 = *(u64 *)(r1 +0)")
+__msg("mark_precise: frame1: regs= stack= before 15: (79) r0 = *(u64 *)(r10 -16)")
+__msg("mark_precise: frame1: regs= stack= before 14: (7b) *(u64 *)(r10 -16) = r2")
+__msg("mark_precise: frame1: regs= stack= before 13: (7b) *(u64 *)(r1 +0) = r2")
+__msg("mark_precise: frame1: regs=r2 stack= before 6: (85) call pc+6")
+__msg("mark_precise: frame0: regs=r2 stack= before 5: (bf) r2 = r6")
+__msg("mark_precise: frame0: regs=r6 stack= before 4: (07) r1 += -8")
+__msg("mark_precise: frame0: regs=r6 stack= before 3: (bf) r1 = r10")
+__msg("mark_precise: frame0: regs=r6 stack= before 2: (b7) r6 = 1")
 __naked int subprog_spill_into_parent_stack_slot_precise(void)
 {
 	asm volatile (
diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c
index 0d84dd1f38b6..8a2ff81d8350 100644
--- a/tools/testing/selftests/bpf/verifier/precise.c
+++ b/tools/testing/selftests/bpf/verifier/precise.c
@@ -140,10 +140,11 @@
 	.result = REJECT,
 },
 {
-	"precise: ST insn causing spi > allocated_stack",
+	"precise: ST zero to stack insn is supported",
 	.insns = {
 	BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
 	BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 123, 0),
+	/* not a register spill, so we stop precision propagation for R4 here */
 	BPF_ST_MEM(BPF_DW, BPF_REG_3, -8, 0),
 	BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
 	BPF_MOV64_IMM(BPF_REG_0, -1),
@@ -157,11 +158,11 @@
 	mark_precise: frame0: last_idx 4 first_idx 2\
 	mark_precise: frame0: regs=r4 stack= before 4\
 	mark_precise: frame0: regs=r4 stack= before 3\
-	mark_precise: frame0: regs= stack=-8 before 2\
-	mark_precise: frame0: falling back to forcing all scalars precise\
-	force_precise: frame0: forcing r0 to be precise\
 	mark_precise: frame0: last_idx 5 first_idx 5\
-	mark_precise: frame0: parent state regs= stack=:",
+	mark_precise: frame0: parent state regs=r0 stack=:\
+	mark_precise: frame0: last_idx 4 first_idx 2\
+	mark_precise: frame0: regs=r0 stack= before 4\
+	5: R0=-1 R4=0",
 	.result = VERBOSE_ACCEPT,
 	.retval = -1,
 },
@@ -169,6 +170,8 @@
 	"precise: STX insn causing spi > allocated_stack",
 	.insns = {
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+	/* make later reg spill more interesting by having somewhat known scalar */
+	BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xff),
 	BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
 	BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 123, 0),
 	BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, -8),
@@ -179,18 +182,21 @@
 	},
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.flags = BPF_F_TEST_STATE_FREQ,
-	.errstr = "mark_precise: frame0: last_idx 6 first_idx 6\
+	.errstr = "mark_precise: frame0: last_idx 7 first_idx 7\
 	mark_precise: frame0: parent state regs=r4 stack=:\
-	mark_precise: frame0: last_idx 5 first_idx 3\
-	mark_precise: frame0: regs=r4 stack= before 5\
-	mark_precise: frame0: regs=r4 stack= before 4\
-	mark_precise: frame0: regs= stack=-8 before 3\
-	mark_precise: frame0: falling back to forcing all scalars precise\
-	force_precise: frame0: forcing r0 to be precise\
-	force_precise: frame0: forcing r0 to be precise\
-	force_precise: frame0: forcing r0 to be precise\
-	force_precise: frame0: forcing r0 to be precise\
-	mark_precise: frame0: last_idx 6 first_idx 6\
+	mark_precise: frame0: last_idx 6 first_idx 4\
+	mark_precise: frame0: regs=r4 stack= before 6: (b7) r0 = -1\
+	mark_precise: frame0: regs=r4 stack= before 5: (79) r4 = *(u64 *)(r10 -8)\
+	mark_precise: frame0: regs= stack=-8 before 4: (7b) *(u64 *)(r3 -8) = r0\
+	mark_precise: frame0: parent state regs=r0 stack=:\
+	mark_precise: frame0: last_idx 3 first_idx 3\
+	mark_precise: frame0: regs=r0 stack= before 3: (55) if r3 != 0x7b goto pc+0\
+	mark_precise: frame0: regs=r0 stack= before 2: (bf) r3 = r10\
+	mark_precise: frame0: regs=r0 stack= before 1: (57) r0 &= 255\
+	mark_precise: frame0: parent state regs=r0 stack=:\
+	mark_precise: frame0: last_idx 0 first_idx 0\
+	mark_precise: frame0: regs=r0 stack= before 0: (85) call bpf_get_prandom_u32#7\
+	mark_precise: frame0: last_idx 7 first_idx 7\
 	mark_precise: frame0: parent state regs= stack=:",
 	.result = VERBOSE_ACCEPT,
 	.retval = -1,
-- 
cgit v1.2.3


From ab125ed3ec1c10ccc36bc98c7a4256ad114a3dae Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 5 Dec 2023 10:42:41 -0800
Subject: bpf: fix check for attempt to corrupt spilled pointer

When register is spilled onto a stack as a 1/2/4-byte register, we set
slot_type[BPF_REG_SIZE - 1] (plus potentially few more below it,
depending on actual spill size). So to check if some stack slot has
spilled register we need to consult slot_type[7], not slot_type[0].

To avoid the need to remember and double-check this in the future, just
use is_spilled_reg() helper.

Fixes: 27113c59b6d0 ("bpf: Check the other end of slot_type for STACK_SPILL")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231205184248.1502704-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9bc16dc66465..3edca06de9fd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4431,7 +4431,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	 * so it's aligned access and [off, off + size) are within stack limits
 	 */
 	if (!env->allow_ptr_leaks &&
-	    state->stack[spi].slot_type[0] == STACK_SPILL &&
+	    is_spilled_reg(&state->stack[spi]) &&
 	    size != BPF_REG_SIZE) {
 		verbose(env, "attempt to corrupt spilled pointer on stack\n");
 		return -EACCES;
-- 
cgit v1.2.3


From eaf18febd6ebc381aeb61543705148b3e28c7c47 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 5 Dec 2023 10:42:42 -0800
Subject: bpf: preserve STACK_ZERO slots on partial reg spills

Instead of always forcing STACK_ZERO slots to STACK_MISC, preserve it in
situations where this is possible. E.g., when spilling register as
1/2/4-byte subslots on the stack, all the remaining bytes in the stack
slot do not automatically become unknown. If we knew they contained
zeroes, we can preserve those STACK_ZERO markers.

Add a helper mark_stack_slot_misc(), similar to scrub_spilled_slot(),
but that doesn't overwrite either STACK_INVALID nor STACK_ZERO. Note
that we need to take into account possibility of being in unprivileged
mode, in which case STACK_INVALID is forced to STACK_MISC for correctness,
as treating STACK_INVALID as equivalent STACK_MISC is only enabled in
privileged mode.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231205184248.1502704-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3edca06de9fd..93de39a6e36e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1144,6 +1144,21 @@ static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack)
 	       stack->spilled_ptr.type == SCALAR_VALUE;
 }
 
+/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which
+ * case they are equivalent, or it's STACK_ZERO, in which case we preserve
+ * more precise STACK_ZERO.
+ * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take
+ * env->allow_ptr_leaks into account and force STACK_MISC, if necessary.
+ */
+static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype)
+{
+	if (*stype == STACK_ZERO)
+		return;
+	if (env->allow_ptr_leaks && *stype == STACK_INVALID)
+		return;
+	*stype = STACK_MISC;
+}
+
 static void scrub_spilled_slot(u8 *stype)
 {
 	if (*stype != STACK_INVALID)
@@ -4386,7 +4401,8 @@ static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_
 	dst->live = live;
 }
 
-static void save_register_state(struct bpf_func_state *state,
+static void save_register_state(struct bpf_verifier_env *env,
+				struct bpf_func_state *state,
 				int spi, struct bpf_reg_state *reg,
 				int size)
 {
@@ -4401,7 +4417,7 @@ static void save_register_state(struct bpf_func_state *state,
 
 	/* size < 8 bytes spill */
 	for (; i; i--)
-		scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]);
+		mark_stack_slot_misc(env, &state->stack[spi].slot_type[i - 1]);
 }
 
 static bool is_bpf_st_mem(struct bpf_insn *insn)
@@ -4463,7 +4479,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	mark_stack_slot_scratched(env, spi);
 	if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
 	    !register_is_null(reg) && env->bpf_capable) {
-		save_register_state(state, spi, reg, size);
+		save_register_state(env, state, spi, reg, size);
 		/* Break the relation on a narrowing spill. */
 		if (fls64(reg->umax_value) > BITS_PER_BYTE * size)
 			state->stack[spi].spilled_ptr.id = 0;
@@ -4473,7 +4489,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 
 		__mark_reg_known(&fake_reg, insn->imm);
 		fake_reg.type = SCALAR_VALUE;
-		save_register_state(state, spi, &fake_reg, size);
+		save_register_state(env, state, spi, &fake_reg, size);
 		insn_flags = 0; /* not a register spill */
 	} else if (reg && is_spillable_regtype(reg->type)) {
 		/* register containing pointer is being spilled into stack */
@@ -4486,7 +4502,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 			verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
 			return -EINVAL;
 		}
-		save_register_state(state, spi, reg, size);
+		save_register_state(env, state, spi, reg, size);
 	} else {
 		u8 type = STACK_MISC;
 
@@ -4757,6 +4773,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 						continue;
 					if (type == STACK_MISC)
 						continue;
+					if (type == STACK_ZERO)
+						continue;
 					if (type == STACK_INVALID && env->allow_uninit_stack)
 						continue;
 					verbose(env, "invalid read from stack off %d+%d size %d\n",
-- 
cgit v1.2.3


From e322f0bcb8d371f4606eaf141c7f967e1a79bcb7 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 5 Dec 2023 10:42:44 -0800
Subject: bpf: preserve constant zero when doing partial register restore

Similar to special handling of STACK_ZERO, when reading 1/2/4 bytes from
stack from slot that has register spilled into it and that register has
a constant value zero, preserve that zero and mark spilled register as
precise for that. This makes spilled const zero register and STACK_ZERO
cases equivalent in their behavior.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231205184248.1502704-7-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 93de39a6e36e..1ebe76c98451 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4767,22 +4767,39 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 				copy_register_state(&state->regs[dst_regno], reg);
 				state->regs[dst_regno].subreg_def = subreg_def;
 			} else {
+				int spill_cnt = 0, zero_cnt = 0;
+
 				for (i = 0; i < size; i++) {
 					type = stype[(slot - i) % BPF_REG_SIZE];
-					if (type == STACK_SPILL)
+					if (type == STACK_SPILL) {
+						spill_cnt++;
 						continue;
+					}
 					if (type == STACK_MISC)
 						continue;
-					if (type == STACK_ZERO)
+					if (type == STACK_ZERO) {
+						zero_cnt++;
 						continue;
+					}
 					if (type == STACK_INVALID && env->allow_uninit_stack)
 						continue;
 					verbose(env, "invalid read from stack off %d+%d size %d\n",
 						off, i, size);
 					return -EACCES;
 				}
-				mark_reg_unknown(env, state->regs, dst_regno);
-				insn_flags = 0; /* not restoring original register state */
+
+				if (spill_cnt == size &&
+				    tnum_is_const(reg->var_off) && reg->var_off.value == 0) {
+					__mark_reg_const_zero(&state->regs[dst_regno]);
+					/* this IS register fill, so keep insn_flags */
+				} else if (zero_cnt == size) {
+					/* similarly to mark_reg_stack_read(), preserve zeroes */
+					__mark_reg_const_zero(&state->regs[dst_regno]);
+					insn_flags = 0; /* not restoring original register state */
+				} else {
+					mark_reg_unknown(env, state->regs, dst_regno);
+					insn_flags = 0; /* not restoring original register state */
+				}
 			}
 			state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
 		} else if (dst_regno >= 0) {
-- 
cgit v1.2.3


From 18a433b62061e3d787bfc3e670fa711fecbd7cb4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 5 Dec 2023 10:42:46 -0800
Subject: bpf: track aligned STACK_ZERO cases as imprecise spilled registers

Now that precision backtracing is supporting register spill/fill to/from
stack, there is another oportunity to be exploited here: minimizing
precise STACK_ZERO cases. With a simple code change we can rely on
initially imprecise register spill tracking for cases when register
spilled to stack was a known zero.

This is a very common case for initializing on the stack variables,
including rather large structures. Often times zero has no special
meaning for the subsequent BPF program logic and is often overwritten
with non-zero values soon afterwards. But due to STACK_ZERO vs
STACK_MISC tracking, such initial zero initialization actually causes
duplication of verifier states as STACK_ZERO is clearly different than
STACK_MISC or spilled SCALAR_VALUE register.

The effect of this (now) trivial change is huge, as can be seen below.
These are differences between BPF selftests, Cilium, and Meta-internal
BPF object files relative to previous patch in this series. You can see
improvements ranging from single-digit percentage improvement for
instructions and states, all the way to 50-60% reduction for some of
Meta-internal host agent programs, and even some Cilium programs.

For Meta-internal ones I left only the differences for largest BPF
object files by states/instructions, as there were too many differences
in the overall output. All the differences were improvements, reducting
number of states and thus instructions validated.

Note, Meta-internal BPF object file names are not printed below.
Many copies of balancer_ingress are actually many different
configurations of Katran, so they are different BPF programs, which
explains state reduction going from -16% all the way to 31%, depending
on BPF program logic complexity.

I also tooked a closer look at a few small-ish BPF programs to validate
the behavior. Let's take bpf_iter_netrlink.bpf.o (first row below).
While it's just 8 vs 5 states, verifier log is still pretty long to
include it here. But the reduction in states is due to the following
piece of C code:

        unsigned long ino;

	...

        sk = s->sk_socket;
        if (!sk) {
                ino = 0;
        } else {
                inode = SOCK_INODE(sk);
                bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino);
        }
        BPF_SEQ_PRINTF(seq, "%-8u %-8lu\n", s->sk_drops.counter, ino);
	return 0;

You can see that in some situations `ino` is zero-initialized, while in
others it's unknown value filled out by bpf_probe_read_kernel(). Before
this change code after if/else branches have to be validated twice. Once
with (precise) ino == 0, due to eager STACK_ZERO logic, and then again
for when ino is just STACK_MISC. But BPF_SEQ_PRINTF() doesn't care about
precise value of ino, so with the change in this patch verifier is able
to prune states from after one of the branches, reducing number of total
states (and instructions) required for successful validation.

Similar principle applies to bigger real-world applications, just at
a much larger scale.

SELFTESTS
=========
File                                     Program                  Insns (A)  Insns (B)  Insns    (DIFF)  States (A)  States (B)  States (DIFF)
---------------------------------------  -----------------------  ---------  ---------  ---------------  ----------  ----------  -------------
bpf_iter_netlink.bpf.linked3.o           dump_netlink                   148        104    -44 (-29.73%)           8           5   -3 (-37.50%)
bpf_iter_unix.bpf.linked3.o              dump_unix                     8474       8404     -70 (-0.83%)         151         147    -4 (-2.65%)
bpf_loop.bpf.linked3.o                   stack_check                    560        324   -236 (-42.14%)          42          24  -18 (-42.86%)
local_storage_bench.bpf.linked3.o        get_local                      120         77    -43 (-35.83%)           9           6   -3 (-33.33%)
loop6.bpf.linked3.o                      trace_virtqueue_add_sgs      10167       9868    -299 (-2.94%)         226         206   -20 (-8.85%)
pyperf600_bpf_loop.bpf.linked3.o         on_event                      4872       3423  -1449 (-29.74%)         322         229  -93 (-28.88%)
strobemeta.bpf.linked3.o                 on_event                    180697     176036   -4661 (-2.58%)        4780        4734   -46 (-0.96%)
test_cls_redirect.bpf.linked3.o          cls_redirect                 65594      65401    -193 (-0.29%)        4230        4212   -18 (-0.43%)
test_global_func_args.bpf.linked3.o      test_cls                       145        136      -9 (-6.21%)          10           9   -1 (-10.00%)
test_l4lb.bpf.linked3.o                  balancer_ingress              4760       2612  -2148 (-45.13%)         113         102   -11 (-9.73%)
test_l4lb_noinline.bpf.linked3.o         balancer_ingress              4845       4877     +32 (+0.66%)         219         221    +2 (+0.91%)
test_l4lb_noinline_dynptr.bpf.linked3.o  balancer_ingress              2072       2087     +15 (+0.72%)          97          98    +1 (+1.03%)
test_seg6_loop.bpf.linked3.o             __add_egr_x                  12440       9975  -2465 (-19.82%)         364         353   -11 (-3.02%)
test_tcp_hdr_options.bpf.linked3.o       estab                         2558       2572     +14 (+0.55%)         179         180    +1 (+0.56%)
test_xdp_dynptr.bpf.linked3.o            _xdp_tx_iptunnel               645        596     -49 (-7.60%)          26          24    -2 (-7.69%)
test_xdp_noinline.bpf.linked3.o          balancer_ingress_v6           3520       3516      -4 (-0.11%)         216         216    +0 (+0.00%)
xdp_synproxy_kern.bpf.linked3.o          syncookie_tc                 82661      81241   -1420 (-1.72%)        5073        5155   +82 (+1.62%)
xdp_synproxy_kern.bpf.linked3.o          syncookie_xdp                84964      82297   -2667 (-3.14%)        5130        5157   +27 (+0.53%)

META-INTERNAL
=============
Program                                 Insns (A)  Insns (B)  Insns      (DIFF)  States (A)  States (B)  States   (DIFF)
--------------------------------------  ---------  ---------  -----------------  ----------  ----------  ---------------
balancer_ingress                            27925      23608    -4317 (-15.46%)        1488        1482      -6 (-0.40%)
balancer_ingress                            31824      27546    -4278 (-13.44%)        1658        1652      -6 (-0.36%)
balancer_ingress                            32213      27935    -4278 (-13.28%)        1689        1683      -6 (-0.36%)
balancer_ingress                            32213      27935    -4278 (-13.28%)        1689        1683      -6 (-0.36%)
balancer_ingress                            31824      27546    -4278 (-13.44%)        1658        1652      -6 (-0.36%)
balancer_ingress                            38647      29562    -9085 (-23.51%)        2069        1835   -234 (-11.31%)
balancer_ingress                            38647      29562    -9085 (-23.51%)        2069        1835   -234 (-11.31%)
balancer_ingress                            40339      30792    -9547 (-23.67%)        2193        1934   -259 (-11.81%)
balancer_ingress                            37321      29055    -8266 (-22.15%)        1972        1795    -177 (-8.98%)
balancer_ingress                            38176      29753    -8423 (-22.06%)        2008        1831    -177 (-8.81%)
balancer_ingress                            29193      20910    -8283 (-28.37%)        1599        1422   -177 (-11.07%)
balancer_ingress                            30013      21452    -8561 (-28.52%)        1645        1447   -198 (-12.04%)
balancer_ingress                            28691      24290    -4401 (-15.34%)        1545        1531     -14 (-0.91%)
balancer_ingress                            34223      28965    -5258 (-15.36%)        1984        1875    -109 (-5.49%)
balancer_ingress                            35481      26158    -9323 (-26.28%)        2095        1806   -289 (-13.79%)
balancer_ingress                            35481      26158    -9323 (-26.28%)        2095        1806   -289 (-13.79%)
balancer_ingress                            35868      26455    -9413 (-26.24%)        2140        1827   -313 (-14.63%)
balancer_ingress                            35868      26455    -9413 (-26.24%)        2140        1827   -313 (-14.63%)
balancer_ingress                            35481      26158    -9323 (-26.28%)        2095        1806   -289 (-13.79%)
balancer_ingress                            35481      26158    -9323 (-26.28%)        2095        1806   -289 (-13.79%)
balancer_ingress                            34844      29485    -5359 (-15.38%)        2036        1918    -118 (-5.80%)
fbflow_egress                                3256       2652     -604 (-18.55%)         218         192    -26 (-11.93%)
fbflow_ingress                               1026        944       -82 (-7.99%)          70          63     -7 (-10.00%)
sslwall_tc_egress                            8424       7360    -1064 (-12.63%)         498         458     -40 (-8.03%)
syar_accept_protect                         15040       9539    -5501 (-36.58%)         364         220   -144 (-39.56%)
syar_connect_tcp_v6                         15036       9535    -5501 (-36.59%)         360         216   -144 (-40.00%)
syar_connect_udp_v4                         15039       9538    -5501 (-36.58%)         361         217   -144 (-39.89%)
syar_connect_connect4_protect4              24805      15833    -8972 (-36.17%)         756         480   -276 (-36.51%)
syar_lsm_file_open                         167772     151813    -15959 (-9.51%)        1836        1667    -169 (-9.20%)
syar_namespace_create_new                   14805       9304    -5501 (-37.16%)         353         209   -144 (-40.79%)
syar_python3_detect                         17531      12030    -5501 (-31.38%)         391         247   -144 (-36.83%)
syar_ssh_post_fork                          16412      10911    -5501 (-33.52%)         405         261   -144 (-35.56%)
syar_enter_execve                           14728       9227    -5501 (-37.35%)         345         201   -144 (-41.74%)
syar_enter_execveat                         14728       9227    -5501 (-37.35%)         345         201   -144 (-41.74%)
syar_exit_execve                            16622      11121    -5501 (-33.09%)         376         232   -144 (-38.30%)
syar_exit_execveat                          16622      11121    -5501 (-33.09%)         376         232   -144 (-38.30%)
syar_syscalls_kill                          15288       9787    -5501 (-35.98%)         398         254   -144 (-36.18%)
syar_task_enter_pivot_root                  14898       9397    -5501 (-36.92%)         357         213   -144 (-40.34%)
syar_syscalls_setreuid                      16678      11177    -5501 (-32.98%)         429         285   -144 (-33.57%)
syar_syscalls_setuid                        16678      11177    -5501 (-32.98%)         429         285   -144 (-33.57%)
syar_syscalls_process_vm_readv              14959       9458    -5501 (-36.77%)         364         220   -144 (-39.56%)
syar_syscalls_process_vm_writev             15757      10256    -5501 (-34.91%)         390         246   -144 (-36.92%)
do_uprobe                                   15519      10018    -5501 (-35.45%)         373         229   -144 (-38.61%)
edgewall                                   179715      55783  -123932 (-68.96%)       12607        3999  -8608 (-68.28%)
bictcp_state                                 7570       4131    -3439 (-45.43%)         496         269   -227 (-45.77%)
cubictcp_state                               7570       4131    -3439 (-45.43%)         496         269   -227 (-45.77%)
tcp_rate_skb_delivered                        447        272     -175 (-39.15%)          29          18    -11 (-37.93%)
kprobe__bbr_set_state                        4566       2615    -1951 (-42.73%)         209         124    -85 (-40.67%)
kprobe__bictcp_state                         4566       2615    -1951 (-42.73%)         209         124    -85 (-40.67%)
inet_sock_set_state                          1501       1337     -164 (-10.93%)          93          85      -8 (-8.60%)
tcp_retransmit_skb                           1145        981     -164 (-14.32%)          67          59     -8 (-11.94%)
tcp_retransmit_synack                        1183        951     -232 (-19.61%)          67          55    -12 (-17.91%)
bpf_tcptuner                                 1459       1187     -272 (-18.64%)          99          80    -19 (-19.19%)
tw_egress                                     801        776       -25 (-3.12%)          69          66      -3 (-4.35%)
tw_ingress                                    795        770       -25 (-3.14%)          69          66      -3 (-4.35%)
ttls_tc_ingress                             19025      19383      +358 (+1.88%)         470         465      -5 (-1.06%)
ttls_nat_egress                               490        299     -191 (-38.98%)          33          20    -13 (-39.39%)
ttls_nat_ingress                              448        285     -163 (-36.38%)          32          21    -11 (-34.38%)
tw_twfw_egress                             511127     212071  -299056 (-58.51%)       16733        8504  -8229 (-49.18%)
tw_twfw_ingress                            500095     212069  -288026 (-57.59%)       16223        8504  -7719 (-47.58%)
tw_twfw_tc_eg                              511113     212064  -299049 (-58.51%)       16732        8504  -8228 (-49.18%)
tw_twfw_tc_in                              500095     212069  -288026 (-57.59%)       16223        8504  -7719 (-47.58%)
tw_twfw_egress                              12632      12435      -197 (-1.56%)         276         260     -16 (-5.80%)
tw_twfw_ingress                             12631      12454      -177 (-1.40%)         278         261     -17 (-6.12%)
tw_twfw_tc_eg                               12595      12435      -160 (-1.27%)         274         259     -15 (-5.47%)
tw_twfw_tc_in                               12631      12454      -177 (-1.40%)         278         261     -17 (-6.12%)
tw_xdp_dump                                   266        209      -57 (-21.43%)           9           8     -1 (-11.11%)

CILIUM
=========
File           Program                           Insns (A)  Insns (B)  Insns     (DIFF)  States (A)  States (B)  States  (DIFF)
-------------  --------------------------------  ---------  ---------  ----------------  ----------  ----------  --------------
bpf_host.o     cil_to_netdev                          6047       4578   -1469 (-24.29%)         362         249  -113 (-31.22%)
bpf_host.o     handle_lxc_traffic                     2227       1585    -642 (-28.83%)         156         103   -53 (-33.97%)
bpf_host.o     tail_handle_ipv4_from_netdev           2244       1458    -786 (-35.03%)         163         106   -57 (-34.97%)
bpf_host.o     tail_handle_nat_fwd_ipv4              21022      10479  -10543 (-50.15%)        1289         670  -619 (-48.02%)
bpf_host.o     tail_handle_nat_fwd_ipv6              15433      11375   -4058 (-26.29%)         905         643  -262 (-28.95%)
bpf_host.o     tail_ipv4_host_policy_ingress          2219       1367    -852 (-38.40%)         161          96   -65 (-40.37%)
bpf_host.o     tail_nodeport_nat_egress_ipv4         22460      19862   -2598 (-11.57%)        1469        1293  -176 (-11.98%)
bpf_host.o     tail_nodeport_nat_ingress_ipv4         5526       3534   -1992 (-36.05%)         366         243  -123 (-33.61%)
bpf_host.o     tail_nodeport_nat_ingress_ipv6         5132       4256    -876 (-17.07%)         241         219    -22 (-9.13%)
bpf_host.o     tail_nodeport_nat_ipv6_egress          3702       3542     -160 (-4.32%)         215         205    -10 (-4.65%)
bpf_lxc.o      tail_handle_nat_fwd_ipv4              21022      10479  -10543 (-50.15%)        1289         670  -619 (-48.02%)
bpf_lxc.o      tail_handle_nat_fwd_ipv6              15433      11375   -4058 (-26.29%)         905         643  -262 (-28.95%)
bpf_lxc.o      tail_ipv4_ct_egress                    5073       3374   -1699 (-33.49%)         262         172   -90 (-34.35%)
bpf_lxc.o      tail_ipv4_ct_ingress                   5093       3385   -1708 (-33.54%)         262         172   -90 (-34.35%)
bpf_lxc.o      tail_ipv4_ct_ingress_policy_only       5093       3385   -1708 (-33.54%)         262         172   -90 (-34.35%)
bpf_lxc.o      tail_ipv6_ct_egress                    4593       3878    -715 (-15.57%)         194         151   -43 (-22.16%)
bpf_lxc.o      tail_ipv6_ct_ingress                   4606       3891    -715 (-15.52%)         194         151   -43 (-22.16%)
bpf_lxc.o      tail_ipv6_ct_ingress_policy_only       4606       3891    -715 (-15.52%)         194         151   -43 (-22.16%)
bpf_lxc.o      tail_nodeport_nat_ingress_ipv4         5526       3534   -1992 (-36.05%)         366         243  -123 (-33.61%)
bpf_lxc.o      tail_nodeport_nat_ingress_ipv6         5132       4256    -876 (-17.07%)         241         219    -22 (-9.13%)
bpf_overlay.o  tail_handle_nat_fwd_ipv4              20524      10114  -10410 (-50.72%)        1271         638  -633 (-49.80%)
bpf_overlay.o  tail_nodeport_nat_egress_ipv4         22718      19490   -3228 (-14.21%)        1475        1275  -200 (-13.56%)
bpf_overlay.o  tail_nodeport_nat_ingress_ipv4         5526       3534   -1992 (-36.05%)         366         243  -123 (-33.61%)
bpf_overlay.o  tail_nodeport_nat_ingress_ipv6         5132       4256    -876 (-17.07%)         241         219    -22 (-9.13%)
bpf_overlay.o  tail_nodeport_nat_ipv6_egress          3638       3548      -90 (-2.47%)         209         203     -6 (-2.87%)
bpf_overlay.o  tail_rev_nodeport_lb4                  4368       3820    -548 (-12.55%)         248         215   -33 (-13.31%)
bpf_overlay.o  tail_rev_nodeport_lb6                  2867       2428    -439 (-15.31%)         167         140   -27 (-16.17%)
bpf_sock.o     cil_sock6_connect                      1718       1703      -15 (-0.87%)         100          99     -1 (-1.00%)
bpf_xdp.o      tail_handle_nat_fwd_ipv4              12917      12443     -474 (-3.67%)         875         849    -26 (-2.97%)
bpf_xdp.o      tail_handle_nat_fwd_ipv6              13515      13264     -251 (-1.86%)         715         702    -13 (-1.82%)
bpf_xdp.o      tail_lb_ipv4                          39492      36367    -3125 (-7.91%)        2430        2251   -179 (-7.37%)
bpf_xdp.o      tail_lb_ipv6                          80441      78058    -2383 (-2.96%)        3647        3523   -124 (-3.40%)
bpf_xdp.o      tail_nodeport_ipv6_dsr                 1038        901    -137 (-13.20%)          61          55     -6 (-9.84%)
bpf_xdp.o      tail_nodeport_nat_egress_ipv4         13027      12096     -931 (-7.15%)         868         809    -59 (-6.80%)
bpf_xdp.o      tail_nodeport_nat_ingress_ipv4         7617       5900   -1717 (-22.54%)         522         413  -109 (-20.88%)
bpf_xdp.o      tail_nodeport_nat_ingress_ipv6         7575       7395     -180 (-2.38%)         383         374     -9 (-2.35%)
bpf_xdp.o      tail_rev_nodeport_lb4                  6808       6739      -69 (-1.01%)         403         396     -7 (-1.74%)
bpf_xdp.o      tail_rev_nodeport_lb6                 16173      15847     -326 (-2.02%)        1010         990    -20 (-1.98%)

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231205184248.1502704-9-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1ebe76c98451..e5ce530641ba 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4477,8 +4477,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 		return err;
 
 	mark_stack_slot_scratched(env, spi);
-	if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
-	    !register_is_null(reg) && env->bpf_capable) {
+	if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && env->bpf_capable) {
 		save_register_state(env, state, spi, reg, size);
 		/* Break the relation on a narrowing spill. */
 		if (fls64(reg->umax_value) > BITS_PER_BYTE * size)
@@ -4527,7 +4526,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 		/* when we zero initialize stack slots mark them as such */
 		if ((reg && register_is_null(reg)) ||
 		    (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {
-			/* backtracking doesn't work for STACK_ZERO yet. */
+			/* STACK_ZERO case happened because register spill
+			 * wasn't properly aligned at the stack slot boundary,
+			 * so it's not a register spill anymore; force
+			 * originating register to be precise to make
+			 * STACK_ZERO correct for subsequent states
+			 */
 			err = mark_chain_precision(env, value_regno);
 			if (err)
 				return err;
-- 
cgit v1.2.3


From 7be76461f302ec05cbd62b90b2a05c64299ca01f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 5 Dec 2023 16:52:09 -0500
Subject: tracing: Always update snapshot buffer size

It use to be that only the top level instance had a snapshot buffer (for
latency tracers like wakeup and irqsoff). The update of the ring buffer
size would check if the instance was the top level and if so, it would
also update the snapshot buffer as it needs to be the same as the main
buffer.

Now that lower level instances also has a snapshot buffer, they too need
to update their snapshot buffer sizes when the main buffer is changed,
otherwise the following can be triggered:

 # cd /sys/kernel/tracing
 # echo 1500 > buffer_size_kb
 # mkdir instances/foo
 # echo irqsoff > instances/foo/current_tracer
 # echo 1000 > instances/foo/buffer_size_kb

Produces:

 WARNING: CPU: 2 PID: 856 at kernel/trace/trace.c:1938 update_max_tr_single.part.0+0x27d/0x320

Which is:

	ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu);

	if (ret == -EBUSY) {
		[..]
	}

	WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);  <== here

That's because ring_buffer_swap_cpu() has:

	int ret = -EINVAL;

	[..]

	/* At least make sure the two buffers are somewhat the same */
	if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
		goto out;

	[..]
 out:
	return ret;
 }

Instead, update all instances' snapshot buffer sizes when their main
buffer size is updated.

Link: https://lkml.kernel.org/r/20231205220010.454662151@goodmis.org

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: 6d9b3fa5e7f6 ("tracing: Move tracing_max_latency into trace_array")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9aebf904ff97..231c173ec04f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6392,8 +6392,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
 		return ret;
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-	if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
-	    !tr->current_trace->use_max_tr)
+	if (!tr->current_trace->use_max_tr)
 		goto out;
 
 	ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
-- 
cgit v1.2.3


From d78ab792705c7be1b91243b2544d1a79406a2ad7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 5 Dec 2023 16:52:10 -0500
Subject: tracing: Stop current tracer when resizing buffer

When the ring buffer is being resized, it can cause side effects to the
running tracer. For instance, there's a race with irqsoff tracer that
swaps individual per cpu buffers between the main buffer and the snapshot
buffer. The resize operation modifies the main buffer and then the
snapshot buffer. If a swap happens in between those two operations it will
break the tracer.

Simply stop the running tracer before resizing the buffers and enable it
again when finished.

Link: https://lkml.kernel.org/r/20231205220010.748996423@goodmis.org

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: 3928a8a2d9808 ("ftrace: make work with new ring buffer")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 231c173ec04f..e978868b1a22 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6387,9 +6387,12 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
 	if (!tr->array_buffer.buffer)
 		return 0;
 
+	/* Do not allow tracing while resizng ring buffer */
+	tracing_stop_tr(tr);
+
 	ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu);
 	if (ret < 0)
-		return ret;
+		goto out_start;
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 	if (!tr->current_trace->use_max_tr)
@@ -6417,7 +6420,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
 			WARN_ON(1);
 			tracing_disabled = 1;
 		}
-		return ret;
+		goto out_start;
 	}
 
 	update_buffer_entries(&tr->max_buffer, cpu);
@@ -6426,7 +6429,8 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
 #endif /* CONFIG_TRACER_MAX_TRACE */
 
 	update_buffer_entries(&tr->array_buffer, cpu);
-
+ out_start:
+	tracing_start_tr(tr);
 	return ret;
 }
 
-- 
cgit v1.2.3


From b538bf7d0ec11ca49f536dfda742a5f6db90a798 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 5 Dec 2023 16:52:11 -0500
Subject: tracing: Disable snapshot buffer when stopping instance tracers

It use to be that only the top level instance had a snapshot buffer (for
latency tracers like wakeup and irqsoff). When stopping a tracer in an
instance would not disable the snapshot buffer. This could have some
unintended consequences if the irqsoff tracer is enabled.

Consolidate the tracing_start/stop() with tracing_start/stop_tr() so that
all instances behave the same. The tracing_start/stop() functions will
just call their respective tracing_start/stop_tr() with the global_array
passed in.

Link: https://lkml.kernel.org/r/20231205220011.041220035@goodmis.org

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: 6d9b3fa5e7f6 ("tracing: Move tracing_max_latency into trace_array")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 110 ++++++++++++++++-----------------------------------
 1 file changed, 34 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e978868b1a22..2492c6c76850 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2360,13 +2360,7 @@ int is_tracing_stopped(void)
 	return global_trace.stop_count;
 }
 
-/**
- * tracing_start - quick start of the tracer
- *
- * If tracing is enabled but was stopped by tracing_stop,
- * this will start the tracer back up.
- */
-void tracing_start(void)
+static void tracing_start_tr(struct trace_array *tr)
 {
 	struct trace_buffer *buffer;
 	unsigned long flags;
@@ -2374,119 +2368,83 @@ void tracing_start(void)
 	if (tracing_disabled)
 		return;
 
-	raw_spin_lock_irqsave(&global_trace.start_lock, flags);
-	if (--global_trace.stop_count) {
-		if (global_trace.stop_count < 0) {
+	raw_spin_lock_irqsave(&tr->start_lock, flags);
+	if (--tr->stop_count) {
+		if (WARN_ON_ONCE(tr->stop_count < 0)) {
 			/* Someone screwed up their debugging */
-			WARN_ON_ONCE(1);
-			global_trace.stop_count = 0;
+			tr->stop_count = 0;
 		}
 		goto out;
 	}
 
 	/* Prevent the buffers from switching */
-	arch_spin_lock(&global_trace.max_lock);
+	arch_spin_lock(&tr->max_lock);
 
-	buffer = global_trace.array_buffer.buffer;
+	buffer = tr->array_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_enable(buffer);
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-	buffer = global_trace.max_buffer.buffer;
+	buffer = tr->max_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_enable(buffer);
 #endif
 
-	arch_spin_unlock(&global_trace.max_lock);
-
- out:
-	raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
-}
-
-static void tracing_start_tr(struct trace_array *tr)
-{
-	struct trace_buffer *buffer;
-	unsigned long flags;
-
-	if (tracing_disabled)
-		return;
-
-	/* If global, we need to also start the max tracer */
-	if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
-		return tracing_start();
-
-	raw_spin_lock_irqsave(&tr->start_lock, flags);
-
-	if (--tr->stop_count) {
-		if (tr->stop_count < 0) {
-			/* Someone screwed up their debugging */
-			WARN_ON_ONCE(1);
-			tr->stop_count = 0;
-		}
-		goto out;
-	}
-
-	buffer = tr->array_buffer.buffer;
-	if (buffer)
-		ring_buffer_record_enable(buffer);
+	arch_spin_unlock(&tr->max_lock);
 
  out:
 	raw_spin_unlock_irqrestore(&tr->start_lock, flags);
 }
 
 /**
- * tracing_stop - quick stop of the tracer
+ * tracing_start - quick start of the tracer
  *
- * Light weight way to stop tracing. Use in conjunction with
- * tracing_start.
+ * If tracing is enabled but was stopped by tracing_stop,
+ * this will start the tracer back up.
  */
-void tracing_stop(void)
+void tracing_start(void)
+
+{
+	return tracing_start_tr(&global_trace);
+}
+
+static void tracing_stop_tr(struct trace_array *tr)
 {
 	struct trace_buffer *buffer;
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&global_trace.start_lock, flags);
-	if (global_trace.stop_count++)
+	raw_spin_lock_irqsave(&tr->start_lock, flags);
+	if (tr->stop_count++)
 		goto out;
 
 	/* Prevent the buffers from switching */
-	arch_spin_lock(&global_trace.max_lock);
+	arch_spin_lock(&tr->max_lock);
 
-	buffer = global_trace.array_buffer.buffer;
+	buffer = tr->array_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_disable(buffer);
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-	buffer = global_trace.max_buffer.buffer;
+	buffer = tr->max_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_disable(buffer);
 #endif
 
-	arch_spin_unlock(&global_trace.max_lock);
+	arch_spin_unlock(&tr->max_lock);
 
  out:
-	raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
+	raw_spin_unlock_irqrestore(&tr->start_lock, flags);
 }
 
-static void tracing_stop_tr(struct trace_array *tr)
+/**
+ * tracing_stop - quick stop of the tracer
+ *
+ * Light weight way to stop tracing. Use in conjunction with
+ * tracing_start.
+ */
+void tracing_stop(void)
 {
-	struct trace_buffer *buffer;
-	unsigned long flags;
-
-	/* If global, we need to also stop the max tracer */
-	if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
-		return tracing_stop();
-
-	raw_spin_lock_irqsave(&tr->start_lock, flags);
-	if (tr->stop_count++)
-		goto out;
-
-	buffer = tr->array_buffer.buffer;
-	if (buffer)
-		ring_buffer_record_disable(buffer);
-
- out:
-	raw_spin_unlock_irqrestore(&tr->start_lock, flags);
+	return tracing_stop_tr(&global_trace);
 }
 
 static int trace_save_cmdline(struct task_struct *tsk)
-- 
cgit v1.2.3


From 7fed14f7ac9cf5e38c693836fe4a874720141845 Mon Sep 17 00:00:00 2001
From: Petr Pavlu <petr.pavlu@suse.com>
Date: Tue, 5 Dec 2023 17:17:34 +0100
Subject: tracing: Fix incomplete locking when disabling buffered events

The following warning appears when using buffered events:

[  203.556451] WARNING: CPU: 53 PID: 10220 at kernel/trace/ring_buffer.c:3912 ring_buffer_discard_commit+0x2eb/0x420
[...]
[  203.670690] CPU: 53 PID: 10220 Comm: stress-ng-sysin Tainted: G            E      6.7.0-rc2-default #4 56e6d0fcf5581e6e51eaaecbdaec2a2338c80f3a
[  203.670704] Hardware name: Intel Corp. GROVEPORT/GROVEPORT, BIOS GVPRCRB1.86B.0016.D04.1705030402 05/03/2017
[  203.670709] RIP: 0010:ring_buffer_discard_commit+0x2eb/0x420
[  203.735721] Code: 4c 8b 4a 50 48 8b 42 48 49 39 c1 0f 84 b3 00 00 00 49 83 e8 01 75 b1 48 8b 42 10 f0 ff 40 08 0f 0b e9 fc fe ff ff f0 ff 47 08 <0f> 0b e9 77 fd ff ff 48 8b 42 10 f0 ff 40 08 0f 0b e9 f5 fe ff ff
[  203.735734] RSP: 0018:ffffb4ae4f7b7d80 EFLAGS: 00010202
[  203.735745] RAX: 0000000000000000 RBX: ffffb4ae4f7b7de0 RCX: ffff8ac10662c000
[  203.735754] RDX: ffff8ac0c750be00 RSI: ffff8ac10662c000 RDI: ffff8ac0c004d400
[  203.781832] RBP: ffff8ac0c039cea0 R08: 0000000000000000 R09: 0000000000000000
[  203.781839] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[  203.781842] R13: ffff8ac10662c000 R14: ffff8ac0c004d400 R15: ffff8ac10662c008
[  203.781846] FS:  00007f4cd8a67740(0000) GS:ffff8ad798880000(0000) knlGS:0000000000000000
[  203.781851] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  203.781855] CR2: 0000559766a74028 CR3: 00000001804c4000 CR4: 00000000001506f0
[  203.781862] Call Trace:
[  203.781870]  <TASK>
[  203.851949]  trace_event_buffer_commit+0x1ea/0x250
[  203.851967]  trace_event_raw_event_sys_enter+0x83/0xe0
[  203.851983]  syscall_trace_enter.isra.0+0x182/0x1a0
[  203.851990]  do_syscall_64+0x3a/0xe0
[  203.852075]  entry_SYSCALL_64_after_hwframe+0x6e/0x76
[  203.852090] RIP: 0033:0x7f4cd870fa77
[  203.982920] Code: 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 90 b8 89 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e9 43 0e 00 f7 d8 64 89 01 48
[  203.982932] RSP: 002b:00007fff99717dd8 EFLAGS: 00000246 ORIG_RAX: 0000000000000089
[  203.982942] RAX: ffffffffffffffda RBX: 0000558ea1d7b6f0 RCX: 00007f4cd870fa77
[  203.982948] RDX: 0000000000000000 RSI: 00007fff99717de0 RDI: 0000558ea1d7b6f0
[  203.982957] RBP: 00007fff99717de0 R08: 00007fff997180e0 R09: 00007fff997180e0
[  203.982962] R10: 00007fff997180e0 R11: 0000000000000246 R12: 00007fff99717f40
[  204.049239] R13: 00007fff99718590 R14: 0000558e9f2127a8 R15: 00007fff997180b0
[  204.049256]  </TASK>

For instance, it can be triggered by running these two commands in
parallel:

 $ while true; do
    echo hist:key=id.syscall:val=hitcount > \
      /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger;
  done
 $ stress-ng --sysinfo $(nproc)

The warning indicates that the current ring_buffer_per_cpu is not in the
committing state. It happens because the active ring_buffer_event
doesn't actually come from the ring_buffer_per_cpu but is allocated from
trace_buffered_event.

The bug is in function trace_buffered_event_disable() where the
following normally happens:

* The code invokes disable_trace_buffered_event() via
  smp_call_function_many() and follows it by synchronize_rcu(). This
  increments the per-CPU variable trace_buffered_event_cnt on each
  target CPU and grants trace_buffered_event_disable() the exclusive
  access to the per-CPU variable trace_buffered_event.

* Maintenance is performed on trace_buffered_event, all per-CPU event
  buffers get freed.

* The code invokes enable_trace_buffered_event() via
  smp_call_function_many(). This decrements trace_buffered_event_cnt and
  releases the access to trace_buffered_event.

A problem is that smp_call_function_many() runs a given function on all
target CPUs except on the current one. The following can then occur:

* Task X executing trace_buffered_event_disable() runs on CPU 0.

* The control reaches synchronize_rcu() and the task gets rescheduled on
  another CPU 1.

* The RCU synchronization finishes. At this point,
  trace_buffered_event_disable() has the exclusive access to all
  trace_buffered_event variables except trace_buffered_event[CPU0]
  because trace_buffered_event_cnt[CPU0] is never incremented and if the
  buffer is currently unused, remains set to 0.

* A different task Y is scheduled on CPU 0 and hits a trace event. The
  code in trace_event_buffer_lock_reserve() sees that
  trace_buffered_event_cnt[CPU0] is set to 0 and decides the use the
  buffer provided by trace_buffered_event[CPU0].

* Task X continues its execution in trace_buffered_event_disable(). The
  code incorrectly frees the event buffer pointed by
  trace_buffered_event[CPU0] and resets the variable to NULL.

* Task Y writes event data to the now freed buffer and later detects the
  created inconsistency.

The issue is observable since commit dea499781a11 ("tracing: Fix warning
in trace_buffered_event_disable()") which moved the call of
trace_buffered_event_disable() in __ftrace_event_enable_disable()
earlier, prior to invoking call->class->reg(.. TRACE_REG_UNREGISTER ..).
The underlying problem in trace_buffered_event_disable() is however
present since the original implementation in commit 0fc1b09ff1ff
("tracing: Use temp buffer when filtering events").

Fix the problem by replacing the two smp_call_function_many() calls with
on_each_cpu_mask() which invokes a given callback on all CPUs.

Link: https://lore.kernel.org/all/20231127151248.7232-2-petr.pavlu@suse.com/
Link: https://lkml.kernel.org/r/20231205161736.19663-2-petr.pavlu@suse.com

Cc: stable@vger.kernel.org
Fixes: 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events")
Fixes: dea499781a11 ("tracing: Fix warning in trace_buffered_event_disable()")
Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2492c6c76850..6aeffa4a6994 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2781,11 +2781,9 @@ void trace_buffered_event_disable(void)
 	if (--trace_buffered_event_ref)
 		return;
 
-	preempt_disable();
 	/* For each CPU, set the buffer as used. */
-	smp_call_function_many(tracing_buffer_mask,
-			       disable_trace_buffered_event, NULL, 1);
-	preempt_enable();
+	on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event,
+			 NULL, true);
 
 	/* Wait for all current users to finish */
 	synchronize_rcu();
@@ -2800,11 +2798,9 @@ void trace_buffered_event_disable(void)
 	 */
 	smp_wmb();
 
-	preempt_disable();
 	/* Do the work on each cpu */
-	smp_call_function_many(tracing_buffer_mask,
-			       enable_trace_buffered_event, NULL, 1);
-	preempt_enable();
+	on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL,
+			 true);
 }
 
 static struct trace_buffer *temp_buffer;
-- 
cgit v1.2.3


From 34209fe83ef8404353f91ab4ea4035dbc9922d04 Mon Sep 17 00:00:00 2001
From: Petr Pavlu <petr.pavlu@suse.com>
Date: Tue, 5 Dec 2023 17:17:35 +0100
Subject: tracing: Fix a warning when allocating buffered events fails

Function trace_buffered_event_disable() produces an unexpected warning
when the previous call to trace_buffered_event_enable() fails to
allocate pages for buffered events.

The situation can occur as follows:

* The counter trace_buffered_event_ref is at 0.

* The soft mode gets enabled for some event and
  trace_buffered_event_enable() is called. The function increments
  trace_buffered_event_ref to 1 and starts allocating event pages.

* The allocation fails for some page and trace_buffered_event_disable()
  is called for cleanup.

* Function trace_buffered_event_disable() decrements
  trace_buffered_event_ref back to 0, recognizes that it was the last
  use of buffered events and frees all allocated pages.

* The control goes back to trace_buffered_event_enable() which returns.
  The caller of trace_buffered_event_enable() has no information that
  the function actually failed.

* Some time later, the soft mode is disabled for the same event.
  Function trace_buffered_event_disable() is called. It warns on
  "WARN_ON_ONCE(!trace_buffered_event_ref)" and returns.

Buffered events are just an optimization and can handle failures. Make
trace_buffered_event_enable() exit on the first failure and left any
cleanup later to when trace_buffered_event_disable() is called.

Link: https://lore.kernel.org/all/20231127151248.7232-2-petr.pavlu@suse.com/
Link: https://lkml.kernel.org/r/20231205161736.19663-3-petr.pavlu@suse.com

Fixes: 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events")
Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6aeffa4a6994..ef72354f61ce 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2728,8 +2728,11 @@ void trace_buffered_event_enable(void)
 	for_each_tracing_cpu(cpu) {
 		page = alloc_pages_node(cpu_to_node(cpu),
 					GFP_KERNEL | __GFP_NORETRY, 0);
-		if (!page)
-			goto failed;
+		/* This is just an optimization and can handle failures */
+		if (!page) {
+			pr_err("Failed to allocate event buffer\n");
+			break;
+		}
 
 		event = page_address(page);
 		memset(event, 0, sizeof(*event));
@@ -2743,10 +2746,6 @@ void trace_buffered_event_enable(void)
 			WARN_ON_ONCE(1);
 		preempt_enable();
 	}
-
-	return;
- failed:
-	trace_buffered_event_disable();
 }
 
 static void enable_trace_buffered_event(void *data)
-- 
cgit v1.2.3


From c0591b1cccf708a47bc465c62436d669a4213323 Mon Sep 17 00:00:00 2001
From: Petr Pavlu <petr.pavlu@suse.com>
Date: Tue, 5 Dec 2023 17:17:36 +0100
Subject: tracing: Fix a possible race when disabling buffered events

Function trace_buffered_event_disable() is responsible for freeing pages
backing buffered events and this process can run concurrently with
trace_event_buffer_lock_reserve().

The following race is currently possible:

* Function trace_buffered_event_disable() is called on CPU 0. It
  increments trace_buffered_event_cnt on each CPU and waits via
  synchronize_rcu() for each user of trace_buffered_event to complete.

* After synchronize_rcu() is finished, function
  trace_buffered_event_disable() has the exclusive access to
  trace_buffered_event. All counters trace_buffered_event_cnt are at 1
  and all pointers trace_buffered_event are still valid.

* At this point, on a different CPU 1, the execution reaches
  trace_event_buffer_lock_reserve(). The function calls
  preempt_disable_notrace() and only now enters an RCU read-side
  critical section. The function proceeds and reads a still valid
  pointer from trace_buffered_event[CPU1] into the local variable
  "entry". However, it doesn't yet read trace_buffered_event_cnt[CPU1]
  which happens later.

* Function trace_buffered_event_disable() continues. It frees
  trace_buffered_event[CPU1] and decrements
  trace_buffered_event_cnt[CPU1] back to 0.

* Function trace_event_buffer_lock_reserve() continues. It reads and
  increments trace_buffered_event_cnt[CPU1] from 0 to 1. This makes it
  believe that it can use the "entry" that it already obtained but the
  pointer is now invalid and any access results in a use-after-free.

Fix the problem by making a second synchronize_rcu() call after all
trace_buffered_event values are set to NULL. This waits on all potential
users in trace_event_buffer_lock_reserve() that still read a previous
pointer from trace_buffered_event.

Link: https://lore.kernel.org/all/20231127151248.7232-2-petr.pavlu@suse.com/
Link: https://lkml.kernel.org/r/20231205161736.19663-4-petr.pavlu@suse.com

Cc: stable@vger.kernel.org
Fixes: 0fc1b09ff1ff ("tracing: Use temp buffer when filtering events")
Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ef72354f61ce..fbcd3bafb93e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2791,13 +2791,17 @@ void trace_buffered_event_disable(void)
 		free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
 		per_cpu(trace_buffered_event, cpu) = NULL;
 	}
+
 	/*
-	 * Make sure trace_buffered_event is NULL before clearing
-	 * trace_buffered_event_cnt.
+	 * Wait for all CPUs that potentially started checking if they can use
+	 * their event buffer only after the previous synchronize_rcu() call and
+	 * they still read a valid pointer from trace_buffered_event. It must be
+	 * ensured they don't see cleared trace_buffered_event_cnt else they
+	 * could wrongly decide to use the pointed-to buffer which is now freed.
 	 */
-	smp_wmb();
+	synchronize_rcu();
 
-	/* Do the work on each cpu */
+	/* For each CPU, relinquish the buffer */
 	on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL,
 			 true);
 }
-- 
cgit v1.2.3


From 909fa05dd3c181e5b403912889057f7cdbf3906c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:13 -0800
Subject: bpf: align CAP_NET_ADMIN checks with bpf_capable() approach

Within BPF syscall handling code CAP_NET_ADMIN checks stand out a bit
compared to CAP_BPF and CAP_PERFMON checks. For the latter, CAP_BPF or
CAP_PERFMON are checked first, but if they are not set, CAP_SYS_ADMIN
takes over and grants whatever part of BPF syscall is required.

Similar kind of checks that involve CAP_NET_ADMIN are not so consistent.
One out of four uses does follow CAP_BPF/CAP_PERFMON model: during
BPF_PROG_LOAD, if the type of BPF program is "network-related" either
CAP_NET_ADMIN or CAP_SYS_ADMIN is required to proceed.

But in three other cases CAP_NET_ADMIN is required even if CAP_SYS_ADMIN
is set:
  - when creating DEVMAP/XDKMAP/CPU_MAP maps;
  - when attaching CGROUP_SKB programs;
  - when handling BPF_PROG_QUERY command.

This patch is changing the latter three cases to follow BPF_PROG_LOAD
model, that is allowing to proceed under either CAP_NET_ADMIN or
CAP_SYS_ADMIN.

This also makes it cleaner in subsequent BPF token patches to switch
wholesomely to a generic bpf_token_capable(int cap) check, that always
falls back to CAP_SYS_ADMIN if requested capability is missing.

Cc: Jakub Kicinski <kuba@kernel.org>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ebaccf77d56e..ee33a52abf18 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1121,6 +1121,11 @@ free_map_tab:
 	return ret;
 }
 
+static bool bpf_net_capable(void)
+{
+	return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
+}
+
 #define BPF_MAP_CREATE_LAST_FIELD map_extra
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
@@ -1224,7 +1229,7 @@ static int map_create(union bpf_attr *attr)
 	case BPF_MAP_TYPE_DEVMAP:
 	case BPF_MAP_TYPE_DEVMAP_HASH:
 	case BPF_MAP_TYPE_XSKMAP:
-		if (!capable(CAP_NET_ADMIN))
+		if (!bpf_net_capable())
 			return -EPERM;
 		break;
 	default:
@@ -2625,7 +2630,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	    !bpf_capable())
 		return -EPERM;
 
-	if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
+	if (is_net_admin_prog_type(type) && !bpf_net_capable())
 		return -EPERM;
 	if (is_perfmon_prog_type(type) && !perfmon_capable())
 		return -EPERM;
@@ -3777,7 +3782,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
 	case BPF_PROG_TYPE_CGROUP_SKB:
-		if (!capable(CAP_NET_ADMIN))
+		if (!bpf_net_capable())
 			/* cg-skb progs can be loaded by unpriv user.
 			 * check permissions at attach time.
 			 */
@@ -3980,7 +3985,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 static int bpf_prog_query(const union bpf_attr *attr,
 			  union bpf_attr __user *uattr)
 {
-	if (!capable(CAP_NET_ADMIN))
+	if (!bpf_net_capable())
 		return -EPERM;
 	if (CHECK_ATTR(BPF_PROG_QUERY))
 		return -EINVAL;
-- 
cgit v1.2.3


From 40bba140c60fbb3ee8df6203c82fbd3de9f19d95 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:14 -0800
Subject: bpf: add BPF token delegation mount options to BPF FS

Add few new mount options to BPF FS that allow to specify that a given
BPF FS instance allows creation of BPF token (added in the next patch),
and what sort of operations are allowed under BPF token. As such, we get
4 new mount options, each is a bit mask
  - `delegate_cmds` allow to specify which bpf() syscall commands are
    allowed with BPF token derived from this BPF FS instance;
  - if BPF_MAP_CREATE command is allowed, `delegate_maps` specifies
    a set of allowable BPF map types that could be created with BPF token;
  - if BPF_PROG_LOAD command is allowed, `delegate_progs` specifies
    a set of allowable BPF program types that could be loaded with BPF token;
  - if BPF_PROG_LOAD command is allowed, `delegate_attachs` specifies
    a set of allowable BPF program attach types that could be loaded with
    BPF token; delegate_progs and delegate_attachs are meant to be used
    together, as full BPF program type is, in general, determined
    through both program type and program attach type.

Currently, these mount options accept the following forms of values:
  - a special value "any", that enables all possible values of a given
  bit set;
  - numeric value (decimal or hexadecimal, determined by kernel
  automatically) that specifies a bit mask value directly;
  - all the values for a given mount option are combined, if specified
  multiple times. E.g., `mount -t bpf nodev /path/to/mount -o
  delegate_maps=0x1 -o delegate_maps=0x2` will result in a combined 0x3
  mask.

Ideally, more convenient (for humans) symbolic form derived from
corresponding UAPI enums would be accepted (e.g., `-o
delegate_progs=kprobe|tracepoint`) and I intend to implement this, but
it requires a bunch of UAPI header churn, so I postponed it until this
feature lands upstream or at least there is a definite consensus that
this feature is acceptable and is going to make it, just to minimize
amount of wasted effort and not increase amount of non-essential code to
be reviewed.

Attentive reader will notice that BPF FS is now marked as
FS_USERNS_MOUNT, which theoretically makes it mountable inside non-init
user namespace as long as the process has sufficient *namespaced*
capabilities within that user namespace. But in reality we still
restrict BPF FS to be mountable only by processes with CAP_SYS_ADMIN *in
init userns* (extra check in bpf_fill_super()). FS_USERNS_MOUNT is added
to allow creating BPF FS context object (i.e., fsopen("bpf")) from
inside unprivileged process inside non-init userns, to capture that
userns as the owning userns. It will still be required to pass this
context object back to privileged process to instantiate and mount it.

This manipulation is important, because capturing non-init userns as the
owning userns of BPF FS instance (super block) allows to use that userns
to constraint BPF token to that userns later on (see next patch). So
creating BPF FS with delegation inside unprivileged userns will restrict
derived BPF token objects to only "work" inside that intended userns,
making it scoped to a intended "container". Also, setting these
delegation options requires capable(CAP_SYS_ADMIN), so unprivileged
process cannot set this up without involvement of a privileged process.

There is a set of selftests at the end of the patch set that simulates
this sequence of steps and validates that everything works as intended.
But careful review is requested to make sure there are no missed gaps in
the implementation and testing.

This somewhat subtle set of aspects is the result of previous
discussions ([0]) about various user namespace implications and
interactions with BPF token functionality and is necessary to contain
BPF token inside intended user namespace.

  [0] https://lore.kernel.org/bpf/20230704-hochverdient-lehne-eeb9eeef785e@brauner/

Acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 10 ++++++
 kernel/bpf/inode.c  | 88 +++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 88 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 10e5e4d8a00f..d3c9acc593ea 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1581,6 +1581,16 @@ struct bpf_link_primer {
 	u32 id;
 };
 
+struct bpf_mount_opts {
+	umode_t mode;
+
+	/* BPF token-related delegation options */
+	u64 delegate_cmds;
+	u64 delegate_maps;
+	u64 delegate_progs;
+	u64 delegate_attachs;
+};
+
 struct bpf_struct_ops_value;
 struct btf_member;
 
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 1aafb2ff2e95..220fe0f99095 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -20,6 +20,7 @@
 #include <linux/filter.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/kstrtox.h>
 #include "preload/bpf_preload.h"
 
 enum bpf_type {
@@ -599,10 +600,31 @@ EXPORT_SYMBOL(bpf_prog_get_type_path);
  */
 static int bpf_show_options(struct seq_file *m, struct dentry *root)
 {
+	struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
 	umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
 
 	if (mode != S_IRWXUGO)
 		seq_printf(m, ",mode=%o", mode);
+
+	if (opts->delegate_cmds == ~0ULL)
+		seq_printf(m, ",delegate_cmds=any");
+	else if (opts->delegate_cmds)
+		seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds);
+
+	if (opts->delegate_maps == ~0ULL)
+		seq_printf(m, ",delegate_maps=any");
+	else if (opts->delegate_maps)
+		seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps);
+
+	if (opts->delegate_progs == ~0ULL)
+		seq_printf(m, ",delegate_progs=any");
+	else if (opts->delegate_progs)
+		seq_printf(m, ",delegate_progs=0x%llx", opts->delegate_progs);
+
+	if (opts->delegate_attachs == ~0ULL)
+		seq_printf(m, ",delegate_attachs=any");
+	else if (opts->delegate_attachs)
+		seq_printf(m, ",delegate_attachs=0x%llx", opts->delegate_attachs);
 	return 0;
 }
 
@@ -626,22 +648,27 @@ static const struct super_operations bpf_super_ops = {
 
 enum {
 	OPT_MODE,
+	OPT_DELEGATE_CMDS,
+	OPT_DELEGATE_MAPS,
+	OPT_DELEGATE_PROGS,
+	OPT_DELEGATE_ATTACHS,
 };
 
 static const struct fs_parameter_spec bpf_fs_parameters[] = {
 	fsparam_u32oct	("mode",			OPT_MODE),
+	fsparam_string	("delegate_cmds",		OPT_DELEGATE_CMDS),
+	fsparam_string	("delegate_maps",		OPT_DELEGATE_MAPS),
+	fsparam_string	("delegate_progs",		OPT_DELEGATE_PROGS),
+	fsparam_string	("delegate_attachs",		OPT_DELEGATE_ATTACHS),
 	{}
 };
 
-struct bpf_mount_opts {
-	umode_t mode;
-};
-
 static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	struct bpf_mount_opts *opts = fc->fs_private;
+	struct bpf_mount_opts *opts = fc->s_fs_info;
 	struct fs_parse_result result;
-	int opt;
+	int opt, err;
+	u64 msk;
 
 	opt = fs_parse(fc, bpf_fs_parameters, param, &result);
 	if (opt < 0) {
@@ -665,6 +692,28 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	case OPT_MODE:
 		opts->mode = result.uint_32 & S_IALLUGO;
 		break;
+	case OPT_DELEGATE_CMDS:
+	case OPT_DELEGATE_MAPS:
+	case OPT_DELEGATE_PROGS:
+	case OPT_DELEGATE_ATTACHS:
+		if (strcmp(param->string, "any") == 0) {
+			msk = ~0ULL;
+		} else {
+			err = kstrtou64(param->string, 0, &msk);
+			if (err)
+				return err;
+		}
+		/* Setting delegation mount options requires privileges */
+		if (msk && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		switch (opt) {
+		case OPT_DELEGATE_CMDS: opts->delegate_cmds |= msk; break;
+		case OPT_DELEGATE_MAPS: opts->delegate_maps |= msk; break;
+		case OPT_DELEGATE_PROGS: opts->delegate_progs |= msk; break;
+		case OPT_DELEGATE_ATTACHS: opts->delegate_attachs |= msk; break;
+		default: return -EINVAL;
+		}
+		break;
 	}
 
 	return 0;
@@ -739,10 +788,14 @@ out:
 static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	static const struct tree_descr bpf_rfiles[] = { { "" } };
-	struct bpf_mount_opts *opts = fc->fs_private;
+	struct bpf_mount_opts *opts = sb->s_fs_info;
 	struct inode *inode;
 	int ret;
 
+	/* Mounting an instance of BPF FS requires privileges */
+	if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
 	if (ret)
 		return ret;
@@ -764,7 +817,7 @@ static int bpf_get_tree(struct fs_context *fc)
 
 static void bpf_free_fc(struct fs_context *fc)
 {
-	kfree(fc->fs_private);
+	kfree(fc->s_fs_info);
 }
 
 static const struct fs_context_operations bpf_context_ops = {
@@ -786,17 +839,32 @@ static int bpf_init_fs_context(struct fs_context *fc)
 
 	opts->mode = S_IRWXUGO;
 
-	fc->fs_private = opts;
+	/* start out with no BPF token delegation enabled */
+	opts->delegate_cmds = 0;
+	opts->delegate_maps = 0;
+	opts->delegate_progs = 0;
+	opts->delegate_attachs = 0;
+
+	fc->s_fs_info = opts;
 	fc->ops = &bpf_context_ops;
 	return 0;
 }
 
+static void bpf_kill_super(struct super_block *sb)
+{
+	struct bpf_mount_opts *opts = sb->s_fs_info;
+
+	kill_litter_super(sb);
+	kfree(opts);
+}
+
 static struct file_system_type bpf_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "bpf",
 	.init_fs_context = bpf_init_fs_context,
 	.parameters	= bpf_fs_parameters,
-	.kill_sb	= kill_litter_super,
+	.kill_sb	= bpf_kill_super,
+	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 static int __init bpf_init(void)
-- 
cgit v1.2.3


From 4527358b76861dfd64ee34aba45d81648fbc8a61 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:15 -0800
Subject: bpf: introduce BPF token object

Add new kind of BPF kernel object, BPF token. BPF token is meant to
allow delegating privileged BPF functionality, like loading a BPF
program or creating a BPF map, from privileged process to a *trusted*
unprivileged process, all while having a good amount of control over which
privileged operations could be performed using provided BPF token.

This is achieved through mounting BPF FS instance with extra delegation
mount options, which determine what operations are delegatable, and also
constraining it to the owning user namespace (as mentioned in the
previous patch).

BPF token itself is just a derivative from BPF FS and can be created
through a new bpf() syscall command, BPF_TOKEN_CREATE, which accepts BPF
FS FD, which can be attained through open() API by opening BPF FS mount
point. Currently, BPF token "inherits" delegated command, map types,
prog type, and attach type bit sets from BPF FS as is. In the future,
having an BPF token as a separate object with its own FD, we can allow
to further restrict BPF token's allowable set of things either at the
creation time or after the fact, allowing the process to guard itself
further from unintentionally trying to load undesired kind of BPF
programs. But for now we keep things simple and just copy bit sets as is.

When BPF token is created from BPF FS mount, we take reference to the
BPF super block's owning user namespace, and then use that namespace for
checking all the {CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN}
capabilities that are normally only checked against init userns (using
capable()), but now we check them using ns_capable() instead (if BPF
token is provided). See bpf_token_capable() for details.

Such setup means that BPF token in itself is not sufficient to grant BPF
functionality. User namespaced process has to *also* have necessary
combination of capabilities inside that user namespace. So while
previously CAP_BPF was useless when granted within user namespace, now
it gains a meaning and allows container managers and sys admins to have
a flexible control over which processes can and need to use BPF
functionality within the user namespace (i.e., container in practice).
And BPF FS delegation mount options and derived BPF tokens serve as
a per-container "flag" to grant overall ability to use bpf() (plus further
restrict on which parts of bpf() syscalls are treated as namespaced).

Note also, BPF_TOKEN_CREATE command itself requires ns_capable(CAP_BPF)
within the BPF FS owning user namespace, rounding up the ns_capable()
story of BPF token.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  41 ++++++++
 include/uapi/linux/bpf.h       |  37 +++++++
 kernel/bpf/Makefile            |   2 +-
 kernel/bpf/inode.c             |  12 ++-
 kernel/bpf/syscall.c           |  17 ++++
 kernel/bpf/token.c             | 214 +++++++++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  37 +++++++
 7 files changed, 354 insertions(+), 6 deletions(-)
 create mode 100644 kernel/bpf/token.c

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d3c9acc593ea..aa9cf8e5fab1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -51,6 +51,10 @@ struct module;
 struct bpf_func_state;
 struct ftrace_ops;
 struct cgroup;
+struct bpf_token;
+struct user_namespace;
+struct super_block;
+struct inode;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -1591,6 +1595,13 @@ struct bpf_mount_opts {
 	u64 delegate_attachs;
 };
 
+struct bpf_token {
+	struct work_struct work;
+	atomic64_t refcnt;
+	struct user_namespace *userns;
+	u64 allowed_cmds;
+};
+
 struct bpf_struct_ops_value;
 struct btf_member;
 
@@ -2048,6 +2059,7 @@ static inline void bpf_enable_instrumentation(void)
 	migrate_enable();
 }
 
+extern const struct super_operations bpf_super_ops;
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 extern const struct file_operations bpf_iter_fops;
@@ -2182,6 +2194,8 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map)
 
 extern int sysctl_unprivileged_bpf_disabled;
 
+bool bpf_token_capable(const struct bpf_token *token, int cap);
+
 static inline bool bpf_allow_ptr_leaks(void)
 {
 	return perfmon_capable();
@@ -2216,8 +2230,17 @@ int bpf_link_new_fd(struct bpf_link *link);
 struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
 
+void bpf_token_inc(struct bpf_token *token);
+void bpf_token_put(struct bpf_token *token);
+int bpf_token_create(union bpf_attr *attr);
+struct bpf_token *bpf_token_get_from_fd(u32 ufd);
+
+bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
+
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
+struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir,
+			    umode_t mode);
 
 #define BPF_ITER_FUNC_PREFIX "bpf_iter_"
 #define DEFINE_BPF_ITER_FUNC(target, args...)			\
@@ -2580,6 +2603,24 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 	return -EOPNOTSUPP;
 }
 
+static inline bool bpf_token_capable(const struct bpf_token *token, int cap)
+{
+	return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
+}
+
+static inline void bpf_token_inc(struct bpf_token *token)
+{
+}
+
+static inline void bpf_token_put(struct bpf_token *token)
+{
+}
+
+static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline void __dev_flush(void)
 {
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e88746ba7d21..d4a567e5bc3c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -847,6 +847,36 @@ union bpf_iter_link_info {
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
  *
+ * BPF_TOKEN_CREATE
+ *	Description
+ *		Create BPF token with embedded information about what
+ *		BPF-related functionality it allows:
+ *		- a set of allowed bpf() syscall commands;
+ *		- a set of allowed BPF map types to be created with
+ *		BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed;
+ *		- a set of allowed BPF program types and BPF program attach
+ *		types to be loaded with BPF_PROG_LOAD command, if
+ *		BPF_PROG_LOAD itself is allowed.
+ *
+ *		BPF token is created (derived) from an instance of BPF FS,
+ *		assuming it has necessary delegation mount options specified.
+ *		This BPF token can be passed as an extra parameter to various
+ *		bpf() syscall commands to grant BPF subsystem functionality to
+ *		unprivileged processes.
+ *
+ *		When created, BPF token is "associated" with the owning
+ *		user namespace of BPF FS instance (super block) that it was
+ *		derived from, and subsequent BPF operations performed with
+ *		BPF token would be performing capabilities checks (i.e.,
+ *		CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within
+ *		that user namespace. Without BPF token, such capabilities
+ *		have to be granted in init user namespace, making bpf()
+ *		syscall incompatible with user namespace, for the most part.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -901,6 +931,8 @@ enum bpf_cmd {
 	BPF_ITER_CREATE,
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
+	BPF_TOKEN_CREATE,
+	__MAX_BPF_CMD,
 };
 
 enum bpf_map_type {
@@ -1712,6 +1744,11 @@ union bpf_attr {
 		__u32		flags;		/* extra flags */
 	} prog_bind_map;
 
+	struct { /* struct used by BPF_TOKEN_CREATE command */
+		__u32		flags;
+		__u32		bpffs_fd;
+	} token_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index f526b7573e97..4ce95acfcaa7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
 endif
 CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 220fe0f99095..6ce3f9696e72 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -99,9 +99,9 @@ static const struct inode_operations bpf_prog_iops = { };
 static const struct inode_operations bpf_map_iops  = { };
 static const struct inode_operations bpf_link_iops  = { };
 
-static struct inode *bpf_get_inode(struct super_block *sb,
-				   const struct inode *dir,
-				   umode_t mode)
+struct inode *bpf_get_inode(struct super_block *sb,
+			    const struct inode *dir,
+			    umode_t mode)
 {
 	struct inode *inode;
 
@@ -602,11 +602,13 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 {
 	struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
 	umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
+	u64 mask;
 
 	if (mode != S_IRWXUGO)
 		seq_printf(m, ",mode=%o", mode);
 
-	if (opts->delegate_cmds == ~0ULL)
+	mask = (1ULL << __MAX_BPF_CMD) - 1;
+	if ((opts->delegate_cmds & mask) == mask)
 		seq_printf(m, ",delegate_cmds=any");
 	else if (opts->delegate_cmds)
 		seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds);
@@ -639,7 +641,7 @@ static void bpf_free_inode(struct inode *inode)
 	free_inode_nonrcu(inode);
 }
 
-static const struct super_operations bpf_super_ops = {
+const struct super_operations bpf_super_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
 	.show_options	= bpf_show_options,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ee33a52abf18..a156d549b356 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5377,6 +5377,20 @@ out_prog_put:
 	return ret;
 }
 
+#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd
+
+static int token_create(union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_TOKEN_CREATE))
+		return -EINVAL;
+
+	/* no flags are supported yet */
+	if (attr->token_create.flags)
+		return -EINVAL;
+
+	return bpf_token_create(attr);
+}
+
 static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 {
 	union bpf_attr attr;
@@ -5510,6 +5524,9 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 	case BPF_PROG_BIND_MAP:
 		err = bpf_prog_bind_map(&attr);
 		break;
+	case BPF_TOKEN_CREATE:
+		err = token_create(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
new file mode 100644
index 000000000000..e18aaecc67e9
--- /dev/null
+++ b/kernel/bpf/token.c
@@ -0,0 +1,214 @@
+#include <linux/bpf.h>
+#include <linux/vmalloc.h>
+#include <linux/fdtable.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/idr.h>
+#include <linux/namei.h>
+#include <linux/user_namespace.h>
+
+bool bpf_token_capable(const struct bpf_token *token, int cap)
+{
+	/* BPF token allows ns_capable() level of capabilities, but only if
+	 * token's userns is *exactly* the same as current user's userns
+	 */
+	if (token && current_user_ns() == token->userns) {
+		if (ns_capable(token->userns, cap))
+			return true;
+		if (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN))
+			return true;
+	}
+	/* otherwise fallback to capable() checks */
+	return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
+}
+
+void bpf_token_inc(struct bpf_token *token)
+{
+	atomic64_inc(&token->refcnt);
+}
+
+static void bpf_token_free(struct bpf_token *token)
+{
+	put_user_ns(token->userns);
+	kvfree(token);
+}
+
+static void bpf_token_put_deferred(struct work_struct *work)
+{
+	struct bpf_token *token = container_of(work, struct bpf_token, work);
+
+	bpf_token_free(token);
+}
+
+void bpf_token_put(struct bpf_token *token)
+{
+	if (!token)
+		return;
+
+	if (!atomic64_dec_and_test(&token->refcnt))
+		return;
+
+	INIT_WORK(&token->work, bpf_token_put_deferred);
+	schedule_work(&token->work);
+}
+
+static int bpf_token_release(struct inode *inode, struct file *filp)
+{
+	struct bpf_token *token = filp->private_data;
+
+	bpf_token_put(token);
+	return 0;
+}
+
+static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+	struct bpf_token *token = filp->private_data;
+	u64 mask;
+
+	BUILD_BUG_ON(__MAX_BPF_CMD >= 64);
+	mask = (1ULL << __MAX_BPF_CMD) - 1;
+	if ((token->allowed_cmds & mask) == mask)
+		seq_printf(m, "allowed_cmds:\tany\n");
+	else
+		seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);
+}
+
+#define BPF_TOKEN_INODE_NAME "bpf-token"
+
+static const struct inode_operations bpf_token_iops = { };
+
+static const struct file_operations bpf_token_fops = {
+	.release	= bpf_token_release,
+	.show_fdinfo	= bpf_token_show_fdinfo,
+};
+
+int bpf_token_create(union bpf_attr *attr)
+{
+	struct bpf_mount_opts *mnt_opts;
+	struct bpf_token *token = NULL;
+	struct user_namespace *userns;
+	struct inode *inode;
+	struct file *file;
+	struct path path;
+	struct fd f;
+	umode_t mode;
+	int err, fd;
+
+	f = fdget(attr->token_create.bpffs_fd);
+	if (!f.file)
+		return -EBADF;
+
+	path = f.file->f_path;
+	path_get(&path);
+	fdput(f);
+
+	if (path.dentry != path.mnt->mnt_sb->s_root) {
+		err = -EINVAL;
+		goto out_path;
+	}
+	if (path.mnt->mnt_sb->s_op != &bpf_super_ops) {
+		err = -EINVAL;
+		goto out_path;
+	}
+	err = path_permission(&path, MAY_ACCESS);
+	if (err)
+		goto out_path;
+
+	userns = path.dentry->d_sb->s_user_ns;
+	/*
+	 * Enforce that creators of BPF tokens are in the same user
+	 * namespace as the BPF FS instance. This makes reasoning about
+	 * permissions a lot easier and we can always relax this later.
+	 */
+	if (current_user_ns() != userns) {
+		err = -EPERM;
+		goto out_path;
+	}
+	if (!ns_capable(userns, CAP_BPF)) {
+		err = -EPERM;
+		goto out_path;
+	}
+
+	mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+	inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_path;
+	}
+
+	inode->i_op = &bpf_token_iops;
+	inode->i_fop = &bpf_token_fops;
+	clear_nlink(inode); /* make sure it is unlinked */
+
+	file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
+	if (IS_ERR(file)) {
+		iput(inode);
+		err = PTR_ERR(file);
+		goto out_path;
+	}
+
+	token = kvzalloc(sizeof(*token), GFP_USER);
+	if (!token) {
+		err = -ENOMEM;
+		goto out_file;
+	}
+
+	atomic64_set(&token->refcnt, 1);
+
+	/* remember bpffs owning userns for future ns_capable() checks */
+	token->userns = get_user_ns(userns);
+
+	mnt_opts = path.dentry->d_sb->s_fs_info;
+	token->allowed_cmds = mnt_opts->delegate_cmds;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		err = fd;
+		goto out_token;
+	}
+
+	file->private_data = token;
+	fd_install(fd, file);
+
+	path_put(&path);
+	return fd;
+
+out_token:
+	bpf_token_free(token);
+out_file:
+	fput(file);
+out_path:
+	path_put(&path);
+	return err;
+}
+
+struct bpf_token *bpf_token_get_from_fd(u32 ufd)
+{
+	struct fd f = fdget(ufd);
+	struct bpf_token *token;
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+	if (f.file->f_op != &bpf_token_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	token = f.file->private_data;
+	bpf_token_inc(token);
+	fdput(f);
+
+	return token;
+}
+
+bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+	/* BPF token can be used only within exactly the same userns in which
+	 * it was created
+	 */
+	if (!token || current_user_ns() != token->userns)
+		return false;
+
+	return token->allowed_cmds & (1ULL << cmd);
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e88746ba7d21..d4a567e5bc3c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -847,6 +847,36 @@ union bpf_iter_link_info {
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
  *
+ * BPF_TOKEN_CREATE
+ *	Description
+ *		Create BPF token with embedded information about what
+ *		BPF-related functionality it allows:
+ *		- a set of allowed bpf() syscall commands;
+ *		- a set of allowed BPF map types to be created with
+ *		BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed;
+ *		- a set of allowed BPF program types and BPF program attach
+ *		types to be loaded with BPF_PROG_LOAD command, if
+ *		BPF_PROG_LOAD itself is allowed.
+ *
+ *		BPF token is created (derived) from an instance of BPF FS,
+ *		assuming it has necessary delegation mount options specified.
+ *		This BPF token can be passed as an extra parameter to various
+ *		bpf() syscall commands to grant BPF subsystem functionality to
+ *		unprivileged processes.
+ *
+ *		When created, BPF token is "associated" with the owning
+ *		user namespace of BPF FS instance (super block) that it was
+ *		derived from, and subsequent BPF operations performed with
+ *		BPF token would be performing capabilities checks (i.e.,
+ *		CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within
+ *		that user namespace. Without BPF token, such capabilities
+ *		have to be granted in init user namespace, making bpf()
+ *		syscall incompatible with user namespace, for the most part.
+ *
+ *	Return
+ *		A new file descriptor (a nonnegative integer), or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -901,6 +931,8 @@ enum bpf_cmd {
 	BPF_ITER_CREATE,
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
+	BPF_TOKEN_CREATE,
+	__MAX_BPF_CMD,
 };
 
 enum bpf_map_type {
@@ -1712,6 +1744,11 @@ union bpf_attr {
 		__u32		flags;		/* extra flags */
 	} prog_bind_map;
 
+	struct { /* struct used by BPF_TOKEN_CREATE command */
+		__u32		flags;
+		__u32		bpffs_fd;
+	} token_create;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
-- 
cgit v1.2.3


From 688b7270b3cb75e8ac78123d719967db40336e5b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:16 -0800
Subject: bpf: add BPF token support to BPF_MAP_CREATE command

Allow providing token_fd for BPF_MAP_CREATE command to allow controlled
BPF map creation from unprivileged process through delegated BPF token.

Wire through a set of allowed BPF map types to BPF token, derived from
BPF FS at BPF token creation time. This, in combination with allowed_cmds
allows to create a narrowly-focused BPF token (controlled by privileged
agent) with a restrictive set of BPF maps that application can attempt
to create.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |  2 +
 include/uapi/linux/bpf.h                           |  2 +
 kernel/bpf/inode.c                                 |  3 +-
 kernel/bpf/syscall.c                               | 52 ++++++++++++++++------
 kernel/bpf/token.c                                 | 16 +++++++
 tools/include/uapi/linux/bpf.h                     |  2 +
 .../selftests/bpf/prog_tests/libbpf_probes.c       |  2 +
 .../testing/selftests/bpf/prog_tests/libbpf_str.c  |  3 ++
 8 files changed, 67 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index aa9cf8e5fab1..e08e8436df38 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1600,6 +1600,7 @@ struct bpf_token {
 	atomic64_t refcnt;
 	struct user_namespace *userns;
 	u64 allowed_cmds;
+	u64 allowed_maps;
 };
 
 struct bpf_struct_ops_value;
@@ -2236,6 +2237,7 @@ int bpf_token_create(union bpf_attr *attr);
 struct bpf_token *bpf_token_get_from_fd(u32 ufd);
 
 bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
+bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
 
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d4a567e5bc3c..0bba3392b17a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -983,6 +983,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
+	__MAX_BPF_MAP_TYPE
 };
 
 /* Note that tracing related programs such as
@@ -1433,6 +1434,7 @@ union bpf_attr {
 		 * to using 5 hash functions).
 		 */
 		__u64	map_extra;
+		__u32	map_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 6ce3f9696e72..9c7865d1c53d 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -613,7 +613,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 	else if (opts->delegate_cmds)
 		seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds);
 
-	if (opts->delegate_maps == ~0ULL)
+	mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
+	if ((opts->delegate_maps & mask) == mask)
 		seq_printf(m, ",delegate_maps=any");
 	else if (opts->delegate_maps)
 		seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a156d549b356..22e14124cd61 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1009,8 +1009,8 @@ int map_check_no_btf(const struct bpf_map *map,
 	return -ENOTSUPP;
 }
 
-static int map_check_btf(struct bpf_map *map, const struct btf *btf,
-			 u32 btf_key_id, u32 btf_value_id)
+static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
+			 const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
 {
 	const struct btf_type *key_type, *value_type;
 	u32 key_size, value_size;
@@ -1038,7 +1038,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 	if (!IS_ERR_OR_NULL(map->record)) {
 		int i;
 
-		if (!bpf_capable()) {
+		if (!bpf_token_capable(token, CAP_BPF)) {
 			ret = -EPERM;
 			goto free_map_tab;
 		}
@@ -1126,11 +1126,12 @@ static bool bpf_net_capable(void)
 	return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD map_extra
+#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
 	const struct bpf_map_ops *ops;
+	struct bpf_token *token = NULL;
 	int numa_node = bpf_map_attr_numa_node(attr);
 	u32 map_type = attr->map_type;
 	struct bpf_map *map;
@@ -1181,14 +1182,32 @@ static int map_create(union bpf_attr *attr)
 	if (!ops->map_mem_usage)
 		return -EINVAL;
 
+	if (attr->map_token_fd) {
+		token = bpf_token_get_from_fd(attr->map_token_fd);
+		if (IS_ERR(token))
+			return PTR_ERR(token);
+
+		/* if current token doesn't grant map creation permissions,
+		 * then we can't use this token, so ignore it and rely on
+		 * system-wide capabilities checks
+		 */
+		if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
+		    !bpf_token_allow_map_type(token, attr->map_type)) {
+			bpf_token_put(token);
+			token = NULL;
+		}
+	}
+
+	err = -EPERM;
+
 	/* Intent here is for unprivileged_bpf_disabled to block BPF map
 	 * creation for unprivileged users; other actions depend
 	 * on fd availability and access to bpffs, so are dependent on
 	 * object creation success. Even with unprivileged BPF disabled,
 	 * capability checks are still carried out.
 	 */
-	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
-		return -EPERM;
+	if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
+		goto put_token;
 
 	/* check privileged map type permissions */
 	switch (map_type) {
@@ -1221,25 +1240,27 @@ static int map_create(union bpf_attr *attr)
 	case BPF_MAP_TYPE_LRU_PERCPU_HASH:
 	case BPF_MAP_TYPE_STRUCT_OPS:
 	case BPF_MAP_TYPE_CPUMAP:
-		if (!bpf_capable())
-			return -EPERM;
+		if (!bpf_token_capable(token, CAP_BPF))
+			goto put_token;
 		break;
 	case BPF_MAP_TYPE_SOCKMAP:
 	case BPF_MAP_TYPE_SOCKHASH:
 	case BPF_MAP_TYPE_DEVMAP:
 	case BPF_MAP_TYPE_DEVMAP_HASH:
 	case BPF_MAP_TYPE_XSKMAP:
-		if (!bpf_net_capable())
-			return -EPERM;
+		if (!bpf_token_capable(token, CAP_NET_ADMIN))
+			goto put_token;
 		break;
 	default:
 		WARN(1, "unsupported map type %d", map_type);
-		return -EPERM;
+		goto put_token;
 	}
 
 	map = ops->map_alloc(attr);
-	if (IS_ERR(map))
-		return PTR_ERR(map);
+	if (IS_ERR(map)) {
+		err = PTR_ERR(map);
+		goto put_token;
+	}
 	map->ops = ops;
 	map->map_type = map_type;
 
@@ -1276,7 +1297,7 @@ static int map_create(union bpf_attr *attr)
 		map->btf = btf;
 
 		if (attr->btf_value_type_id) {
-			err = map_check_btf(map, btf, attr->btf_key_type_id,
+			err = map_check_btf(map, token, btf, attr->btf_key_type_id,
 					    attr->btf_value_type_id);
 			if (err)
 				goto free_map;
@@ -1297,6 +1318,7 @@ static int map_create(union bpf_attr *attr)
 		goto free_map_sec;
 
 	bpf_map_save_memcg(map);
+	bpf_token_put(token);
 
 	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
@@ -1317,6 +1339,8 @@ free_map_sec:
 free_map:
 	btf_put(map->btf);
 	map->ops->map_free(map);
+put_token:
+	bpf_token_put(token);
 	return err;
 }
 
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index e18aaecc67e9..06c34dae658e 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -72,6 +72,13 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
 		seq_printf(m, "allowed_cmds:\tany\n");
 	else
 		seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);
+
+	BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64);
+	mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
+	if ((token->allowed_maps & mask) == mask)
+		seq_printf(m, "allowed_maps:\tany\n");
+	else
+		seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);
 }
 
 #define BPF_TOKEN_INODE_NAME "bpf-token"
@@ -161,6 +168,7 @@ int bpf_token_create(union bpf_attr *attr)
 
 	mnt_opts = path.dentry->d_sb->s_fs_info;
 	token->allowed_cmds = mnt_opts->delegate_cmds;
+	token->allowed_maps = mnt_opts->delegate_maps;
 
 	fd = get_unused_fd_flags(O_CLOEXEC);
 	if (fd < 0) {
@@ -212,3 +220,11 @@ bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
 
 	return token->allowed_cmds & (1ULL << cmd);
 }
+
+bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
+{
+	if (!token || type >= __MAX_BPF_MAP_TYPE)
+		return false;
+
+	return token->allowed_maps & (1ULL << type);
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d4a567e5bc3c..0bba3392b17a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -983,6 +983,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
+	__MAX_BPF_MAP_TYPE
 };
 
 /* Note that tracing related programs such as
@@ -1433,6 +1434,7 @@ union bpf_attr {
 		 * to using 5 hash functions).
 		 */
 		__u64	map_extra;
+		__u32	map_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
index 9f766ddd946a..573249a2814d 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
@@ -68,6 +68,8 @@ void test_libbpf_probe_map_types(void)
 
 		if (map_type == BPF_MAP_TYPE_UNSPEC)
 			continue;
+		if (strcmp(map_type_name, "__MAX_BPF_MAP_TYPE") == 0)
+			continue;
 
 		if (!test__start_subtest(map_type_name))
 			continue;
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
index c440ea3311ed..2a0633f43c73 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
@@ -132,6 +132,9 @@ static void test_libbpf_bpf_map_type_str(void)
 		const char *map_type_str;
 		char buf[256];
 
+		if (map_type == __MAX_BPF_MAP_TYPE)
+			continue;
+
 		map_type_name = btf__str_by_offset(btf, e->name_off);
 		map_type_str = libbpf_bpf_map_type_str(map_type);
 		ASSERT_OK_PTR(map_type_str, map_type_name);
-- 
cgit v1.2.3


From ee54b1a910e4d49c9a104f31ae3f5b979131adf8 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:17 -0800
Subject: bpf: add BPF token support to BPF_BTF_LOAD command

Accept BPF token FD in BPF_BTF_LOAD command to allow BTF data loading
through delegated BPF token. BTF loading is a pretty straightforward
operation, so as long as BPF token is created with allow_cmds granting
BPF_BTF_LOAD command, kernel proceeds to parsing BTF data and creating
BTF object.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-6-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/syscall.c           | 20 ++++++++++++++++++--
 tools/include/uapi/linux/bpf.h |  1 +
 3 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0bba3392b17a..9f9989e0d062 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1616,6 +1616,7 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		btf_log_true_size;
+		__u32		btf_token_fd;
 	};
 
 	struct {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 22e14124cd61..d87c5c27cde3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4777,15 +4777,31 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	return err;
 }
 
-#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
+#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
 
 static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
 {
+	struct bpf_token *token = NULL;
+
 	if (CHECK_ATTR(BPF_BTF_LOAD))
 		return -EINVAL;
 
-	if (!bpf_capable())
+	if (attr->btf_token_fd) {
+		token = bpf_token_get_from_fd(attr->btf_token_fd);
+		if (IS_ERR(token))
+			return PTR_ERR(token);
+		if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) {
+			bpf_token_put(token);
+			token = NULL;
+		}
+	}
+
+	if (!bpf_token_capable(token, CAP_BPF)) {
+		bpf_token_put(token);
 		return -EPERM;
+	}
+
+	bpf_token_put(token);
 
 	return btf_new_fd(attr, uattr, uattr_size);
 }
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0bba3392b17a..9f9989e0d062 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1616,6 +1616,7 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		btf_log_true_size;
+		__u32		btf_token_fd;
 	};
 
 	struct {
-- 
cgit v1.2.3


From e1cef620f598853a90f17701fcb1057a6768f7b8 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:18 -0800
Subject: bpf: add BPF token support to BPF_PROG_LOAD command

Add basic support of BPF token to BPF_PROG_LOAD. Wire through a set of
allowed BPF program types and attach types, derived from BPF FS at BPF
token creation time. Then make sure we perform bpf_token_capable()
checks everywhere where it's relevant.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-7-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |  6 ++
 include/uapi/linux/bpf.h                           |  2 +
 kernel/bpf/core.c                                  |  1 +
 kernel/bpf/inode.c                                 |  6 +-
 kernel/bpf/syscall.c                               | 87 ++++++++++++++++------
 kernel/bpf/token.c                                 | 27 +++++++
 tools/include/uapi/linux/bpf.h                     |  2 +
 .../selftests/bpf/prog_tests/libbpf_probes.c       |  2 +
 .../testing/selftests/bpf/prog_tests/libbpf_str.c  |  3 +
 9 files changed, 110 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e08e8436df38..20af87b59d70 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1461,6 +1461,7 @@ struct bpf_prog_aux {
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
+	struct bpf_token *token;
 	struct bpf_prog_offload *offload;
 	struct btf *btf;
 	struct bpf_func_info *func_info;
@@ -1601,6 +1602,8 @@ struct bpf_token {
 	struct user_namespace *userns;
 	u64 allowed_cmds;
 	u64 allowed_maps;
+	u64 allowed_progs;
+	u64 allowed_attachs;
 };
 
 struct bpf_struct_ops_value;
@@ -2238,6 +2241,9 @@ struct bpf_token *bpf_token_get_from_fd(u32 ufd);
 
 bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
 bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
+bool bpf_token_allow_prog_type(const struct bpf_token *token,
+			       enum bpf_prog_type prog_type,
+			       enum bpf_attach_type attach_type);
 
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9f9989e0d062..4df2d025c784 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1028,6 +1028,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
+	__MAX_BPF_PROG_TYPE
 };
 
 enum bpf_attach_type {
@@ -1504,6 +1505,7 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		log_true_size;
+		__u32		prog_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 4b813da8d6c0..47085839af8d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2751,6 +2751,7 @@ void bpf_prog_free(struct bpf_prog *fp)
 
 	if (aux->dst_prog)
 		bpf_prog_put(aux->dst_prog);
+	bpf_token_put(aux->token);
 	INIT_WORK(&aux->work, bpf_prog_free_deferred);
 	schedule_work(&aux->work);
 }
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 9c7865d1c53d..5359a0929c35 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -619,12 +619,14 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 	else if (opts->delegate_maps)
 		seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps);
 
-	if (opts->delegate_progs == ~0ULL)
+	mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
+	if ((opts->delegate_progs & mask) == mask)
 		seq_printf(m, ",delegate_progs=any");
 	else if (opts->delegate_progs)
 		seq_printf(m, ",delegate_progs=0x%llx", opts->delegate_progs);
 
-	if (opts->delegate_attachs == ~0ULL)
+	mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
+	if ((opts->delegate_attachs & mask) == mask)
 		seq_printf(m, ",delegate_attachs=any");
 	else if (opts->delegate_attachs)
 		seq_printf(m, ",delegate_attachs=0x%llx", opts->delegate_attachs);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d87c5c27cde3..2c8393c21b8c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2608,13 +2608,15 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD log_true_size
+#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd
 
 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	enum bpf_prog_type type = attr->prog_type;
 	struct bpf_prog *prog, *dst_prog = NULL;
 	struct btf *attach_btf = NULL;
+	struct bpf_token *token = NULL;
+	bool bpf_cap;
 	int err;
 	char license[128];
 
@@ -2631,10 +2633,31 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 				 BPF_F_TEST_REG_INVARIANTS))
 		return -EINVAL;
 
+	bpf_prog_load_fixup_attach_type(attr);
+
+	if (attr->prog_token_fd) {
+		token = bpf_token_get_from_fd(attr->prog_token_fd);
+		if (IS_ERR(token))
+			return PTR_ERR(token);
+		/* if current token doesn't grant prog loading permissions,
+		 * then we can't use this token, so ignore it and rely on
+		 * system-wide capabilities checks
+		 */
+		if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) ||
+		    !bpf_token_allow_prog_type(token, attr->prog_type,
+					       attr->expected_attach_type)) {
+			bpf_token_put(token);
+			token = NULL;
+		}
+	}
+
+	bpf_cap = bpf_token_capable(token, CAP_BPF);
+	err = -EPERM;
+
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
 	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
-	    !bpf_capable())
-		return -EPERM;
+	    !bpf_cap)
+		goto put_token;
 
 	/* Intent here is for unprivileged_bpf_disabled to block BPF program
 	 * creation for unprivileged users; other actions depend
@@ -2643,21 +2666,23 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	 * capability checks are still carried out for these
 	 * and other operations.
 	 */
-	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
-		return -EPERM;
+	if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
+		goto put_token;
 
 	if (attr->insn_cnt == 0 ||
-	    attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
-		return -E2BIG;
+	    attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
+		err = -E2BIG;
+		goto put_token;
+	}
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
-	    !bpf_capable())
-		return -EPERM;
+	    !bpf_cap)
+		goto put_token;
 
-	if (is_net_admin_prog_type(type) && !bpf_net_capable())
-		return -EPERM;
-	if (is_perfmon_prog_type(type) && !perfmon_capable())
-		return -EPERM;
+	if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
+		goto put_token;
+	if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
+		goto put_token;
 
 	/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
 	 * or btf, we need to check which one it is
@@ -2667,27 +2692,33 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 		if (IS_ERR(dst_prog)) {
 			dst_prog = NULL;
 			attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
-			if (IS_ERR(attach_btf))
-				return -EINVAL;
+			if (IS_ERR(attach_btf)) {
+				err = -EINVAL;
+				goto put_token;
+			}
 			if (!btf_is_kernel(attach_btf)) {
 				/* attaching through specifying bpf_prog's BTF
 				 * objects directly might be supported eventually
 				 */
 				btf_put(attach_btf);
-				return -ENOTSUPP;
+				err = -ENOTSUPP;
+				goto put_token;
 			}
 		}
 	} else if (attr->attach_btf_id) {
 		/* fall back to vmlinux BTF, if BTF type ID is specified */
 		attach_btf = bpf_get_btf_vmlinux();
-		if (IS_ERR(attach_btf))
-			return PTR_ERR(attach_btf);
-		if (!attach_btf)
-			return -EINVAL;
+		if (IS_ERR(attach_btf)) {
+			err = PTR_ERR(attach_btf);
+			goto put_token;
+		}
+		if (!attach_btf) {
+			err = -EINVAL;
+			goto put_token;
+		}
 		btf_get(attach_btf);
 	}
 
-	bpf_prog_load_fixup_attach_type(attr);
 	if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
 				       attach_btf, attr->attach_btf_id,
 				       dst_prog)) {
@@ -2695,7 +2726,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 			bpf_prog_put(dst_prog);
 		if (attach_btf)
 			btf_put(attach_btf);
-		return -EINVAL;
+		err = -EINVAL;
+		goto put_token;
 	}
 
 	/* plain bpf_prog allocation */
@@ -2705,7 +2737,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 			bpf_prog_put(dst_prog);
 		if (attach_btf)
 			btf_put(attach_btf);
-		return -ENOMEM;
+		err = -EINVAL;
+		goto put_token;
 	}
 
 	prog->expected_attach_type = attr->expected_attach_type;
@@ -2716,6 +2749,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
 	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
 
+	/* move token into prog->aux, reuse taken refcnt */
+	prog->aux->token = token;
+	token = NULL;
+
 	err = security_bpf_prog_alloc(prog->aux);
 	if (err)
 		goto free_prog;
@@ -2817,6 +2854,8 @@ free_prog:
 	if (prog->aux->attach_btf)
 		btf_put(prog->aux->attach_btf);
 	bpf_prog_free(prog);
+put_token:
+	bpf_token_put(token);
 	return err;
 }
 
@@ -3806,7 +3845,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
 	case BPF_PROG_TYPE_CGROUP_SKB:
-		if (!bpf_net_capable())
+		if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN))
 			/* cg-skb progs can be loaded by unpriv user.
 			 * check permissions at attach time.
 			 */
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index 06c34dae658e..5a51e6b8f6bf 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -79,6 +79,20 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
 		seq_printf(m, "allowed_maps:\tany\n");
 	else
 		seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);
+
+	BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64);
+	mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
+	if ((token->allowed_progs & mask) == mask)
+		seq_printf(m, "allowed_progs:\tany\n");
+	else
+		seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs);
+
+	BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64);
+	mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
+	if ((token->allowed_attachs & mask) == mask)
+		seq_printf(m, "allowed_attachs:\tany\n");
+	else
+		seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs);
 }
 
 #define BPF_TOKEN_INODE_NAME "bpf-token"
@@ -169,6 +183,8 @@ int bpf_token_create(union bpf_attr *attr)
 	mnt_opts = path.dentry->d_sb->s_fs_info;
 	token->allowed_cmds = mnt_opts->delegate_cmds;
 	token->allowed_maps = mnt_opts->delegate_maps;
+	token->allowed_progs = mnt_opts->delegate_progs;
+	token->allowed_attachs = mnt_opts->delegate_attachs;
 
 	fd = get_unused_fd_flags(O_CLOEXEC);
 	if (fd < 0) {
@@ -228,3 +244,14 @@ bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type t
 
 	return token->allowed_maps & (1ULL << type);
 }
+
+bool bpf_token_allow_prog_type(const struct bpf_token *token,
+			       enum bpf_prog_type prog_type,
+			       enum bpf_attach_type attach_type)
+{
+	if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE)
+		return false;
+
+	return (token->allowed_progs & (1ULL << prog_type)) &&
+	       (token->allowed_attachs & (1ULL << attach_type));
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9f9989e0d062..4df2d025c784 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1028,6 +1028,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
+	__MAX_BPF_PROG_TYPE
 };
 
 enum bpf_attach_type {
@@ -1504,6 +1505,7 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		log_true_size;
+		__u32		prog_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
index 573249a2814d..4ed46ed58a7b 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
@@ -30,6 +30,8 @@ void test_libbpf_probe_prog_types(void)
 
 		if (prog_type == BPF_PROG_TYPE_UNSPEC)
 			continue;
+		if (strcmp(prog_type_name, "__MAX_BPF_PROG_TYPE") == 0)
+			continue;
 
 		if (!test__start_subtest(prog_type_name))
 			continue;
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
index 2a0633f43c73..384bc1f7a65e 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
@@ -189,6 +189,9 @@ static void test_libbpf_bpf_prog_type_str(void)
 		const char *prog_type_str;
 		char buf[256];
 
+		if (prog_type == __MAX_BPF_PROG_TYPE)
+			continue;
+
 		prog_type_name = btf__str_by_offset(btf, e->name_off);
 		prog_type_str = libbpf_bpf_prog_type_str(prog_type);
 		ASSERT_OK_PTR(prog_type_str, prog_type_name);
-- 
cgit v1.2.3


From 4cbb270e115bc197ff2046aeb54cc951666b16ec Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:19 -0800
Subject: bpf: take into account BPF token when fetching helper protos

Instead of performing unconditional system-wide bpf_capable() and
perfmon_capable() calls inside bpf_base_func_proto() function (and other
similar ones) to determine eligibility of a given BPF helper for a given
program, use previously recorded BPF token during BPF_PROG_LOAD command
handling to inform the decision.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-8-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/media/rc/bpf-lirc.c |  2 +-
 include/linux/bpf.h         |  5 +++--
 kernel/bpf/cgroup.c         |  6 +++---
 kernel/bpf/helpers.c        |  6 +++---
 kernel/bpf/syscall.c        |  5 +++--
 kernel/trace/bpf_trace.c    |  2 +-
 net/core/filter.c           | 32 ++++++++++++++++----------------
 net/ipv4/bpf_tcp_ca.c       |  2 +-
 net/netfilter/nf_bpf_link.c |  2 +-
 9 files changed, 32 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index fe17c7f98e81..6d07693c6b9f 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_trace_printk:
-		if (perfmon_capable())
+		if (bpf_token_capable(prog->aux->token, CAP_PERFMON))
 			return bpf_get_trace_printk_proto();
 		fallthrough;
 	default:
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 20af87b59d70..2a3ab4f3dd8c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2492,7 +2492,8 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type
 struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
 
-const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
+const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id,
+						 const struct bpf_prog *prog);
 void bpf_task_storage_free(struct task_struct *task);
 void bpf_cgrp_storage_free(struct cgroup *cgroup);
 bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
@@ -2752,7 +2753,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log,
 }
 
 static inline const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	return NULL;
 }
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 491d20038cbe..98e0e3835b28 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1630,7 +1630,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
@@ -2191,7 +2191,7 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
@@ -2348,7 +2348,7 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ee9bdf29246a..b3be5742d6f1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1679,7 +1679,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
 const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
 
 const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
+bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_map_lookup_elem:
@@ -1730,7 +1730,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		break;
 	}
 
-	if (!bpf_capable())
+	if (!bpf_token_capable(prog->aux->token, CAP_BPF))
 		return NULL;
 
 	switch (func_id) {
@@ -1788,7 +1788,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		break;
 	}
 
-	if (!perfmon_capable())
+	if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
 		return NULL;
 
 	switch (func_id) {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2c8393c21b8c..1cc03f08c9cd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5712,7 +5712,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
 const struct bpf_func_proto * __weak
 tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-	return bpf_base_func_proto(func_id);
+	return bpf_base_func_proto(func_id, prog);
 }
 
 BPF_CALL_1(bpf_sys_close, u32, fd)
@@ -5762,7 +5762,8 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_sys_bpf:
-		return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
+		return !bpf_token_capable(prog->aux->token, CAP_PERFMON)
+		       ? NULL : &bpf_sys_bpf_proto;
 	case BPF_FUNC_btf_find_by_name_kind:
 		return &bpf_btf_find_by_name_kind_proto;
 	case BPF_FUNC_sys_close:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1648bde28f01..774cf476a892 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1626,7 +1626,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_trace_vprintk:
 		return bpf_get_trace_vprintk_proto();
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 0adaa4afa35f..0bf2a03d8203 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -87,7 +87,7 @@
 #include "dev.h"
 
 static const struct bpf_func_proto *
-bpf_sk_base_func_proto(enum bpf_func_id func_id);
+bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
 
 int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
 {
@@ -7841,7 +7841,7 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
@@ -7934,7 +7934,7 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 			return NULL;
 		}
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -7953,7 +7953,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_skb_event_output_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8140,7 +8140,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8199,7 +8199,7 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 
 #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
@@ -8260,7 +8260,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_sock_proto;
 #endif /* CONFIG_INET */
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8302,7 +8302,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_cgroup_classid_curr_proto;
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8346,7 +8346,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skc_lookup_tcp_proto;
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8357,7 +8357,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_flow_dissector_load_bytes_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -8384,7 +8384,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_under_cgroup:
 		return &bpf_skb_under_cgroup_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -11215,7 +11215,7 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
@@ -11397,7 +11397,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_sk_release:
 		return &bpf_sk_release_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id);
+		return bpf_sk_base_func_proto(func_id, prog);
 	}
 }
 
@@ -11731,7 +11731,7 @@ const struct bpf_func_proto bpf_sock_from_file_proto = {
 };
 
 static const struct bpf_func_proto *
-bpf_sk_base_func_proto(enum bpf_func_id func_id)
+bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	const struct bpf_func_proto *func;
 
@@ -11760,10 +11760,10 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 
-	if (!perfmon_capable())
+	if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
 		return NULL;
 
 	return func;
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 39dcccf0f174..c7bbd8f3c708 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -191,7 +191,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id);
+		return bpf_base_func_proto(func_id, prog);
 	}
 }
 
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
index e502ec00b2fe..1969facac91c 100644
--- a/net/netfilter/nf_bpf_link.c
+++ b/net/netfilter/nf_bpf_link.c
@@ -314,7 +314,7 @@ static bool nf_is_valid_access(int off, int size, enum bpf_access_type type,
 static const struct bpf_func_proto *
 bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-	return bpf_base_func_proto(func_id);
+	return bpf_base_func_proto(func_id, prog);
 }
 
 const struct bpf_verifier_ops netfilter_verifier_ops = {
-- 
cgit v1.2.3


From 8062fb12de99b2da33754c6a3be1bfc30d9a35f4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:20 -0800
Subject: bpf: consistently use BPF token throughout BPF verifier logic

Remove remaining direct queries to perfmon_capable() and bpf_capable()
in BPF verifier logic and instead use BPF token (if available) to make
decisions about privileges.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-9-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    | 16 ++++++++--------
 include/linux/filter.h |  2 +-
 kernel/bpf/arraymap.c  |  2 +-
 kernel/bpf/core.c      |  2 +-
 kernel/bpf/verifier.c  | 13 ++++++-------
 net/core/filter.c      |  4 ++--
 6 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2a3ab4f3dd8c..435abad3cc61 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2200,24 +2200,24 @@ extern int sysctl_unprivileged_bpf_disabled;
 
 bool bpf_token_capable(const struct bpf_token *token, int cap);
 
-static inline bool bpf_allow_ptr_leaks(void)
+static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token)
 {
-	return perfmon_capable();
+	return bpf_token_capable(token, CAP_PERFMON);
 }
 
-static inline bool bpf_allow_uninit_stack(void)
+static inline bool bpf_allow_uninit_stack(const struct bpf_token *token)
 {
-	return perfmon_capable();
+	return bpf_token_capable(token, CAP_PERFMON);
 }
 
-static inline bool bpf_bypass_spec_v1(void)
+static inline bool bpf_bypass_spec_v1(const struct bpf_token *token)
 {
-	return cpu_mitigations_off() || perfmon_capable();
+	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
 }
 
-static inline bool bpf_bypass_spec_v4(void)
+static inline bool bpf_bypass_spec_v4(const struct bpf_token *token)
 {
-	return cpu_mitigations_off() || perfmon_capable();
+	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
 }
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a4953fafc8cb..14354605ad26 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1139,7 +1139,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
 		return false;
 	if (!bpf_jit_harden)
 		return false;
-	if (bpf_jit_harden == 1 && bpf_capable())
+	if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF))
 		return false;
 
 	return true;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 4a4a67956e21..8d365bda9a8b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -82,7 +82,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int numa_node = bpf_map_attr_numa_node(attr);
 	u32 elem_size, index_mask, max_entries;
-	bool bypass_spec_v1 = bpf_bypass_spec_v1();
+	bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL);
 	u64 array_size, mask64;
 	struct bpf_array *array;
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 47085839af8d..ced511f44174 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -675,7 +675,7 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
 	if (!bpf_prog_kallsyms_candidate(fp) ||
-	    !bpf_capable())
+	    !bpf_token_capable(fp->aux->token, CAP_BPF))
 		return;
 
 	bpf_prog_ksym_set_addr(fp);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e5ce530641ba..45e85fb76d82 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20597,7 +20597,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	env->prog = *prog;
 	env->ops = bpf_verifier_ops[env->prog->type];
 	env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
-	is_priv = bpf_capable();
+
+	env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
+	env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
+	env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token);
+	env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token);
+	env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF);
 
 	bpf_get_btf_vmlinux();
 
@@ -20629,12 +20634,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
 		env->strict_alignment = false;
 
-	env->allow_ptr_leaks = bpf_allow_ptr_leaks();
-	env->allow_uninit_stack = bpf_allow_uninit_stack();
-	env->bypass_spec_v1 = bpf_bypass_spec_v1();
-	env->bypass_spec_v4 = bpf_bypass_spec_v4();
-	env->bpf_capable = bpf_capable();
-
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
 	env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
diff --git a/net/core/filter.c b/net/core/filter.c
index 0bf2a03d8203..adcfc2c25754 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8559,7 +8559,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
-		if (!bpf_capable())
+		if (!bpf_token_capable(prog->aux->token, CAP_BPF))
 			return false;
 		break;
 	}
@@ -8571,7 +8571,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 			break;
 		case bpf_ctx_range(struct __sk_buff, tstamp):
-			if (!bpf_capable())
+			if (!bpf_token_capable(prog->aux->token, CAP_BPF))
 				return false;
 			break;
 		default:
-- 
cgit v1.2.3


From c3dd6e94df7193f33f45d33303f5e85afb2a72dc Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:21 -0800
Subject: bpf,lsm: refactor bpf_prog_alloc/bpf_prog_free LSM hooks

Based on upstream discussion ([0]), rework existing
bpf_prog_alloc_security LSM hook. Rename it to bpf_prog_load and instead
of passing bpf_prog_aux, pass proper bpf_prog pointer for a full BPF
program struct. Also, we pass bpf_attr union with all the user-provided
arguments for BPF_PROG_LOAD command.  This will give LSMs as much
information as we can basically provide.

The hook is also BPF token-aware now, and optional bpf_token struct is
passed as a third argument. bpf_prog_load LSM hook is called after
a bunch of sanity checks were performed, bpf_prog and bpf_prog_aux were
allocated and filled out, but right before performing full-fledged BPF
verification step.

bpf_prog_free LSM hook is now accepting struct bpf_prog argument, for
consistency. SELinux code is adjusted to all new names, types, and
signatures.

Note, given that bpf_prog_load (previously bpf_prog_alloc) hook can be
used by some LSMs to allocate extra security blob, but also by other
LSMs to reject BPF program loading, we need to make sure that
bpf_prog_free LSM hook is called after bpf_prog_load/bpf_prog_alloc one
*even* if the hook itself returned error. If we don't do that, we run
the risk of leaking memory. This seems to be possible today when
combining SELinux and BPF LSM, as one example, depending on their
relative ordering.

Also, for BPF LSM setup, add bpf_prog_load and bpf_prog_free to
sleepable LSM hooks list, as they are both executed in sleepable
context. Also drop bpf_prog_load hook from untrusted, as there is no
issue with refcount or anything else anymore, that originally forced us
to add it to untrusted list in c0c852dd1876 ("bpf: Do not mark certain LSM
hook arguments as trusted"). We now trigger this hook much later and it
should not be an issue anymore.

  [0] https://lore.kernel.org/bpf/9fe88aef7deabbe87d3fc38c4aea3c69.paul@paul-moore.com/

Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-10-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/lsm_hook_defs.h |  5 +++--
 include/linux/security.h      | 12 +++++++-----
 kernel/bpf/bpf_lsm.c          |  5 +++--
 kernel/bpf/syscall.c          | 25 +++++++++++++------------
 security/security.c           | 25 +++++++++++++++----------
 security/selinux/hooks.c      | 15 ++++++++-------
 6 files changed, 49 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index ff217a5ce552..41ec4a7c070e 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -400,8 +400,9 @@ LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
 LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
 LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map)
 LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map)
-LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux)
-LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux)
+LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
+	 struct bpf_token *token)
+LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
 #endif /* CONFIG_BPF_SYSCALL */
 
 LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)
diff --git a/include/linux/security.h b/include/linux/security.h
index 1d1df326c881..65467eef6678 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2020,15 +2020,16 @@ static inline void securityfs_remove(struct dentry *dentry)
 union bpf_attr;
 struct bpf_map;
 struct bpf_prog;
-struct bpf_prog_aux;
+struct bpf_token;
 #ifdef CONFIG_SECURITY
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
 extern int security_bpf_prog(struct bpf_prog *prog);
 extern int security_bpf_map_alloc(struct bpf_map *map);
 extern void security_bpf_map_free(struct bpf_map *map);
-extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux);
-extern void security_bpf_prog_free(struct bpf_prog_aux *aux);
+extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
+				  struct bpf_token *token);
+extern void security_bpf_prog_free(struct bpf_prog *prog);
 #else
 static inline int security_bpf(int cmd, union bpf_attr *attr,
 					     unsigned int size)
@@ -2054,12 +2055,13 @@ static inline int security_bpf_map_alloc(struct bpf_map *map)
 static inline void security_bpf_map_free(struct bpf_map *map)
 { }
 
-static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
+					 struct bpf_token *token)
 {
 	return 0;
 }
 
-static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
+static inline void security_bpf_prog_free(struct bpf_prog *prog)
 { }
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_BPF_SYSCALL */
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index e14c822f8911..3e956f6302f3 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -263,6 +263,8 @@ BTF_ID(func, bpf_lsm_bpf_map)
 BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
 BTF_ID(func, bpf_lsm_bpf_map_free_security)
 BTF_ID(func, bpf_lsm_bpf_prog)
+BTF_ID(func, bpf_lsm_bpf_prog_load)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
 BTF_ID(func, bpf_lsm_bprm_check_security)
 BTF_ID(func, bpf_lsm_bprm_committed_creds)
 BTF_ID(func, bpf_lsm_bprm_committing_creds)
@@ -346,8 +348,7 @@ BTF_SET_END(sleepable_lsm_hooks)
 
 BTF_SET_START(untrusted_lsm_hooks)
 BTF_ID(func, bpf_lsm_bpf_map_free_security)
-BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
-BTF_ID(func, bpf_lsm_bpf_prog_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog_free)
 BTF_ID(func, bpf_lsm_file_alloc_security)
 BTF_ID(func, bpf_lsm_file_free_security)
 #ifdef CONFIG_SECURITY_NETWORK
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1cc03f08c9cd..7717c7c7b95d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2162,7 +2162,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 	kvfree(aux->func_info);
 	kfree(aux->func_info_aux);
 	free_uid(aux->user);
-	security_bpf_prog_free(aux);
+	security_bpf_prog_free(aux->prog);
 	bpf_prog_free(aux->prog);
 }
 
@@ -2753,10 +2753,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	prog->aux->token = token;
 	token = NULL;
 
-	err = security_bpf_prog_alloc(prog->aux);
-	if (err)
-		goto free_prog;
-
 	prog->aux->user = get_current_user();
 	prog->len = attr->insn_cnt;
 
@@ -2764,12 +2760,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	if (copy_from_bpfptr(prog->insns,
 			     make_bpfptr(attr->insns, uattr.is_kernel),
 			     bpf_prog_insn_size(prog)) != 0)
-		goto free_prog_sec;
+		goto free_prog;
 	/* copy eBPF program license from user space */
 	if (strncpy_from_bpfptr(license,
 				make_bpfptr(attr->license, uattr.is_kernel),
 				sizeof(license) - 1) < 0)
-		goto free_prog_sec;
+		goto free_prog;
 	license[sizeof(license) - 1] = 0;
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
@@ -2783,25 +2779,29 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	if (bpf_prog_is_dev_bound(prog->aux)) {
 		err = bpf_prog_dev_bound_init(prog, attr);
 		if (err)
-			goto free_prog_sec;
+			goto free_prog;
 	}
 
 	if (type == BPF_PROG_TYPE_EXT && dst_prog &&
 	    bpf_prog_is_dev_bound(dst_prog->aux)) {
 		err = bpf_prog_dev_bound_inherit(prog, dst_prog);
 		if (err)
-			goto free_prog_sec;
+			goto free_prog;
 	}
 
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
 	if (err < 0)
-		goto free_prog_sec;
+		goto free_prog;
 
 	prog->aux->load_time = ktime_get_boottime_ns();
 	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
 			       sizeof(attr->prog_name));
 	if (err < 0)
+		goto free_prog;
+
+	err = security_bpf_prog_load(prog, attr, token);
+	if (err)
 		goto free_prog_sec;
 
 	/* run eBPF verifier */
@@ -2847,10 +2847,11 @@ free_used_maps:
 	 */
 	__bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
 	return err;
+
 free_prog_sec:
-	free_uid(prog->aux->user);
-	security_bpf_prog_free(prog->aux);
+	security_bpf_prog_free(prog);
 free_prog:
+	free_uid(prog->aux->user);
 	if (prog->aux->attach_btf)
 		btf_put(prog->aux->attach_btf);
 	bpf_prog_free(prog);
diff --git a/security/security.c b/security/security.c
index dcb3e7014f9b..c8a1c66cfaad 100644
--- a/security/security.c
+++ b/security/security.c
@@ -5180,16 +5180,21 @@ int security_bpf_map_alloc(struct bpf_map *map)
 }
 
 /**
- * security_bpf_prog_alloc() - Allocate a bpf program LSM blob
- * @aux: bpf program aux info struct
+ * security_bpf_prog_load() - Check if loading of BPF program is allowed
+ * @prog: BPF program object
+ * @attr: BPF syscall attributes used to create BPF program
+ * @token: BPF token used to grant user access to BPF subsystem
  *
- * Initialize the security field inside bpf program.
+ * Perform an access control check when the kernel loads a BPF program and
+ * allocates associated BPF program object. This hook is also responsible for
+ * allocating any required LSM state for the BPF program.
  *
  * Return: Returns 0 on success, error on failure.
  */
-int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
+			   struct bpf_token *token)
 {
-	return call_int_hook(bpf_prog_alloc_security, 0, aux);
+	return call_int_hook(bpf_prog_load, 0, prog, attr, token);
 }
 
 /**
@@ -5204,14 +5209,14 @@ void security_bpf_map_free(struct bpf_map *map)
 }
 
 /**
- * security_bpf_prog_free() - Free a bpf program's LSM blob
- * @aux: bpf program aux info struct
+ * security_bpf_prog_free() - Free a BPF program's LSM blob
+ * @prog: BPF program struct
  *
- * Clean up the security information stored inside bpf prog.
+ * Clean up the security information stored inside BPF program.
  */
-void security_bpf_prog_free(struct bpf_prog_aux *aux)
+void security_bpf_prog_free(struct bpf_prog *prog)
 {
-	call_void_hook(bpf_prog_free_security, aux);
+	call_void_hook(bpf_prog_free, prog);
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index feda711c6b7b..eabee39e983c 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6805,7 +6805,8 @@ static void selinux_bpf_map_free(struct bpf_map *map)
 	kfree(bpfsec);
 }
 
-static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux)
+static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
+				 struct bpf_token *token)
 {
 	struct bpf_security_struct *bpfsec;
 
@@ -6814,16 +6815,16 @@ static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux)
 		return -ENOMEM;
 
 	bpfsec->sid = current_sid();
-	aux->security = bpfsec;
+	prog->aux->security = bpfsec;
 
 	return 0;
 }
 
-static void selinux_bpf_prog_free(struct bpf_prog_aux *aux)
+static void selinux_bpf_prog_free(struct bpf_prog *prog)
 {
-	struct bpf_security_struct *bpfsec = aux->security;
+	struct bpf_security_struct *bpfsec = prog->aux->security;
 
-	aux->security = NULL;
+	prog->aux->security = NULL;
 	kfree(bpfsec);
 }
 #endif
@@ -7180,7 +7181,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
 	LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
 	LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
-	LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
+	LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free),
 #endif
 
 #ifdef CONFIG_PERF_EVENTS
@@ -7238,7 +7239,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 #endif
 #ifdef CONFIG_BPF_SYSCALL
 	LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc),
-	LSM_HOOK_INIT(bpf_prog_alloc_security, selinux_bpf_prog_alloc),
+	LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load),
 #endif
 #ifdef CONFIG_PERF_EVENTS
 	LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc),
-- 
cgit v1.2.3


From 66d636d70a79c1d37e3eea67ab50969e6aaef983 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:22 -0800
Subject: bpf,lsm: refactor bpf_map_alloc/bpf_map_free LSM hooks

Similarly to bpf_prog_alloc LSM hook, rename and extend bpf_map_alloc
hook into bpf_map_create, taking not just struct bpf_map, but also
bpf_attr and bpf_token, to give a fuller context to LSMs.

Unlike bpf_prog_alloc, there is no need to move the hook around, as it
currently is firing right before allocating BPF map ID and FD, which
seems to be a sweet spot.

But like bpf_prog_alloc/bpf_prog_free combo, make sure that bpf_map_free
LSM hook is called even if bpf_map_create hook returned error, as if few
LSMs are combined together it could be that one LSM successfully
allocated security blob for its needs, while subsequent LSM rejected BPF
map creation. The former LSM would still need to free up LSM blob, so we
need to ensure security_bpf_map_free() is called regardless of the
outcome.

Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-11-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/lsm_hook_defs.h |  5 +++--
 include/linux/security.h      |  6 ++++--
 kernel/bpf/bpf_lsm.c          |  6 +++---
 kernel/bpf/syscall.c          |  4 ++--
 security/security.c           | 16 ++++++++++------
 security/selinux/hooks.c      |  7 ++++---
 6 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 41ec4a7c070e..adb25cc63ce3 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -398,8 +398,9 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule)
 LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size)
 LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
 LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
-LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map)
-LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map)
+LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr,
+	 struct bpf_token *token)
+LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map)
 LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
 	 struct bpf_token *token)
 LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
diff --git a/include/linux/security.h b/include/linux/security.h
index 65467eef6678..08fd777cbe94 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2025,7 +2025,8 @@ struct bpf_token;
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
 extern int security_bpf_prog(struct bpf_prog *prog);
-extern int security_bpf_map_alloc(struct bpf_map *map);
+extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
+				   struct bpf_token *token);
 extern void security_bpf_map_free(struct bpf_map *map);
 extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 				  struct bpf_token *token);
@@ -2047,7 +2048,8 @@ static inline int security_bpf_prog(struct bpf_prog *prog)
 	return 0;
 }
 
-static inline int security_bpf_map_alloc(struct bpf_map *map)
+static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
+					  struct bpf_token *token)
 {
 	return 0;
 }
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 3e956f6302f3..9e4e615f11eb 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -260,8 +260,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 BTF_SET_START(sleepable_lsm_hooks)
 BTF_ID(func, bpf_lsm_bpf)
 BTF_ID(func, bpf_lsm_bpf_map)
-BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
-BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_map_create)
+BTF_ID(func, bpf_lsm_bpf_map_free)
 BTF_ID(func, bpf_lsm_bpf_prog)
 BTF_ID(func, bpf_lsm_bpf_prog_load)
 BTF_ID(func, bpf_lsm_bpf_prog_free)
@@ -347,7 +347,7 @@ BTF_ID(func, bpf_lsm_userns_create)
 BTF_SET_END(sleepable_lsm_hooks)
 
 BTF_SET_START(untrusted_lsm_hooks)
-BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_map_free)
 BTF_ID(func, bpf_lsm_bpf_prog_free)
 BTF_ID(func, bpf_lsm_file_alloc_security)
 BTF_ID(func, bpf_lsm_file_free_security)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7717c7c7b95d..aff045eed375 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1309,9 +1309,9 @@ static int map_create(union bpf_attr *attr)
 			attr->btf_vmlinux_value_type_id;
 	}
 
-	err = security_bpf_map_alloc(map);
+	err = security_bpf_map_create(map, attr, token);
 	if (err)
-		goto free_map;
+		goto free_map_sec;
 
 	err = bpf_map_alloc_id(map);
 	if (err)
diff --git a/security/security.c b/security/security.c
index c8a1c66cfaad..ad24cf36da94 100644
--- a/security/security.c
+++ b/security/security.c
@@ -5167,16 +5167,20 @@ int security_bpf_prog(struct bpf_prog *prog)
 }
 
 /**
- * security_bpf_map_alloc() - Allocate a bpf map LSM blob
- * @map: bpf map
+ * security_bpf_map_create() - Check if BPF map creation is allowed
+ * @map: BPF map object
+ * @attr: BPF syscall attributes used to create BPF map
+ * @token: BPF token used to grant user access
  *
- * Initialize the security field inside bpf map.
+ * Do a check when the kernel creates a new BPF map. This is also the
+ * point where LSM blob is allocated for LSMs that need them.
  *
  * Return: Returns 0 on success, error on failure.
  */
-int security_bpf_map_alloc(struct bpf_map *map)
+int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
+			    struct bpf_token *token)
 {
-	return call_int_hook(bpf_map_alloc_security, 0, map);
+	return call_int_hook(bpf_map_create, 0, map, attr, token);
 }
 
 /**
@@ -5205,7 +5209,7 @@ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
  */
 void security_bpf_map_free(struct bpf_map *map)
 {
-	call_void_hook(bpf_map_free_security, map);
+	call_void_hook(bpf_map_free, map);
 }
 
 /**
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index eabee39e983c..002351ab67b7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6783,7 +6783,8 @@ static int selinux_bpf_prog(struct bpf_prog *prog)
 			    BPF__PROG_RUN, NULL);
 }
 
-static int selinux_bpf_map_alloc(struct bpf_map *map)
+static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
+				  struct bpf_token *token)
 {
 	struct bpf_security_struct *bpfsec;
 
@@ -7180,7 +7181,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(bpf, selinux_bpf),
 	LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
 	LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
-	LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
+	LSM_HOOK_INIT(bpf_map_free, selinux_bpf_map_free),
 	LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free),
 #endif
 
@@ -7238,7 +7239,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init),
 #endif
 #ifdef CONFIG_BPF_SYSCALL
-	LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc),
+	LSM_HOOK_INIT(bpf_map_create, selinux_bpf_map_create),
 	LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load),
 #endif
 #ifdef CONFIG_PERF_EVENTS
-- 
cgit v1.2.3


From d734ca7b33dbf60eb15dcf7c44f3da7073356777 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 30 Nov 2023 10:52:23 -0800
Subject: bpf,lsm: add BPF token LSM hooks

Wire up bpf_token_create and bpf_token_free LSM hooks, which allow to
allocate LSM security blob (we add `void *security` field to struct
bpf_token for that), but also control who can instantiate BPF token.
This follows existing pattern for BPF map and BPF prog.

Also add security_bpf_token_allow_cmd() and security_bpf_token_capable()
LSM hooks that allow LSM implementation to control and negate (if
necessary) BPF token's delegation of a specific bpf_cmd and capability,
respectively.

Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231130185229.2688956-12-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h           |  3 +++
 include/linux/lsm_hook_defs.h |  5 ++++
 include/linux/security.h      | 25 ++++++++++++++++++
 kernel/bpf/bpf_lsm.c          |  4 +++
 kernel/bpf/token.c            | 18 ++++++++-----
 security/security.c           | 60 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 109 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 435abad3cc61..7a483f6b6d5f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1604,6 +1604,9 @@ struct bpf_token {
 	u64 allowed_maps;
 	u64 allowed_progs;
 	u64 allowed_attachs;
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 };
 
 struct bpf_struct_ops_value;
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index adb25cc63ce3..3fdd00b452ac 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -404,6 +404,11 @@ LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map)
 LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
 	 struct bpf_token *token)
 LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
+LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr,
+	 struct path *path)
+LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token)
+LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd)
+LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap)
 #endif /* CONFIG_BPF_SYSCALL */
 
 LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)
diff --git a/include/linux/security.h b/include/linux/security.h
index 08fd777cbe94..00809d2d5c38 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/sockptr.h>
+#include <linux/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -2031,6 +2032,11 @@ extern void security_bpf_map_free(struct bpf_map *map);
 extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 				  struct bpf_token *token);
 extern void security_bpf_prog_free(struct bpf_prog *prog);
+extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
+				     struct path *path);
+extern void security_bpf_token_free(struct bpf_token *token);
+extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
+extern int security_bpf_token_capable(const struct bpf_token *token, int cap);
 #else
 static inline int security_bpf(int cmd, union bpf_attr *attr,
 					     unsigned int size)
@@ -2065,6 +2071,25 @@ static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *
 
 static inline void security_bpf_prog_free(struct bpf_prog *prog)
 { }
+
+static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
+				     struct path *path)
+{
+	return 0;
+}
+
+static inline void security_bpf_token_free(struct bpf_token *token)
+{ }
+
+static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+	return 0;
+}
+
+static inline int security_bpf_token_capable(const struct bpf_token *token, int cap)
+{
+	return 0;
+}
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_BPF_SYSCALL */
 
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 9e4e615f11eb..7d2f96413a57 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -265,6 +265,10 @@ BTF_ID(func, bpf_lsm_bpf_map_free)
 BTF_ID(func, bpf_lsm_bpf_prog)
 BTF_ID(func, bpf_lsm_bpf_prog_load)
 BTF_ID(func, bpf_lsm_bpf_prog_free)
+BTF_ID(func, bpf_lsm_bpf_token_create)
+BTF_ID(func, bpf_lsm_bpf_token_free)
+BTF_ID(func, bpf_lsm_bpf_token_cmd)
+BTF_ID(func, bpf_lsm_bpf_token_capable)
 BTF_ID(func, bpf_lsm_bprm_check_security)
 BTF_ID(func, bpf_lsm_bprm_committed_creds)
 BTF_ID(func, bpf_lsm_bprm_committing_creds)
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index 5a51e6b8f6bf..17212efcde60 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -7,6 +7,7 @@
 #include <linux/idr.h>
 #include <linux/namei.h>
 #include <linux/user_namespace.h>
+#include <linux/security.h>
 
 bool bpf_token_capable(const struct bpf_token *token, int cap)
 {
@@ -14,10 +15,9 @@ bool bpf_token_capable(const struct bpf_token *token, int cap)
 	 * token's userns is *exactly* the same as current user's userns
 	 */
 	if (token && current_user_ns() == token->userns) {
-		if (ns_capable(token->userns, cap))
-			return true;
-		if (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN))
-			return true;
+		if (ns_capable(token->userns, cap) ||
+		    (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN)))
+			return security_bpf_token_capable(token, cap) == 0;
 	}
 	/* otherwise fallback to capable() checks */
 	return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
@@ -30,6 +30,7 @@ void bpf_token_inc(struct bpf_token *token)
 
 static void bpf_token_free(struct bpf_token *token)
 {
+	security_bpf_token_free(token);
 	put_user_ns(token->userns);
 	kvfree(token);
 }
@@ -186,6 +187,10 @@ int bpf_token_create(union bpf_attr *attr)
 	token->allowed_progs = mnt_opts->delegate_progs;
 	token->allowed_attachs = mnt_opts->delegate_attachs;
 
+	err = security_bpf_token_create(token, attr, &path);
+	if (err)
+		goto out_token;
+
 	fd = get_unused_fd_flags(O_CLOEXEC);
 	if (fd < 0) {
 		err = fd;
@@ -233,8 +238,9 @@ bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
 	 */
 	if (!token || current_user_ns() != token->userns)
 		return false;
-
-	return token->allowed_cmds & (1ULL << cmd);
+	if (!(token->allowed_cmds & (1ULL << cmd)))
+		return false;
+	return security_bpf_token_cmd(token, cmd) == 0;
 }
 
 bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
diff --git a/security/security.c b/security/security.c
index ad24cf36da94..088a79c35c26 100644
--- a/security/security.c
+++ b/security/security.c
@@ -5201,6 +5201,55 @@ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 	return call_int_hook(bpf_prog_load, 0, prog, attr, token);
 }
 
+/**
+ * security_bpf_token_create() - Check if creating of BPF token is allowed
+ * @token: BPF token object
+ * @attr: BPF syscall attributes used to create BPF token
+ * @path: path pointing to BPF FS mount point from which BPF token is created
+ *
+ * Do a check when the kernel instantiates a new BPF token object from BPF FS
+ * instance. This is also the point where LSM blob can be allocated for LSMs.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
+int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
+			      struct path *path)
+{
+	return call_int_hook(bpf_token_create, 0, token, attr, path);
+}
+
+/**
+ * security_bpf_token_cmd() - Check if BPF token is allowed to delegate
+ * requested BPF syscall command
+ * @token: BPF token object
+ * @cmd: BPF syscall command requested to be delegated by BPF token
+ *
+ * Do a check when the kernel decides whether provided BPF token should allow
+ * delegation of requested BPF syscall command.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
+int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+{
+	return call_int_hook(bpf_token_cmd, 0, token, cmd);
+}
+
+/**
+ * security_bpf_token_capable() - Check if BPF token is allowed to delegate
+ * requested BPF-related capability
+ * @token: BPF token object
+ * @cap: capabilities requested to be delegated by BPF token
+ *
+ * Do a check when the kernel decides whether provided BPF token should allow
+ * delegation of requested BPF-related capabilities.
+ *
+ * Return: Returns 0 on success, error on failure.
+ */
+int security_bpf_token_capable(const struct bpf_token *token, int cap)
+{
+	return call_int_hook(bpf_token_capable, 0, token, cap);
+}
+
 /**
  * security_bpf_map_free() - Free a bpf map's LSM blob
  * @map: bpf map
@@ -5222,6 +5271,17 @@ void security_bpf_prog_free(struct bpf_prog *prog)
 {
 	call_void_hook(bpf_prog_free, prog);
 }
+
+/**
+ * security_bpf_token_free() - Free a BPF token's LSM blob
+ * @token: BPF token struct
+ *
+ * Clean up the security information stored inside BPF token.
+ */
+void security_bpf_token_free(struct bpf_token *token)
+{
+	call_void_hook(bpf_token_free, token);
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 /**
-- 
cgit v1.2.3


From 3232e7aad11e541da86bbb1fa5ea5737b30bd006 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 5 Dec 2023 17:21:14 -0500
Subject: cgroup/cpuset: Include isolated cpuset CPUs in cpu_is_isolated()
 check

Currently, the cpu_is_isolated() function checks only the statically
isolated CPUs specified via the "isolcpus" and "nohz_full" kernel
command line options. This function is used by vmstat and memcg to
reduce interference with isolated CPUs by not doing stat flushing
or scheduling works on those CPUs.

Workloads running on isolated CPUs within isolated cpuset
partitions should receive the same treatment to reduce unnecessary
interference. This patch introduces a new cpuset_cpu_is_isolated()
function to be called by cpu_is_isolated() so that the set of dynamically
created cpuset isolated CPUs will be included in the check.

Assuming that testing a bit in a cpumask is atomic, no synchronization
primitive is currently used to synchronize access to the cpuset's
isolated_cpus mask.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cpuset.h          |  6 ++++++
 include/linux/sched/isolation.h |  4 +++-
 kernel/cgroup/cpuset.c          | 11 +++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index d629094fac6e..875d12598bd2 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -77,6 +77,7 @@ extern void cpuset_lock(void);
 extern void cpuset_unlock(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
+extern bool cpuset_cpu_is_isolated(int cpu);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -207,6 +208,11 @@ static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p)
 	return false;
 }
 
+static inline bool cpuset_cpu_is_isolated(int cpu)
+{
+	return false;
+}
+
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 {
 	return node_possible_map;
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index fe1a46f30d24..2b461129d1fa 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -2,6 +2,7 @@
 #define _LINUX_SCHED_ISOLATION_H
 
 #include <linux/cpumask.h>
+#include <linux/cpuset.h>
 #include <linux/init.h>
 #include <linux/tick.h>
 
@@ -67,7 +68,8 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type)
 static inline bool cpu_is_isolated(int cpu)
 {
 	return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
-		 !housekeeping_test_cpu(cpu, HK_TYPE_TICK);
+	       !housekeeping_test_cpu(cpu, HK_TYPE_TICK) ||
+	       cpuset_cpu_is_isolated(cpu);
 }
 
 #endif /* _LINUX_SCHED_ISOLATION_H */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2a16df86c55c..dfbb16aca9f4 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1518,6 +1518,17 @@ static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
 	WARN_ON_ONCE(ret < 0);
 }
 
+/**
+ * cpuset_cpu_is_isolated - Check if the given CPU is isolated
+ * @cpu: the CPU number to be checked
+ * Return: true if CPU is used in an isolated partition, false otherwise
+ */
+bool cpuset_cpu_is_isolated(int cpu)
+{
+	return cpumask_test_cpu(cpu, isolated_cpus);
+}
+EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
+
 /*
  * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
  * @cs: cpuset
-- 
cgit v1.2.3


From b2dd797543cfa6580eac8408dd67fa02164d9e56 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 6 Dec 2023 10:02:44 -0500
Subject: ring-buffer: Force absolute timestamp on discard of event

There's a race where if an event is discarded from the ring buffer and an
interrupt were to happen at that time and insert an event, the time stamp
is still used from the discarded event as an offset. This can screw up the
timings.

If the event is going to be discarded, set the "before_stamp" to zero.
When a new event comes in, it compares the "before_stamp" with the
"write_stamp" and if they are not equal, it will insert an absolute
timestamp. This will prevent the timings from getting out of sync due to
the discarded event.

Link: https://lore.kernel.org/linux-trace-kernel/20231206100244.5130f9b3@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Fixes: 6f6be606e763f ("ring-buffer: Force before_stamp and write_stamp to be different on discard")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 43cc47d7faaf..a6da2d765c78 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3030,22 +3030,19 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 			local_read(&bpage->write) & ~RB_WRITE_MASK;
 		unsigned long event_length = rb_event_length(event);
 
+		/*
+		 * For the before_stamp to be different than the write_stamp
+		 * to make sure that the next event adds an absolute
+		 * value and does not rely on the saved write stamp, which
+		 * is now going to be bogus.
+		 */
+		rb_time_set(&cpu_buffer->before_stamp, 0);
+
 		/* Something came in, can't discard */
 		if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
 				       write_stamp, write_stamp - delta))
 			return false;
 
-		/*
-		 * It's possible that the event time delta is zero
-		 * (has the same time stamp as the previous event)
-		 * in which case write_stamp and before_stamp could
-		 * be the same. In such a case, force before_stamp
-		 * to be different than write_stamp. It doesn't
-		 * matter what it is, as long as its different.
-		 */
-		if (!delta)
-			rb_time_set(&cpu_buffer->before_stamp, 0);
-
 		/*
 		 * If an event were to come in now, it would see that the
 		 * write_stamp and the before_stamp are different, and assume
-- 
cgit v1.2.3


From f458a1453424e03462b5bb539673c9a3cddda480 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 6 Dec 2023 10:00:50 -0500
Subject: ring-buffer: Test last update in 32bit version of __rb_time_read()

Since 64 bit cmpxchg() is very expensive on 32bit architectures, the
timestamp used by the ring buffer does some interesting tricks to be able
to still have an atomic 64 bit number. It originally just used 60 bits and
broke it up into two 32 bit words where the extra 2 bits were used for
synchronization. But this was not enough for all use cases, and all 64
bits were required.

The 32bit version of the ring buffer timestamp was then broken up into 3
32bit words using the same counter trick. But one update was not done. The
check to see if the read operation was done without interruption only
checked the first two words and not last one (like it had before this
update). Fix it by making sure all three updates happen without
interruption by comparing the initial counter with the last updated
counter.

Link: https://lore.kernel.org/linux-trace-kernel/20231206100050.3100b7bb@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Fixes: f03f2abce4f39 ("ring-buffer: Have 32 bit time stamps use all 64 bits")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a6da2d765c78..8d2a4f00eca9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -644,8 +644,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
 
 	*cnt = rb_time_cnt(top);
 
-	/* If top and bottom counts don't match, this interrupted a write */
-	if (*cnt != rb_time_cnt(bottom))
+	/* If top and msb counts don't match, this interrupted a write */
+	if (*cnt != rb_time_cnt(msb))
 		return false;
 
 	/* The shift to msb will lose its cnt bits */
-- 
cgit v1.2.3


From 4b7de801606e504e69689df71475d27e35336fb3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 6 Dec 2023 09:30:40 +0100
Subject: bpf: Fix prog_array_map_poke_run map poke update

Lee pointed out issue found by syscaller [0] hitting BUG in prog array
map poke update in prog_array_map_poke_run function due to error value
returned from bpf_arch_text_poke function.

There's race window where bpf_arch_text_poke can fail due to missing
bpf program kallsym symbols, which is accounted for with check for
-EINVAL in that BUG_ON call.

The problem is that in such case we won't update the tail call jump
and cause imbalance for the next tail call update check which will
fail with -EBUSY in bpf_arch_text_poke.

I'm hitting following race during the program load:

  CPU 0                             CPU 1

  bpf_prog_load
    bpf_check
      do_misc_fixups
        prog_array_map_poke_track

                                    map_update_elem
                                      bpf_fd_array_map_update_elem
                                        prog_array_map_poke_run

                                          bpf_arch_text_poke returns -EINVAL

    bpf_prog_kallsyms_add

After bpf_arch_text_poke (CPU 1) fails to update the tail call jump, the next
poke update fails on expected jump instruction check in bpf_arch_text_poke
with -EBUSY and triggers the BUG_ON in prog_array_map_poke_run.

Similar race exists on the program unload.

Fixing this by moving the update to bpf_arch_poke_desc_update function which
makes sure we call __bpf_arch_text_poke that skips the bpf address check.

Each architecture has slightly different approach wrt looking up bpf address
in bpf_arch_text_poke, so instead of splitting the function or adding new
'checkip' argument in previous version, it seems best to move the whole
map_poke_run update as arch specific code.

  [0] https://syzkaller.appspot.com/bug?extid=97a4fe20470e9bc30810

Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT")
Reported-by: syzbot+97a4fe20470e9bc30810@syzkaller.appspotmail.com
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Cc: Lee Jones <lee@kernel.org>
Cc: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20231206083041.1306660-2-jolsa@kernel.org
---
 arch/x86/net/bpf_jit_comp.c | 46 +++++++++++++++++++++++++++++++++++
 include/linux/bpf.h         |  3 +++
 kernel/bpf/arraymap.c       | 58 ++++++++-------------------------------------
 3 files changed, 59 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8c10d9abc239..e89e415aa743 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3025,3 +3025,49 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
 #endif
 	WARN(1, "verification of programs using bpf_throw should have failed\n");
 }
+
+void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+			       struct bpf_prog *new, struct bpf_prog *old)
+{
+	u8 *old_addr, *new_addr, *old_bypass_addr;
+	int ret;
+
+	old_bypass_addr = old ? NULL : poke->bypass_addr;
+	old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
+	new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
+
+	/*
+	 * On program loading or teardown, the program's kallsym entry
+	 * might not be in place, so we use __bpf_arch_text_poke to skip
+	 * the kallsyms check.
+	 */
+	if (new) {
+		ret = __bpf_arch_text_poke(poke->tailcall_target,
+					   BPF_MOD_JUMP,
+					   old_addr, new_addr);
+		BUG_ON(ret < 0);
+		if (!old) {
+			ret = __bpf_arch_text_poke(poke->tailcall_bypass,
+						   BPF_MOD_JUMP,
+						   poke->bypass_addr,
+						   NULL);
+			BUG_ON(ret < 0);
+		}
+	} else {
+		ret = __bpf_arch_text_poke(poke->tailcall_bypass,
+					   BPF_MOD_JUMP,
+					   old_bypass_addr,
+					   poke->bypass_addr);
+		BUG_ON(ret < 0);
+		/* let other CPUs finish the execution of program
+		 * so that it will not possible to expose them
+		 * to invalid nop, stack unwind, nop state
+		 */
+		if (!ret)
+			synchronize_rcu();
+		ret = __bpf_arch_text_poke(poke->tailcall_target,
+					   BPF_MOD_JUMP,
+					   old_addr, NULL);
+		BUG_ON(ret < 0);
+	}
+}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6762dac3ef76..cff5bb08820e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3175,6 +3175,9 @@ enum bpf_text_poke_type {
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		       void *addr1, void *addr2);
 
+void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+			       struct bpf_prog *new, struct bpf_prog *old);
+
 void *bpf_arch_text_copy(void *dst, void *src, size_t len);
 int bpf_arch_text_invalidate(void *dst, size_t len);
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 2058e89b5ddd..c85ff9162a5c 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1012,11 +1012,16 @@ static void prog_array_map_poke_untrack(struct bpf_map *map,
 	mutex_unlock(&aux->poke_mutex);
 }
 
+void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+				      struct bpf_prog *new, struct bpf_prog *old)
+{
+	WARN_ON_ONCE(1);
+}
+
 static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
 				    struct bpf_prog *old,
 				    struct bpf_prog *new)
 {
-	u8 *old_addr, *new_addr, *old_bypass_addr;
 	struct prog_poke_elem *elem;
 	struct bpf_array_aux *aux;
 
@@ -1025,7 +1030,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
 
 	list_for_each_entry(elem, &aux->poke_progs, list) {
 		struct bpf_jit_poke_descriptor *poke;
-		int i, ret;
+		int i;
 
 		for (i = 0; i < elem->aux->size_poke_tab; i++) {
 			poke = &elem->aux->poke_tab[i];
@@ -1044,21 +1049,10 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
 			 *    activated, so tail call updates can arrive from here
 			 *    while JIT is still finishing its final fixup for
 			 *    non-activated poke entries.
-			 * 3) On program teardown, the program's kallsym entry gets
-			 *    removed out of RCU callback, but we can only untrack
-			 *    from sleepable context, therefore bpf_arch_text_poke()
-			 *    might not see that this is in BPF text section and
-			 *    bails out with -EINVAL. As these are unreachable since
-			 *    RCU grace period already passed, we simply skip them.
-			 * 4) Also programs reaching refcount of zero while patching
+			 * 3) Also programs reaching refcount of zero while patching
 			 *    is in progress is okay since we're protected under
 			 *    poke_mutex and untrack the programs before the JIT
-			 *    buffer is freed. When we're still in the middle of
-			 *    patching and suddenly kallsyms entry of the program
-			 *    gets evicted, we just skip the rest which is fine due
-			 *    to point 3).
-			 * 5) Any other error happening below from bpf_arch_text_poke()
-			 *    is a unexpected bug.
+			 *    buffer is freed.
 			 */
 			if (!READ_ONCE(poke->tailcall_target_stable))
 				continue;
@@ -1068,39 +1062,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
 			    poke->tail_call.key != key)
 				continue;
 
-			old_bypass_addr = old ? NULL : poke->bypass_addr;
-			old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
-			new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
-
-			if (new) {
-				ret = bpf_arch_text_poke(poke->tailcall_target,
-							 BPF_MOD_JUMP,
-							 old_addr, new_addr);
-				BUG_ON(ret < 0 && ret != -EINVAL);
-				if (!old) {
-					ret = bpf_arch_text_poke(poke->tailcall_bypass,
-								 BPF_MOD_JUMP,
-								 poke->bypass_addr,
-								 NULL);
-					BUG_ON(ret < 0 && ret != -EINVAL);
-				}
-			} else {
-				ret = bpf_arch_text_poke(poke->tailcall_bypass,
-							 BPF_MOD_JUMP,
-							 old_bypass_addr,
-							 poke->bypass_addr);
-				BUG_ON(ret < 0 && ret != -EINVAL);
-				/* let other CPUs finish the execution of program
-				 * so that it will not possible to expose them
-				 * to invalid nop, stack unwind, nop state
-				 */
-				if (!ret)
-					synchronize_rcu();
-				ret = bpf_arch_text_poke(poke->tailcall_target,
-							 BPF_MOD_JUMP,
-							 old_addr, NULL);
-				BUG_ON(ret < 0 && ret != -EINVAL);
-			}
+			bpf_arch_poke_desc_update(poke, new, old);
 		}
 	}
 }
-- 
cgit v1.2.3


From dccf78d39f1069a5ddf4328bf0c97aa5f2f4296e Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Tue, 28 Nov 2023 13:44:57 +0800
Subject: kernel/Kconfig.kexec: drop select of KEXEC for CRASH_DUMP

Ignat Korchagin complained that a potential config regression was
introduced by commit 89cde455915f ("kexec: consolidate kexec and crash
options into kernel/Kconfig.kexec").  Before the commit, CONFIG_CRASH_DUMP
has no dependency on CONFIG_KEXEC.  After the commit, CRASH_DUMP selects
KEXEC.  That enforces system to have CONFIG_KEXEC=y as long as
CONFIG_CRASH_DUMP=Y which people may not want.

In Ignat's case, he sets CONFIG_CRASH_DUMP=y, CONFIG_KEXEC_FILE=y and
CONFIG_KEXEC=n because kexec_load interface could have security issue if
kernel/initrd has no chance to be signed and verified.

CRASH_DUMP has select of KEXEC because Eric, author of above commit, met a
LKP report of build failure when posting patch of earlier version.  Please
see below link to get detail of the LKP report:

    https://lore.kernel.org/all/3e8eecd1-a277-2cfb-690e-5de2eb7b988e@oracle.com/T/#u

In fact, that LKP report is triggered because arm's <asm/kexec.h> is
wrapped in CONFIG_KEXEC ifdeffery scope.  That is wrong.  CONFIG_KEXEC
controls the enabling/disabling of kexec_load interface, but not kexec
feature.  Removing the wrongly added CONFIG_KEXEC ifdeffery scope in
<asm/kexec.h> of arm allows us to drop the select KEXEC for CRASH_DUMP.
Meanwhile, change arch/arm/kernel/Makefile to let machine_kexec.o
relocate_kernel.o depend on KEXEC_CORE.

Link: https://lkml.kernel.org/r/20231128054457.659452-1-bhe@redhat.com
Fixes: 89cde455915f ("kexec: consolidate kexec and crash options into kernel/Kconfig.kexec")
Signed-off-by: Baoquan He <bhe@redhat.com>
Reported-by: Ignat Korchagin <ignat@cloudflare.com>
Tested-by: Ignat Korchagin <ignat@cloudflare.com>	[compile-time only]
Tested-by: Alexander Gordeev <agordeev@linux.ibm.com>
Reviewed-by: Eric DeVolder <eric_devolder@yahoo.com>
Tested-by: Eric DeVolder <eric_devolder@yahoo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/include/asm/kexec.h | 4 ----
 arch/arm/kernel/Makefile     | 2 +-
 kernel/Kconfig.kexec         | 1 -
 3 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/include/asm/kexec.h b/arch/arm/include/asm/kexec.h
index e62832dcba76..a8287e7ab9d4 100644
--- a/arch/arm/include/asm/kexec.h
+++ b/arch/arm/include/asm/kexec.h
@@ -2,8 +2,6 @@
 #ifndef _ARM_KEXEC_H
 #define _ARM_KEXEC_H
 
-#ifdef CONFIG_KEXEC
-
 /* Maximum physical address we can use pages from */
 #define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
 /* Maximum address we can reach in physical address mode */
@@ -82,6 +80,4 @@ static inline struct page *boot_pfn_to_page(unsigned long boot_pfn)
 
 #endif /* __ASSEMBLY__ */
 
-#endif /* CONFIG_KEXEC */
-
 #endif /* _ARM_KEXEC_H */
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index d53f56d6f840..771264d4726a 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -59,7 +59,7 @@ obj-$(CONFIG_FUNCTION_TRACER)	+= entry-ftrace.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o insn.o patch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o insn.o patch.o
 obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o insn.o patch.o
-obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
+obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec.o relocate_kernel.o
 # Main staffs in KPROBES are in arch/arm/probes/ .
 obj-$(CONFIG_KPROBES)		+= patch.o insn.o
 obj-$(CONFIG_OABI_COMPAT)	+= sys_oabi-compat.o
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 7aff28ded2f4..1cc3b1c595d7 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -97,7 +97,6 @@ config CRASH_DUMP
 	depends on ARCH_SUPPORTS_KEXEC
 	select CRASH_CORE
 	select KEXEC_CORE
-	select KEXEC
 	help
 	  Generate crash dump after being started by kexec.
 	  This should be normally only set in special crash dump kernels
-- 
cgit v1.2.3


From f08a1c658257c73697a819c4ded3a84b6f0ead74 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:48 -0800
Subject: bpf: Let bpf_prog_pack_free handle any pointer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, bpf_prog_pack_free only can only free pointer to struct
bpf_binary_header, which is not flexible. Add a size argument to
bpf_prog_pack_free so that it can handle any pointer.

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Reviewed-by: Björn Töpel <bjorn@rivosinc.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20231206224054.492250-2-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h  |  2 +-
 kernel/bpf/core.c       | 21 ++++++++++-----------
 kernel/bpf/dispatcher.c |  5 +----
 3 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 14354605ad26..12d907f17d36 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1067,7 +1067,7 @@ struct bpf_binary_header *
 bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);
 
 void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
-void bpf_prog_pack_free(struct bpf_binary_header *hdr);
+void bpf_prog_pack_free(void *ptr, u32 size);
 
 static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ced511f44174..c34513d645c4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -928,20 +928,20 @@ out:
 	return ptr;
 }
 
-void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+void bpf_prog_pack_free(void *ptr, u32 size)
 {
 	struct bpf_prog_pack *pack = NULL, *tmp;
 	unsigned int nbits;
 	unsigned long pos;
 
 	mutex_lock(&pack_mutex);
-	if (hdr->size > BPF_PROG_PACK_SIZE) {
-		bpf_jit_free_exec(hdr);
+	if (size > BPF_PROG_PACK_SIZE) {
+		bpf_jit_free_exec(ptr);
 		goto out;
 	}
 
 	list_for_each_entry(tmp, &pack_list, list) {
-		if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) {
+		if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
 			pack = tmp;
 			break;
 		}
@@ -950,10 +950,10 @@ void bpf_prog_pack_free(struct bpf_binary_header *hdr)
 	if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
 		goto out;
 
-	nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
-	pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
+	nbits = BPF_PROG_SIZE_TO_NBITS(size);
+	pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
 
-	WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
+	WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
 		  "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
 
 	bitmap_clear(pack->bitmap, pos, nbits);
@@ -1100,8 +1100,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
 
 	*rw_header = kvmalloc(size, GFP_KERNEL);
 	if (!*rw_header) {
-		bpf_arch_text_copy(&ro_header->size, &size, sizeof(size));
-		bpf_prog_pack_free(ro_header);
+		bpf_prog_pack_free(ro_header, size);
 		bpf_jit_uncharge_modmem(size);
 		return NULL;
 	}
@@ -1132,7 +1131,7 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
 	kvfree(rw_header);
 
 	if (IS_ERR(ptr)) {
-		bpf_prog_pack_free(ro_header);
+		bpf_prog_pack_free(ro_header, ro_header->size);
 		return PTR_ERR(ptr);
 	}
 	return 0;
@@ -1153,7 +1152,7 @@ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
 {
 	u32 size = ro_header->size;
 
-	bpf_prog_pack_free(ro_header);
+	bpf_prog_pack_free(ro_header, size);
 	kvfree(rw_header);
 	bpf_jit_uncharge_modmem(size);
 }
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index fa3e9225aedc..56760fc10e78 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -150,10 +150,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 			goto out;
 		d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE);
 		if (!d->rw_image) {
-			u32 size = PAGE_SIZE;
-
-			bpf_arch_text_copy(d->image, &size, sizeof(size));
-			bpf_prog_pack_free((struct bpf_binary_header *)d->image);
+			bpf_prog_pack_free(d->image, PAGE_SIZE);
 			d->image = NULL;
 			goto out;
 		}
-- 
cgit v1.2.3


From 7a3d9a159b178e87306a6e989071ed9a114a1a31 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:49 -0800
Subject: bpf: Adjust argument names of arch_prepare_bpf_trampoline()

We are using "im" for "struct bpf_tramp_image" and "tr" for "struct
bpf_trampoline" in most of the code base. The only exception is the
prototype and fallback version of arch_prepare_bpf_trampoline(). Update
them to match the rest of the code base.

We mix "orig_call" and "func_addr" for the argument in different versions
of arch_prepare_bpf_trampoline(). s/orig_call/func_addr/g so they match.

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20231206224054.492250-3-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c | 10 +++++-----
 include/linux/bpf.h           |  4 ++--
 kernel/bpf/trampoline.c       |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 7d4af64e3982..d81b886ea4df 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1828,7 +1828,7 @@ static void restore_args(struct jit_ctx *ctx, int args_off, int nregs)
  *
  */
 static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
-			      struct bpf_tramp_links *tlinks, void *orig_call,
+			      struct bpf_tramp_links *tlinks, void *func_addr,
 			      int nregs, u32 flags)
 {
 	int i;
@@ -1926,7 +1926,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 
 	if (flags & BPF_TRAMP_F_IP_ARG) {
 		/* save ip address of the traced function */
-		emit_addr_mov_i64(A64_R(10), (const u64)orig_call, ctx);
+		emit_addr_mov_i64(A64_R(10), (const u64)func_addr, ctx);
 		emit(A64_STR64I(A64_R(10), A64_SP, ip_off), ctx);
 	}
 
@@ -2029,7 +2029,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 				void *image_end, const struct btf_func_model *m,
 				u32 flags, struct bpf_tramp_links *tlinks,
-				void *orig_call)
+				void *func_addr)
 {
 	int i, ret;
 	int nregs = m->nr_args;
@@ -2050,7 +2050,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	if (nregs > 8)
 		return -ENOTSUPP;
 
-	ret = prepare_trampoline(&ctx, im, tlinks, orig_call, nregs, flags);
+	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags);
 	if (ret < 0)
 		return ret;
 
@@ -2061,7 +2061,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	ctx.idx = 0;
 
 	jit_fill_hole(image, (unsigned int)(image_end - image));
-	ret = prepare_trampoline(&ctx, im, tlinks, orig_call, nregs, flags);
+	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags);
 
 	if (ret > 0 && validate_code(&ctx) < 0)
 		ret = -EINVAL;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7a483f6b6d5f..17eb6d905204 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1098,10 +1098,10 @@ struct bpf_tramp_run_ctx;
  *      fexit = a set of program to run after original function
  */
 struct bpf_tramp_image;
-int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 				const struct btf_func_model *m, u32 flags,
 				struct bpf_tramp_links *tlinks,
-				void *orig_call);
+				void *func_addr);
 u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index e97aeda3a86b..e114a1c7961e 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -1032,10 +1032,10 @@ bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
 }
 
 int __weak
-arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
+arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 			    const struct btf_func_model *m, u32 flags,
 			    struct bpf_tramp_links *tlinks,
-			    void *orig_call)
+			    void *func_addr)
 {
 	return -ENOTSUPP;
 }
-- 
cgit v1.2.3


From 82583daa2efc2e336962b231a46bad03a280b3e0 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:50 -0800
Subject: bpf: Add helpers for trampoline image management

As BPF trampoline of different archs moves from bpf_jit_[alloc|free]_exec()
to bpf_prog_pack_[alloc|free](), we need to use different _alloc, _free for
different archs during the transition. Add the following helpers for this
transition:

void *arch_alloc_bpf_trampoline(unsigned int size);
void arch_free_bpf_trampoline(void *image, unsigned int size);
void arch_protect_bpf_trampoline(void *image, unsigned int size);
void arch_unprotect_bpf_trampoline(void *image, unsigned int size);

The fallback version of these helpers require size <= PAGE_SIZE, but they
are only called with size == PAGE_SIZE. They will be called with size <
PAGE_SIZE when arch_bpf_trampoline_size() helper is introduced later.

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20231206224054.492250-4-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  5 +++++
 kernel/bpf/bpf_struct_ops.c    | 12 +++++------
 kernel/bpf/trampoline.c        | 46 +++++++++++++++++++++++++++++++++++-------
 net/bpf/bpf_dummy_struct_ops.c |  7 +++----
 4 files changed, 52 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 17eb6d905204..b7fca151cf1b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1102,6 +1102,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 				const struct btf_func_model *m, u32 flags,
 				struct bpf_tramp_links *tlinks,
 				void *func_addr);
+void *arch_alloc_bpf_trampoline(unsigned int size);
+void arch_free_bpf_trampoline(void *image, unsigned int size);
+void arch_protect_bpf_trampoline(void *image, unsigned int size);
+void arch_unprotect_bpf_trampoline(void *image, unsigned int size);
+
 u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index db6176fb64dc..e9e95879bce2 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -515,7 +515,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 			if (err)
 				goto reset_unlock;
 		}
-		set_memory_rox((long)st_map->image, 1);
+		arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
 		/* Let bpf_link handle registration & unregistration.
 		 *
 		 * Pair with smp_load_acquire() during lookup_elem().
@@ -524,7 +524,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		goto unlock;
 	}
 
-	set_memory_rox((long)st_map->image, 1);
+	arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE);
 	err = st_ops->reg(kdata);
 	if (likely(!err)) {
 		/* This refcnt increment on the map here after
@@ -547,8 +547,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	 * there was a race in registering the struct_ops (under the same name) to
 	 * a sub-system through different struct_ops's maps.
 	 */
-	set_memory_nx((long)st_map->image, 1);
-	set_memory_rw((long)st_map->image, 1);
+	arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE);
 
 reset_unlock:
 	bpf_struct_ops_map_put_progs(st_map);
@@ -616,7 +615,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
 		bpf_struct_ops_map_put_progs(st_map);
 	bpf_map_area_free(st_map->links);
 	if (st_map->image) {
-		bpf_jit_free_exec(st_map->image);
+		arch_free_bpf_trampoline(st_map->image, PAGE_SIZE);
 		bpf_jit_uncharge_modmem(PAGE_SIZE);
 	}
 	bpf_map_area_free(st_map->uvalue);
@@ -691,7 +690,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 		return ERR_PTR(ret);
 	}
 
-	st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
+	st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE);
 	if (!st_map->image) {
 		/* __bpf_struct_ops_map_free() uses st_map->image as flag
 		 * for "charged or not". In this case, we need to unchange
@@ -711,7 +710,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	}
 
 	mutex_init(&st_map->lock);
-	set_vm_flush_reset_perms(st_map->image);
 	bpf_map_init_from_attr(map, attr);
 
 	return map;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index e114a1c7961e..affbcbf7e76e 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -254,7 +254,7 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a
 static void bpf_tramp_image_free(struct bpf_tramp_image *im)
 {
 	bpf_image_ksym_del(&im->ksym);
-	bpf_jit_free_exec(im->image);
+	arch_free_bpf_trampoline(im->image, PAGE_SIZE);
 	bpf_jit_uncharge_modmem(PAGE_SIZE);
 	percpu_ref_exit(&im->pcref);
 	kfree_rcu(im, rcu);
@@ -365,10 +365,9 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
 		goto out_free_im;
 
 	err = -ENOMEM;
-	im->image = image = bpf_jit_alloc_exec(PAGE_SIZE);
+	im->image = image = arch_alloc_bpf_trampoline(PAGE_SIZE);
 	if (!image)
 		goto out_uncharge;
-	set_vm_flush_reset_perms(image);
 
 	err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
 	if (err)
@@ -381,7 +380,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
 	return im;
 
 out_free_image:
-	bpf_jit_free_exec(im->image);
+	arch_free_bpf_trampoline(im->image, PAGE_SIZE);
 out_uncharge:
 	bpf_jit_uncharge_modmem(PAGE_SIZE);
 out_free_im:
@@ -444,7 +443,7 @@ again:
 	if (err < 0)
 		goto out_free;
 
-	set_memory_rox((long)im->image, 1);
+	arch_protect_bpf_trampoline(im->image, PAGE_SIZE);
 
 	WARN_ON(tr->cur_image && total == 0);
 	if (tr->cur_image)
@@ -465,8 +464,7 @@ again:
 		tr->fops->trampoline = 0;
 
 		/* reset im->image memory attr for arch_prepare_bpf_trampoline */
-		set_memory_nx((long)im->image, 1);
-		set_memory_rw((long)im->image, 1);
+		arch_unprotect_bpf_trampoline(im->image, PAGE_SIZE);
 		goto again;
 	}
 #endif
@@ -1040,6 +1038,40 @@ arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image
 	return -ENOTSUPP;
 }
 
+void * __weak arch_alloc_bpf_trampoline(unsigned int size)
+{
+	void *image;
+
+	if (WARN_ON_ONCE(size > PAGE_SIZE))
+		return NULL;
+	image = bpf_jit_alloc_exec(PAGE_SIZE);
+	if (image)
+		set_vm_flush_reset_perms(image);
+	return image;
+}
+
+void __weak arch_free_bpf_trampoline(void *image, unsigned int size)
+{
+	WARN_ON_ONCE(size > PAGE_SIZE);
+	/* bpf_jit_free_exec doesn't need "size", but
+	 * bpf_prog_pack_free() needs it.
+	 */
+	bpf_jit_free_exec(image);
+}
+
+void __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
+{
+	WARN_ON_ONCE(size > PAGE_SIZE);
+	set_memory_rox((long)image, 1);
+}
+
+void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+{
+	WARN_ON_ONCE(size > PAGE_SIZE);
+	set_memory_nx((long)image, 1);
+	set_memory_rw((long)image, 1);
+}
+
 static int __init init_trampolines(void)
 {
 	int i;
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index 5918d1b32e19..2748f9d77b18 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -101,12 +101,11 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 		goto out;
 	}
 
-	image = bpf_jit_alloc_exec(PAGE_SIZE);
+	image = arch_alloc_bpf_trampoline(PAGE_SIZE);
 	if (!image) {
 		err = -ENOMEM;
 		goto out;
 	}
-	set_vm_flush_reset_perms(image);
 
 	link = kzalloc(sizeof(*link), GFP_USER);
 	if (!link) {
@@ -124,7 +123,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	if (err < 0)
 		goto out;
 
-	set_memory_rox((long)image, 1);
+	arch_protect_bpf_trampoline(image, PAGE_SIZE);
 	prog_ret = dummy_ops_call_op(image, args);
 
 	err = dummy_ops_copy_args(args);
@@ -134,7 +133,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 		err = -EFAULT;
 out:
 	kfree(args);
-	bpf_jit_free_exec(image);
+	arch_free_bpf_trampoline(image, PAGE_SIZE);
 	if (link)
 		bpf_link_put(&link->link);
 	kfree(tlinks);
-- 
cgit v1.2.3


From 96d1b7c081c0c96cbe8901045f4ff15a2e9974a2 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:52 -0800
Subject: bpf: Add arch_bpf_trampoline_size()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This helper will be used to calculate the size of the trampoline before
allocating the memory.

arch_prepare_bpf_trampoline() for arm64 and riscv64 can use
arch_bpf_trampoline_size() to check the trampoline fits in the image.

OTOH, arch_prepare_bpf_trampoline() for s390 has to call the JIT process
twice, so it cannot use arch_bpf_trampoline_size().

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Björn Töpel <bjorn@rivosinc.com>
Tested-by: Björn Töpel <bjorn@rivosinc.com> # on riscv
Link: https://lore.kernel.org/r/20231206224054.492250-6-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c   | 56 ++++++++++++++++++++++++++++++-----------
 arch/riscv/net/bpf_jit_comp64.c | 22 ++++++++++++----
 arch/s390/net/bpf_jit_comp.c    | 56 +++++++++++++++++++++++++----------------
 arch/x86/net/bpf_jit_comp.c     | 40 ++++++++++++++++++++++++++---
 include/linux/bpf.h             |  2 ++
 kernel/bpf/trampoline.c         |  6 +++++
 6 files changed, 136 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index d81b886ea4df..a6671253b7ed 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -2026,18 +2026,10 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
 	return ctx->idx;
 }
 
-int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
-				void *image_end, const struct btf_func_model *m,
-				u32 flags, struct bpf_tramp_links *tlinks,
-				void *func_addr)
+static int btf_func_model_nregs(const struct btf_func_model *m)
 {
-	int i, ret;
 	int nregs = m->nr_args;
-	int max_insns = ((long)image_end - (long)image) / AARCH64_INSN_SIZE;
-	struct jit_ctx ctx = {
-		.image = NULL,
-		.idx = 0,
-	};
+	int i;
 
 	/* extra registers needed for struct argument */
 	for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) {
@@ -2046,19 +2038,53 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 			nregs += (m->arg_size[i] + 7) / 8 - 1;
 	}
 
+	return nregs;
+}
+
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+			     struct bpf_tramp_links *tlinks, void *func_addr)
+{
+	struct jit_ctx ctx = {
+		.image = NULL,
+		.idx = 0,
+	};
+	struct bpf_tramp_image im;
+	int nregs, ret;
+
+	nregs = btf_func_model_nregs(m);
 	/* the first 8 registers are used for arguments */
 	if (nregs > 8)
 		return -ENOTSUPP;
 
-	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags);
+	ret = prepare_trampoline(&ctx, &im, tlinks, func_addr, nregs, flags);
 	if (ret < 0)
 		return ret;
 
-	if (ret > max_insns)
-		return -EFBIG;
+	return ret < 0 ? ret : ret * AARCH64_INSN_SIZE;
+}
 
-	ctx.image = image;
-	ctx.idx = 0;
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
+				void *image_end, const struct btf_func_model *m,
+				u32 flags, struct bpf_tramp_links *tlinks,
+				void *func_addr)
+{
+	int ret, nregs;
+	struct jit_ctx ctx = {
+		.image = image,
+		.idx = 0,
+	};
+
+	nregs = btf_func_model_nregs(m);
+	/* the first 8 registers are used for arguments */
+	if (nregs > 8)
+		return -ENOTSUPP;
+
+	ret = arch_bpf_trampoline_size(m, flags, tlinks, func_addr);
+	if (ret < 0)
+		return ret;
+
+	if (ret > ((long)image_end - (long)image))
+		return -EFBIG;
 
 	jit_fill_hole(image, (unsigned int)(image_end - image));
 	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags);
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 8581693e62d3..35747fafde57 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -1029,6 +1029,21 @@ out:
 	return ret;
 }
 
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+			     struct bpf_tramp_links *tlinks, void *func_addr)
+{
+	struct bpf_tramp_image im;
+	struct rv_jit_context ctx;
+	int ret;
+
+	ctx.ninsns = 0;
+	ctx.insns = NULL;
+	ctx.ro_insns = NULL;
+	ret = __arch_prepare_bpf_trampoline(&im, m, tlinks, func_addr, flags, &ctx);
+
+	return ret < 0 ? ret : ninsns_rvoff(ctx.ninsns);
+}
+
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 				void *image_end, const struct btf_func_model *m,
 				u32 flags, struct bpf_tramp_links *tlinks,
@@ -1037,14 +1052,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	int ret;
 	struct rv_jit_context ctx;
 
-	ctx.ninsns = 0;
-	ctx.insns = NULL;
-	ctx.ro_insns = NULL;
-	ret = __arch_prepare_bpf_trampoline(im, m, tlinks, func_addr, flags, &ctx);
+	ret = arch_bpf_trampoline_size(im, m, flags, tlinks, func_addr);
 	if (ret < 0)
 		return ret;
 
-	if (ninsns_rvoff(ret) > (long)image_end - (long)image)
+	if (ret > (long)image_end - (long)image)
 		return -EFBIG;
 
 	ctx.ninsns = 0;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index bf06b7283f0c..cc129617480a 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -2637,6 +2637,21 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 	return 0;
 }
 
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+			     struct bpf_tramp_links *tlinks, void *orig_call)
+{
+	struct bpf_tramp_image im;
+	struct bpf_tramp_jit tjit;
+	int ret;
+
+	memset(&tjit, 0, sizeof(tjit));
+
+	ret = __arch_prepare_bpf_trampoline(&im, &tjit, m, flags,
+					    tlinks, orig_call);
+
+	return ret < 0 ? ret : tjit.common.prg;
+}
+
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 				void *image_end, const struct btf_func_model *m,
 				u32 flags, struct bpf_tramp_links *tlinks,
@@ -2644,30 +2659,27 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 {
 	struct bpf_tramp_jit tjit;
 	int ret;
-	int i;
 
-	for (i = 0; i < 2; i++) {
-		if (i == 0) {
-			/* Compute offsets, check whether the code fits. */
-			memset(&tjit, 0, sizeof(tjit));
-		} else {
-			/* Generate the code. */
-			tjit.common.prg = 0;
-			tjit.common.prg_buf = image;
-		}
-		ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags,
-						    tlinks, func_addr);
-		if (ret < 0)
-			return ret;
-		if (tjit.common.prg > (char *)image_end - (char *)image)
-			/*
-			 * Use the same error code as for exceeding
-			 * BPF_MAX_TRAMP_LINKS.
-			 */
-			return -E2BIG;
-	}
+	/* Compute offsets, check whether the code fits. */
+	memset(&tjit, 0, sizeof(tjit));
+	ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags,
+					    tlinks, func_addr);
+
+	if (ret < 0)
+		return ret;
+	if (tjit.common.prg > (char *)image_end - (char *)image)
+		/*
+		 * Use the same error code as for exceeding
+		 * BPF_MAX_TRAMP_LINKS.
+		 */
+		return -E2BIG;
+
+	tjit.common.prg = 0;
+	tjit.common.prg_buf = image;
+	ret = __arch_prepare_bpf_trampoline(im, &tjit, m, flags,
+					    tlinks, func_addr);
 
-	return tjit.common.prg;
+	return ret < 0 ? ret : tjit.common.prg;
 }
 
 bool bpf_jit_supports_subprog_tailcalls(void)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5f7528cac344..5d75069fdcc2 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2422,10 +2422,10 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
  * add rsp, 8                      // skip eth_type_trans's frame
  * ret                             // return to its caller
  */
-int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
-				const struct btf_func_model *m, u32 flags,
-				struct bpf_tramp_links *tlinks,
-				void *func_addr)
+static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
+					 const struct btf_func_model *m, u32 flags,
+					 struct bpf_tramp_links *tlinks,
+					 void *func_addr)
 {
 	int i, ret, nr_regs = m->nr_args, stack_size = 0;
 	int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
@@ -2678,6 +2678,38 @@ cleanup:
 	return ret;
 }
 
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
+				const struct btf_func_model *m, u32 flags,
+				struct bpf_tramp_links *tlinks,
+				void *func_addr)
+{
+	return __arch_prepare_bpf_trampoline(im, image, image_end, m, flags, tlinks, func_addr);
+}
+
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+			     struct bpf_tramp_links *tlinks, void *func_addr)
+{
+	struct bpf_tramp_image im;
+	void *image;
+	int ret;
+
+	/* Allocate a temporary buffer for __arch_prepare_bpf_trampoline().
+	 * This will NOT cause fragmentation in direct map, as we do not
+	 * call set_memory_*() on this buffer.
+	 *
+	 * We cannot use kvmalloc here, because we need image to be in
+	 * module memory range.
+	 */
+	image = bpf_jit_alloc_exec(PAGE_SIZE);
+	if (!image)
+		return -ENOMEM;
+
+	ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, m, flags,
+					    tlinks, func_addr);
+	bpf_jit_free_exec(image);
+	return ret;
+}
+
 static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
 {
 	u8 *jg_reloc, *prog = *pprog;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b7fca151cf1b..2332ddeb396b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1106,6 +1106,8 @@ void *arch_alloc_bpf_trampoline(unsigned int size);
 void arch_free_bpf_trampoline(void *image, unsigned int size);
 void arch_protect_bpf_trampoline(void *image, unsigned int size);
 void arch_unprotect_bpf_trampoline(void *image, unsigned int size);
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+			     struct bpf_tramp_links *tlinks, void *func_addr);
 
 u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index affbcbf7e76e..b553cbd89e55 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -1072,6 +1072,12 @@ void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size)
 	set_memory_rw((long)image, 1);
 }
 
+int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+				    struct bpf_tramp_links *tlinks, void *func_addr)
+{
+	return -ENOTSUPP;
+}
+
 static int __init init_trampolines(void)
 {
 	int i;
-- 
cgit v1.2.3


From 26ef208c209a0e6eed8942a5d191b39dccfa6e38 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 6 Dec 2023 14:40:53 -0800
Subject: bpf: Use arch_bpf_trampoline_size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of blindly allocating PAGE_SIZE for each trampoline, check the size
of the trampoline with arch_bpf_trampoline_size(). This size is saved in
bpf_tramp_image->size, and used for modmem charge/uncharge. The fallback
arch_alloc_bpf_trampoline() still allocates a whole page because we need to
use set_memory_* to protect the memory.

struct_ops trampoline still uses a whole page for multiple trampolines.

With this size check at caller (regular trampoline and struct_ops
trampoline), remove arch_bpf_trampoline_size() from
arch_prepare_bpf_trampoline() in archs.

Also, update bpf_image_ksym_add() to handle symbol of different sizes.

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>  # on s390x
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Björn Töpel <bjorn@rivosinc.com>
Tested-by: Björn Töpel <bjorn@rivosinc.com> # on riscv
Link: https://lore.kernel.org/r/20231206224054.492250-7-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c   |  7 ------
 arch/riscv/net/bpf_jit_comp64.c |  7 ------
 include/linux/bpf.h             |  3 ++-
 kernel/bpf/bpf_struct_ops.c     |  7 ++++++
 kernel/bpf/dispatcher.c         |  2 +-
 kernel/bpf/trampoline.c         | 55 +++++++++++++++++++++++++----------------
 6 files changed, 44 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index a6671253b7ed..8955da5c47cf 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -2079,13 +2079,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	if (nregs > 8)
 		return -ENOTSUPP;
 
-	ret = arch_bpf_trampoline_size(m, flags, tlinks, func_addr);
-	if (ret < 0)
-		return ret;
-
-	if (ret > ((long)image_end - (long)image))
-		return -EFBIG;
-
 	jit_fill_hole(image, (unsigned int)(image_end - image));
 	ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags);
 
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 35747fafde57..58dc64dd94a8 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -1052,13 +1052,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	int ret;
 	struct rv_jit_context ctx;
 
-	ret = arch_bpf_trampoline_size(im, m, flags, tlinks, func_addr);
-	if (ret < 0)
-		return ret;
-
-	if (ret > (long)image_end - (long)image)
-		return -EFBIG;
-
 	ctx.ninsns = 0;
 	/*
 	 * The bpf_int_jit_compile() uses a RW buffer (ctx.insns) to write the
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2332ddeb396b..c1a06263a4f3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1141,6 +1141,7 @@ enum bpf_tramp_prog_type {
 
 struct bpf_tramp_image {
 	void *image;
+	int size;
 	struct bpf_ksym ksym;
 	struct percpu_ref pcref;
 	void *ip_after_call;
@@ -1325,7 +1326,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
 void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 				struct bpf_prog *to);
 /* Called only from JIT-enabled code, so there's no need for stubs. */
-void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
+void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym);
 void bpf_image_ksym_del(struct bpf_ksym *ksym);
 void bpf_ksym_add(struct bpf_ksym *ksym);
 void bpf_ksym_del(struct bpf_ksym *ksym);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index e9e95879bce2..4d53c53fc5aa 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -355,6 +355,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 				      void *image, void *image_end)
 {
 	u32 flags;
+	int size;
 
 	tlinks[BPF_TRAMP_FENTRY].links[0] = link;
 	tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
@@ -362,6 +363,12 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 	 * and it must be used alone.
 	 */
 	flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
+
+	size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
+	if (size < 0)
+		return size;
+	if (size > (unsigned long)image_end - (unsigned long)image)
+		return -E2BIG;
 	return arch_prepare_bpf_trampoline(NULL, image, image_end,
 					   model, flags, tlinks, NULL);
 }
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 56760fc10e78..70fb82bf1637 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -154,7 +154,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 			d->image = NULL;
 			goto out;
 		}
-		bpf_image_ksym_add(d->image, &d->ksym);
+		bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym);
 	}
 
 	prev_num_progs = d->num_progs;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index b553cbd89e55..d382f5ebe06c 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -115,10 +115,10 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 		(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
 }
 
-void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
+void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym)
 {
 	ksym->start = (unsigned long) data;
-	ksym->end = ksym->start + PAGE_SIZE;
+	ksym->end = ksym->start + size;
 	bpf_ksym_add(ksym);
 	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
 			   PAGE_SIZE, false, ksym->name);
@@ -254,8 +254,8 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_a
 static void bpf_tramp_image_free(struct bpf_tramp_image *im)
 {
 	bpf_image_ksym_del(&im->ksym);
-	arch_free_bpf_trampoline(im->image, PAGE_SIZE);
-	bpf_jit_uncharge_modmem(PAGE_SIZE);
+	arch_free_bpf_trampoline(im->image, im->size);
+	bpf_jit_uncharge_modmem(im->size);
 	percpu_ref_exit(&im->pcref);
 	kfree_rcu(im, rcu);
 }
@@ -349,7 +349,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im)
 	call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
 }
 
-static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size)
 {
 	struct bpf_tramp_image *im;
 	struct bpf_ksym *ksym;
@@ -360,12 +360,13 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
 	if (!im)
 		goto out;
 
-	err = bpf_jit_charge_modmem(PAGE_SIZE);
+	err = bpf_jit_charge_modmem(size);
 	if (err)
 		goto out_free_im;
+	im->size = size;
 
 	err = -ENOMEM;
-	im->image = image = arch_alloc_bpf_trampoline(PAGE_SIZE);
+	im->image = image = arch_alloc_bpf_trampoline(size);
 	if (!image)
 		goto out_uncharge;
 
@@ -376,13 +377,13 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key)
 	ksym = &im->ksym;
 	INIT_LIST_HEAD_RCU(&ksym->lnode);
 	snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
-	bpf_image_ksym_add(image, ksym);
+	bpf_image_ksym_add(image, size, ksym);
 	return im;
 
 out_free_image:
-	arch_free_bpf_trampoline(im->image, PAGE_SIZE);
+	arch_free_bpf_trampoline(im->image, im->size);
 out_uncharge:
-	bpf_jit_uncharge_modmem(PAGE_SIZE);
+	bpf_jit_uncharge_modmem(size);
 out_free_im:
 	kfree(im);
 out:
@@ -395,7 +396,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
 	struct bpf_tramp_links *tlinks;
 	u32 orig_flags = tr->flags;
 	bool ip_arg = false;
-	int err, total;
+	int err, total, size;
 
 	tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg);
 	if (IS_ERR(tlinks))
@@ -408,12 +409,6 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
 		goto out;
 	}
 
-	im = bpf_tramp_image_alloc(tr->key);
-	if (IS_ERR(im)) {
-		err = PTR_ERR(im);
-		goto out;
-	}
-
 	/* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */
 	tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX);
 
@@ -437,13 +432,31 @@ again:
 		tr->flags |= BPF_TRAMP_F_ORIG_STACK;
 #endif
 
-	err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
+	size = arch_bpf_trampoline_size(&tr->func.model, tr->flags,
+					tlinks, tr->func.addr);
+	if (size < 0) {
+		err = size;
+		goto out;
+	}
+
+	if (size > PAGE_SIZE) {
+		err = -E2BIG;
+		goto out;
+	}
+
+	im = bpf_tramp_image_alloc(tr->key, size);
+	if (IS_ERR(im)) {
+		err = PTR_ERR(im);
+		goto out;
+	}
+
+	err = arch_prepare_bpf_trampoline(im, im->image, im->image + size,
 					  &tr->func.model, tr->flags, tlinks,
 					  tr->func.addr);
 	if (err < 0)
 		goto out_free;
 
-	arch_protect_bpf_trampoline(im->image, PAGE_SIZE);
+	arch_protect_bpf_trampoline(im->image, im->size);
 
 	WARN_ON(tr->cur_image && total == 0);
 	if (tr->cur_image)
@@ -463,8 +476,8 @@ again:
 		tr->fops->func = NULL;
 		tr->fops->trampoline = 0;
 
-		/* reset im->image memory attr for arch_prepare_bpf_trampoline */
-		arch_unprotect_bpf_trampoline(im->image, PAGE_SIZE);
+		/* free im memory and reallocate later */
+		bpf_tramp_image_free(im);
 		goto again;
 	}
 #endif
-- 
cgit v1.2.3


From a833a17aeac73b33f79433d7cee68d5cafd71e4f Mon Sep 17 00:00:00 2001
From: Andrei Matei <andreimatei1@gmail.com>
Date: Wed, 6 Dec 2023 23:11:48 -0500
Subject: bpf: Fix verification of indirect var-off stack access

This patch fixes a bug around the verification of possibly-zero-sized
stack accesses. When the access was done through a var-offset stack
pointer, check_stack_access_within_bounds was incorrectly computing the
maximum-offset of a zero-sized read to be the same as the register's min
offset. Instead, we have to take in account the register's maximum
possible value. The patch also simplifies how the max offset is checked;
the check is now simpler than for min offset.

The bug was allowing accesses to erroneously pass the
check_stack_access_within_bounds() checks, only to later crash in
check_stack_range_initialized() when all the possibly-affected stack
slots are iterated (this time with a correct max offset).
check_stack_range_initialized() is relying on
check_stack_access_within_bounds() for its accesses to the
stack-tracking vector to be within bounds; in the case of zero-sized
accesses, we were essentially only verifying that the lowest possible
slot was within bounds. We would crash when the max-offset of the stack
pointer was >= 0 (which shouldn't pass verification, and hopefully is
not something anyone's code attempts to do in practice).

Thanks Hao for reporting!

Fixes: 01f810ace9ed3 ("bpf: Allow variable-offset stack access")
Reported-by: Hao Sun <sunhao.th@gmail.com>
Signed-off-by: Andrei Matei <andreimatei1@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231207041150.229139-2-andreimatei1@gmail.com

Closes: https://lore.kernel.org/bpf/CACkBjsZGEUaRCHsmaX=h-efVogsRfK1FPxmkgb0Os_frnHiNdw@mail.gmail.com/
---
 kernel/bpf/verifier.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 45e85fb76d82..85e4ab61084f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6620,10 +6620,7 @@ static int check_stack_access_within_bounds(
 
 	if (tnum_is_const(reg->var_off)) {
 		min_off = reg->var_off.value + off;
-		if (access_size > 0)
-			max_off = min_off + access_size - 1;
-		else
-			max_off = min_off;
+		max_off = min_off + access_size;
 	} else {
 		if (reg->smax_value >= BPF_MAX_VAR_OFF ||
 		    reg->smin_value <= -BPF_MAX_VAR_OFF) {
@@ -6632,15 +6629,12 @@ static int check_stack_access_within_bounds(
 			return -EACCES;
 		}
 		min_off = reg->smin_value + off;
-		if (access_size > 0)
-			max_off = reg->smax_value + off + access_size - 1;
-		else
-			max_off = min_off;
+		max_off = reg->smax_value + off + access_size;
 	}
 
 	err = check_stack_slot_within_bounds(min_off, state, type);
-	if (!err)
-		err = check_stack_slot_within_bounds(max_off, state, type);
+	if (!err && max_off > 0)
+		err = -EINVAL; /* out of stack access into non-negative offsets */
 
 	if (err) {
 		if (tnum_is_const(reg->var_off)) {
-- 
cgit v1.2.3


From 1d38a9ee81570c4bd61f557832dead4d6f816760 Mon Sep 17 00:00:00 2001
From: Andrei Matei <andreimatei1@gmail.com>
Date: Wed, 6 Dec 2023 23:11:50 -0500
Subject: bpf: Guard stack limits against 32bit overflow

This patch promotes the arithmetic around checking stack bounds to be
done in the 64-bit domain, instead of the current 32bit. The arithmetic
implies adding together a 64-bit register with a int offset. The
register was checked to be below 1<<29 when it was variable, but not
when it was fixed. The offset either comes from an instruction (in which
case it is 16 bit), from another register (in which case the caller
checked it to be below 1<<29 [1]), or from the size of an argument to a
kfunc (in which case it can be a u32 [2]). Between the register being
inconsistently checked to be below 1<<29, and the offset being up to an
u32, it appears that we were open to overflowing the `int`s which were
currently used for arithmetic.

[1] https://github.com/torvalds/linux/blob/815fb87b753055df2d9e50f6cd80eb10235fe3e9/kernel/bpf/verifier.c#L7494-L7498
[2] https://github.com/torvalds/linux/blob/815fb87b753055df2d9e50f6cd80eb10235fe3e9/kernel/bpf/verifier.c#L11904

Reported-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Signed-off-by: Andrei Matei <andreimatei1@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231207041150.229139-4-andreimatei1@gmail.com
---
 kernel/bpf/verifier.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 85e4ab61084f..0e77bb52542d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6577,7 +6577,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
  * The minimum valid offset is -MAX_BPF_STACK for writes, and
  * -state->allocated_stack for reads.
  */
-static int check_stack_slot_within_bounds(int off,
+static int check_stack_slot_within_bounds(s64 off,
 					  struct bpf_func_state *state,
 					  enum bpf_access_type t)
 {
@@ -6606,7 +6606,7 @@ static int check_stack_access_within_bounds(
 	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = regs + regno;
 	struct bpf_func_state *state = func(env, reg);
-	int min_off, max_off;
+	s64 min_off, max_off;
 	int err;
 	char *err_extra;
 
@@ -6619,7 +6619,7 @@ static int check_stack_access_within_bounds(
 		err_extra = " write to";
 
 	if (tnum_is_const(reg->var_off)) {
-		min_off = reg->var_off.value + off;
+		min_off = (s64)reg->var_off.value + off;
 		max_off = min_off + access_size;
 	} else {
 		if (reg->smax_value >= BPF_MAX_VAR_OFF ||
-- 
cgit v1.2.3


From 6b4a64bafd107e521c01eec3453ce94a3fb38529 Mon Sep 17 00:00:00 2001
From: Andrei Matei <andreimatei1@gmail.com>
Date: Thu, 7 Dec 2023 22:25:18 -0500
Subject: bpf: Fix accesses to uninit stack slots

Privileged programs are supposed to be able to read uninitialized stack
memory (ever since 6715df8d5) but, before this patch, these accesses
were permitted inconsistently. In particular, accesses were permitted
above state->allocated_stack, but not below it. In other words, if the
stack was already "large enough", the access was permitted, but
otherwise the access was rejected instead of being allowed to "grow the
stack". This undesired rejection was happening in two places:
- in check_stack_slot_within_bounds()
- in check_stack_range_initialized()
This patch arranges for these accesses to be permitted. A bunch of tests
that were relying on the old rejection had to change; all of them were
changed to add also run unprivileged, in which case the old behavior
persists. One tests couldn't be updated - global_func16 - because it
can't run unprivileged for other reasons.

This patch also fixes the tracking of the stack size for variable-offset
reads. This second fix is bundled in the same commit as the first one
because they're inter-related. Before this patch, writes to the stack
using registers containing a variable offset (as opposed to registers
with fixed, known values) were not properly contributing to the
function's needed stack size. As a result, it was possible for a program
to verify, but then to attempt to read out-of-bounds data at runtime
because a too small stack had been allocated for it.

Each function tracks the size of the stack it needs in
bpf_subprog_info.stack_depth, which is maintained by
update_stack_depth(). For regular memory accesses, check_mem_access()
was calling update_state_depth() but it was passing in only the fixed
part of the offset register, ignoring the variable offset. This was
incorrect; the minimum possible value of that register should be used
instead.

This tracking is now fixed by centralizing the tracking of stack size in
grow_stack_state(), and by lifting the calls to grow_stack_state() to
check_stack_access_within_bounds() as suggested by Andrii. The code is
now simpler and more convincingly tracks the correct maximum stack size.
check_stack_range_initialized() can now rely on enough stack having been
allocated for the access; this helps with the fix for the first issue.

A few tests were changed to also check the stack depth computation. The
one that fails without this patch is verifier_var_off:stack_write_priv_vs_unpriv.

Fixes: 01f810ace9ed3 ("bpf: Allow variable-offset stack access")
Reported-by: Hao Sun <sunhao.th@gmail.com>
Signed-off-by: Andrei Matei <andreimatei1@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231208032519.260451-3-andreimatei1@gmail.com

Closes: https://lore.kernel.org/bpf/CABWLsev9g8UP_c3a=1qbuZUi20tGoUXoU07FPf-5FLvhOKOY+Q@mail.gmail.com/
---
 kernel/bpf/verifier.c                              | 65 +++++++++-------------
 tools/testing/selftests/bpf/progs/iters.c          |  2 +-
 .../selftests/bpf/progs/test_global_func16.c       |  2 +-
 .../selftests/bpf/progs/verifier_basic_stack.c     |  8 +--
 .../testing/selftests/bpf/progs/verifier_int_ptr.c |  5 +-
 .../selftests/bpf/progs/verifier_raw_stack.c       |  5 +-
 .../testing/selftests/bpf/progs/verifier_var_off.c | 62 +++++++++++++++++----
 .../selftests/bpf/verifier/atomic_cmpxchg.c        | 11 ----
 tools/testing/selftests/bpf/verifier/calls.c       |  4 +-
 9 files changed, 92 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0e77bb52542d..de1e29fa467e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1259,7 +1259,10 @@ static int resize_reference_state(struct bpf_func_state *state, size_t n)
 	return 0;
 }
 
-static int grow_stack_state(struct bpf_func_state *state, int size)
+/* Possibly update state->allocated_stack to be at least size bytes. Also
+ * possibly update the function's high-water mark in its bpf_subprog_info.
+ */
+static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size)
 {
 	size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE;
 
@@ -1271,6 +1274,11 @@ static int grow_stack_state(struct bpf_func_state *state, int size)
 		return -ENOMEM;
 
 	state->allocated_stack = size;
+
+	/* update known max for given subprogram */
+	if (env->subprog_info[state->subprogno].stack_depth < size)
+		env->subprog_info[state->subprogno].stack_depth = size;
+
 	return 0;
 }
 
@@ -4440,9 +4448,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	struct bpf_reg_state *reg = NULL;
 	int insn_flags = insn_stack_access_flags(state->frameno, spi);
 
-	err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));
-	if (err)
-		return err;
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
 	 * so it's aligned access and [off, off + size) are within stack limits
 	 */
@@ -4595,10 +4600,6 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
 	    (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))
 		writing_zero = true;
 
-	err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE));
-	if (err)
-		return err;
-
 	for (i = min_off; i < max_off; i++) {
 		int spi;
 
@@ -5774,20 +5775,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 					   strict);
 }
 
-static int update_stack_depth(struct bpf_verifier_env *env,
-			      const struct bpf_func_state *func,
-			      int off)
-{
-	u16 stack = env->subprog_info[func->subprogno].stack_depth;
-
-	if (stack >= -off)
-		return 0;
-
-	/* update known max for given subprogram */
-	env->subprog_info[func->subprogno].stack_depth = -off;
-	return 0;
-}
-
 /* starting from main bpf function walk all instructions of the function
  * and recursively walk all callees that given function can call.
  * Ignore jump and exit insns.
@@ -6577,13 +6564,14 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
  * The minimum valid offset is -MAX_BPF_STACK for writes, and
  * -state->allocated_stack for reads.
  */
-static int check_stack_slot_within_bounds(s64 off,
-					  struct bpf_func_state *state,
-					  enum bpf_access_type t)
+static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
+                                          s64 off,
+                                          struct bpf_func_state *state,
+                                          enum bpf_access_type t)
 {
 	int min_valid_off;
 
-	if (t == BPF_WRITE)
+	if (t == BPF_WRITE || env->allow_uninit_stack)
 		min_valid_off = -MAX_BPF_STACK;
 	else
 		min_valid_off = -state->allocated_stack;
@@ -6632,7 +6620,7 @@ static int check_stack_access_within_bounds(
 		max_off = reg->smax_value + off + access_size;
 	}
 
-	err = check_stack_slot_within_bounds(min_off, state, type);
+	err = check_stack_slot_within_bounds(env, min_off, state, type);
 	if (!err && max_off > 0)
 		err = -EINVAL; /* out of stack access into non-negative offsets */
 
@@ -6647,8 +6635,10 @@ static int check_stack_access_within_bounds(
 			verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n",
 				err_extra, regno, tn_buf, off, access_size);
 		}
+		return err;
 	}
-	return err;
+
+	return grow_stack_state(env, state, round_up(-min_off, BPF_REG_SIZE));
 }
 
 /* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -6663,7 +6653,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 {
 	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = regs + regno;
-	struct bpf_func_state *state;
 	int size, err = 0;
 
 	size = bpf_size_to_bytes(bpf_size);
@@ -6806,11 +6795,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (err)
 			return err;
 
-		state = func(env, reg);
-		err = update_stack_depth(env, state, off);
-		if (err)
-			return err;
-
 		if (t == BPF_READ)
 			err = check_stack_read(env, regno, off, size,
 					       value_regno);
@@ -7004,7 +6988,8 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 
 /* When register 'regno' is used to read the stack (either directly or through
  * a helper function) make sure that it's within stack boundary and, depending
- * on the access type, that all elements of the stack are initialized.
+ * on the access type and privileges, that all elements of the stack are
+ * initialized.
  *
  * 'off' includes 'regno->off', but not its dynamic part (if any).
  *
@@ -7112,8 +7097,11 @@ static int check_stack_range_initialized(
 
 		slot = -i - 1;
 		spi = slot / BPF_REG_SIZE;
-		if (state->allocated_stack <= slot)
-			goto err;
+		if (state->allocated_stack <= slot) {
+			verbose(env, "verifier bug: allocated_stack too small");
+			return -EFAULT;
+		}
+
 		stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
 		if (*stype == STACK_MISC)
 			goto mark;
@@ -7137,7 +7125,6 @@ static int check_stack_range_initialized(
 			goto mark;
 		}
 
-err:
 		if (tnum_is_const(reg->var_off)) {
 			verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
 				err_extra, regno, min_off, i - min_off, access_size);
@@ -7162,7 +7149,7 @@ mark:
 		 * helper may write to the entire memory range.
 		 */
 	}
-	return update_stack_depth(env, state, min_off);
+	return 0;
 }
 
 static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
index b2181f850d3e..3aca3dc145b5 100644
--- a/tools/testing/selftests/bpf/progs/iters.c
+++ b/tools/testing/selftests/bpf/progs/iters.c
@@ -846,7 +846,7 @@ __naked int delayed_precision_mark(void)
 		"call %[bpf_iter_num_next];"
 		"if r0 == 0 goto 2f;"
 		"if r6 != 42 goto 3f;"
-		"r7 = -32;"
+		"r7 = -33;"
 		"call %[bpf_get_prandom_u32];"
 		"r6 = r0;"
 		"goto 1b;\n"
diff --git a/tools/testing/selftests/bpf/progs/test_global_func16.c b/tools/testing/selftests/bpf/progs/test_global_func16.c
index e7206304632e..e3e64bc472cd 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func16.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func16.c
@@ -13,7 +13,7 @@ __noinline int foo(int (*arr)[10])
 }
 
 SEC("cgroup_skb/ingress")
-__failure __msg("invalid indirect read from stack")
+__success
 int global_func16(struct __sk_buff *skb)
 {
 	int array[10];
diff --git a/tools/testing/selftests/bpf/progs/verifier_basic_stack.c b/tools/testing/selftests/bpf/progs/verifier_basic_stack.c
index 359df865a8f3..8d77cc5323d3 100644
--- a/tools/testing/selftests/bpf/progs/verifier_basic_stack.c
+++ b/tools/testing/selftests/bpf/progs/verifier_basic_stack.c
@@ -27,8 +27,8 @@ __naked void stack_out_of_bounds(void)
 
 SEC("socket")
 __description("uninitialized stack1")
-__failure __msg("invalid indirect read from stack")
-__failure_unpriv
+__success __log_level(4) __msg("stack depth 8")
+__failure_unpriv __msg_unpriv("invalid indirect read from stack")
 __naked void uninitialized_stack1(void)
 {
 	asm volatile ("					\
@@ -45,8 +45,8 @@ __naked void uninitialized_stack1(void)
 
 SEC("socket")
 __description("uninitialized stack2")
-__failure __msg("invalid read from stack")
-__failure_unpriv
+__success __log_level(4) __msg("stack depth 8")
+__failure_unpriv __msg_unpriv("invalid read from stack")
 __naked void uninitialized_stack2(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
index 74d9cad469d9..9fc3fae5cd83 100644
--- a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
+++ b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c
@@ -5,9 +5,10 @@
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
 
-SEC("cgroup/sysctl")
+SEC("socket")
 __description("ARG_PTR_TO_LONG uninitialized")
-__failure __msg("invalid indirect read from stack R4 off -16+0 size 8")
+__success
+__failure_unpriv __msg_unpriv("invalid indirect read from stack R4 off -16+0 size 8")
 __naked void arg_ptr_to_long_uninitialized(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_raw_stack.c b/tools/testing/selftests/bpf/progs/verifier_raw_stack.c
index efbfc3a4ad6a..f67390224a9c 100644
--- a/tools/testing/selftests/bpf/progs/verifier_raw_stack.c
+++ b/tools/testing/selftests/bpf/progs/verifier_raw_stack.c
@@ -5,9 +5,10 @@
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
 
-SEC("tc")
+SEC("socket")
 __description("raw_stack: no skb_load_bytes")
-__failure __msg("invalid read from stack R6 off=-8 size=8")
+__success
+__failure_unpriv __msg_unpriv("invalid read from stack R6 off=-8 size=8")
 __naked void stack_no_skb_load_bytes(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_var_off.c b/tools/testing/selftests/bpf/progs/verifier_var_off.c
index b7bdd7db3a35..c810f4f6f479 100644
--- a/tools/testing/selftests/bpf/progs/verifier_var_off.c
+++ b/tools/testing/selftests/bpf/progs/verifier_var_off.c
@@ -59,9 +59,10 @@ __naked void stack_read_priv_vs_unpriv(void)
 "	::: __clobber_all);
 }
 
-SEC("lwt_in")
+SEC("cgroup/skb")
 __description("variable-offset stack read, uninitialized")
-__failure __msg("invalid variable-offset read from stack R2")
+__success
+__failure_unpriv __msg_unpriv("R2 variable stack access prohibited for !root")
 __naked void variable_offset_stack_read_uninitialized(void)
 {
 	asm volatile ("					\
@@ -83,12 +84,55 @@ __naked void variable_offset_stack_read_uninitialized(void)
 
 SEC("socket")
 __description("variable-offset stack write, priv vs unpriv")
-__success __failure_unpriv
+__success
+/* Check that the maximum stack depth is correctly maintained according to the
+ * maximum possible variable offset.
+ */
+__log_level(4) __msg("stack depth 16")
+__failure_unpriv
 /* Variable stack access is rejected for unprivileged.
  */
 __msg_unpriv("R2 variable stack access prohibited for !root")
 __retval(0)
 __naked void stack_write_priv_vs_unpriv(void)
+{
+	asm volatile ("                               \
+	/* Get an unknown value */                    \
+	r2 = *(u32*)(r1 + 0);                         \
+	/* Make it small and 8-byte aligned */        \
+	r2 &= 8;                                      \
+	r2 -= 16;                                     \
+	/* Add it to fp. We now have either fp-8 or   \
+	 * fp-16, but we don't know which             \
+	 */                                           \
+	r2 += r10;                                    \
+	/* Dereference it for a stack write */        \
+	r0 = 0;                                       \
+	*(u64*)(r2 + 0) = r0;                         \
+	exit;                                         \
+"	::: __clobber_all);
+}
+
+/* Similar to the previous test, but this time also perform a read from the
+ * address written to with a variable offset. The read is allowed, showing that,
+ * after a variable-offset write, a priviledged program can read the slots that
+ * were in the range of that write (even if the verifier doesn't actually know if
+ * the slot being read was really written to or not.
+ *
+ * Despite this test being mostly a superset, the previous test is also kept for
+ * the sake of it checking the stack depth in the case where there is no read.
+ */
+SEC("socket")
+__description("variable-offset stack write followed by read")
+__success
+/* Check that the maximum stack depth is correctly maintained according to the
+ * maximum possible variable offset.
+ */
+__log_level(4) __msg("stack depth 16")
+__failure_unpriv
+__msg_unpriv("R2 variable stack access prohibited for !root")
+__retval(0)
+__naked void stack_write_followed_by_read(void)
 {
 	asm volatile ("					\
 	/* Get an unknown value */			\
@@ -103,12 +147,7 @@ __naked void stack_write_priv_vs_unpriv(void)
 	/* Dereference it for a stack write */		\
 	r0 = 0;						\
 	*(u64*)(r2 + 0) = r0;				\
-	/* Now read from the address we just wrote. This shows\
-	 * that, after a variable-offset write, a priviledged\
-	 * program can read the slots that were in the range of\
-	 * that write (even if the verifier doesn't actually know\
-	 * if the slot being read was really written to or not.\
-	 */						\
+	/* Now read from the address we just wrote. */ \
 	r3 = *(u64*)(r2 + 0);				\
 	r0 = 0;						\
 	exit;						\
@@ -282,9 +321,10 @@ __naked void access_min_out_of_bound(void)
 	: __clobber_all);
 }
 
-SEC("lwt_in")
+SEC("cgroup/skb")
 __description("indirect variable-offset stack access, min_off < min_initialized")
-__failure __msg("invalid indirect read from stack R2 var_off")
+__success
+__failure_unpriv __msg_unpriv("R2 variable stack access prohibited for !root")
 __naked void access_min_off_min_initialized(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
index 319337bdcfc8..9a7b1106fda8 100644
--- a/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
+++ b/tools/testing/selftests/bpf/verifier/atomic_cmpxchg.c
@@ -83,17 +83,6 @@
 	.result = REJECT,
 	.errstr = "!read_ok",
 },
-{
-	"Can't use cmpxchg on uninit memory",
-	.insns = {
-		BPF_MOV64_IMM(BPF_REG_0, 3),
-		BPF_MOV64_IMM(BPF_REG_2, 4),
-		BPF_ATOMIC_OP(BPF_DW, BPF_CMPXCHG, BPF_REG_10, BPF_REG_2, -8),
-		BPF_EXIT_INSN(),
-	},
-	.result = REJECT,
-	.errstr = "invalid read from stack",
-},
 {
 	"BPF_W cmpxchg should zero top 32 bits",
 	.insns = {
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index 3d5cd51071f0..ab25a81fd3a1 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -1505,7 +1505,9 @@
 	.prog_type = BPF_PROG_TYPE_XDP,
 	.fixup_map_hash_8b = { 23 },
 	.result = REJECT,
-	.errstr = "invalid read from stack R7 off=-16 size=8",
+	.errstr = "R0 invalid mem access 'scalar'",
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "invalid read from stack R7 off=-16 size=8",
 },
 {
 	"calls: two calls that receive map_value via arg=ptr_stack_of_caller. test1",
-- 
cgit v1.2.3


From 2929bfac006d8f8e22b307d04e0d71bcb84db698 Mon Sep 17 00:00:00 2001
From: Andrei Matei <andreimatei1@gmail.com>
Date: Thu, 7 Dec 2023 22:25:19 -0500
Subject: bpf: Minor cleanup around stack bounds

Push the rounding up of stack offsets into the function responsible for
growing the stack, rather than relying on all the callers to do it.
Uncertainty about whether the callers did it or not tripped up people in
a previous review.

Signed-off-by: Andrei Matei <andreimatei1@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20231208032519.260451-4-andreimatei1@gmail.com
---
 kernel/bpf/verifier.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index de1e29fa467e..fb690539d5f6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1264,7 +1264,11 @@ static int resize_reference_state(struct bpf_func_state *state, size_t n)
  */
 static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int size)
 {
-	size_t old_n = state->allocated_stack / BPF_REG_SIZE, n = size / BPF_REG_SIZE;
+	size_t old_n = state->allocated_stack / BPF_REG_SIZE, n;
+
+	/* The stack size is always a multiple of BPF_REG_SIZE. */
+	size = round_up(size, BPF_REG_SIZE);
+	n = size / BPF_REG_SIZE;
 
 	if (old_n >= n)
 		return 0;
@@ -6638,7 +6642,10 @@ static int check_stack_access_within_bounds(
 		return err;
 	}
 
-	return grow_stack_state(env, state, round_up(-min_off, BPF_REG_SIZE));
+	/* Note that there is no stack access with offset zero, so the needed stack
+	 * size is -min_off, not -min_off+1.
+	 */
+	return grow_stack_state(env, state, -min_off /* size */);
 }
 
 /* check whether memory at (regno + off) is accessible for t = (read | write)
-- 
cgit v1.2.3


From 73d9eb340d2b95e0e86a656a7f3157c137f10129 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Wed, 6 Dec 2023 11:53:24 +0000
Subject: bpf: Enable bpf_cgrp_storage for cgroup1 non-attach case

In the current cgroup1 environment, associating operations between cgroups
and applications in a BPF program requires storing a mapping of cgroup_id
to application either in a hash map or maintaining it in userspace.
However, by enabling bpf_cgrp_storage for cgroup1, it becomes possible to
conveniently store application-specific information in cgroup-local storage
and utilize it within BPF programs. Furthermore, enabling this feature for
cgroup1 involves minor modifications for the non-attach case, streamlining
the process.

However, when it comes to enabling this functionality for the cgroup1
attach case, it presents challenges. Therefore, the decision is to focus on
enabling it solely for the cgroup1 non-attach case at present. If
attempting to attach to a cgroup1 fd, the operation will simply fail with
the error code -EBADF.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231206115326.4295-2-laoar.shao@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/bpf_cgrp_storage.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index d44fe8dd9732..28efd0a3f220 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -82,7 +82,7 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
 	int fd;
 
 	fd = *(int *)key;
-	cgroup = cgroup_get_from_fd(fd);
+	cgroup = cgroup_v1v2_get_from_fd(fd);
 	if (IS_ERR(cgroup))
 		return ERR_CAST(cgroup);
 
@@ -101,7 +101,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
 	int fd;
 
 	fd = *(int *)key;
-	cgroup = cgroup_get_from_fd(fd);
+	cgroup = cgroup_v1v2_get_from_fd(fd);
 	if (IS_ERR(cgroup))
 		return PTR_ERR(cgroup);
 
@@ -131,7 +131,7 @@ static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
 	int err, fd;
 
 	fd = *(int *)key;
-	cgroup = cgroup_get_from_fd(fd);
+	cgroup = cgroup_v1v2_get_from_fd(fd);
 	if (IS_ERR(cgroup))
 		return PTR_ERR(cgroup);
 
-- 
cgit v1.2.3


From c26f2a8901393c9f81909da0a4324587092bd3a3 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 8 Dec 2023 18:23:49 +0800
Subject: bpf: Remove unnecessary wait from bpf_map_copy_value()

Both map_lookup_elem() and generic_map_lookup_batch() use
bpf_map_copy_value() to lookup and copy the value, and there is no
update operation in bpf_map_copy_value(), so just remove the invocation
of maybe_wait_bpf_programs() from it.

Fixes: 15c14a3dca42 ("bpf: Add bpf_map_{value_size, update_value, map_copy_value} functions")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231208102355.2628918-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index aff045eed375..9ad3f527ab37 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -264,7 +264,6 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
 	}
 
 	bpf_enable_instrumentation();
-	maybe_wait_bpf_programs(map);
 
 	return err;
 }
-- 
cgit v1.2.3


From 37ba5b59d6adfa08926acd3a833608487a18c2ef Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 8 Dec 2023 18:23:50 +0800
Subject: bpf: Call maybe_wait_bpf_programs() only once for
 generic_map_update_batch()

Just like commit 9087c6ff8dfe ("bpf: Call maybe_wait_bpf_programs() only
once from generic_map_delete_batch()"), there is also no need to call
maybe_wait_bpf_programs() for each update in batched update, so only
call it once in generic_map_update_batch().

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231208102355.2628918-3-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9ad3f527ab37..07e671431987 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -203,7 +203,6 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
 		rcu_read_unlock();
 	}
 	bpf_enable_instrumentation();
-	maybe_wait_bpf_programs(map);
 
 	return err;
 }
@@ -1577,6 +1576,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 	}
 
 	err = bpf_map_update_value(map, f.file, key, value, attr->flags);
+	maybe_wait_bpf_programs(map);
 
 	kvfree(value);
 free_key:
@@ -1816,6 +1816,8 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
 
 	kvfree(value);
 	kvfree(key);
+
+	maybe_wait_bpf_programs(map);
 	return err;
 }
 
-- 
cgit v1.2.3


From 012772581d040607ac1f981f47f6afd2336b4580 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 8 Dec 2023 18:23:51 +0800
Subject: bpf: Add missed maybe_wait_bpf_programs() for htab of maps

When doing batched lookup and deletion operations on htab of maps,
maybe_wait_bpf_programs() is needed to ensure all programs don't use the
inner map after the bpf syscall returns.

Instead of adding the wait in __htab_map_lookup_and_delete_batch(),
adding the wait in bpf_map_do_batch() and also removing the calling of
maybe_wait_bpf_programs() from generic_map_{delete,update}_batch().

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231208102355.2628918-4-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 07e671431987..2e6ef361da1c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1758,7 +1758,6 @@ int generic_map_delete_batch(struct bpf_map *map,
 
 	kvfree(key);
 
-	maybe_wait_bpf_programs(map);
 	return err;
 }
 
@@ -1817,7 +1816,6 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
 	kvfree(value);
 	kvfree(key);
 
-	maybe_wait_bpf_programs(map);
 	return err;
 }
 
@@ -5031,8 +5029,10 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
 	else
 		BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
 err_put:
-	if (has_write)
+	if (has_write) {
+		maybe_wait_bpf_programs(map);
 		bpf_map_write_active_dec(map);
+	}
 	fdput(f);
 	return err;
 }
-- 
cgit v1.2.3


From 67ad2c73ff29b32bd09135ec07c26e59490dbb3b Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 8 Dec 2023 18:23:52 +0800
Subject: bpf: Only call maybe_wait_bpf_programs() when map operation succeeds

There is no need to call maybe_wait_bpf_programs() if update or deletion
operation fails. So only call maybe_wait_bpf_programs() if update or
deletion operation succeeds.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231208102355.2628918-5-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2e6ef361da1c..dd641475b65f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1576,7 +1576,8 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 	}
 
 	err = bpf_map_update_value(map, f.file, key, value, attr->flags);
-	maybe_wait_bpf_programs(map);
+	if (!err)
+		maybe_wait_bpf_programs(map);
 
 	kvfree(value);
 free_key:
@@ -1632,7 +1633,8 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
 	err = map->ops->map_delete_elem(map, key);
 	rcu_read_unlock();
 	bpf_enable_instrumentation();
-	maybe_wait_bpf_programs(map);
+	if (!err)
+		maybe_wait_bpf_programs(map);
 out:
 	kvfree(key);
 err_put:
-- 
cgit v1.2.3


From 06e5c999f10269a532304e89a6adb2fbfeb0593c Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 8 Dec 2023 18:23:53 +0800
Subject: bpf: Set uattr->batch.count as zero before batched update or deletion

generic_map_{delete,update}_batch() doesn't set uattr->batch.count as
zero before it tries to allocate memory for key. If the memory
allocation fails, the value of uattr->batch.count will be incorrect.

Fix it by setting uattr->batch.count as zero beore batched update or
deletion.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231208102355.2628918-6-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dd641475b65f..06320d9abf33 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1731,6 +1731,9 @@ int generic_map_delete_batch(struct bpf_map *map,
 	if (!max_count)
 		return 0;
 
+	if (put_user(0, &uattr->batch.count))
+		return -EFAULT;
+
 	key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
 	if (!key)
 		return -ENOMEM;
@@ -1787,6 +1790,9 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
 	if (!max_count)
 		return 0;
 
+	if (put_user(0, &uattr->batch.count))
+		return -EFAULT;
+
 	key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN);
 	if (!key)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 482d548d40b0af9af730e4869903d4433e44f014 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 8 Dec 2023 17:09:57 -0800
Subject: bpf: handle fake register spill to stack with BPF_ST_MEM instruction

When verifier validates BPF_ST_MEM instruction that stores known
constant to stack (e.g., *(u64 *)(r10 - 8) = 123), it effectively spills
a fake register with a constant (but initially imprecise) value to
a stack slot. Because read-side logic treats it as a proper register
fill from stack slot, we need to mark such stack slot initialization as
INSN_F_STACK_ACCESS instruction to stop precision backtracking from
missing it.

Fixes: 41f6f64e6999 ("bpf: support non-r10 register spill/fill to/from stack in precision tracking")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231209010958.66758-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fb690539d5f6..727a59e4a647 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4498,7 +4498,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 		__mark_reg_known(&fake_reg, insn->imm);
 		fake_reg.type = SCALAR_VALUE;
 		save_register_state(env, state, spi, &fake_reg, size);
-		insn_flags = 0; /* not a register spill */
 	} else if (reg && is_spillable_regtype(reg->type)) {
 		/* register containing pointer is being spilled into stack */
 		if (size != BPF_REG_SIZE) {
-- 
cgit v1.2.3


From a6de18f310a511278c1ff16b96eb2d500eada725 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Thu, 7 Dec 2023 15:08:42 -0600
Subject: bpf: Add bpf_cpumask_weight() kfunc

It can be useful to query how many bits are set in a cpumask. For
example, if you want to perform special logic for the last remaining
core that's set in a mask. Let's therefore add a new
bpf_cpumask_weight() kfunc which checks how many bits are set in a mask.

Signed-off-by: David Vernet <void@manifault.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231207210843.168466-2-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/cpumasks.rst |  2 +-
 kernel/bpf/cpumask.c           | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/Documentation/bpf/cpumasks.rst b/Documentation/bpf/cpumasks.rst
index a22b6ad105fb..b5d47a04da5d 100644
--- a/Documentation/bpf/cpumasks.rst
+++ b/Documentation/bpf/cpumasks.rst
@@ -352,7 +352,7 @@ can be used to query the contents of cpumasks.
 
 .. kernel-doc:: kernel/bpf/cpumask.c
    :identifiers: bpf_cpumask_first bpf_cpumask_first_zero bpf_cpumask_first_and
-                 bpf_cpumask_test_cpu
+                 bpf_cpumask_test_cpu bpf_cpumask_weight
 
 .. kernel-doc:: kernel/bpf/cpumask.c
    :identifiers: bpf_cpumask_equal bpf_cpumask_intersects bpf_cpumask_subset
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index e01c741e54e7..7499b7d8c06f 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -405,6 +405,17 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
 	return cpumask_any_and_distribute(src1, src2);
 }
 
+/**
+ * bpf_cpumask_weight() - Return the number of bits in @cpumask.
+ * @cpumask: The cpumask being queried.
+ *
+ * Count the number of set bits in the given cpumask.
+ */
+__bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
+{
+	return cpumask_weight(cpumask);
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_SET8_START(cpumask_kfunc_btf_ids)
@@ -432,6 +443,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
 BTF_SET8_END(cpumask_kfunc_btf_ids)
 
 static const struct btf_kfunc_id_set cpumask_kfunc_set = {
-- 
cgit v1.2.3


From d2406291483775ecddaee929231a39c70c08fda2 Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:45 +0800
Subject: fork: use __mt_dup() to duplicate maple tree in dup_mmap()

In dup_mmap(), using __mt_dup() to duplicate the old maple tree and then
directly replacing the entries of VMAs in the new maple tree can result in
better performance.  __mt_dup() uses DFS pre-order to duplicate the maple
tree, so it is efficient.

The average time complexity of __mt_dup() is O(n), where n is the number
of VMAs.  The proof of the time complexity is provided in the commit log
that introduces __mt_dup().  After duplicating the maple tree, each
element is traversed and replaced (ignoring the cases of deletion, which
are rare).  Since it is only a replacement operation for each element,
this process is also O(n).

Analyzing the exact time complexity of the previous algorithm is
challenging because each insertion can involve appending to a node,
pushing data to adjacent nodes, or even splitting nodes.  The frequency of
each action is difficult to calculate.  The worst-case scenario for a
single insertion is when the tree undergoes splitting at every level.  If
we consider each insertion as the worst-case scenario, we can determine
that the upper bound of the time complexity is O(n*log(n)), although this
is a loose upper bound.  However, based on the test data, it appears that
the actual time complexity is likely to be O(n).

As the entire maple tree is duplicated using __mt_dup(), if dup_mmap()
fails, there will be a portion of VMAs that have not been duplicated in
the maple tree.  To handle this, we mark the failure point with
XA_ZERO_ENTRY.  In exit_mmap(), if this marker is encountered, stop
releasing VMAs that have not been duplicated after this point.

There is a "spawn" in byte-unixbench[1], which can be used to test the
performance of fork().  I modified it slightly to make it work with
different number of VMAs.

Below are the test results.  The first row shows the number of VMAs.  The
second and third rows show the number of fork() calls per ten seconds,
corresponding to next-20231006 and the this patchset, respectively.  The
test results were obtained with CPU binding to avoid scheduler load
balancing that could cause unstable results.  There are still some
fluctuations in the test results, but at least they are better than the
original performance.

21     121   221    421    821    1621   3221   6421   12821  25621  51221
112100 76261 54227  34035  20195  11112  6017   3161   1606   802    393
114558 83067 65008  45824  28751  16072  8922   4747   2436   1233   599
2.19%  8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42%

[1] https://github.com/kdlucas/byte-unixbench/tree/master

Link: https://lkml.kernel.org/r/20231027033845.90608-11-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Suggested-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 11 +++++++++++
 kernel/fork.c      | 40 +++++++++++++++++++++++++++++-----------
 mm/internal.h      | 11 -----------
 mm/memory.c        |  7 ++++++-
 mm/mmap.c          |  9 ++++++---
 5 files changed, 52 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 418d26608ece..64cd1ee4aacc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -994,6 +994,17 @@ static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
 	return mas_expected_entries(&vmi->mas, count);
 }
 
+static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
+			unsigned long start, unsigned long end, gfp_t gfp)
+{
+	__mas_set_range(&vmi->mas, start, end - 1);
+	mas_store_gfp(&vmi->mas, NULL, gfp);
+	if (unlikely(mas_is_err(&vmi->mas)))
+		return -ENOMEM;
+
+	return 0;
+}
+
 /* Free any unused preallocations */
 static inline void vma_iter_free(struct vma_iterator *vmi)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 10917c3e1f03..93924392a5c3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -650,7 +650,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	int retval;
 	unsigned long charge = 0;
 	LIST_HEAD(uf);
-	VMA_ITERATOR(old_vmi, oldmm, 0);
 	VMA_ITERATOR(vmi, mm, 0);
 
 	uprobe_start_dup_mmap();
@@ -678,16 +677,22 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		goto out;
 	khugepaged_fork(mm, oldmm);
 
-	retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
-	if (retval)
+	/* Use __mt_dup() to efficiently build an identical maple tree. */
+	retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+	if (unlikely(retval))
 		goto out;
 
 	mt_clear_in_rcu(vmi.mas.tree);
-	for_each_vma(old_vmi, mpnt) {
+	for_each_vma(vmi, mpnt) {
 		struct file *file;
 
 		vma_start_write(mpnt);
 		if (mpnt->vm_flags & VM_DONTCOPY) {
+			retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
+						    mpnt->vm_end, GFP_KERNEL);
+			if (retval)
+				goto loop_out;
+
 			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
 			continue;
 		}
@@ -749,9 +754,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		if (is_vm_hugetlb_page(tmp))
 			hugetlb_dup_vma_private(tmp);
 
-		/* Link the vma into the MT */
-		if (vma_iter_bulk_store(&vmi, tmp))
-			goto fail_nomem_vmi_store;
+		/*
+		 * Link the vma into the MT. After using __mt_dup(), memory
+		 * allocation is not necessary here, so it cannot fail.
+		 */
+		vma_iter_bulk_store(&vmi, tmp);
 
 		mm->map_count++;
 		if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -760,15 +767,28 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
 
-		if (retval)
+		if (retval) {
+			mpnt = vma_next(&vmi);
 			goto loop_out;
+		}
 	}
 	/* a new mm has just been created */
 	retval = arch_dup_mmap(oldmm, mm);
 loop_out:
 	vma_iter_free(&vmi);
-	if (!retval)
+	if (!retval) {
 		mt_set_in_rcu(vmi.mas.tree);
+	} else if (mpnt) {
+		/*
+		 * The entire maple tree has already been duplicated. If the
+		 * mmap duplication fails, mark the failure point with
+		 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+		 * stop releasing VMAs that have not been duplicated after this
+		 * point.
+		 */
+		mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+		mas_store(&vmi.mas, XA_ZERO_ENTRY);
+	}
 out:
 	mmap_write_unlock(mm);
 	flush_tlb_mm(oldmm);
@@ -778,8 +798,6 @@ fail_uprobe_end:
 	uprobe_end_dup_mmap();
 	return retval;
 
-fail_nomem_vmi_store:
-	unlink_anon_vmas(tmp);
 fail_nomem_anon_vma_fork:
 	mpol_put(vma_policy(tmp));
 fail_nomem_policy:
diff --git a/mm/internal.h b/mm/internal.h
index b61034bd50f5..89a5a794d68f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1154,17 +1154,6 @@ static inline void vma_iter_clear(struct vma_iterator *vmi)
 	mas_store_prealloc(&vmi->mas, NULL);
 }
 
-static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
-			unsigned long start, unsigned long end, gfp_t gfp)
-{
-	__mas_set_range(&vmi->mas, start, end - 1);
-	mas_store_gfp(&vmi->mas, NULL, gfp);
-	if (unlikely(mas_is_err(&vmi->mas)))
-		return -ENOMEM;
-
-	return 0;
-}
-
 static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
 {
 	return mas_walk(&vmi->mas);
diff --git a/mm/memory.c b/mm/memory.c
index 5c757fba8858..a7025ed5c65b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -374,6 +374,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		 * be 0.  This will underflow and is okay.
 		 */
 		next = mas_find(mas, ceiling - 1);
+		if (unlikely(xa_is_zero(next)))
+			next = NULL;
 
 		/*
 		 * Hide vma from rmap and truncate_pagecache before freeing
@@ -395,6 +397,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 			       && !is_vm_hugetlb_page(next)) {
 				vma = next;
 				next = mas_find(mas, ceiling - 1);
+				if (unlikely(xa_is_zero(next)))
+					next = NULL;
 				if (mm_wr_locked)
 					vma_start_write(vma);
 				unlink_anon_vmas(vma);
@@ -1744,7 +1748,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
 		unmap_single_vma(tlb, vma, start, end, &details,
 				 mm_wr_locked);
 		hugetlb_zap_end(vma, &details);
-	} while ((vma = mas_find(mas, tree_end - 1)) != NULL);
+		vma = mas_find(mas, tree_end - 1);
+	} while (vma && likely(!xa_is_zero(vma)));
 	mmu_notifier_invalidate_range_end(&range);
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 1971bfffcc03..4f1cb814586d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3294,10 +3294,11 @@ void exit_mmap(struct mm_struct *mm)
 	arch_exit_mmap(mm);
 
 	vma = mas_find(&mas, ULONG_MAX);
-	if (!vma) {
+	if (!vma || unlikely(xa_is_zero(vma))) {
 		/* Can happen if dup_mmap() received an OOM */
 		mmap_read_unlock(mm);
-		return;
+		mmap_write_lock(mm);
+		goto destroy;
 	}
 
 	lru_add_drain();
@@ -3332,11 +3333,13 @@ void exit_mmap(struct mm_struct *mm)
 		remove_vma(vma, true);
 		count++;
 		cond_resched();
-	} while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
+		vma = mas_find(&mas, ULONG_MAX);
+	} while (vma && likely(!xa_is_zero(vma)));
 
 	BUG_ON(count != mm->map_count);
 
 	trace_exit_mmap(mm);
+destroy:
 	__mt_destroy(&mm->mm_mt);
 	mmap_write_unlock(mm);
 	vm_unacct_memory(nr_accounted);
-- 
cgit v1.2.3


From a9a1d6ad668f2ea0b5a77d0c4c7a41446d0801a8 Mon Sep 17 00:00:00 2001
From: Dongmin Lee <ldmldm05@gmail.com>
Date: Sat, 4 Nov 2023 20:33:20 +0900
Subject: kernel/reboot: explicitly notify if halt occurred instead of power
 off

When kernel_can_power_off() returns false, and reboot has called with
LINUX_REBOOT_CMD_POWER_OFF, kernel_halt() will be initiated instead of
actual power off function.

However, in this situation, Kernel never explicitly notifies user that
system halted instead of requested power off.

Since halt and power off perform different behavior, and user initiated
reboot call with power off command, not halt, This could be unintended
behavior to user, like this:

~ # poweroff -f
[    3.581482] reboot: System halted

Therefore, this explicitly notifies user that poweroff is not available,
and halting has been occured as an alternative behavior instead:

~ # poweroff -f
[    4.123668] reboot: Power off not available: System halted instead

[akpm@linux-foundation.org: tweak comment text]
Link: https://lkml.kernel.org/r/20231104113320.72440-1-ldmldm05@gmail.com
Signed-off-by: Dongmin Lee <ldmldm05@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/reboot.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index 395a0ea3c7a8..c3a3b82c4f64 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -58,6 +58,14 @@ struct sys_off_handler {
 	struct device *dev;
 };
 
+/*
+ * This variable is used to indicate if a halt was initiated instead of a
+ * reboot when the reboot call was invoked with LINUX_REBOOT_CMD_POWER_OFF, but
+ * the system cannot be powered off. This allowes kernel_halt() to notify users
+ * of that.
+ */
+static bool poweroff_fallback_to_halt;
+
 /*
  * Temporary stub that prevents linkage failure while we're in process
  * of removing all uses of legacy pm_power_off() around the kernel.
@@ -297,7 +305,10 @@ void kernel_halt(void)
 	kernel_shutdown_prepare(SYSTEM_HALT);
 	migrate_to_reboot_cpu();
 	syscore_shutdown();
-	pr_emerg("System halted\n");
+	if (poweroff_fallback_to_halt)
+		pr_emerg("Power off not available: System halted instead\n");
+	else
+		pr_emerg("System halted\n");
 	kmsg_dump(KMSG_DUMP_SHUTDOWN);
 	machine_halt();
 }
@@ -732,8 +743,10 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
 	/* Instead of trying to make the power_off code look like
 	 * halt when pm_power_off is not set do it the easy way.
 	 */
-	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off())
+	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) {
+		poweroff_fallback_to_halt = true;
 		cmd = LINUX_REBOOT_CMD_HALT;
+	}
 
 	mutex_lock(&system_transition_mutex);
 	switch (cmd) {
-- 
cgit v1.2.3


From 61a7a5e25fe79b6c43f1c49705a0294be113c4a5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 30 Oct 2023 16:57:10 +0100
Subject: introduce for_other_threads(p, t)

Cosmetic, but imho it makes the usage look more clear and simple, the new
helper doesn't require to initialize "t".

After this change while_each_thread() has only 3 users, and it is only
used in the do/while loops.

Link: https://lkml.kernel.org/r/20231030155710.GA9095@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/exec.c                    |  3 +--
 include/linux/sched/signal.h |  3 +++
 kernel/signal.c              | 11 ++++-------
 3 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 4aa19b24f281..ee43597cb453 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1578,11 +1578,10 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	 * will be able to manipulate the current directory, etc.
 	 * It would be nice to force an unshare instead...
 	 */
-	t = p;
 	n_fs = 1;
 	spin_lock(&p->fs->lock);
 	rcu_read_lock();
-	while_each_thread(p, t) {
+	for_other_threads(p, t) {
 		if (t->fs == p->fs)
 			n_fs++;
 	}
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 3499c1a8b929..41d6759d6a4a 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -646,6 +646,9 @@ extern bool current_is_single_threaded(void);
 #define while_each_thread(g, t) \
 	while ((t = next_thread(t)) != g)
 
+#define for_other_threads(p, t)	\
+	for (t = p; (t = next_thread(t)) != p; )
+
 #define __for_each_thread(signal, t)	\
 	list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
 		lockdep_is_held(&tasklist_lock))
diff --git a/kernel/signal.c b/kernel/signal.c
index 47a7602dfe8d..5aa216e841a2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1376,12 +1376,12 @@ int force_sig_info(struct kernel_siginfo *info)
  */
 int zap_other_threads(struct task_struct *p)
 {
-	struct task_struct *t = p;
+	struct task_struct *t;
 	int count = 0;
 
 	p->signal->group_stop_count = 0;
 
-	while_each_thread(p, t) {
+	for_other_threads(p, t) {
 		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
 		/* Don't require de_thread to wait for the vhost_worker */
 		if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)
@@ -2465,12 +2465,10 @@ static bool do_signal_stop(int signr)
 			sig->group_exit_code = signr;
 
 		sig->group_stop_count = 0;
-
 		if (task_set_jobctl_pending(current, signr | gstop))
 			sig->group_stop_count++;
 
-		t = current;
-		while_each_thread(current, t) {
+		for_other_threads(current, t) {
 			/*
 			 * Setting state to TASK_STOPPED for a group
 			 * stop is always done with the siglock held,
@@ -2966,8 +2964,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
 	if (sigisemptyset(&retarget))
 		return;
 
-	t = tsk;
-	while_each_thread(tsk, t) {
+	for_other_threads(tsk, t) {
 		if (t->flags & PF_EXITING)
 			continue;
 
-- 
cgit v1.2.3


From f72709ab69430d986dfc5a08c9a86f625e3fed33 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Thu, 16 Nov 2023 14:36:36 +0100
Subject: arch: remove ARCH_THREAD_STACK_ALLOCATOR

Patch series "Remove unused code after IA-64 removal".

While looking into something different I noticed that there are a couple
of Kconfig options which were only selected by IA-64 and which are now
unused.

So remove them and simplify the code a bit.


This patch (of 3):

IA-64 was the only architecture which selected ARCH_THREAD_STACK_ALLOCATOR.
IA-64 was removed with commit cf8e8658100d ("arch: Remove Itanium (IA-64)
architecture"). Therefore remove support for ARCH_THREAD_STACK_ALLOCATOR as
well.

Link: https://lkml.kernel.org/r/20231116133638.1636277-1-hca@linux.ibm.com
Link: https://lkml.kernel.org/r/20231116133638.1636277-2-hca@linux.ibm.com
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/Kconfig  |  4 ----
 kernel/fork.c | 20 --------------------
 2 files changed, 24 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index f4b210ab0612..310162b41a1c 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -320,10 +320,6 @@ config HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	  should be implemented. Without this, the entire thread_struct
 	  field in task_struct will be left whitelisted.
 
-# Select if arch has its private alloc_thread_stack() function
-config ARCH_THREAD_STACK_ALLOCATOR
-	bool
-
 # Select if arch wants to size task_struct dynamically via arch_task_struct_size:
 config ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	bool
diff --git a/kernel/fork.c b/kernel/fork.c
index 10917c3e1f03..d071809866e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -179,8 +179,6 @@ static inline void free_task_struct(struct task_struct *tsk)
 }
 #endif
 
-#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
-
 /*
  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
  * kmemcache based allocator.
@@ -412,24 +410,6 @@ void thread_stack_cache_init(void)
 }
 
 # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
-#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
-
-static int alloc_thread_stack_node(struct task_struct *tsk, int node)
-{
-	unsigned long *stack;
-
-	stack = arch_alloc_thread_stack_node(tsk, node);
-	tsk->stack = stack;
-	return stack ? 0 : -ENOMEM;
-}
-
-static void free_thread_stack(struct task_struct *tsk)
-{
-	arch_free_thread_stack(tsk);
-	tsk->stack = NULL;
-}
-
-#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
 
 /* SLAB cache for signal_struct structures (tsk->signal) */
 static struct kmem_cache *signal_cachep;
-- 
cgit v1.2.3


From 3888750e21ccb909051c810cc79fcc0650a740f8 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Thu, 16 Nov 2023 14:36:37 +0100
Subject: arch: remove ARCH_TASK_STRUCT_ALLOCATOR

IA-64 was the only architecture which selected ARCH_TASK_STRUCT_ALLOCATOR.
IA-64 was removed with commit cf8e8658100d ("arch: Remove Itanium (IA-64)
architecture"). Therefore remove support for ARCH_THREAD_STACK_ALLOCATOR
as well.

Link: https://lkml.kernel.org/r/20231116133638.1636277-3-hca@linux.ibm.com
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/Kconfig  | 5 -----
 kernel/fork.c | 6 ------
 2 files changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 310162b41a1c..c2f87ef9f0ae 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -305,13 +305,8 @@ config ARCH_HAS_CPU_FINALIZE_INIT
 config ARCH_TASK_STRUCT_ON_STACK
 	bool
 
-# Select if arch has its private alloc_task_struct() function
-config ARCH_TASK_STRUCT_ALLOCATOR
-	bool
-
 config HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	bool
-	depends on !ARCH_TASK_STRUCT_ALLOCATOR
 	help
 	  An architecture should select this to provide hardened usercopy
 	  knowledge about what region of the thread_struct should be
diff --git a/kernel/fork.c b/kernel/fork.c
index d071809866e0..ce8a4b8c04e2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -165,7 +165,6 @@ void __weak arch_release_task_struct(struct task_struct *tsk)
 {
 }
 
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 static struct kmem_cache *task_struct_cachep;
 
 static inline struct task_struct *alloc_task_struct_node(int node)
@@ -177,7 +176,6 @@ static inline void free_task_struct(struct task_struct *tsk)
 {
 	kmem_cache_free(task_struct_cachep, tsk);
 }
-#endif
 
 /*
  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@ -1001,7 +999,6 @@ static void set_max_threads(unsigned int max_threads_suggested)
 int arch_task_struct_size __read_mostly;
 #endif
 
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
 {
 	/* Fetch thread_struct whitelist for the architecture. */
@@ -1016,12 +1013,10 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
 	else
 		*offset += offsetof(struct task_struct, thread);
 }
-#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
 
 void __init fork_init(void)
 {
 	int i;
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	0
 #endif
@@ -1034,7 +1029,6 @@ void __init fork_init(void)
 			arch_task_struct_size, align,
 			SLAB_PANIC|SLAB_ACCOUNT,
 			useroffset, usersize, NULL);
-#endif
 
 	/* do the arch specific task caches init */
 	arch_task_cache_init();
-- 
cgit v1.2.3


From b454ec29225cda9ae85ed0a154f4228f1922c872 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 20 Nov 2023 16:16:49 +0100
Subject: kernel/signal.c: simplify force_sig_info_to_task(), kill
 recalc_sigpending_and_wake()

The purpose of recalc_sigpending_and_wake() is not clear, it looks
"obviously unneeded" because we are going to send the signal which can't
be blocked or ignored.

Add the comment to explain why we can't rely on send_signal_locked() and
make this logic more simple/explicit.  recalc_sigpending_and_wake() has no
other users, it can die.

In fact I think we don't even need signal_wake_up(), the target task must
be either current or a TASK_TRACED child, otherwise the usage of siglock
is not safe.  But this needs another change.

Link: https://lkml.kernel.org/r/20231120151649.GA15995@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/signal.h |  1 -
 kernel/signal.c              | 17 ++++-------------
 2 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 41d6759d6a4a..015c0e3a3e1d 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -432,7 +432,6 @@ static inline bool fault_signal_pending(vm_fault_t fault_flags,
  * This is required every time the blocked sigset_t changes.
  * callers must hold sighand->siglock.
  */
-extern void recalc_sigpending_and_wake(struct task_struct *t);
 extern void recalc_sigpending(void);
 extern void calculate_sigpending(void);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 5aa216e841a2..c9c57d053ce4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -171,16 +171,6 @@ static bool recalc_sigpending_tsk(struct task_struct *t)
 	return false;
 }
 
-/*
- * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up.
- * This is superfluous when called on current, the wakeup is a harmless no-op.
- */
-void recalc_sigpending_and_wake(struct task_struct *t)
-{
-	if (recalc_sigpending_tsk(t))
-		signal_wake_up(t, 0);
-}
-
 void recalc_sigpending(void)
 {
 	if (!recalc_sigpending_tsk(current) && !freezing(current))
@@ -1348,10 +1338,8 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
 		action->sa.sa_handler = SIG_DFL;
 		if (handler == HANDLER_EXIT)
 			action->sa.sa_flags |= SA_IMMUTABLE;
-		if (blocked) {
+		if (blocked)
 			sigdelset(&t->blocked, sig);
-			recalc_sigpending_and_wake(t);
-		}
 	}
 	/*
 	 * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
@@ -1361,6 +1349,9 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
 	    (!t->ptrace || (handler == HANDLER_EXIT)))
 		t->signal->flags &= ~SIGNAL_UNKILLABLE;
 	ret = send_signal_locked(sig, info, t, PIDTYPE_PID);
+	/* This can happen if the signal was already pending and blocked */
+	if (!task_sigpending(t))
+		signal_wake_up(t, 0);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 
 	return ret;
-- 
cgit v1.2.3


From 27bbb2a0fddf70e185e800cd78f0142d45330c6c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 21 Nov 2023 17:26:50 +0100
Subject: __ptrace_unlink: kill the obsolete "FIXME" code

The corner case described by the comment is no longer possible after the
commit 7b3c36fc4c23 ("ptrace: fix task_join_group_stop() for the case when
current is traced"), task_join_group_stop() ensures that the new thread
has the correct signr in JOBCTL_STOP_SIGMASK regardless of ptrace.

Link: https://lkml.kernel.org/r/20231121162650.GA6635@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/ptrace.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d8b5e13a2229..3617213c3d8a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -145,20 +145,9 @@ void __ptrace_unlink(struct task_struct *child)
 	 */
 	if (!(child->flags & PF_EXITING) &&
 	    (child->signal->flags & SIGNAL_STOP_STOPPED ||
-	     child->signal->group_stop_count)) {
+	     child->signal->group_stop_count))
 		child->jobctl |= JOBCTL_STOP_PENDING;
 
-		/*
-		 * This is only possible if this thread was cloned by the
-		 * traced task running in the stopped group, set the signal
-		 * for the future reports.
-		 * FIXME: we should change ptrace_init_task() to handle this
-		 * case.
-		 */
-		if (!(child->jobctl & JOBCTL_STOP_SIGMASK))
-			child->jobctl |= SIGSTOP;
-	}
-
 	/*
 	 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
 	 * @child in the butt.  Note that @resume should be used iff @child
-- 
cgit v1.2.3


From 0311d8272406b2ec47f485bef887723cc352a489 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Tue, 14 Nov 2023 17:12:01 +0100
Subject: kexec: use atomic_try_cmpxchg in crash_kexec

Use atomic_try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in
crash_kexec().  x86 CMPXCHG instruction returns success in ZF flag,
so this change saves a compare after cmpxchg.

No functional change intended.

Link: https://lkml.kernel.org/r/20231114161228.108516-1-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kexec_core.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index be5642a4ec49..bc4c096ab1f3 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1063,9 +1063,10 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs)
 	 * panic().  Otherwise parallel calls of panic() and crash_kexec()
 	 * may stop each other.  To exclude them, we use panic_cpu here too.
 	 */
+	old_cpu = PANIC_CPU_INVALID;
 	this_cpu = raw_smp_processor_id();
-	old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
-	if (old_cpu == PANIC_CPU_INVALID) {
+
+	if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
 		/* This is the 1st CPU which comes here, so go ahead. */
 		__crash_kexec(regs);
 
-- 
cgit v1.2.3


From b1c3efe07987592c16d5f59ce235e6ddbea65a73 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 23 Nov 2023 12:05:03 +0100
Subject: sched: fair: move unused stub functions to header

These four functions have a normal definition for CONFIG_FAIR_GROUP_SCHED,
and empty one that is only referenced when FAIR_GROUP_SCHED is disabled
but CGROUP_SCHED is still enabled.  If both are turned off, the functions
are still defined but the misisng prototype causes a W=1 warning:

kernel/sched/fair.c:12544:6: error: no previous prototype for 'free_fair_sched_group'
kernel/sched/fair.c:12546:5: error: no previous prototype for 'alloc_fair_sched_group'
kernel/sched/fair.c:12553:6: error: no previous prototype for 'online_fair_sched_group'
kernel/sched/fair.c:12555:6: error: no previous prototype for 'unregister_fair_sched_group'

Move the alternatives into the header as static inline functions with the
correct combination of #ifdef checks to avoid the warning without adding
even more complexity.

[A different patch with the same description got applied by accident
 and was later reverted, but the original patch is still missing]

Link: https://lkml.kernel.org/r/20231123110506.707903-4-arnd@kernel.org
Fixes: 7aa55f2a5902 ("sched/fair: Move unused stub functions to header")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@rivosinc.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Tudor Ambarus <tudor.ambarus@linaro.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/sched/fair.c  | 13 -------------
 kernel/sched/sched.h | 11 +++++++++++
 2 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d7a3c63a2171..835a40eac462 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13036,19 +13036,6 @@ next_cpu:
 	return 0;
 }
 
-#else /* CONFIG_FAIR_GROUP_SCHED */
-
-void free_fair_sched_group(struct task_group *tg) { }
-
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
-{
-	return 1;
-}
-
-void online_fair_sched_group(struct task_group *tg) { }
-
-void unregister_fair_sched_group(struct task_group *tg) { }
-
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2e5a95486a42..8f5df5250b8d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -436,10 +436,21 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
 
 extern int tg_nop(struct task_group *tg, void *data);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 extern void free_fair_sched_group(struct task_group *tg);
 extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
 extern void online_fair_sched_group(struct task_group *tg);
 extern void unregister_fair_sched_group(struct task_group *tg);
+#else
+static inline void free_fair_sched_group(struct task_group *tg) { }
+static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+       return 1;
+}
+static inline void online_fair_sched_group(struct task_group *tg) { }
+static inline void unregister_fair_sched_group(struct task_group *tg) { }
+#endif
+
 extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 			struct sched_entity *se, int cpu,
 			struct sched_entity *parent);
-- 
cgit v1.2.3


From 7acf164b259d9007264d9d8501da1023f140a3b4 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 15 Nov 2023 21:00:27 +0800
Subject: resource: add walk_system_ram_res_rev()

This function, being a variant of walk_system_ram_res() introduced in
commit 8c86e70acead ("resource: provide new functions to walk through
resources"), walks through a list of all the resources of System RAM in
reversed order, i.e., from higher to lower.

It will be used in kexec_file code to load kernel, initrd etc when
preparing kexec reboot.

Link: https://lkml.kernel.org/r/ZVTA6z/06cLnWKUz@MiWiFi-R3L-srv
Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ioport.h |  3 +++
 kernel/resource.c      | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 14f5cfabbbc8..db7fe25f3370 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -331,6 +331,9 @@ extern int
 walk_system_ram_res(u64 start, u64 end, void *arg,
 		    int (*func)(struct resource *, void *));
 extern int
+walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+			int (*func)(struct resource *, void *));
+extern int
 walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
 		    void *arg, int (*func)(struct resource *, void *));
 
diff --git a/kernel/resource.c b/kernel/resource.c
index 866ef3663a0b..e8a244300e5b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -27,6 +27,8 @@
 #include <linux/mount.h>
 #include <linux/resource_ext.h>
 #include <uapi/linux/magic.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
 #include <asm/io.h>
 
 
@@ -429,6 +431,61 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 				     func);
 }
 
+/*
+ * This function, being a variant of walk_system_ram_res(), calls the @func
+ * callback against all memory ranges of type System RAM which are marked as
+ * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from
+ * higher to lower.
+ */
+int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+				int (*func)(struct resource *, void *))
+{
+	struct resource res, *rams;
+	int rams_size = 16, i;
+	unsigned long flags;
+	int ret = -1;
+
+	/* create a list */
+	rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL);
+	if (!rams)
+		return ret;
+
+	flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+	i = 0;
+	while ((start < end) &&
+		(!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) {
+		if (i >= rams_size) {
+			/* re-alloc */
+			struct resource *rams_new;
+
+			rams_new = kvrealloc(rams, rams_size * sizeof(struct resource),
+					     (rams_size + 16) * sizeof(struct resource),
+					     GFP_KERNEL);
+			if (!rams_new)
+				goto out;
+
+			rams = rams_new;
+			rams_size += 16;
+		}
+
+		rams[i].start = res.start;
+		rams[i++].end = res.end;
+
+		start = res.end + 1;
+	}
+
+	/* go reverse */
+	for (i--; i >= 0; i--) {
+		ret = (*func)(&rams[i], arg);
+		if (ret)
+			break;
+	}
+
+out:
+	kvfree(rams);
+	return ret;
+}
+
 /*
  * This function calls the @func callback against all memory ranges, which
  * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
-- 
cgit v1.2.3


From b3ba234171cd0d58df0a13c262210ff8b5fd2830 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Tue, 14 Nov 2023 17:16:58 +0800
Subject: kexec_file: load kernel at top of system RAM if required

Patch series "kexec_file: Load kernel at top of system RAM if required".

Justification:
==============

Kexec_load interface has been doing top down searching and loading
kernel/initrd/purgtory etc to prepare for kexec reboot.  In that way, the
benefits are that it avoids to consume and fragment limited low memory
which satisfy DMA buffer allocation and big chunk of continuous memory
during system init; and avoids to stir with BIOS/FW reserved or occupied
areas, or corner case handling/work around/quirk occupied areas when doing
system init.  By the way, the top-down searching and loading of kexec-ed
kernel is done in user space utility code.

For kexec_file loading, even if kexec_buf.top_down is 'true', it's simply
ignored.  It calls walk_system_ram_res() directly to go through all
resources of System RAM bottom up, to find an available memory region,
then call locate_mem_hole_callback() to allocate memory in that found
memory region from top to down.  This is not expected and inconsistent
with kexec_load.

Implementation
===============

In patch 1, introduce a new function walk_system_ram_res_rev() which is a
variant of walk_system_ram_res(), it walks through a list of all the
resources of System RAM in reversed order, i.e., from higher to lower.

In patch 2, check if kexec_buf.top_down is 'true' in
kexec_walk_resources(), if yes, call walk_system_ram_res_rev() to find
memory region of system RAM from top to down to load kernel/initrd etc.

Background information: ======================= And I ever tried this in
the past in a different way, please see below link.  In the post, I tried
to adjust struct sibling linking code, replace the the singly linked list
with list_head so that walk_system_ram_res_rev() can be implemented in a
much easier way.  Finally I failed.
https://lore.kernel.org/all/20180718024944.577-4-bhe@redhat.com/

This time, I picked up the patch from AKASHI Takahiro's old post and made
some change to take as the current patch 1:
https://lists.infradead.org/pipermail/linux-arm-kernel/2017-September/531456.html


This patch (of 2):

Kexec_load interface has been doing top down searching and loading
kernel/initrd/purgtory etc to prepare for kexec reboot.  In that way, the
benefits are that it avoids to consume and fragment limited low memory
which satisfy DMA buffer allocation and big chunk of continuous memory
during system init; and avoids to stir with BIOS/FW reserved or occupied
areas, or corner case handling/work around/quirk occupied areas when doing
system init.  By the way, the top-down searching and loading of kexec-ed
kernel is done in user space utility code.

For kexec_file loading, even if kexec_buf.top_down is 'true', it's simply
ignored.  It calls walk_system_ram_res() directly to go through all
resources of System RAM bottom up, to find an available memory region,
then call locate_mem_hole_callback() to allocate memory in that found
memory region from top to down.  This is not expected and inconsistent
with kexec_load.

Here check if kexec_buf.top_down is 'true' in kexec_walk_resources(), if
yes, call the newly added walk_system_ram_res_rev() to find memory region
of system RAM from top to down to load kernel/initrd etc.

Link: https://lkml.kernel.org/r/20231114091658.228030-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20231114091658.228030-3-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kexec_file.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f9a419cd22d4..ba3ef30921b8 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -592,6 +592,8 @@ static int kexec_walk_resources(struct kexec_buf *kbuf,
 					   IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
 					   crashk_res.start, crashk_res.end,
 					   kbuf, func);
+	else if (kbuf->top_down)
+		return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func);
 	else
 		return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
 }
-- 
cgit v1.2.3


From 9d02330abd3ecdacc43fc6b16a5668e7e94b7562 Mon Sep 17 00:00:00 2001
From: Li Zhe <lizhe.67@bytedance.com>
Date: Thu, 23 Nov 2023 16:40:22 +0800
Subject: softlockup: serialized softlockup's log

If multiple CPUs trigger softlockup at the same time with
'softlockup_all_cpu_backtrace=0', the softlockup's logs will appear
staggeredly in dmesg, which will affect the viewing of the logs for
developer.  Since the code path for outputting softlockup logs is not a
kernel hotspot and the performance requirements for the code are not
strict, locks are used to serialize the softlockup log output to improve
the readability of the logs.

Link: https://lkml.kernel.org/r/20231123084022.10302-1-lizhe.67@bytedance.com
Signed-off-by: Li Zhe <lizhe.67@bytedance.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Cc: Lecopzer Chen <lecopzer.chen@mediatek.com>
Cc: Pingfan Liu <kernelfans@gmail.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/watchdog.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5cd6d4e26915..bf30a6fac665 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -448,6 +448,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	struct pt_regs *regs = get_irq_regs();
 	int duration;
 	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+	static DEFINE_SPINLOCK(watchdog_output_lock);
 
 	if (!watchdog_enabled)
 		return HRTIMER_NORESTART;
@@ -514,6 +515,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		/* Start period for the next softlockup warning. */
 		update_report_ts();
 
+		spin_lock(&watchdog_output_lock);
 		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
 			smp_processor_id(), duration,
 			current->comm, task_pid_nr(current));
@@ -523,6 +525,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 			show_regs(regs);
 		else
 			dump_stack();
+		spin_unlock(&watchdog_output_lock);
 
 		if (softlockup_all_cpu_backtrace) {
 			trigger_allbutcpu_cpu_backtrace(smp_processor_id());
-- 
cgit v1.2.3


From 18966f7b9458d3b19412fe9dfb421ab59401bfe1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 11 Oct 2023 09:45:54 -0700
Subject: rcu-tasks: Mark RCU Tasks accesses to current->rcu_tasks_idle_cpu

The task_struct structure's ->rcu_tasks_idle_cpu can be concurrently
read and written from the RCU Tasks grace-period kthread and from the
CPU on which the task_struct structure's task is running.  This commit
therefore marks the accesses appropriately.

Reported-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 kernel/rcu/tasks.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index f54d5782eca0..732ad5b39946 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -975,7 +975,7 @@ static void check_holdout_task(struct task_struct *t,
 	    t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
 	    !rcu_tasks_is_holdout(t) ||
 	    (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
-	     !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
+	     !is_idle_task(t) && READ_ONCE(t->rcu_tasks_idle_cpu) >= 0)) {
 		WRITE_ONCE(t->rcu_tasks_holdout, false);
 		list_del_init(&t->rcu_tasks_holdout_list);
 		put_task_struct(t);
@@ -993,7 +993,7 @@ static void check_holdout_task(struct task_struct *t,
 		 t, ".I"[is_idle_task(t)],
 		 "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)],
 		 t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
-		 t->rcu_tasks_idle_cpu, cpu);
+		 data_race(t->rcu_tasks_idle_cpu), cpu);
 	sched_show_task(t);
 }
 
-- 
cgit v1.2.3


From 4e58aaeebb3c27993c734c99eae6881b196b1ddb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 1 Nov 2023 18:28:38 -0700
Subject: rcu: Restrict access to RCU CPU stall notifiers

Although the RCU CPU stall notifiers can be useful for dumping state when
tracking down delicate forward-progress bugs where NUMA effects cause
cache lines to be delivered to a given CPU regularly, but always in a
state that prevents that CPU from making forward progress.  These bugs can
be detected by the RCU CPU stall-warning mechanism, but in some cases,
the stall-warnings printk()s disrupt the forward-progress bug before
any useful state can be obtained.

Unfortunately, the notifier mechanism added by commit 5b404fdabacf ("rcu:
Add RCU CPU stall notifier") can make matters worse if used at all
carelessly. For example, if the stall warning was caused by a lock not
being released, then any attempt to acquire that lock in the notifier
will hang. This will prevent not only the notifier from producing any
useful output, but it will also prevent the stall-warning message from
ever appearing.

This commit therefore hides this new RCU CPU stall notifier
mechanism under a new RCU_CPU_STALL_NOTIFIER Kconfig option that
depends on both DEBUG_KERNEL and RCU_EXPERT.  In addition, the
rcupdate.rcu_cpu_stall_notifiers=1 kernel boot parameter must also
be specified.  The RCU_CPU_STALL_NOTIFIER Kconfig option's help text
contains a warning and explains the dangers of careless use, recommending
lockless notifier code.  In addition, a WARN() is triggered each time
that an attempt is made to register a stall-warning notifier in kernels
built with CONFIG_RCU_CPU_STALL_NOTIFIER=y.

This combination of measures will keep use of this mechanism confined to
debug kernels and away from routine deployments.

[ paulmck: Apply Dan Carpenter feedback. ]

Fixes: 5b404fdabacf ("rcu: Add RCU CPU stall notifier")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  6 ++++++
 include/linux/rcu_notifier.h                    |  6 +++---
 kernel/rcu/Kconfig.debug                        | 25 +++++++++++++++++++++++++
 kernel/rcu/rcu.h                                |  8 +++++---
 kernel/rcu/rcutorture.c                         | 12 +++++++-----
 kernel/rcu/tree_stall.h                         | 11 ++++++++++-
 kernel/rcu/update.c                             |  6 ++++++
 7 files changed, 62 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 65731b060e3f..b72e2049c487 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5302,6 +5302,12 @@
 			Dump ftrace buffer after reporting RCU CPU
 			stall warning.
 
+	rcupdate.rcu_cpu_stall_notifiers= [KNL]
+			Provide RCU CPU stall notifiers, but see the
+			warnings in the RCU_CPU_STALL_NOTIFIER Kconfig
+			option's help text.  TL;DR:  You almost certainly
+			do not want rcupdate.rcu_cpu_stall_notifiers.
+
 	rcupdate.rcu_cpu_stall_suppress= [KNL]
 			Suppress RCU CPU stall warning messages.
 
diff --git a/include/linux/rcu_notifier.h b/include/linux/rcu_notifier.h
index ebf371364581..5640f024773b 100644
--- a/include/linux/rcu_notifier.h
+++ b/include/linux/rcu_notifier.h
@@ -13,7 +13,7 @@
 #define RCU_STALL_NOTIFY_NORM	1
 #define RCU_STALL_NOTIFY_EXP	2
 
-#ifdef CONFIG_RCU_STALL_COMMON
+#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 
 #include <linux/notifier.h>
 #include <linux/types.h>
@@ -21,12 +21,12 @@
 int rcu_stall_chain_notifier_register(struct notifier_block *n);
 int rcu_stall_chain_notifier_unregister(struct notifier_block *n);
 
-#else // #ifdef CONFIG_RCU_STALL_COMMON
+#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 
 // No RCU CPU stall warnings in Tiny RCU.
 static inline int rcu_stall_chain_notifier_register(struct notifier_block *n) { return -EEXIST; }
 static inline int rcu_stall_chain_notifier_unregister(struct notifier_block *n) { return -ENOENT; }
 
-#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON
+#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 
 #endif /* __LINUX_RCU_NOTIFIER_H */
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 2984de629f74..9b0b52e1836f 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -105,6 +105,31 @@ config RCU_CPU_STALL_CPUTIME
 	  The boot option rcupdate.rcu_cpu_stall_cputime has the same function
 	  as this one, but will override this if it exists.
 
+config RCU_CPU_STALL_NOTIFIER
+	bool "Provide RCU CPU-stall notifiers"
+	depends on RCU_STALL_COMMON
+	depends on DEBUG_KERNEL
+	depends on RCU_EXPERT
+	default n
+	help
+	  WARNING:  You almost certainly do not want this!!!
+
+	  Enable RCU CPU-stall notifiers, which are invoked just before
+	  printing the RCU CPU stall warning.  As such, bugs in notifier
+	  callbacks can prevent stall warnings from being printed.
+	  And the whole reason that a stall warning is being printed is
+	  that something is hung up somewhere.	Therefore, the notifier
+	  callbacks must be written extremely carefully, preferably
+	  containing only lockless code.  After all, it is quite possible
+	  that the whole reason that the RCU CPU stall is happening in
+	  the first place is that someone forgot to release whatever lock
+	  that you are thinking of acquiring.  In which case, having your
+	  notifier callback acquire that lock will hang, preventing the
+	  RCU CPU stall warning from appearing.
+
+	  Say Y here if you want RCU CPU stall notifiers (you don't want them)
+	  Say N if you are unsure.
+
 config RCU_TRACE
 	bool "Enable tracing for RCU"
 	depends on DEBUG_KERNEL
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index b531c33e9545..f94f65877f2b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -262,6 +262,8 @@ static inline bool rcu_stall_is_suppressed_at_boot(void)
 	return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended();
 }
 
+extern int rcu_cpu_stall_notifiers;
+
 #ifdef CONFIG_RCU_STALL_COMMON
 
 extern int rcu_cpu_stall_ftrace_dump;
@@ -659,10 +661,10 @@ static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; }
 bool rcu_cpu_beenfullyonline(int cpu);
 #endif
 
-#ifdef CONFIG_RCU_STALL_COMMON
+#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 int rcu_stall_notifier_call_chain(unsigned long val, void *v);
-#else // #ifdef CONFIG_RCU_STALL_COMMON
+#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; }
-#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON
+#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
 
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 30fc9d34e329..07a6a183c555 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2450,10 +2450,12 @@ static int rcu_torture_stall(void *args)
 	unsigned long stop_at;
 
 	VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
-	ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
-	if (ret)
-		pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
-			__func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
+	if (rcu_cpu_stall_notifiers) {
+		ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
+		if (ret)
+			pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
+				__func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
+	}
 	if (stall_cpu_holdoff > 0) {
 		VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
 		schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
@@ -2497,7 +2499,7 @@ static int rcu_torture_stall(void *args)
 		cur_ops->readunlock(idx);
 	}
 	pr_alert("%s end.\n", __func__);
-	if (!ret) {
+	if (rcu_cpu_stall_notifiers && !ret) {
 		ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block);
 		if (ret)
 			pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index ac8e86babe44..5d666428546b 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -1061,6 +1061,7 @@ static int __init rcu_sysrq_init(void)
 }
 early_initcall(rcu_sysrq_init);
 
+#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
 
 //////////////////////////////////////////////////////////////////////////////
 //
@@ -1081,7 +1082,13 @@ static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list);
  */
 int rcu_stall_chain_notifier_register(struct notifier_block *n)
 {
-	return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n);
+	int rcsn = rcu_cpu_stall_notifiers;
+
+	WARN(1, "Adding %pS() to RCU stall notifier list (%s).\n", n->notifier_call,
+	     rcsn ? "possibly suppressing RCU CPU stall warnings" : "failed, so all is well");
+	if (rcsn)
+		return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n);
+	return -EEXIST;
 }
 EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register);
 
@@ -1115,3 +1122,5 @@ int rcu_stall_notifier_call_chain(unsigned long val, void *v)
 {
 	return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v);
 }
+
+#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c534d6806d3d..46aaaa9fe339 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -538,9 +538,15 @@ long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 EXPORT_SYMBOL_GPL(torture_sched_setaffinity);
 #endif
 
+int rcu_cpu_stall_notifiers __read_mostly; // !0 = provide stall notifiers (rarely useful)
+EXPORT_SYMBOL_GPL(rcu_cpu_stall_notifiers);
+
 #ifdef CONFIG_RCU_STALL_COMMON
 int rcu_cpu_stall_ftrace_dump __read_mostly;
 module_param(rcu_cpu_stall_ftrace_dump, int, 0644);
+#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
+module_param(rcu_cpu_stall_notifiers, int, 0444);
+#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
 int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings.
 EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
 module_param(rcu_cpu_stall_suppress, int, 0644);
-- 
cgit v1.2.3


From a1ca8295ee53a2fc57085fae26df37228c655791 Mon Sep 17 00:00:00 2001
From: Wang chaodong <chaodong@nfschina.com>
Date: Fri, 20 Oct 2023 16:51:06 +0800
Subject: PM: hibernate: Drop unnecessary local variable initialization

It is not necessary to intialize the error variable in
create_basic_memory_bitmaps(), because it is only read after
being assigned a value.

Signed-off-by: Wang chaodong <chaodong@nfschina.com>
[ rjw: Subject and changelog rewrite ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 50a15408c3fc..71b2f12ed3b5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1119,7 +1119,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
 int create_basic_memory_bitmaps(void)
 {
 	struct memory_bitmap *bm1, *bm2;
-	int error = 0;
+	int error;
 
 	if (forbidden_pages_map && free_pages_map)
 		return 0;
-- 
cgit v1.2.3


From bbeaa4691fa8682e2fe2e87f28d5fce39805fa68 Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Fri, 27 Oct 2023 09:55:33 +0800
Subject: PM: hibernate: Do not initialize error in swap_write_page()

'error' first receives the function result before it is used, and it
does not need to be assigned a value during definition.

Signed-off-by: Li zeming <zeming@nfschina.com>
[ rjw: Subject rewrite ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/swap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a2cb0babb5ec..68973ca2cf07 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -451,7 +451,7 @@ err_close:
 static int swap_write_page(struct swap_map_handle *handle, void *buf,
 		struct hib_bio_batch *hb)
 {
-	int error = 0;
+	int error;
 	sector_t offset;
 
 	if (!handle->cur)
-- 
cgit v1.2.3


From 4ac934b1aaa99e00ca25875d55094a4fe34e212d Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Tue, 24 Oct 2023 10:04:34 +0800
Subject: PM: hibernate: Do not initialize error in snapshot_write_next()

The error variable in snapshot_write_next() gets a value before it is
used, so don't initialize it to 0 upfront.

Signed-off-by: Li zeming <zeming@nfschina.com>
[ rjw: Subject and changelog rewrite ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 71b2f12ed3b5..e3e8f1c6e75f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2778,7 +2778,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 int snapshot_write_next(struct snapshot_handle *handle)
 {
 	static struct chain_allocator ca;
-	int error = 0;
+	int error;
 
 next:
 	/* Check if we have already loaded the entire image */
-- 
cgit v1.2.3


From 20eb4142397cf3ec221de43f10ea149af462c572 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 4 Oct 2023 01:29:01 +0200
Subject: srcu: Remove superfluous callbacks advancing from srcu_gp_start()

Callbacks advancing on SRCU must be performed on two specific places:

1) On enqueue time in order to make room for the acceleration of the
   new callback.

2) On invocation time in order to move the callbacks ready to invoke.

Any other callback advancing callsite is needless. Remove the remaining
one in srcu_gp_start().

Co-developed-by: Yong He <zhuangel570@gmail.com>
Signed-off-by: Yong He <zhuangel570@gmail.com>
Co-developed-by: Joel Fernandes <joel@joelfernandes.org>
Signed-off-by: Joel Fernandes <joel@joelfernandes.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Co-developed-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 kernel/rcu/srcutree.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 560e99ec5333..e9356a103626 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -772,20 +772,10 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe);
  */
 static void srcu_gp_start(struct srcu_struct *ssp)
 {
-	struct srcu_data *sdp;
 	int state;
 
-	if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
-		sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
-	else
-		sdp = this_cpu_ptr(ssp->sda);
 	lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
 	WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed));
-	spin_lock_rcu_node(sdp);  /* Interrupts already disabled. */
-	rcu_segcblist_advance(&sdp->srcu_cblist,
-			      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
-	WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
-	spin_unlock_rcu_node(sdp);  /* Interrupts remain disabled. */
 	WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies);
 	WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0);
 	smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
-- 
cgit v1.2.3


From 94c55b9e21979daa88e190bf971c47432a818ebe Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 4 Oct 2023 01:29:02 +0200
Subject: srcu: No need to advance/accelerate if no callback enqueued

While in grace period start, there is nothing to accelerate and
therefore no need to advance the callbacks either if no callback is
to be enqueued.

Spare these needless operations in this case.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 kernel/rcu/srcutree.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index e9356a103626..2bfc8ed1eed2 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1261,9 +1261,11 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 	 *     period (gp_num = X + 8). So acceleration fails.
 	 */
 	s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
-	rcu_segcblist_advance(&sdp->srcu_cblist,
-			      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
-	WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s) && rhp);
+	if (rhp) {
+		rcu_segcblist_advance(&sdp->srcu_cblist,
+				      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+		WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s));
+	}
 	if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
 		sdp->srcu_gp_seq_needed = s;
 		needgp = true;
-- 
cgit v1.2.3


From c21357e4461f3f9c8ff93302906b5372411ee108 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 4 Oct 2023 01:29:03 +0200
Subject: srcu: Explain why callbacks invocations can't run concurrently

If an SRCU barrier is queued while callbacks are running and a new
callbacks invocator for the same sdp were to run concurrently, the
RCU barrier might execute too early. As this requirement is non-obvious,
make sure to keep a record.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 kernel/rcu/srcutree.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 2bfc8ed1eed2..0351a4e83529 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1715,6 +1715,11 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+	/*
+	 * Although this function is theoretically re-entrant, concurrent
+	 * callbacks invocation is disallowed to avoid executing an SRCU barrier
+	 * too early.
+	 */
 	if (sdp->srcu_cblist_invoking ||
 	    !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
 		spin_unlock_irq_rcu_node(sdp);
@@ -1745,6 +1750,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	sdp->srcu_cblist_invoking = false;
 	more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
 	spin_unlock_irq_rcu_node(sdp);
+	/* An SRCU barrier or callbacks from previous nesting work pending */
 	if (more)
 		srcu_schedule_cbs_sdp(sdp, 0);
 }
-- 
cgit v1.2.3


From 1e68485d8299860e68c4e1d29589ff0d20db0287 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 4 Dec 2023 15:39:19 -0800
Subject: bpf: log PTR_TO_MEM memory size in verifier log

Emit valid memory size addressable through PTR_TO_MEM register.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231204233931.49758-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/log.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 55d019f30e91..61d7d23a0118 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -682,6 +682,10 @@ static void print_reg_state(struct bpf_verifier_env *env,
 		verbose_a("r=");
 		verbose_unum(env, reg->range);
 	}
+	if (base_type(t) == PTR_TO_MEM) {
+		verbose_a("sz=");
+		verbose_unum(env, reg->mem_size);
+	}
 	if (tnum_is_const(reg->var_off)) {
 		/* a pointer register with fixed offset */
 		if (reg->var_off.value) {
-- 
cgit v1.2.3


From 22b769bb4f87060774bfdd6facbab438ed3b8453 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 4 Dec 2023 15:39:20 -0800
Subject: bpf: emit more dynptr information in verifier log

Emit dynptr type for CONST_PTR_TO_DYNPTR register. Also emit id,
ref_obj_id, and dynptr_id fields for STACK_DYNPTR stack slots.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231204233931.49758-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/log.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 61d7d23a0118..594a234f122b 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -628,6 +628,12 @@ static bool type_is_map_ptr(enum bpf_reg_type t) {
 	}
 }
 
+/*
+ * _a stands for append, was shortened to avoid multiline statements below.
+ * This macro is used to output a comma separated list of attributes.
+ */
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; })
+
 static void print_reg_state(struct bpf_verifier_env *env,
 			    const struct bpf_func_state *state,
 			    const struct bpf_reg_state *reg)
@@ -643,11 +649,6 @@ static void print_reg_state(struct bpf_verifier_env *env,
 		verbose_snum(env, reg->var_off.value + reg->off);
 		return;
 	}
-/*
- * _a stands for append, was shortened to avoid multiline statements below.
- * This macro is used to output a comma separated list of attributes.
- */
-#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; })
 
 	verbose(env, "%s", reg_type_str(env, t));
 	if (t == PTR_TO_STACK) {
@@ -686,6 +687,8 @@ static void print_reg_state(struct bpf_verifier_env *env,
 		verbose_a("sz=");
 		verbose_unum(env, reg->mem_size);
 	}
+	if (t == CONST_PTR_TO_DYNPTR)
+		verbose_a("type=%s",  dynptr_type_str(reg->dynptr.type));
 	if (tnum_is_const(reg->var_off)) {
 		/* a pointer register with fixed offset */
 		if (reg->var_off.value) {
@@ -702,8 +705,6 @@ static void print_reg_state(struct bpf_verifier_env *env,
 		}
 	}
 	verbose(env, ")");
-
-#undef verbose_a
 }
 
 void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state,
@@ -727,6 +728,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st
 	}
 	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
 		char types_buf[BPF_REG_SIZE + 1];
+		const char *sep = "";
 		bool valid = false;
 		u8 slot_type;
 		int j;
@@ -765,9 +767,14 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st
 
 			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
 			print_liveness(env, reg->live);
-			verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type));
+			verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type));
+			if (reg->id)
+				verbose_a("id=%d", reg->id);
 			if (reg->ref_obj_id)
-				verbose(env, "(ref_id=%d)", reg->ref_obj_id);
+				verbose_a("ref_id=%d", reg->ref_obj_id);
+			if (reg->dynptr_id)
+				verbose_a("dynptr_id=%d", reg->dynptr_id);
+			verbose(env, ")");
 			break;
 		case STACK_ITER:
 			/* only main slot has ref_obj_id set; skip others */
-- 
cgit v1.2.3


From 1a1ad782dcbbacd9e8d4e2e7ff1bf14d1db80727 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 4 Dec 2023 15:39:21 -0800
Subject: bpf: tidy up exception callback management a bit

Use the fact that we are passing subprog index around and have
a corresponding struct bpf_subprog_info in bpf_verifier_env for each
subprogram. We don't need to separately pass around a flag whether
subprog is exception callback or not, each relevant verifier function
can determine this using provided subprog index if we maintain
bpf_subprog_info properly.

Also move out exception callback-specific logic from
btf_prepare_func_args(), keeping it generic. We can enforce all these
restriction right before exception callback verification pass. We add
out parameter, arg_cnt, for now, but this will be unnecessary with
subsequent refactoring and will be removed.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20231204233931.49758-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  2 +-
 kernel/bpf/btf.c      | 11 ++---------
 kernel/bpf/verifier.c | 52 ++++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 42 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c1a06263a4f3..0bd4889e917a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2494,7 +2494,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *reg, bool is_ex_cb);
+			  struct bpf_reg_state *reg, u32 *nargs);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
 const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 63cf4128fc05..d56433bf8aba 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6956,7 +6956,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
  * (either PTR_TO_CTX or SCALAR_VALUE).
  */
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *regs, bool is_ex_cb)
+			  struct bpf_reg_state *regs, u32 *arg_cnt)
 {
 	struct bpf_verifier_log *log = &env->log;
 	struct bpf_prog *prog = env->prog;
@@ -7013,6 +7013,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			tname, nargs, MAX_BPF_FUNC_REG_ARGS);
 		return -EINVAL;
 	}
+	*arg_cnt = nargs;
 	/* check that function returns int, exception cb also requires this */
 	t = btf_type_by_id(btf, t->type);
 	while (btf_type_is_modifier(t))
@@ -7062,14 +7063,6 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			i, btf_type_str(t), tname);
 		return -EINVAL;
 	}
-	/* We have already ensured that the callback returns an integer, just
-	 * like all global subprogs. We need to determine it only has a single
-	 * scalar argument.
-	 */
-	if (is_ex_cb && (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE)) {
-		bpf_log(log, "exception cb only supports single integer argument\n");
-		return -EINVAL;
-	}
 	return 0;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 727a59e4a647..d1755db1b503 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -442,6 +442,25 @@ static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env,
 	return &env->prog->aux->func_info_aux[subprog];
 }
 
+static struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog)
+{
+	return &env->subprog_info[subprog];
+}
+
+static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
+{
+	struct bpf_subprog_info *info = subprog_info(env, subprog);
+
+	info->is_cb = true;
+	info->is_async_cb = true;
+	info->is_exception_cb = true;
+}
+
+static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog)
+{
+	return subprog_info(env, subprog)->is_exception_cb;
+}
+
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
 {
 	return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
@@ -2892,6 +2911,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
 			if (env->subprog_info[i].start != ex_cb_insn)
 				continue;
 			env->exception_callback_subprog = i;
+			mark_subprog_exc_cb(env, i);
 			break;
 		}
 	}
@@ -19166,9 +19186,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 
 		env->exception_callback_subprog = env->subprog_cnt - 1;
 		/* Don't update insn_cnt, as add_hidden_subprog always appends insns */
-		env->subprog_info[env->exception_callback_subprog].is_cb = true;
-		env->subprog_info[env->exception_callback_subprog].is_async_cb = true;
-		env->subprog_info[env->exception_callback_subprog].is_exception_cb = true;
+		mark_subprog_exc_cb(env, env->exception_callback_subprog);
 	}
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
@@ -19868,7 +19886,7 @@ static void free_states(struct bpf_verifier_env *env)
 	}
 }
 
-static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex_cb)
+static int do_check_common(struct bpf_verifier_env *env, int subprog)
 {
 	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
 	struct bpf_verifier_state *state;
@@ -19899,9 +19917,23 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex
 
 	regs = state->frame[state->curframe]->regs;
 	if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
-		ret = btf_prepare_func_args(env, subprog, regs, is_ex_cb);
+		u32 nargs;
+
+		ret = btf_prepare_func_args(env, subprog, regs, &nargs);
 		if (ret)
 			goto out;
+		if (subprog_is_exc_cb(env, subprog)) {
+			state->frame[0]->in_exception_callback_fn = true;
+			/* We have already ensured that the callback returns an integer, just
+			 * like all global subprogs. We need to determine it only has a single
+			 * scalar argument.
+			 */
+			if (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE) {
+				verbose(env, "exception cb only supports single integer argument\n");
+				ret = -EINVAL;
+				goto out;
+			}
+		}
 		for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
 			if (regs[i].type == PTR_TO_CTX)
 				mark_reg_known_zero(env, regs, i);
@@ -19915,12 +19947,6 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog, bool is_ex
 				regs[i].id = ++env->id_gen;
 			}
 		}
-		if (is_ex_cb) {
-			state->frame[0]->in_exception_callback_fn = true;
-			env->subprog_info[subprog].is_cb = true;
-			env->subprog_info[subprog].is_async_cb = true;
-			env->subprog_info[subprog].is_exception_cb = true;
-		}
 	} else {
 		/* 1st arg to a function */
 		regs[BPF_REG_1].type = PTR_TO_CTX;
@@ -20000,7 +20026,7 @@ again:
 
 		env->insn_idx = env->subprog_info[i].start;
 		WARN_ON_ONCE(env->insn_idx == 0);
-		ret = do_check_common(env, i, env->exception_callback_subprog == i);
+		ret = do_check_common(env, i);
 		if (ret) {
 			return ret;
 		} else if (env->log.level & BPF_LOG_LEVEL) {
@@ -20030,7 +20056,7 @@ static int do_check_main(struct bpf_verifier_env *env)
 	int ret;
 
 	env->insn_idx = 0;
-	ret = do_check_common(env, 0, false);
+	ret = do_check_common(env, 0);
 	if (!ret)
 		env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
 	return ret;
-- 
cgit v1.2.3


From 8f23f5dba6b4693448144bde4dd6f537543442c2 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 27 Oct 2023 08:05:20 +0800
Subject: iommu: Change kconfig around IOMMU_SVA

Linus suggested that the kconfig here is confusing:

https://lore.kernel.org/all/CAHk-=wgUiAtiszwseM1p2fCJ+sC4XWQ+YN4TanFhUgvUqjr9Xw@mail.gmail.com/

Let's break it into three kconfigs controlling distinct things:

 - CONFIG_IOMMU_MM_DATA controls if the mm_struct has the additional
   fields for the IOMMU. Currently only PASID, but later patches store
   a struct iommu_mm_data *

 - CONFIG_ARCH_HAS_CPU_PASID controls if the arch needs the scheduling bit
   for keeping track of the ENQCMD instruction. x86 will select this if
   IOMMU_SVA is enabled

 - IOMMU_SVA controls if the IOMMU core compiles in the SVA support code
   for iommu driver use and the IOMMU exported API

This way ARM will not enable CONFIG_ARCH_HAS_CPU_PASID

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20231027000525.1278806-2-tina.zhang@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/Kconfig             | 5 +++++
 arch/x86/Kconfig         | 1 +
 arch/x86/kernel/traps.c  | 2 +-
 drivers/iommu/Kconfig    | 1 +
 include/linux/iommu.h    | 2 +-
 include/linux/mm_types.h | 2 +-
 include/linux/sched.h    | 2 +-
 kernel/fork.c            | 2 +-
 mm/Kconfig               | 3 +++
 mm/init-mm.c             | 2 +-
 10 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index f4b210ab0612..3e49f862670e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -301,6 +301,11 @@ config ARCH_HAS_DMA_CLEAR_UNCACHED
 config ARCH_HAS_CPU_FINALIZE_INIT
 	bool
 
+# The architecture has a per-task state that includes the mm's PASID
+config ARCH_HAS_CPU_PASID
+	bool
+	select IOMMU_MM_DATA
+
 # Select if arch init_task must go in the __init_task_data section
 config ARCH_TASK_STRUCT_ON_STACK
 	bool
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3762f41bb092..68a2ec36a46e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -71,6 +71,7 @@ config X86
 	select ARCH_HAS_CACHE_LINE_SIZE
 	select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
 	select ARCH_HAS_CPU_FINALIZE_INIT
+	select ARCH_HAS_CPU_PASID		if IOMMU_SVA
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEBUG_VM_PGTABLE	if !X86_PAE
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c876f1d36a81..2b62dbb3396a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -565,7 +565,7 @@ static bool fixup_iopl_exception(struct pt_regs *regs)
  */
 static bool try_fixup_enqcmd_gp(void)
 {
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
 	u32 pasid;
 
 	/*
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 7673bb82945b..9a29d742617e 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -160,6 +160,7 @@ config IOMMU_DMA
 
 # Shared Virtual Addressing
 config IOMMU_SVA
+	select IOMMU_MM_DATA
 	bool
 
 config FSL_PAMU
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index c7394b39599c..cd3f398095bf 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1337,7 +1337,7 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream
 	return false;
 }
 
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_IOMMU_MM_DATA
 static inline void mm_pasid_init(struct mm_struct *mm)
 {
 	mm->pasid = IOMMU_PASID_INVALID;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 957ce38768b2..41f248608dd9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -938,7 +938,7 @@ struct mm_struct {
 #endif
 		struct work_struct async_put_work;
 
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_IOMMU_MM_DATA
 		u32 pasid;
 #endif
 #ifdef CONFIG_KSM
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292c31697248..70888a36677b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -954,7 +954,7 @@ struct task_struct {
 	/* Recursion prevention for eventfd_signal() */
 	unsigned			in_eventfd:1;
 #endif
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
 	unsigned			pasid_activated:1;
 #endif
 #ifdef	CONFIG_CPU_SUP_INTEL
diff --git a/kernel/fork.c b/kernel/fork.c
index 10917c3e1f03..43fd9bc1a522 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1179,7 +1179,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->use_memdelay = 0;
 #endif
 
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
 	tsk->pasid_activated = 0;
 #endif
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 89971a894b60..0143f4d905c9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1270,6 +1270,9 @@ config LOCK_MM_AND_FIND_VMA
 	bool
 	depends on !STACK_GROWSUP
 
+config IOMMU_MM_DATA
+	bool
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/init-mm.c b/mm/init-mm.c
index cfd367822cdd..c52dc2740a3d 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -44,7 +44,7 @@ struct mm_struct init_mm = {
 #endif
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_IOMMU_MM_DATA
 	.pasid		= IOMMU_PASID_INVALID,
 #endif
 	INIT_MM_CONTEXT(init_mm)
-- 
cgit v1.2.3


From 4e94ddfe2aab72139acb8d5372fac9e6c3f3e383 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 30 Nov 2023 13:49:11 +0100
Subject: file: remove __receive_fd()

Honestly, there's little value in having a helper with and without that
int __user *ufd argument. It's just messy and doesn't really give us
anything. Just expose receive_fd() with that argument and get rid of
that helper.

Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-5-e73ca6f4ea83@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/vdpa/vdpa_user/vduse_dev.c |  2 +-
 fs/file.c                          | 11 +++--------
 include/linux/file.h               |  5 +----
 include/net/scm.h                  |  2 +-
 kernel/pid.c                       |  2 +-
 kernel/seccomp.c                   |  2 +-
 6 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 0ddd4b8abecb..fafd4610b185 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -1157,7 +1157,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
 			fput(f);
 			break;
 		}
-		ret = receive_fd(f, perm_to_file_flags(entry.perm));
+		ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm));
 		fput(f);
 		break;
 	}
diff --git a/fs/file.c b/fs/file.c
index c8eaa0b29a08..3b683b9101d8 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1296,7 +1296,7 @@ out_unlock:
 }
 
 /**
- * __receive_fd() - Install received file into file descriptor table
+ * receive_fd() - Install received file into file descriptor table
  * @file: struct file that was received from another process
  * @ufd: __user pointer to write new fd number to
  * @o_flags: the O_* flags to apply to the new fd entry
@@ -1310,7 +1310,7 @@ out_unlock:
  *
  * Returns newly install fd or -ve on error.
  */
-int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
+int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
 {
 	int new_fd;
 	int error;
@@ -1335,6 +1335,7 @@ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
 	__receive_sock(file);
 	return new_fd;
 }
+EXPORT_SYMBOL_GPL(receive_fd);
 
 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
 {
@@ -1350,12 +1351,6 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
 	return new_fd;
 }
 
-int receive_fd(struct file *file, unsigned int o_flags)
-{
-	return __receive_fd(file, NULL, o_flags);
-}
-EXPORT_SYMBOL_GPL(receive_fd);
-
 static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
 {
 	int err = -EBADF;
diff --git a/include/linux/file.h b/include/linux/file.h
index c0d5219c2852..6834a29338c4 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -96,10 +96,7 @@ DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
 
 extern void fd_install(unsigned int fd, struct file *file);
 
-extern int __receive_fd(struct file *file, int __user *ufd,
-			unsigned int o_flags);
-
-extern int receive_fd(struct file *file, unsigned int o_flags);
+int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags);
 
 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);
 
diff --git a/include/net/scm.h b/include/net/scm.h
index 8aae2468bae0..cf68acec4d70 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -214,7 +214,7 @@ static inline int scm_recv_one_fd(struct file *f, int __user *ufd,
 {
 	if (!ufd)
 		return -EFAULT;
-	return __receive_fd(f, ufd, flags);
+	return receive_fd(f, ufd, flags);
 }
 
 #endif /* __LINUX_NET_SCM_H */
diff --git a/kernel/pid.c b/kernel/pid.c
index 6500ef956f2f..b52b10865454 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -700,7 +700,7 @@ static int pidfd_getfd(struct pid *pid, int fd)
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
-	ret = receive_fd(file, O_CLOEXEC);
+	ret = receive_fd(file, NULL, O_CLOEXEC);
 	fput(file);
 
 	return ret;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 255999ba9190..aca7b437882e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1072,7 +1072,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
 	 */
 	list_del_init(&addfd->list);
 	if (!addfd->setfd)
-		fd = receive_fd(addfd->file, addfd->flags);
+		fd = receive_fd(addfd->file, NULL, addfd->flags);
 	else
 		fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
 	addfd->ret = fd;
-- 
cgit v1.2.3


From 56c26d5ad86dfe48a76855a91b523ab4f372c003 Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Tue, 12 Dec 2023 08:54:36 +0800
Subject: bpf: Remove unused backtrack_state helper functions

The function are defined in the verifier.c file, but not called
elsewhere, so delete the unused function.

kernel/bpf/verifier.c:3448:20: warning: unused function 'bt_set_slot'
kernel/bpf/verifier.c:3453:20: warning: unused function 'bt_clear_slot'
kernel/bpf/verifier.c:3488:20: warning: unused function 'bt_is_slot_set'

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20231212005436.103829-1-yang.lee@linux.alibaba.com

Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7714
---
 kernel/bpf/verifier.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d1755db1b503..ef27820a24e3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3465,16 +3465,6 @@ static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u3
 	bt->stack_masks[frame] &= ~(1ull << slot);
 }
 
-static inline void bt_set_slot(struct backtrack_state *bt, u32 slot)
-{
-	bt_set_frame_slot(bt, bt->frame, slot);
-}
-
-static inline void bt_clear_slot(struct backtrack_state *bt, u32 slot)
-{
-	bt_clear_frame_slot(bt, bt->frame, slot);
-}
-
 static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame)
 {
 	return bt->reg_masks[frame];
@@ -3505,11 +3495,6 @@ static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u
 	return bt->stack_masks[frame] & (1ull << slot);
 }
 
-static inline bool bt_is_slot_set(struct backtrack_state *bt, u32 slot)
-{
-	return bt_is_frame_slot_set(bt, bt->frame, slot);
-}
-
 /* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */
 static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask)
 {
-- 
cgit v1.2.3


From 745e0311306507ddbe1727ac798c8f956812b810 Mon Sep 17 00:00:00 2001
From: Andrei Matei <andreimatei1@gmail.com>
Date: Sun, 10 Dec 2023 17:51:50 -0500
Subject: bpf: Comment on check_mem_size_reg

This patch adds a comment to check_mem_size_reg -- a function whose
meaning is not very transparent. The function implicitly deals with two
registers connected by convention, which is not obvious.

Signed-off-by: Andrei Matei <andreimatei1@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231210225149.67639-1-andreimatei1@gmail.com
---
 kernel/bpf/verifier.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ef27820a24e3..1863826a4ac3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7256,6 +7256,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	}
 }
 
+/* verify arguments to helpers or kfuncs consisting of a pointer and an access
+ * size.
+ *
+ * @regno is the register containing the access size. regno-1 is the register
+ * containing the pointer.
+ */
 static int check_mem_size_reg(struct bpf_verifier_env *env,
 			      struct bpf_reg_state *reg, u32 regno,
 			      bool zero_size_allowed,
-- 
cgit v1.2.3


From b3ae7b67b87fed771fa5bf95389df06b0433603e Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 12 Dec 2023 11:16:17 -0500
Subject: ring-buffer: Fix writing to the buffer with max_data_size

The maximum ring buffer data size is the maximum size of data that can be
recorded on the ring buffer. Events must be smaller than the sub buffer
data size minus any meta data. This size is checked before trying to
allocate from the ring buffer because the allocation assumes that the size
will fit on the sub buffer.

The maximum size was calculated as the size of a sub buffer page (which is
currently PAGE_SIZE minus the sub buffer header) minus the size of the
meta data of an individual event. But it missed the possible adding of a
time stamp for events that are added long enough apart that the event meta
data can't hold the time delta.

When an event is added that is greater than the current BUF_MAX_DATA_SIZE
minus the size of a time stamp, but still less than or equal to
BUF_MAX_DATA_SIZE, the ring buffer would go into an infinite loop, looking
for a page that can hold the event. Luckily, there's a check for this loop
and after 1000 iterations and a warning is emitted and the ring buffer is
disabled. But this should never happen.

This can happen when a large event is added first, or after a long period
where an absolute timestamp is prefixed to the event, increasing its size
by 8 bytes. This passes the check and then goes into the algorithm that
causes the infinite loop.

For events that are the first event on the sub-buffer, it does not need to
add a timestamp, because the sub-buffer itself contains an absolute
timestamp, and adding one is redundant.

The fix is to check if the event is to be the first event on the
sub-buffer, and if it is, then do not add a timestamp.

This also fixes 32 bit adding a timestamp when a read of before_stamp or
write_stamp is interrupted. There's still no need to add that timestamp if
the event is going to be the first event on the sub buffer.

Also, if the buffer has "time_stamp_abs" set, then also check if the
length plus the timestamp is greater than the BUF_MAX_DATA_SIZE.

Link: https://lore.kernel.org/all/20231212104549.58863438@gandalf.local.home/
Link: https://lore.kernel.org/linux-trace-kernel/20231212071837.5fdd6c13@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231212111617.39e02849@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Fixes: a4543a2fa9ef3 ("ring-buffer: Get timestamp after event is allocated")
Fixes: 58fbc3c63275c ("ring-buffer: Consolidate add_timestamp to remove some branches")
Reported-by: Kent Overstreet <kent.overstreet@linux.dev> # (on IRC)
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8d2a4f00eca9..b8986f82eccf 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3579,7 +3579,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 * absolute timestamp.
 		 * Don't bother if this is the start of a new page (w == 0).
 		 */
-		if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) {
+		if (!w) {
+			/* Use the sub-buffer timestamp */
+			info->delta = 0;
+		} else if (unlikely(!a_ok || !b_ok || info->before != info->after)) {
 			info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
 			info->length += RB_LEN_TIME_EXTEND;
 		} else {
@@ -3737,6 +3740,8 @@ rb_reserve_next_event(struct trace_buffer *buffer,
 	if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
 		add_ts_default = RB_ADD_STAMP_ABSOLUTE;
 		info.length += RB_LEN_TIME_EXTEND;
+		if (info.length > BUF_MAX_DATA_SIZE)
+			goto out_fail;
 	} else {
 		add_ts_default = RB_ADD_STAMP_NONE;
 	}
-- 
cgit v1.2.3


From b55b0a0d7c4aa2dac3579aa7e6802d1f57445096 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Sat, 9 Dec 2023 17:10:58 -0500
Subject: tracing: Have large events show up as '[LINE TOO BIG]' instead of
 nothing

If a large event was added to the ring buffer that is larger than what the
trace_seq can handle, it just drops the output:

 ~# cat /sys/kernel/tracing/trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 2/2   #P:8
 #
 #                                _-----=> irqs-off/BH-disabled
 #                               / _----=> need-resched
 #                              | / _---=> hardirq/softirq
 #                              || / _--=> preempt-depth
 #                              ||| / _-=> migrate-disable
 #                              |||| /     delay
 #           TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
 #              | |         |   |||||     |         |
            <...>-859     [001] .....   141.118951: tracing_mark_write           <...>-859     [001] .....   141.148201: tracing_mark_write: 78901234

Instead, catch this case and add some context:

 ~# cat /sys/kernel/tracing/trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 2/2   #P:8
 #
 #                                _-----=> irqs-off/BH-disabled
 #                               / _----=> need-resched
 #                              | / _---=> hardirq/softirq
 #                              || / _--=> preempt-depth
 #                              ||| / _-=> migrate-disable
 #                              |||| /     delay
 #           TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
 #              | |         |   |||||     |         |
            <...>-852     [001] .....   121.550551: tracing_mark_write[LINE TOO BIG]
            <...>-852     [001] .....   121.550581: tracing_mark_write: 78901234

This now emulates the same output as trace_pipe.

Link: https://lore.kernel.org/linux-trace-kernel/20231209171058.78c1a026@gandalf.local.home

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fbcd3bafb93e..aa8f99f3e5de 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4722,7 +4722,11 @@ static int s_show(struct seq_file *m, void *v)
 		iter->leftover = ret;
 
 	} else {
-		print_trace_line(iter);
+		ret = print_trace_line(iter);
+		if (ret == TRACE_TYPE_PARTIAL_LINE) {
+			iter->seq.full = 0;
+			trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
+		}
 		ret = trace_print_seq(m, &iter->seq);
 		/*
 		 * If we overflow the seq_file buffer, then it will
-- 
cgit v1.2.3


From 17d801758157bec93f26faaf5ff1a8b9a552d67a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Sun, 10 Dec 2023 22:12:50 -0500
Subject: ring-buffer: Fix memory leak of free page

Reading the ring buffer does a swap of a sub-buffer within the ring buffer
with a empty sub-buffer. This allows the reader to have full access to the
content of the sub-buffer that was swapped out without having to worry
about contention with the writer.

The readers call ring_buffer_alloc_read_page() to allocate a page that
will be used to swap with the ring buffer. When the code is finished with
the reader page, it calls ring_buffer_free_read_page(). Instead of freeing
the page, it stores it as a spare. Then next call to
ring_buffer_alloc_read_page() will return this spare instead of calling
into the memory management system to allocate a new page.

Unfortunately, on freeing of the ring buffer, this spare page is not
freed, and causes a memory leak.

Link: https://lore.kernel.org/linux-trace-kernel/20231210221250.7b9cc83c@rorschach.local.home

Cc: stable@vger.kernel.org
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Fixes: 73a757e63114d ("ring-buffer: Return reader page back into existing ring buffer")
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b8986f82eccf..dcd47895b424 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1787,6 +1787,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 		free_buffer_page(bpage);
 	}
 
+	free_page((unsigned long)cpu_buffer->free_page);
+
 	kfree(cpu_buffer);
 }
 
-- 
cgit v1.2.3


From d06aff1cb13d2a0d52b48e605462518149c98c81 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Sun, 10 Dec 2023 22:54:47 -0500
Subject: tracing: Update snapshot buffer on resize if it is allocated

The snapshot buffer is to mimic the main buffer so that when a snapshot is
needed, the snapshot and main buffer are swapped. When the snapshot buffer
is allocated, it is set to the minimal size that the ring buffer may be at
and still functional. When it is allocated it becomes the same size as the
main ring buffer, and when the main ring buffer changes in size, it should
do.

Currently, the resize only updates the snapshot buffer if it's used by the
current tracer (ie. the preemptirqsoff tracer). But it needs to be updated
anytime it is allocated.

When changing the size of the main buffer, instead of looking to see if
the current tracer is utilizing the snapshot buffer, just check if it is
allocated to know if it should be updated or not.

Also fix typo in comment just above the code change.

Link: https://lore.kernel.org/linux-trace-kernel/20231210225447.48476a6a@rorschach.local.home

Cc: stable@vger.kernel.org
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Fixes: ad909e21bbe69 ("tracing: Add internal tracing_snapshot() functions")
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aa8f99f3e5de..6c79548f9574 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6348,7 +6348,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
 	if (!tr->array_buffer.buffer)
 		return 0;
 
-	/* Do not allow tracing while resizng ring buffer */
+	/* Do not allow tracing while resizing ring buffer */
 	tracing_stop_tr(tr);
 
 	ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu);
@@ -6356,7 +6356,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
 		goto out_start;
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-	if (!tr->current_trace->use_max_tr)
+	if (!tr->allocated_snapshot)
 		goto out;
 
 	ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
-- 
cgit v1.2.3


From c41bd2514184d75db087fe4c1221237fb7922875 Mon Sep 17 00:00:00 2001
From: Ignat Korchagin <ignat@cloudflare.com>
Date: Wed, 29 Nov 2023 22:04:09 +0000
Subject: kexec: drop dependency on ARCH_SUPPORTS_KEXEC from CRASH_DUMP

In commit f8ff23429c62 ("kernel/Kconfig.kexec: drop select of KEXEC for
CRASH_DUMP") we tried to fix a config regression, where CONFIG_CRASH_DUMP
required CONFIG_KEXEC.

However, it was not enough at least for arm64 platforms.  While further
testing the patch with our arm64 config I noticed that CONFIG_CRASH_DUMP
is unavailable in menuconfig.  This is because CONFIG_CRASH_DUMP still
depends on the new CONFIG_ARCH_SUPPORTS_KEXEC introduced in commit
91506f7e5d21 ("arm64/kexec: refactor for kernel/Kconfig.kexec") and on
arm64 CONFIG_ARCH_SUPPORTS_KEXEC requires CONFIG_PM_SLEEP_SMP=y, which in
turn requires either CONFIG_SUSPEND=y or CONFIG_HIBERNATION=y neither of
which are set in our config.

Given that we already established that CONFIG_KEXEC (which is a switch for
kexec system call itself) is not required for CONFIG_CRASH_DUMP drop
CONFIG_ARCH_SUPPORTS_KEXEC dependency as well.  The arm64 kernel builds
just fine with CONFIG_CRASH_DUMP=y and with both CONFIG_KEXEC=n and
CONFIG_KEXEC_FILE=n after f8ff23429c62 ("kernel/Kconfig.kexec: drop select
of KEXEC for CRASH_DUMP") and this patch are applied given that the
necessary shared bits are included via CONFIG_KEXEC_CORE dependency.

[bhe@redhat.com: don't export some symbols when CONFIG_MMU=n]
  Link: https://lkml.kernel.org/r/ZW03ODUKGGhP1ZGU@MiWiFi-R3L-srv
[bhe@redhat.com: riscv, kexec: fix dependency of two items]
  Link: https://lkml.kernel.org/r/ZW04G/SKnhbE5mnX@MiWiFi-R3L-srv
Link: https://lkml.kernel.org/r/20231129220409.55006-1-ignat@cloudflare.com
Fixes: 91506f7e5d21 ("arm64/kexec: refactor for kernel/Kconfig.kexec")
Signed-off-by: Ignat Korchagin <ignat@cloudflare.com>
Signed-off-by: Baoquan He <bhe@redhat.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: <stable@vger.kernel.org> # 6.6+: f8ff234: kernel/Kconfig.kexec: drop select of KEXEC for CRASH_DUMP
Cc: <stable@vger.kernel.org> # 6.6+
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/riscv/Kconfig             | 4 ++--
 arch/riscv/kernel/crash_core.c | 4 +++-
 kernel/Kconfig.kexec           | 1 -
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 95a2a06acc6a..24c1799e2ec4 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -685,7 +685,7 @@ config RISCV_BOOT_SPINWAIT
 	  If unsure what to do here, say N.
 
 config ARCH_SUPPORTS_KEXEC
-	def_bool MMU
+	def_bool y
 
 config ARCH_SELECTS_KEXEC
 	def_bool y
@@ -693,7 +693,7 @@ config ARCH_SELECTS_KEXEC
 	select HOTPLUG_CPU if SMP
 
 config ARCH_SUPPORTS_KEXEC_FILE
-	def_bool 64BIT && MMU
+	def_bool 64BIT
 
 config ARCH_SELECTS_KEXEC_FILE
 	def_bool y
diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/crash_core.c
index 55f1d7856b54..8706736fd4e2 100644
--- a/arch/riscv/kernel/crash_core.c
+++ b/arch/riscv/kernel/crash_core.c
@@ -5,17 +5,19 @@
 
 void arch_crash_save_vmcoreinfo(void)
 {
-	VMCOREINFO_NUMBER(VA_BITS);
 	VMCOREINFO_NUMBER(phys_ram_base);
 
 	vmcoreinfo_append_str("NUMBER(PAGE_OFFSET)=0x%lx\n", PAGE_OFFSET);
 	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
 	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
+#ifdef CONFIG_MMU
+	VMCOREINFO_NUMBER(VA_BITS);
 	vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START);
 	vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END);
 #ifdef CONFIG_64BIT
 	vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR);
 	vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END);
+#endif
 #endif
 	vmcoreinfo_append_str("NUMBER(KERNEL_LINK_ADDR)=0x%lx\n", KERNEL_LINK_ADDR);
 	vmcoreinfo_append_str("NUMBER(va_kernel_pa_offset)=0x%lx\n",
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 1cc3b1c595d7..2fd510256604 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -94,7 +94,6 @@ config KEXEC_JUMP
 config CRASH_DUMP
 	bool "kernel crash dumps"
 	depends on ARCH_SUPPORTS_CRASH_DUMP
-	depends on ARCH_SUPPORTS_KEXEC
 	select CRASH_CORE
 	select KEXEC_CORE
 	help
-- 
cgit v1.2.3


From 1dd11e977360ad3493812da0b05ffd9adcdd15a1 Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Sat, 9 Dec 2023 22:14:38 +0800
Subject: crash_core: fix the check for whether crashkernel is from high memory

If crash_base is equal to CRASH_ADDR_LOW_MAX, it also indicates that
the crashkernel memory is allocated from high memory. However, the
current check only considers the case where crash_base is greater than
CRASH_ADDR_LOW_MAX. Fix it.

The runtime effects is that crashkernel high memory is successfully
reserved, whereas the crashkernel low memory is bypassed in this case,
then kdump kernel bootup will fail because of no low memory under 4G.

This patch also includes some minor cleanups.

Link: https://lkml.kernel.org/r/20231209141438.77233-1-ytcoode@gmail.com
Fixes: 0ab97169aa05 ("crash_core: add generic function to do reservation")
Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Zhen Lei <thunder.leizhen@huawei.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/crash_core.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index efe87d501c8c..d4313b53837e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -199,7 +199,7 @@ static __initdata char *suffix_tbl[] = {
  * It returns 0 on success and -EINVAL on failure.
  */
 static int __init parse_crashkernel_suffix(char *cmdline,
-					   unsigned long long	*crash_size,
+					   unsigned long long *crash_size,
 					   const char *suffix)
 {
 	char *cur = cmdline;
@@ -268,9 +268,9 @@ static int __init __parse_crashkernel(char *cmdline,
 			     unsigned long long *crash_base,
 			     const char *suffix)
 {
-	char	*first_colon, *first_space;
-	char	*ck_cmdline;
-	char	*name = "crashkernel=";
+	char *first_colon, *first_space;
+	char *ck_cmdline;
+	char *name = "crashkernel=";
 
 	BUG_ON(!crash_size || !crash_base);
 	*crash_size = 0;
@@ -440,7 +440,7 @@ retry:
 		return;
 	}
 
-	if ((crash_base > CRASH_ADDR_LOW_MAX) &&
+	if ((crash_base >= CRASH_ADDR_LOW_MAX) &&
 	     crash_low_size && reserve_crashkernel_low(crash_low_size)) {
 		memblock_phys_free(crash_base, crash_size);
 		return;
-- 
cgit v1.2.3


From 9e45e39dc249c970d99d2681f6bcb55736fd725c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Mon, 11 Dec 2023 11:44:20 -0500
Subject: ring-buffer: Do not update before stamp when switching sub-buffers

The ring buffer timestamps are synchronized by two timestamp placeholders.
One is the "before_stamp" and the other is the "write_stamp" (sometimes
referred to as the "after stamp" but only in the comments. These two
stamps are key to knowing how to handle nested events coming in with a
lockless system.

When moving across sub-buffers, the before stamp is updated but the write
stamp is not. There's an effort to put back the before stamp to something
that seems logical in case there's nested events. But as the current event
is about to cross sub-buffers, and so will any new nested event that happens,
updating the before stamp is useless, and could even introduce new race
conditions.

The first event on a sub-buffer simply uses the sub-buffer's timestamp
and keeps a "delta" of zero. The "before_stamp" and "write_stamp" are not
used in the algorithm in this case. There's no reason to try to fix the
before_stamp when this happens.

As a bonus, it removes a cmpxchg() when crossing sub-buffers!

Link: https://lore.kernel.org/linux-trace-kernel/20231211114420.36dde01b@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Fixes: a389d86f7fd09 ("ring-buffer: Have nested events still record running time stamp")
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dcd47895b424..c7abcc215fe2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3607,14 +3607,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 	/* See if we shot pass the end of this buffer page */
 	if (unlikely(write > BUF_PAGE_SIZE)) {
-		/* before and after may now different, fix it up*/
-		b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
-		a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
-		if (a_ok && b_ok && info->before != info->after)
-			(void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
-					      info->before, info->after);
-		if (a_ok && b_ok)
-			check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
+		check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
 		return rb_move_tail(cpu_buffer, tail, info);
 	}
 
-- 
cgit v1.2.3


From b049525855fdd0024881c9b14b8fbec61c3f53d3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 12 Dec 2023 07:25:58 -0500
Subject: ring-buffer: Have saved event hold the entire event

For the ring buffer iterator (non-consuming read), the event needs to be
copied into the iterator buffer to make sure that a writer does not
overwrite it while the user is reading it. If a write happens during the
copy, the buffer is simply discarded.

But the temp buffer itself was not big enough. The allocation of the
buffer was only BUF_MAX_DATA_SIZE, which is the maximum data size that can
be passed into the ring buffer and saved. But the temp buffer needs to
hold the meta data as well. That would be BUF_PAGE_SIZE and not
BUF_MAX_DATA_SIZE.

Link: https://lore.kernel.org/linux-trace-kernel/20231212072558.61f76493@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Fixes: 785888c544e04 ("ring-buffer: Have rb_iter_head_event() handle concurrent writer")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c7abcc215fe2..1d9caee7f542 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2409,7 +2409,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
 	 */
 	barrier();
 
-	if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE)
+	if ((iter->head + length) > commit || length > BUF_PAGE_SIZE)
 		/* Writer corrupted the read? */
 		goto reset;
 
@@ -5118,7 +5118,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 	if (!iter)
 		return NULL;
 
-	iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags);
+	/* Holds the entire event: data and meta data */
+	iter->event = kmalloc(BUF_PAGE_SIZE, flags);
 	if (!iter->event) {
 		kfree(iter);
 		return NULL;
-- 
cgit v1.2.3


From 60be76eeabb3d83858cc6577fc65c7d0f36ffd42 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 12 Dec 2023 08:44:44 -0500
Subject: tracing: Add size check when printing trace_marker output

If for some reason the trace_marker write does not have a nul byte for the
string, it will overflow the print:

  trace_seq_printf(s, ": %s", field->buf);

The field->buf could be missing the nul byte. To prevent overflow, add the
max size that the buf can be by using the event size and the field
location.

  int max = iter->ent_size - offsetof(struct print_entry, buf);

  trace_seq_printf(s, ": %*.s", max, field->buf);

Link: https://lore.kernel.org/linux-trace-kernel/20231212084444.4619b8ce@gandalf.local.home

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d8b302d01083..3e7fa44dc2b2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1587,11 +1587,12 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
 {
 	struct print_entry *field;
 	struct trace_seq *s = &iter->seq;
+	int max = iter->ent_size - offsetof(struct print_entry, buf);
 
 	trace_assign_type(field, iter->ent);
 
 	seq_print_ip_sym(s, field->ip, flags);
-	trace_seq_printf(s, ": %s", field->buf);
+	trace_seq_printf(s, ": %.*s", max, field->buf);
 
 	return trace_handle_return(s);
 }
@@ -1600,10 +1601,11 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
 					 struct trace_event *event)
 {
 	struct print_entry *field;
+	int max = iter->ent_size - offsetof(struct print_entry, buf);
 
 	trace_assign_type(field, iter->ent);
 
-	trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
+	trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf);
 
 	return trace_handle_return(&iter->seq);
 }
-- 
cgit v1.2.3


From dee39c0c1e9624f925da4ca0bece46bdc7427257 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang1211@gmail.com>
Date: Wed, 1 Nov 2023 11:35:07 +0800
Subject: rcu: Force quiescent states only for ongoing grace period

If an rcutorture test scenario creates an fqs_task kthread, it will
periodically invoke rcu_force_quiescent_state() in order to start
force-quiescent-state (FQS) operations.  However, an FQS operation
will be started even if there is no RCU grace period in progress.
Although testing FQS operations startup when there is no grace period in
progress is necessary, it need not happen all that often.  This commit
therefore causes rcu_force_quiescent_state() to take an early exit
if there is no grace period in progress.

Note that there will still be attempts to start an FQS scan in the
absence of a grace period because the grace period might end right
after the rcu_force_quiescent_state() function's check.  In actual
testing, this happens about once every ten minutes, which should
provide adequate testing.

Signed-off-by: Zqiang <qiang.zhang1211@gmail.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 kernel/rcu/tree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3ac3c846105f..1ae851777806 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2338,6 +2338,8 @@ void rcu_force_quiescent_state(void)
 	struct rcu_node *rnp;
 	struct rcu_node *rnp_old = NULL;
 
+	if (!rcu_gp_in_progress())
+		return;
 	/* Funnel through hierarchy to reduce memory contention. */
 	rnp = raw_cpu_read(rcu_data.mynode);
 	for (; rnp != NULL; rnp = rnp->parent) {
-- 
cgit v1.2.3


From 750e785796bb72423b97cac21ecd0fa3b3b65610 Mon Sep 17 00:00:00 2001
From: Jie Jiang <jiejiang@chromium.org>
Date: Tue, 12 Dec 2023 09:39:23 +0000
Subject: bpf: Support uid and gid when mounting bpffs

Parse uid and gid in bpf_parse_param() so that they can be passed in as
the `data` parameter when mount() bpffs. This will be useful when we
want to control which user/group has the control to the mounted bpffs,
otherwise a separate chown() call will be needed.

Signed-off-by: Jie Jiang <jiejiang@chromium.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Mike Frysinger <vapier@chromium.org>
Acked-by: Christian Brauner <brauner@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231212093923.497838-1-jiejiang@chromium.org
---
 include/linux/bpf.h |  2 ++
 kernel/bpf/inode.c  | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0bd4889e917a..c87c608a3689 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1595,6 +1595,8 @@ struct bpf_link_primer {
 };
 
 struct bpf_mount_opts {
+	kuid_t uid;
+	kgid_t gid;
 	umode_t mode;
 
 	/* BPF token-related delegation options */
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5359a0929c35..0a8e1188ea46 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -601,9 +601,16 @@ EXPORT_SYMBOL(bpf_prog_get_type_path);
 static int bpf_show_options(struct seq_file *m, struct dentry *root)
 {
 	struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
-	umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
+	struct inode *inode = d_inode(root);
+	umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
 	u64 mask;
 
+	if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
+		seq_printf(m, ",uid=%u",
+			   from_kuid_munged(&init_user_ns, inode->i_uid));
+	if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID))
+		seq_printf(m, ",gid=%u",
+			   from_kgid_munged(&init_user_ns, inode->i_gid));
 	if (mode != S_IRWXUGO)
 		seq_printf(m, ",mode=%o", mode);
 
@@ -652,6 +659,8 @@ const struct super_operations bpf_super_ops = {
 };
 
 enum {
+	OPT_UID,
+	OPT_GID,
 	OPT_MODE,
 	OPT_DELEGATE_CMDS,
 	OPT_DELEGATE_MAPS,
@@ -660,6 +669,8 @@ enum {
 };
 
 static const struct fs_parameter_spec bpf_fs_parameters[] = {
+	fsparam_u32	("uid",				OPT_UID),
+	fsparam_u32	("gid",				OPT_GID),
 	fsparam_u32oct	("mode",			OPT_MODE),
 	fsparam_string	("delegate_cmds",		OPT_DELEGATE_CMDS),
 	fsparam_string	("delegate_maps",		OPT_DELEGATE_MAPS),
@@ -672,6 +683,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
 	struct bpf_mount_opts *opts = fc->s_fs_info;
 	struct fs_parse_result result;
+	kuid_t uid;
+	kgid_t gid;
 	int opt, err;
 	u64 msk;
 
@@ -694,6 +707,34 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	}
 
 	switch (opt) {
+	case OPT_UID:
+		uid = make_kuid(current_user_ns(), result.uint_32);
+		if (!uid_valid(uid))
+			goto bad_value;
+
+		/*
+		 * The requested uid must be representable in the
+		 * filesystem's idmapping.
+		 */
+		if (!kuid_has_mapping(fc->user_ns, uid))
+			goto bad_value;
+
+		opts->uid = uid;
+		break;
+	case OPT_GID:
+		gid = make_kgid(current_user_ns(), result.uint_32);
+		if (!gid_valid(gid))
+			goto bad_value;
+
+		/*
+		 * The requested gid must be representable in the
+		 * filesystem's idmapping.
+		 */
+		if (!kgid_has_mapping(fc->user_ns, gid))
+			goto bad_value;
+
+		opts->gid = gid;
+		break;
 	case OPT_MODE:
 		opts->mode = result.uint_32 & S_IALLUGO;
 		break;
@@ -722,6 +763,9 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	}
 
 	return 0;
+
+bad_value:
+	return invalfc(fc, "Bad value for '%s'", param->key);
 }
 
 struct bpf_preload_ops *bpf_preload_ops;
@@ -808,6 +852,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_op = &bpf_super_ops;
 
 	inode = sb->s_root->d_inode;
+	inode->i_uid = opts->uid;
+	inode->i_gid = opts->gid;
 	inode->i_op = &bpf_dir_iops;
 	inode->i_mode &= ~S_IALLUGO;
 	populate_bpffs(sb->s_root);
@@ -843,6 +889,8 @@ static int bpf_init_fs_context(struct fs_context *fc)
 		return -ENOMEM;
 
 	opts->mode = S_IRWXUGO;
+	opts->uid = current_fsuid();
+	opts->gid = current_fsgid();
 
 	/* start out with no BPF token delegation enabled */
 	opts->delegate_cmds = 0;
-- 
cgit v1.2.3


From f5fdb51fb980077a4c6c78f3f775821f611fb38b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 13 Dec 2023 11:08:33 -0800
Subject: bpf: fail BPF_TOKEN_CREATE if no delegation option was set on BPF FS

It's quite confusing in practice when it's possible to successfully
create a BPF token from BPF FS that didn't have any of delegate_xxx
mount options set up. While it's not wrong, it's actually more
meaningful to reject BPF_TOKEN_CREATE with specific error code (-ENOENT)
to let user-space know that no token delegation is setup up.

So, instead of creating empty BPF token that will be always ignored
because it doesn't have any of the allow_xxx bits set, reject it with
-ENOENT. If we ever need empty BPF token to be possible, we can support
that with extra flag passed into BPF_TOKEN_CREATE.

Acked-by: Christian Brauner <brauner@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231213190842.3844987-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/token.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index 17212efcde60..a86fccd57e2d 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -152,6 +152,15 @@ int bpf_token_create(union bpf_attr *attr)
 		goto out_path;
 	}
 
+	mnt_opts = path.dentry->d_sb->s_fs_info;
+	if (mnt_opts->delegate_cmds == 0 &&
+	    mnt_opts->delegate_maps == 0 &&
+	    mnt_opts->delegate_progs == 0 &&
+	    mnt_opts->delegate_attachs == 0) {
+		err = -ENOENT; /* no BPF token delegation is set up */
+		goto out_path;
+	}
+
 	mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
 	inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode);
 	if (IS_ERR(inode)) {
@@ -181,7 +190,6 @@ int bpf_token_create(union bpf_attr *attr)
 	/* remember bpffs owning userns for future ns_capable() checks */
 	token->userns = get_user_ns(userns);
 
-	mnt_opts = path.dentry->d_sb->s_fs_info;
 	token->allowed_cmds = mnt_opts->delegate_cmds;
 	token->allowed_maps = mnt_opts->delegate_maps;
 	token->allowed_progs = mnt_opts->delegate_progs;
-- 
cgit v1.2.3


From b13cddf633562b9b2c34fd63471d377019704ebe Mon Sep 17 00:00:00 2001
From: Matt Bobrowski <mattbobrowski@google.com>
Date: Fri, 8 Dec 2023 15:32:48 +0000
Subject: bpf: add small subset of SECURITY_PATH hooks to BPF
 sleepable_lsm_hooks list

security_path_* based LSM hooks appear to be generally missing from
the sleepable_lsm_hooks list. Initially add a small subset of them to
the preexisting sleepable_lsm_hooks list so that sleepable BPF helpers
like bpf_d_path() can be used from sleepable BPF LSM based programs.

The security_path_* hooks added in this patch are similar to the
security_inode_* counterparts that already exist in the
sleepable_lsm_hooks list, and are called in roughly similar points and
contexts. Presumably, making them OK to be also annotated as
sleepable.

Building a kernel with DEBUG_ATOMIC_SLEEP options enabled and running
reasonable workloads stimulating activity that would be intercepted by
such security hooks didn't show any splats.

Notably, I haven't added all the security_path_* LSM hooks that are
available as I don't need them at this point in time.

Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Acked-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/ZXM3IHHXpNY9y82a@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/bpf_lsm.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 7d2f96413a57..63b4dc495125 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -304,6 +304,18 @@ BTF_ID(func, bpf_lsm_kernel_module_request)
 BTF_ID(func, bpf_lsm_kernel_read_file)
 BTF_ID(func, bpf_lsm_kernfs_init_security)
 
+#ifdef CONFIG_SECURITY_PATH
+BTF_ID(func, bpf_lsm_path_unlink)
+BTF_ID(func, bpf_lsm_path_mkdir)
+BTF_ID(func, bpf_lsm_path_rmdir)
+BTF_ID(func, bpf_lsm_path_truncate)
+BTF_ID(func, bpf_lsm_path_symlink)
+BTF_ID(func, bpf_lsm_path_link)
+BTF_ID(func, bpf_lsm_path_rename)
+BTF_ID(func, bpf_lsm_path_chmod)
+BTF_ID(func, bpf_lsm_path_chown)
+#endif /* CONFIG_SECURITY_PATH */
+
 #ifdef CONFIG_KEYS
 BTF_ID(func, bpf_lsm_key_free)
 #endif /* CONFIG_KEYS */
-- 
cgit v1.2.3


From 2a0c6b41eec90c2a138ea8b574836744783c67ff Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 11 Dec 2023 16:34:47 +0800
Subject: bpf: Update the comments in maybe_wait_bpf_programs()

Since commit 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in
sleepable programs"), sleepable BPF program can also use map-in-map, but
maybe_wait_bpf_programs() doesn't handle it accordingly. The main reason
is that using synchronize_rcu_tasks_trace() to wait for the completions
of these sleepable BPF programs may incur a very long delay and
userspace may think it is hung, so the wait for sleepable BPF programs
is skipped. Update the comments in maybe_wait_bpf_programs() to reflect
the reason.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/r/20231211083447.1921178-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 06320d9abf33..d63c1ed42412 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -142,9 +142,13 @@ static u32 bpf_map_value_size(const struct bpf_map *map)
 
 static void maybe_wait_bpf_programs(struct bpf_map *map)
 {
-	/* Wait for any running BPF programs to complete so that
-	 * userspace, when we return to it, knows that all programs
-	 * that could be running use the new map value.
+	/* Wait for any running non-sleepable BPF programs to complete so that
+	 * userspace, when we return to it, knows that all non-sleepable
+	 * programs that could be running use the new map value. For sleepable
+	 * BPF programs, synchronize_rcu_tasks_trace() should be used to wait
+	 * for the completions of these programs, but considering the waiting
+	 * time can be very long and userspace may think it will hang forever,
+	 * so don't handle sleepable BPF programs now.
 	 */
 	if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
 	    map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
-- 
cgit v1.2.3


From 1cc111b9cddc71ce161cd388f11f0e9048edffdb Mon Sep 17 00:00:00 2001
From: Zheng Yejian <zhengyejian1@huawei.com>
Date: Thu, 14 Dec 2023 09:21:53 +0800
Subject: tracing: Fix uaf issue when open the hist or hist_debug file

KASAN report following issue. The root cause is when opening 'hist'
file of an instance and accessing 'trace_event_file' in hist_show(),
but 'trace_event_file' has been freed due to the instance being removed.
'hist_debug' file has the same problem. To fix it, call
tracing_{open,release}_file_tr() in file_operations callback to have
the ref count and avoid 'trace_event_file' being freed.

  BUG: KASAN: slab-use-after-free in hist_show+0x11e0/0x1278
  Read of size 8 at addr ffff242541e336b8 by task head/190

  CPU: 4 PID: 190 Comm: head Not tainted 6.7.0-rc5-g26aff849438c #133
  Hardware name: linux,dummy-virt (DT)
  Call trace:
   dump_backtrace+0x98/0xf8
   show_stack+0x1c/0x30
   dump_stack_lvl+0x44/0x58
   print_report+0xf0/0x5a0
   kasan_report+0x80/0xc0
   __asan_report_load8_noabort+0x1c/0x28
   hist_show+0x11e0/0x1278
   seq_read_iter+0x344/0xd78
   seq_read+0x128/0x1c0
   vfs_read+0x198/0x6c8
   ksys_read+0xf4/0x1e0
   __arm64_sys_read+0x70/0xa8
   invoke_syscall+0x70/0x260
   el0_svc_common.constprop.0+0xb0/0x280
   do_el0_svc+0x44/0x60
   el0_svc+0x34/0x68
   el0t_64_sync_handler+0xb8/0xc0
   el0t_64_sync+0x168/0x170

  Allocated by task 188:
   kasan_save_stack+0x28/0x50
   kasan_set_track+0x28/0x38
   kasan_save_alloc_info+0x20/0x30
   __kasan_slab_alloc+0x6c/0x80
   kmem_cache_alloc+0x15c/0x4a8
   trace_create_new_event+0x84/0x348
   __trace_add_new_event+0x18/0x88
   event_trace_add_tracer+0xc4/0x1a0
   trace_array_create_dir+0x6c/0x100
   trace_array_create+0x2e8/0x568
   instance_mkdir+0x48/0x80
   tracefs_syscall_mkdir+0x90/0xe8
   vfs_mkdir+0x3c4/0x610
   do_mkdirat+0x144/0x200
   __arm64_sys_mkdirat+0x8c/0xc0
   invoke_syscall+0x70/0x260
   el0_svc_common.constprop.0+0xb0/0x280
   do_el0_svc+0x44/0x60
   el0_svc+0x34/0x68
   el0t_64_sync_handler+0xb8/0xc0
   el0t_64_sync+0x168/0x170

  Freed by task 191:
   kasan_save_stack+0x28/0x50
   kasan_set_track+0x28/0x38
   kasan_save_free_info+0x34/0x58
   __kasan_slab_free+0xe4/0x158
   kmem_cache_free+0x19c/0x508
   event_file_put+0xa0/0x120
   remove_event_file_dir+0x180/0x320
   event_trace_del_tracer+0xb0/0x180
   __remove_instance+0x224/0x508
   instance_rmdir+0x44/0x78
   tracefs_syscall_rmdir+0xbc/0x140
   vfs_rmdir+0x1cc/0x4c8
   do_rmdir+0x220/0x2b8
   __arm64_sys_unlinkat+0xc0/0x100
   invoke_syscall+0x70/0x260
   el0_svc_common.constprop.0+0xb0/0x280
   do_el0_svc+0x44/0x60
   el0_svc+0x34/0x68
   el0t_64_sync_handler+0xb8/0xc0
   el0t_64_sync+0x168/0x170

Link: https://lore.kernel.org/linux-trace-kernel/20231214012153.676155-1-zhengyejian1@huawei.com

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  6 ++++++
 kernel/trace/trace.h             |  1 +
 kernel/trace/trace_events_hist.c | 12 ++++++++----
 3 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6c79548f9574..199df497db07 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4968,6 +4968,12 @@ int tracing_release_file_tr(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+int tracing_single_release_file_tr(struct inode *inode, struct file *filp)
+{
+	tracing_release_file_tr(inode, filp);
+	return single_release(inode, filp);
+}
+
 static int tracing_mark_open(struct inode *inode, struct file *filp)
 {
 	stream_open(inode, filp);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b7f4ea25a194..0489e72c8169 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -617,6 +617,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp);
 int tracing_open_generic_tr(struct inode *inode, struct file *filp);
 int tracing_open_file_tr(struct inode *inode, struct file *filp);
 int tracing_release_file_tr(struct inode *inode, struct file *filp);
+int tracing_single_release_file_tr(struct inode *inode, struct file *filp);
 bool tracing_is_disabled(void);
 bool tracer_tracing_is_on(struct trace_array *tr);
 void tracer_tracing_on(struct trace_array *tr);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1abc07fba1b9..5ecf3c8bde20 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5623,10 +5623,12 @@ static int event_hist_open(struct inode *inode, struct file *file)
 {
 	int ret;
 
-	ret = security_locked_down(LOCKDOWN_TRACEFS);
+	ret = tracing_open_file_tr(inode, file);
 	if (ret)
 		return ret;
 
+	/* Clear private_data to avoid warning in single_open() */
+	file->private_data = NULL;
 	return single_open(file, hist_show, file);
 }
 
@@ -5634,7 +5636,7 @@ const struct file_operations event_hist_fops = {
 	.open = event_hist_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
-	.release = single_release,
+	.release = tracing_single_release_file_tr,
 };
 
 #ifdef CONFIG_HIST_TRIGGERS_DEBUG
@@ -5900,10 +5902,12 @@ static int event_hist_debug_open(struct inode *inode, struct file *file)
 {
 	int ret;
 
-	ret = security_locked_down(LOCKDOWN_TRACEFS);
+	ret = tracing_open_file_tr(inode, file);
 	if (ret)
 		return ret;
 
+	/* Clear private_data to avoid warning in single_open() */
+	file->private_data = NULL;
 	return single_open(file, hist_debug_show, file);
 }
 
@@ -5911,7 +5915,7 @@ const struct file_operations event_hist_debug_fops = {
 	.open = event_hist_debug_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
-	.release = single_release,
+	.release = tracing_single_release_file_tr,
 };
 #endif
 
-- 
cgit v1.2.3


From 8f82583f9527b3be9d70d9a5d1f33435e29d0480 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Thu, 14 Dec 2023 12:30:09 +0800
Subject: bpf: Reduce the scope of rcu_read_lock when updating fd map

There is no rcu-read-lock requirement for ops->map_fd_get_ptr() or
ops->map_fd_put_ptr(), so doesn't use rcu-read-lock for these two
callbacks.

For bpf_fd_array_map_update_elem(), accessing array->ptrs doesn't need
rcu-read-lock because array->ptrs must still be allocated. For
bpf_fd_htab_map_update_elem(), htab_map_update_elem() only requires
rcu-read-lock to be held to avoid the WARN_ON_ONCE(), so only use
rcu_read_lock() during the invocation of htab_map_update_elem().

Acked-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231214043010.3458072-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/hashtab.c | 6 ++++++
 kernel/bpf/syscall.c | 4 ----
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 5b9146fa825f..ec3bdcc6a3cf 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -2523,7 +2523,13 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
 
+	/* The htab bucket lock is always held during update operations in fd
+	 * htab map, and the following rcu_read_lock() is only used to avoid
+	 * the WARN_ON_ONCE in htab_map_update_elem().
+	 */
+	rcu_read_lock();
 	ret = htab_map_update_elem(map, key, &ptr, map_flags);
+	rcu_read_unlock();
 	if (ret)
 		map->ops->map_fd_put_ptr(map, ptr, false);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d63c1ed42412..3fcf7741146a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -184,15 +184,11 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
 		err = bpf_percpu_cgroup_storage_update(map, key, value,
 						       flags);
 	} else if (IS_FD_ARRAY(map)) {
-		rcu_read_lock();
 		err = bpf_fd_array_map_update_elem(map, map_file, key, value,
 						   flags);
-		rcu_read_unlock();
 	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
-		rcu_read_lock();
 		err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
 						  flags);
-		rcu_read_unlock();
 	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
 		/* rcu_read_lock() is not needed */
 		err = bpf_fd_reuseport_array_update_elem(map, key, value,
-- 
cgit v1.2.3


From dc68540913ac523b46ebda3843cec179362c7a72 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Thu, 14 Dec 2023 12:30:10 +0800
Subject: bpf: Use GFP_KERNEL in bpf_event_entry_gen()

rcu_read_lock() is no longer held when invoking bpf_event_entry_gen()
which is called by perf_event_fd_array_get_ptr(), so using GFP_KERNEL
instead of GFP_ATOMIC to reduce the possibility of failures due to
out-of-memory.

Acked-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231214043010.3458072-3-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/arraymap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 8d365bda9a8b..b5ec24b3563e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1195,7 +1195,7 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
 {
 	struct bpf_event_entry *ee;
 
-	ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
+	ee = kzalloc(sizeof(*ee), GFP_KERNEL);
 	if (ee) {
 		ee->event = perf_file->private_data;
 		ee->perf_file = perf_file;
-- 
cgit v1.2.3


From 59e5791f59dd83e8aa72a4e74217eabb6e8cfd90 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 14 Dec 2023 12:38:15 -0800
Subject: bpf: Fix a race condition between btf_put() and map_free()

When running `./test_progs -j` in my local vm with latest kernel,
I once hit a kasan error like below:

  [ 1887.184724] BUG: KASAN: slab-use-after-free in bpf_rb_root_free+0x1f8/0x2b0
  [ 1887.185599] Read of size 4 at addr ffff888106806910 by task kworker/u12:2/2830
  [ 1887.186498]
  [ 1887.186712] CPU: 3 PID: 2830 Comm: kworker/u12:2 Tainted: G           OEL     6.7.0-rc3-00699-g90679706d486-dirty #494
  [ 1887.188034] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
  [ 1887.189618] Workqueue: events_unbound bpf_map_free_deferred
  [ 1887.190341] Call Trace:
  [ 1887.190666]  <TASK>
  [ 1887.190949]  dump_stack_lvl+0xac/0xe0
  [ 1887.191423]  ? nf_tcp_handle_invalid+0x1b0/0x1b0
  [ 1887.192019]  ? panic+0x3c0/0x3c0
  [ 1887.192449]  print_report+0x14f/0x720
  [ 1887.192930]  ? preempt_count_sub+0x1c/0xd0
  [ 1887.193459]  ? __virt_addr_valid+0xac/0x120
  [ 1887.194004]  ? bpf_rb_root_free+0x1f8/0x2b0
  [ 1887.194572]  kasan_report+0xc3/0x100
  [ 1887.195085]  ? bpf_rb_root_free+0x1f8/0x2b0
  [ 1887.195668]  bpf_rb_root_free+0x1f8/0x2b0
  [ 1887.196183]  ? __bpf_obj_drop_impl+0xb0/0xb0
  [ 1887.196736]  ? preempt_count_sub+0x1c/0xd0
  [ 1887.197270]  ? preempt_count_sub+0x1c/0xd0
  [ 1887.197802]  ? _raw_spin_unlock+0x1f/0x40
  [ 1887.198319]  bpf_obj_free_fields+0x1d4/0x260
  [ 1887.198883]  array_map_free+0x1a3/0x260
  [ 1887.199380]  bpf_map_free_deferred+0x7b/0xe0
  [ 1887.199943]  process_scheduled_works+0x3a2/0x6c0
  [ 1887.200549]  worker_thread+0x633/0x890
  [ 1887.201047]  ? __kthread_parkme+0xd7/0xf0
  [ 1887.201574]  ? kthread+0x102/0x1d0
  [ 1887.202020]  kthread+0x1ab/0x1d0
  [ 1887.202447]  ? pr_cont_work+0x270/0x270
  [ 1887.202954]  ? kthread_blkcg+0x50/0x50
  [ 1887.203444]  ret_from_fork+0x34/0x50
  [ 1887.203914]  ? kthread_blkcg+0x50/0x50
  [ 1887.204397]  ret_from_fork_asm+0x11/0x20
  [ 1887.204913]  </TASK>
  [ 1887.204913]  </TASK>
  [ 1887.205209]
  [ 1887.205416] Allocated by task 2197:
  [ 1887.205881]  kasan_set_track+0x3f/0x60
  [ 1887.206366]  __kasan_kmalloc+0x6e/0x80
  [ 1887.206856]  __kmalloc+0xac/0x1a0
  [ 1887.207293]  btf_parse_fields+0xa15/0x1480
  [ 1887.207836]  btf_parse_struct_metas+0x566/0x670
  [ 1887.208387]  btf_new_fd+0x294/0x4d0
  [ 1887.208851]  __sys_bpf+0x4ba/0x600
  [ 1887.209292]  __x64_sys_bpf+0x41/0x50
  [ 1887.209762]  do_syscall_64+0x4c/0xf0
  [ 1887.210222]  entry_SYSCALL_64_after_hwframe+0x63/0x6b
  [ 1887.210868]
  [ 1887.211074] Freed by task 36:
  [ 1887.211460]  kasan_set_track+0x3f/0x60
  [ 1887.211951]  kasan_save_free_info+0x28/0x40
  [ 1887.212485]  ____kasan_slab_free+0x101/0x180
  [ 1887.213027]  __kmem_cache_free+0xe4/0x210
  [ 1887.213514]  btf_free+0x5b/0x130
  [ 1887.213918]  rcu_core+0x638/0xcc0
  [ 1887.214347]  __do_softirq+0x114/0x37e

The error happens at bpf_rb_root_free+0x1f8/0x2b0:

  00000000000034c0 <bpf_rb_root_free>:
  ; {
    34c0: f3 0f 1e fa                   endbr64
    34c4: e8 00 00 00 00                callq   0x34c9 <bpf_rb_root_free+0x9>
    34c9: 55                            pushq   %rbp
    34ca: 48 89 e5                      movq    %rsp, %rbp
  ...
  ;       if (rec && rec->refcount_off >= 0 &&
    36aa: 4d 85 ed                      testq   %r13, %r13
    36ad: 74 a9                         je      0x3658 <bpf_rb_root_free+0x198>
    36af: 49 8d 7d 10                   leaq    0x10(%r13), %rdi
    36b3: e8 00 00 00 00                callq   0x36b8 <bpf_rb_root_free+0x1f8>
                                        <==== kasan function
    36b8: 45 8b 7d 10                   movl    0x10(%r13), %r15d
                                        <==== use-after-free load
    36bc: 45 85 ff                      testl   %r15d, %r15d
    36bf: 78 8c                         js      0x364d <bpf_rb_root_free+0x18d>

So the problem is at rec->refcount_off in the above.

I did some source code analysis and find the reason.
                                  CPU A                        CPU B
  bpf_map_put:
    ...
    btf_put with rcu callback
    ...
    bpf_map_free_deferred
      with system_unbound_wq
    ...                          ...                           ...
    ...                          btf_free_rcu:                 ...
    ...                          ...                           bpf_map_free_deferred:
    ...                          ...
    ...         --------->       btf_struct_metas_free()
    ...         | race condition ...
    ...         --------->                                     map->ops->map_free()
    ...
    ...                          btf->struct_meta_tab = NULL

In the above, map_free() corresponds to array_map_free() and eventually
calling bpf_rb_root_free() which calls:
  ...
  __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
  ...

Here, 'value_rec' is assigned in btf_check_and_fixup_fields() with following code:

  meta = btf_find_struct_meta(btf, btf_id);
  if (!meta)
    return -EFAULT;
  rec->fields[i].graph_root.value_rec = meta->record;

So basically, 'value_rec' is a pointer to the record in struct_metas_tab.
And it is possible that that particular record has been freed by
btf_struct_metas_free() and hence we have a kasan error here.

Actually it is very hard to reproduce the failure with current bpf/bpf-next
code, I only got the above error once. To increase reproducibility, I added
a delay in bpf_map_free_deferred() to delay map->ops->map_free(), which
significantly increased reproducibility.

  diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
  index 5e43ddd1b83f..aae5b5213e93 100644
  --- a/kernel/bpf/syscall.c
  +++ b/kernel/bpf/syscall.c
  @@ -695,6 +695,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
        struct bpf_map *map = container_of(work, struct bpf_map, work);
        struct btf_record *rec = map->record;

  +     mdelay(100);
        security_bpf_map_free(map);
        bpf_map_release_memcg(map);
        /* implementation dependent freeing */

Hao also provided test cases ([1]) for easily reproducing the above issue.

There are two ways to fix the issue, the v1 of the patch ([2]) moving
btf_put() after map_free callback, and the v5 of the patch ([3]) using
a kptr style fix which tries to get a btf reference during
map_check_btf(). Each approach has its pro and cons. The first approach
delays freeing btf while the second approach needs to acquire reference
depending on context which makes logic not very elegant and may
complicate things with future new data structures. Alexei
suggested in [4] going back to v1 which is what this patch
tries to do.

Rerun './test_progs -j' with the above mdelay() hack for a couple
of times and didn't observe the error for the above rb_root test cases.
Running Hou's test ([1]) is also successful.

  [1] https://lore.kernel.org/bpf/20231207141500.917136-1-houtao@huaweicloud.com/
  [2] v1: https://lore.kernel.org/bpf/20231204173946.3066377-1-yonghong.song@linux.dev/
  [3] v5: https://lore.kernel.org/bpf/20231208041621.2968241-1-yonghong.song@linux.dev/
  [4] v4: https://lore.kernel.org/bpf/CAADnVQJ3FiXUhZJwX_81sjZvSYYKCFB3BT6P8D59RS2Gu+0Z7g@mail.gmail.com/

Cc: Hou Tao <houtao@huaweicloud.com>
Fixes: 958cf2e273f0 ("bpf: Introduce bpf_obj_new")
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231214203815.1469107-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3fcf7741146a..8faa1a20edf8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -692,6 +692,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 {
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
 	struct btf_record *rec = map->record;
+	struct btf *btf = map->btf;
 
 	security_bpf_map_free(map);
 	bpf_map_release_memcg(map);
@@ -707,6 +708,10 @@ static void bpf_map_free_deferred(struct work_struct *work)
 	 * template bpf_map struct used during verification.
 	 */
 	btf_record_free(rec);
+	/* Delay freeing of btf for maps, as map_free callback may need
+	 * struct_meta info which will be freed with btf_put().
+	 */
+	btf_put(btf);
 }
 
 static void bpf_map_put_uref(struct bpf_map *map)
@@ -747,7 +752,6 @@ void bpf_map_put(struct bpf_map *map)
 	if (atomic64_dec_and_test(&map->refcnt)) {
 		/* bpf_map_free_id() must be called first */
 		bpf_map_free_id(map);
-		btf_put(map->btf);
 
 		WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
 		if (READ_ONCE(map->free_after_mult_rcu_gp))
-- 
cgit v1.2.3


From c5707b2146d229691e193d5158ea70b21b8ba180 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 14:50:15 -0800
Subject: bpf: support symbolic BPF FS delegation mount options

Besides already supported special "any" value and hex bit mask, support
string-based parsing of delegation masks based on exact enumerator
names. Utilize BTF information of `enum bpf_cmd`, `enum bpf_map_type`,
`enum bpf_prog_type`, and `enum bpf_attach_type` types to find supported
symbolic names (ignoring __MAX_xxx guard values and stripping repetitive
prefixes like BPF_ for cmd and attach types, BPF_MAP_TYPE_ for maps, and
BPF_PROG_TYPE_ for prog types). The case doesn't matter, but it is
normalized to lower case in mount option output. So "PROG_LOAD",
"prog_load", and "MAP_create" are all valid values to specify for
delegate_cmds options, "array" is among supported for map types, etc.

Besides supporting string values, we also support multiple values
specified at the same time, using colon (':') separator.

There are corresponding changes on bpf_show_options side to use known
values to print them in human-readable format, falling back to hex mask
printing, if there are any unrecognized bits. This shouldn't be
necessary when enum BTF information is present, but in general we should
always be able to fall back to this even if kernel was built without BTF.
As mentioned, emitted symbolic names are normalized to be all lower case.

Example below shows various ways to specify delegate_cmds options
through mount command and how mount options are printed back:

12/14 14:39:07.604
vmuser@archvm:~/local/linux/tools/testing/selftests/bpf
$ mount | rg token

  $ sudo mkdir -p /sys/fs/bpf/token
  $ sudo mount -t bpf bpffs /sys/fs/bpf/token \
               -o delegate_cmds=prog_load:MAP_CREATE \
               -o delegate_progs=kprobe \
               -o delegate_attachs=xdp
  $ mount | grep token
  bpffs on /sys/fs/bpf/token type bpf (rw,relatime,delegate_cmds=map_create:prog_load,delegate_progs=kprobe,delegate_attachs=xdp)

Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231214225016.1209867-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/inode.c | 249 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 211 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 0a8e1188ea46..4383b3d13a55 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -595,6 +595,136 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ
 }
 EXPORT_SYMBOL(bpf_prog_get_type_path);
 
+struct bpffs_btf_enums {
+	const struct btf *btf;
+	const struct btf_type *cmd_t;
+	const struct btf_type *map_t;
+	const struct btf_type *prog_t;
+	const struct btf_type *attach_t;
+};
+
+static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
+{
+	const struct btf *btf;
+	const struct btf_type *t;
+	const char *name;
+	int i, n;
+
+	memset(info, 0, sizeof(*info));
+
+	btf = bpf_get_btf_vmlinux();
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+	if (!btf)
+		return -ENOENT;
+
+	info->btf = btf;
+
+	for (i = 1, n = btf_nr_types(btf); i < n; i++) {
+		t = btf_type_by_id(btf, i);
+		if (!btf_type_is_enum(t))
+			continue;
+
+		name = btf_name_by_offset(btf, t->name_off);
+		if (!name)
+			continue;
+
+		if (strcmp(name, "bpf_cmd") == 0)
+			info->cmd_t = t;
+		else if (strcmp(name, "bpf_map_type") == 0)
+			info->map_t = t;
+		else if (strcmp(name, "bpf_prog_type") == 0)
+			info->prog_t = t;
+		else if (strcmp(name, "bpf_attach_type") == 0)
+			info->attach_t = t;
+		else
+			continue;
+
+		if (info->cmd_t && info->map_t && info->prog_t && info->attach_t)
+			return 0;
+	}
+
+	return -ESRCH;
+}
+
+static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t,
+				const char *prefix, const char *str, int *value)
+{
+	const struct btf_enum *e;
+	const char *name;
+	int i, n, pfx_len = strlen(prefix);
+
+	*value = 0;
+
+	if (!btf || !enum_t)
+		return false;
+
+	for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
+		e = &btf_enum(enum_t)[i];
+
+		name = btf_name_by_offset(btf, e->name_off);
+		if (!name || strncasecmp(name, prefix, pfx_len) != 0)
+			continue;
+
+		/* match symbolic name case insensitive and ignoring prefix */
+		if (strcasecmp(name + pfx_len, str) == 0) {
+			*value = e->val;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static void seq_print_delegate_opts(struct seq_file *m,
+				    const char *opt_name,
+				    const struct btf *btf,
+				    const struct btf_type *enum_t,
+				    const char *prefix,
+				    u64 delegate_msk, u64 any_msk)
+{
+	const struct btf_enum *e;
+	bool first = true;
+	const char *name;
+	u64 msk;
+	int i, n, pfx_len = strlen(prefix);
+
+	delegate_msk &= any_msk; /* clear unknown bits */
+
+	if (delegate_msk == 0)
+		return;
+
+	seq_printf(m, ",%s", opt_name);
+	if (delegate_msk == any_msk) {
+		seq_printf(m, "=any");
+		return;
+	}
+
+	if (btf && enum_t) {
+		for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
+			e = &btf_enum(enum_t)[i];
+			name = btf_name_by_offset(btf, e->name_off);
+			if (!name || strncasecmp(name, prefix, pfx_len) != 0)
+				continue;
+			msk = 1ULL << e->val;
+			if (delegate_msk & msk) {
+				/* emit lower-case name without prefix */
+				seq_printf(m, "%c", first ? '=' : ':');
+				name += pfx_len;
+				while (*name) {
+					seq_printf(m, "%c", tolower(*name));
+					name++;
+				}
+
+				delegate_msk &= ~msk;
+				first = false;
+			}
+		}
+	}
+	if (delegate_msk)
+		seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk);
+}
+
 /*
  * Display the mount options in /proc/mounts.
  */
@@ -614,29 +744,34 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
 	if (mode != S_IRWXUGO)
 		seq_printf(m, ",mode=%o", mode);
 
-	mask = (1ULL << __MAX_BPF_CMD) - 1;
-	if ((opts->delegate_cmds & mask) == mask)
-		seq_printf(m, ",delegate_cmds=any");
-	else if (opts->delegate_cmds)
-		seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds);
-
-	mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
-	if ((opts->delegate_maps & mask) == mask)
-		seq_printf(m, ",delegate_maps=any");
-	else if (opts->delegate_maps)
-		seq_printf(m, ",delegate_maps=0x%llx", opts->delegate_maps);
-
-	mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
-	if ((opts->delegate_progs & mask) == mask)
-		seq_printf(m, ",delegate_progs=any");
-	else if (opts->delegate_progs)
-		seq_printf(m, ",delegate_progs=0x%llx", opts->delegate_progs);
-
-	mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
-	if ((opts->delegate_attachs & mask) == mask)
-		seq_printf(m, ",delegate_attachs=any");
-	else if (opts->delegate_attachs)
-		seq_printf(m, ",delegate_attachs=0x%llx", opts->delegate_attachs);
+	if (opts->delegate_cmds || opts->delegate_maps ||
+	    opts->delegate_progs || opts->delegate_attachs) {
+		struct bpffs_btf_enums info;
+
+		/* ignore errors, fallback to hex */
+		(void)find_bpffs_btf_enums(&info);
+
+		mask = (1ULL << __MAX_BPF_CMD) - 1;
+		seq_print_delegate_opts(m, "delegate_cmds",
+					info.btf, info.cmd_t, "BPF_",
+					opts->delegate_cmds, mask);
+
+		mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
+		seq_print_delegate_opts(m, "delegate_maps",
+					info.btf, info.map_t, "BPF_MAP_TYPE_",
+					opts->delegate_maps, mask);
+
+		mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
+		seq_print_delegate_opts(m, "delegate_progs",
+					info.btf, info.prog_t, "BPF_PROG_TYPE_",
+					opts->delegate_progs, mask);
+
+		mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
+		seq_print_delegate_opts(m, "delegate_attachs",
+					info.btf, info.attach_t, "BPF_",
+					opts->delegate_attachs, mask);
+	}
+
 	return 0;
 }
 
@@ -686,7 +821,6 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	kuid_t uid;
 	kgid_t gid;
 	int opt, err;
-	u64 msk;
 
 	opt = fs_parse(fc, bpf_fs_parameters, param, &result);
 	if (opt < 0) {
@@ -741,24 +875,63 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	case OPT_DELEGATE_CMDS:
 	case OPT_DELEGATE_MAPS:
 	case OPT_DELEGATE_PROGS:
-	case OPT_DELEGATE_ATTACHS:
-		if (strcmp(param->string, "any") == 0) {
-			msk = ~0ULL;
-		} else {
-			err = kstrtou64(param->string, 0, &msk);
-			if (err)
-				return err;
+	case OPT_DELEGATE_ATTACHS: {
+		struct bpffs_btf_enums info;
+		const struct btf_type *enum_t;
+		const char *enum_pfx;
+		u64 *delegate_msk, msk = 0;
+		char *p;
+		int val;
+
+		/* ignore errors, fallback to hex */
+		(void)find_bpffs_btf_enums(&info);
+
+		switch (opt) {
+		case OPT_DELEGATE_CMDS:
+			delegate_msk = &opts->delegate_cmds;
+			enum_t = info.cmd_t;
+			enum_pfx = "BPF_";
+			break;
+		case OPT_DELEGATE_MAPS:
+			delegate_msk = &opts->delegate_maps;
+			enum_t = info.map_t;
+			enum_pfx = "BPF_MAP_TYPE_";
+			break;
+		case OPT_DELEGATE_PROGS:
+			delegate_msk = &opts->delegate_progs;
+			enum_t = info.prog_t;
+			enum_pfx = "BPF_PROG_TYPE_";
+			break;
+		case OPT_DELEGATE_ATTACHS:
+			delegate_msk = &opts->delegate_attachs;
+			enum_t = info.attach_t;
+			enum_pfx = "BPF_";
+			break;
+		default:
+			return -EINVAL;
 		}
+
+		while ((p = strsep(&param->string, ":"))) {
+			if (strcmp(p, "any") == 0) {
+				msk |= ~0ULL;
+			} else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) {
+				msk |= 1ULL << val;
+			} else {
+				err = kstrtou64(p, 0, &msk);
+				if (err)
+					return err;
+			}
+		}
+
 		/* Setting delegation mount options requires privileges */
 		if (msk && !capable(CAP_SYS_ADMIN))
 			return -EPERM;
-		switch (opt) {
-		case OPT_DELEGATE_CMDS: opts->delegate_cmds |= msk; break;
-		case OPT_DELEGATE_MAPS: opts->delegate_maps |= msk; break;
-		case OPT_DELEGATE_PROGS: opts->delegate_progs |= msk; break;
-		case OPT_DELEGATE_ATTACHS: opts->delegate_attachs |= msk; break;
-		default: return -EINVAL;
-		}
+
+		*delegate_msk |= msk;
+		break;
+	}
+	default:
+		/* ignore unknown mount options */
 		break;
 	}
 
-- 
cgit v1.2.3


From 7489723c2e26504573dbb49b66bbc59092840008 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Thu, 14 Dec 2023 15:56:25 -0700
Subject: bpf: xdp: Register generic_kfunc_set with XDP programs

Registering generic_kfunc_set with XDP programs enables some of the
newer BPF features inside XDP -- namely tree based data structures and
BPF exceptions.

The current motivation for this commit is to enable assertions inside
XDP bpf progs. Assertions are a standard and useful tool to encode
intent.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/d07d4614b81ca6aada44fcb89bb6b618fb66e4ca.1702594357.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b3be5742d6f1..b0b485126a76 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2630,6 +2630,7 @@ static int __init kfunc_init(void)
 
 	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
 	ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
 						  ARRAY_SIZE(generic_dtors),
-- 
cgit v1.2.3


From 4ad4c1f394b84f9941a10aa8aaf11102478a390b Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 24 Nov 2023 18:10:03 +0000
Subject: dma-mapping: don't store redundant offsets

A bus_dma_region necessarily stores both CPU and DMA base addresses for
a range, so there's no need to also store the difference between them.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/acpi/scan.c        |  1 -
 drivers/of/address.c       |  1 -
 include/linux/dma-direct.h | 19 ++++++++++++-------
 kernel/dma/direct.c        |  1 -
 4 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 02bb2cce423f..ee88a727f200 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1532,7 +1532,6 @@ int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map)
 			r->cpu_start = rentry->res->start;
 			r->dma_start = rentry->res->start - rentry->offset;
 			r->size = resource_size(rentry->res);
-			r->offset = rentry->offset;
 			r++;
 		}
 	}
diff --git a/drivers/of/address.c b/drivers/of/address.c
index b59956310f66..ae46a3605904 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -955,7 +955,6 @@ int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map)
 		r->cpu_start = range.cpu_addr;
 		r->dma_start = range.bus_addr;
 		r->size = range.size;
-		r->offset = range.cpu_addr - range.bus_addr;
 		r++;
 	}
 out:
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 18aade195884..3eb3589ff43e 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -21,7 +21,6 @@ struct bus_dma_region {
 	phys_addr_t	cpu_start;
 	dma_addr_t	dma_start;
 	u64		size;
-	u64		offset;
 };
 
 static inline dma_addr_t translate_phys_to_dma(struct device *dev,
@@ -29,9 +28,12 @@ static inline dma_addr_t translate_phys_to_dma(struct device *dev,
 {
 	const struct bus_dma_region *m;
 
-	for (m = dev->dma_range_map; m->size; m++)
-		if (paddr >= m->cpu_start && paddr - m->cpu_start < m->size)
-			return (dma_addr_t)paddr - m->offset;
+	for (m = dev->dma_range_map; m->size; m++) {
+		u64 offset = paddr - m->cpu_start;
+
+		if (paddr >= m->cpu_start && offset < m->size)
+			return m->dma_start + offset;
+	}
 
 	/* make sure dma_capable fails when no translation is available */
 	return DMA_MAPPING_ERROR;
@@ -42,9 +44,12 @@ static inline phys_addr_t translate_dma_to_phys(struct device *dev,
 {
 	const struct bus_dma_region *m;
 
-	for (m = dev->dma_range_map; m->size; m++)
-		if (dma_addr >= m->dma_start && dma_addr - m->dma_start < m->size)
-			return (phys_addr_t)dma_addr + m->offset;
+	for (m = dev->dma_range_map; m->size; m++) {
+		u64 offset = dma_addr - m->dma_start;
+
+		if (dma_addr >= m->dma_start && offset < m->size)
+			return m->cpu_start + offset;
+	}
 
 	return (phys_addr_t)-1;
 }
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 73c95815789a..98b2e192fd69 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -677,7 +677,6 @@ int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
 		return -ENOMEM;
 	map[0].cpu_start = cpu_start;
 	map[0].dma_start = dma_start;
-	map[0].offset = offset;
 	map[0].size = size;
 	dev->dma_range_map = map;
 	return 0;
-- 
cgit v1.2.3


From 55c543865b76830f524ce5ce1243266a93d06f84 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <petr.tesarik1@huawei-partners.com>
Date: Fri, 1 Dec 2023 13:13:52 +0100
Subject: swiotlb: reduce area lock contention for non-primary IO TLB pools

If multiple areas and multiple IO TLB pools exist, first iterate the
current CPU specific area in all pools. Then move to the next area index.

This is best illustrated by a diagram:

        area 0 |  area 1 | ... | area M |
pool 0    A         B              C
pool 1    D         E
...
pool N    F         G              H

Currently, each pool is searched before moving on to the next pool,
i.e. the search order is A, B ... C, D, E ... F, G ... H. With this patch,
each area is searched in all pools before moving on to the next area,
i.e. the search order is A, D ... F, B, E ... G ... C ... H.

Note that preemption is not disabled, and raw_smp_processor_id() may not
return a stable result, but it is called only once to determine the initial
area index. The search will iterate over all areas eventually, even if the
current task is preempted.

Next, some pools may have less (but not more) areas than default_nareas.
Skip such pools if the distance from the initial area index is greater than
pool->nareas. This logic ensures that for every pool the search starts in
the initial CPU's own area and never tries any area twice.

To verify performance impact, I booted the kernel with a minimum pool
size ("swiotlb=512,4,force"), so multiple pools get allocated, and I ran
these benchmarks:

- small: single-threaded I/O of 4 KiB blocks,
- big: single-threaded I/O of 64 KiB blocks,
- 4way: 4-way parallel I/O of 4 KiB blocks.

The "var" column in the tables below is the coefficient of variance over 5
runs of the test, the "diff" column is the relative difference against base
in read-write I/O bandwidth (MiB/s).

Tested on an x86 VM against a QEMU virtio SATA driver backed by a RAM-based
block device on the host:

	base	   patched
	var	var	diff
small	0.69%	0.62%	+25.4%
big	2.14%	2.27%	+25.7%
4way	2.65%	1.70%	+23.6%

Tested on a Raspberry Pi against a class-10 A1 microSD card:

	base	   patched
	var	var	diff
small	0.53%	1.96%	-0.3%
big	0.02%	0.57%	+0.8%
4way	6.17%	0.40%	+0.3%

These results confirm that there is significant performance boost in the
software IO TLB slot allocation itself. Where performance is dominated by
actual hardware, there is no measurable change.

Signed-off-by: Petr Tesarik <petr.tesarik1@huawei-partners.com>
Reviewed-by: Mirsad Todorovac <mirsad.todorovac@alu.unizg.hr>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 90 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 55 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 33d942615be5..e20b856255ef 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -957,7 +957,7 @@ static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
 #endif /* CONFIG_DEBUG_FS */
 
 /**
- * swiotlb_area_find_slots() - search for slots in one IO TLB memory area
+ * swiotlb_search_pool_area() - search one memory area in one pool
  * @dev:	Device which maps the buffer.
  * @pool:	Memory pool to be searched.
  * @area_index:	Index of the IO TLB memory area to be searched.
@@ -972,7 +972,7 @@ static void dec_used(struct io_tlb_mem *mem, unsigned int nslots)
  *
  * Return: Index of the first allocated slot, or -1 on error.
  */
-static int swiotlb_area_find_slots(struct device *dev, struct io_tlb_pool *pool,
+static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool,
 		int area_index, phys_addr_t orig_addr, size_t alloc_size,
 		unsigned int alloc_align_mask)
 {
@@ -1066,41 +1066,50 @@ found:
 	return slot_index;
 }
 
+#ifdef CONFIG_SWIOTLB_DYNAMIC
+
 /**
- * swiotlb_pool_find_slots() - search for slots in one memory pool
+ * swiotlb_search_area() - search one memory area in all pools
  * @dev:	Device which maps the buffer.
- * @pool:	Memory pool to be searched.
+ * @start_cpu:	Start CPU number.
+ * @cpu_offset:	Offset from @start_cpu.
  * @orig_addr:	Original (non-bounced) IO buffer address.
  * @alloc_size: Total requested size of the bounce buffer,
  *		including initial alignment padding.
  * @alloc_align_mask:	Required alignment of the allocated buffer.
+ * @retpool:	Used memory pool, updated on return.
  *
- * Search through one memory pool to find a sequence of slots that match the
+ * Search one memory area in all pools for a sequence of slots that match the
  * allocation constraints.
  *
  * Return: Index of the first allocated slot, or -1 on error.
  */
-static int swiotlb_pool_find_slots(struct device *dev, struct io_tlb_pool *pool,
-		phys_addr_t orig_addr, size_t alloc_size,
-		unsigned int alloc_align_mask)
+static int swiotlb_search_area(struct device *dev, int start_cpu,
+		int cpu_offset, phys_addr_t orig_addr, size_t alloc_size,
+		unsigned int alloc_align_mask, struct io_tlb_pool **retpool)
 {
-	int start = raw_smp_processor_id() & (pool->nareas - 1);
-	int i = start, index;
-
-	do {
-		index = swiotlb_area_find_slots(dev, pool, i, orig_addr,
-						alloc_size, alloc_align_mask);
-		if (index >= 0)
-			return index;
-		if (++i >= pool->nareas)
-			i = 0;
-	} while (i != start);
+	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+	struct io_tlb_pool *pool;
+	int area_index;
+	int index = -1;
 
-	return -1;
+	rcu_read_lock();
+	list_for_each_entry_rcu(pool, &mem->pools, node) {
+		if (cpu_offset >= pool->nareas)
+			continue;
+		area_index = (start_cpu + cpu_offset) & (pool->nareas - 1);
+		index = swiotlb_search_pool_area(dev, pool, area_index,
+						 orig_addr, alloc_size,
+						 alloc_align_mask);
+		if (index >= 0) {
+			*retpool = pool;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return index;
 }
 
-#ifdef CONFIG_SWIOTLB_DYNAMIC
-
 /**
  * swiotlb_find_slots() - search for slots in the whole swiotlb
  * @dev:	Device which maps the buffer.
@@ -1124,18 +1133,17 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
 	unsigned long nslabs;
 	unsigned long flags;
 	u64 phys_limit;
+	int cpu, i;
 	int index;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(pool, &mem->pools, node) {
-		index = swiotlb_pool_find_slots(dev, pool, orig_addr,
-						alloc_size, alloc_align_mask);
-		if (index >= 0) {
-			rcu_read_unlock();
+	cpu = raw_smp_processor_id();
+	for (i = 0; i < default_nareas; ++i) {
+		index = swiotlb_search_area(dev, cpu, i, orig_addr, alloc_size,
+					    alloc_align_mask, &pool);
+		if (index >= 0)
 			goto found;
-		}
 	}
-	rcu_read_unlock();
+
 	if (!mem->can_grow)
 		return -1;
 
@@ -1148,8 +1156,8 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
 	if (!pool)
 		return -1;
 
-	index = swiotlb_pool_find_slots(dev, pool, orig_addr,
-					alloc_size, alloc_align_mask);
+	index = swiotlb_search_pool_area(dev, pool, 0, orig_addr,
+					 alloc_size, alloc_align_mask);
 	if (index < 0) {
 		swiotlb_dyn_free(&pool->rcu);
 		return -1;
@@ -1192,9 +1200,21 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
 		size_t alloc_size, unsigned int alloc_align_mask,
 		struct io_tlb_pool **retpool)
 {
-	*retpool = &dev->dma_io_tlb_mem->defpool;
-	return swiotlb_pool_find_slots(dev, *retpool,
-				       orig_addr, alloc_size, alloc_align_mask);
+	struct io_tlb_pool *pool;
+	int start, i;
+	int index;
+
+	*retpool = pool = &dev->dma_io_tlb_mem->defpool;
+	i = start = raw_smp_processor_id() & (pool->nareas - 1);
+	do {
+		index = swiotlb_search_pool_area(dev, pool, i, orig_addr,
+						 alloc_size, alloc_align_mask);
+		if (index >= 0)
+			return index;
+		if (++i >= pool->nareas)
+			i = 0;
+	} while (i != start);
+	return -1;
 }
 
 #endif /* CONFIG_SWIOTLB_DYNAMIC */
-- 
cgit v1.2.3


From b07bc2347672cc8c7293c64499f1488278c5ca3d Mon Sep 17 00:00:00 2001
From: Joakim Zhang <joakim.zhang@cixtech.com>
Date: Thu, 14 Dec 2023 16:25:26 +0800
Subject: dma-mapping: clear dev->dma_mem to NULL after freeing it

Reproduced with below sequence:
dma_declare_coherent_memory()->dma_release_coherent_memory()
->dma_declare_coherent_memory()->"return -EBUSY" error

It will return -EBUSY from the dma_assign_coherent_memory()
in dma_declare_coherent_memory(), the reason is that dev->dma_mem
pointer has not been set to NULL after it's freed.

Fixes: cf65a0f6f6ff ("dma-mapping: move all DMA mapping code to kernel/dma")
Signed-off-by: Joakim Zhang <joakim.zhang@cixtech.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/coherent.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index c21abc77c53e..ff5683a57f77 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -132,8 +132,10 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 
 void dma_release_coherent_memory(struct device *dev)
 {
-	if (dev)
+	if (dev) {
 		_dma_release_coherent_memory(dev->dma_mem);
+		dev->dma_mem = NULL;
+	}
 }
 
 static void *__dma_alloc_from_coherent(struct device *dev,
-- 
cgit v1.2.3


From 7e2c1e4b34f07d9aa8937fab88359d4a0fce468e Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 15 Dec 2023 11:24:50 +0000
Subject: perf: Fix perf_event_validate_size() lockdep splat

When lockdep is enabled, the for_each_sibling_event(sibling, event)
macro checks that event->ctx->mutex is held. When creating a new group
leader event, we call perf_event_validate_size() on a partially
initialized event where event->ctx is NULL, and so when
for_each_sibling_event() attempts to check event->ctx->mutex, we get a
splat, as reported by Lucas De Marchi:

  WARNING: CPU: 8 PID: 1471 at kernel/events/core.c:1950 __do_sys_perf_event_open+0xf37/0x1080

This only happens for a new event which is its own group_leader, and in
this case there cannot be any sibling events. Thus it's safe to skip the
check for siblings, which avoids having to make invasive and ugly
changes to for_each_sibling_event().

Avoid the splat by bailing out early when the new event is its own
group_leader.

Fixes: 382c27f4ed28f803 ("perf: Fix perf_event_validate_size()")
Closes: https://lore.kernel.org/lkml/20231214000620.3081018-1-lucas.demarchi@intel.com/
Closes: https://lore.kernel.org/lkml/ZXpm6gQ%2Fd59jGsuW@xpf.sh.intel.com/
Reported-by: Lucas De Marchi <lucas.demarchi@intel.com>
Reported-by: Pengfei Xu <pengfei.xu@intel.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20231215112450.3972309-1-mark.rutland@arm.com
---
 kernel/events/core.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index c9d123e13b57..9efd0d7775e7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1947,6 +1947,16 @@ static bool perf_event_validate_size(struct perf_event *event)
 				   group_leader->nr_siblings + 1) > 16*1024)
 		return false;
 
+	/*
+	 * When creating a new group leader, group_leader->ctx is initialized
+	 * after the size has been validated, but we cannot safely use
+	 * for_each_sibling_event() until group_leader->ctx is set. A new group
+	 * leader cannot have any siblings yet, so we can safely skip checking
+	 * the non-existent siblings.
+	 */
+	if (event == group_leader)
+		return true;
+
 	for_each_sibling_event(sibling, group_leader) {
 		if (__perf_event_read_size(sibling->attr.read_format,
 					   group_leader->nr_siblings + 1) > 16*1024)
-- 
cgit v1.2.3


From 0c4cae1bc00d31c78858c184ede351baea232bdb Mon Sep 17 00:00:00 2001
From: Chris Feng <chris.feng@mediatek.com>
Date: Wed, 13 Dec 2023 16:32:51 +0800
Subject: PM: hibernate: Avoid missing wakeup events during hibernation

Wakeup events that occur in the hibernation process's
hibernation_platform_enter() cannot wake up the system. Although the
current hibernation framework will execute part of the recovery process
after a wakeup event occurs, it ultimately performs a shutdown operation
because the system does not check the return value of
hibernation_platform_enter(). In short, if a wakeup event occurs before
putting the system into the final low-power state, it will be missed.

To solve this problem, check the return value of
hibernation_platform_enter(). When it returns -EAGAIN or -EBUSY (indicate
the occurrence of a wakeup event), execute the hibernation recovery
process, discard the previously saved image, and ultimately return to the
working state.

Signed-off-by: Chris Feng <chris.feng@mediatek.com>
[ rjw: Rephrase the message printed when going back to the working state ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 10 ++++++++--
 kernel/power/power.h     |  2 ++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index dee341ae4ace..4b0b7cf2e019 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -642,9 +642,9 @@ int hibernation_platform_enter(void)
  */
 static void power_down(void)
 {
-#ifdef CONFIG_SUSPEND
 	int error;
 
+#ifdef CONFIG_SUSPEND
 	if (hibernation_mode == HIBERNATION_SUSPEND) {
 		error = suspend_devices_and_enter(mem_sleep_current);
 		if (error) {
@@ -667,7 +667,13 @@ static void power_down(void)
 		kernel_restart(NULL);
 		break;
 	case HIBERNATION_PLATFORM:
-		hibernation_platform_enter();
+		error = hibernation_platform_enter();
+		if (error == -EAGAIN || error == -EBUSY) {
+			swsusp_unmark();
+			events_check_enabled = false;
+			pr_info("Wakeup event detected during hibernation, rolling back.\n");
+			return;
+		}
 		fallthrough;
 	case HIBERNATION_SHUTDOWN:
 		if (kernel_can_power_off())
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 17fd9aaaf084..8499a39c62f4 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -175,6 +175,8 @@ extern int swsusp_write(unsigned int flags);
 void swsusp_close(void);
 #ifdef CONFIG_SUSPEND
 extern int swsusp_unmark(void);
+#else
+static inline int swsusp_unmark(void) { return 0; }
 #endif
 
 struct __kernel_old_timeval;
-- 
cgit v1.2.3


From 71cd7e80cfde548959952eac7063aeaea1f2e1c6 Mon Sep 17 00:00:00 2001
From: Hongchen Zhang <zhanghongchen@loongson.cn>
Date: Thu, 16 Nov 2023 08:56:09 +0800
Subject: PM: hibernate: Enforce ordering during image
 compression/decompression

An S4 (suspend to disk) test on the LoongArch 3A6000 platform sometimes
fails with the following error messaged in the dmesg log:

	Invalid LZO compressed length

That happens because when compressing/decompressing the image, the
synchronization between the control thread and the compress/decompress/crc
thread is based on a relaxed ordering interface, which is unreliable, and the
following situation may occur:

CPU 0					CPU 1
save_image_lzo				lzo_compress_threadfn
					  atomic_set(&d->stop, 1);
  atomic_read(&data[thr].stop)
  data[thr].cmp = data[thr].cmp_len;
	  				  WRITE data[thr].cmp_len

Then CPU0 gets a stale cmp_len and writes it to disk. During resume from S4,
wrong cmp_len is loaded.

To maintain data consistency between the two threads, use the acquire/release
variants of atomic set and read operations.

Fixes: 081a9d043c98 ("PM / Hibernate: Improve performance of LZO/plain hibernation, checksum image")
Cc: All applicable <stable@vger.kernel.org>
Signed-off-by: Hongchen Zhang <zhanghongchen@loongson.cn>
Co-developed-by: Weihao Li <liweihao@loongson.cn>
Signed-off-by: Weihao Li <liweihao@loongson.cn>
[ rjw: Subject rewrite and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/swap.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 68973ca2cf07..975e7195573b 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -606,11 +606,11 @@ static int crc32_threadfn(void *data)
 	unsigned i;
 
 	while (1) {
-		wait_event(d->go, atomic_read(&d->ready) ||
+		wait_event(d->go, atomic_read_acquire(&d->ready) ||
 		                  kthread_should_stop());
 		if (kthread_should_stop()) {
 			d->thr = NULL;
-			atomic_set(&d->stop, 1);
+			atomic_set_release(&d->stop, 1);
 			wake_up(&d->done);
 			break;
 		}
@@ -619,7 +619,7 @@ static int crc32_threadfn(void *data)
 		for (i = 0; i < d->run_threads; i++)
 			*d->crc32 = crc32_le(*d->crc32,
 			                     d->unc[i], *d->unc_len[i]);
-		atomic_set(&d->stop, 1);
+		atomic_set_release(&d->stop, 1);
 		wake_up(&d->done);
 	}
 	return 0;
@@ -649,12 +649,12 @@ static int lzo_compress_threadfn(void *data)
 	struct cmp_data *d = data;
 
 	while (1) {
-		wait_event(d->go, atomic_read(&d->ready) ||
+		wait_event(d->go, atomic_read_acquire(&d->ready) ||
 		                  kthread_should_stop());
 		if (kthread_should_stop()) {
 			d->thr = NULL;
 			d->ret = -1;
-			atomic_set(&d->stop, 1);
+			atomic_set_release(&d->stop, 1);
 			wake_up(&d->done);
 			break;
 		}
@@ -663,7 +663,7 @@ static int lzo_compress_threadfn(void *data)
 		d->ret = lzo1x_1_compress(d->unc, d->unc_len,
 		                          d->cmp + LZO_HEADER, &d->cmp_len,
 		                          d->wrk);
-		atomic_set(&d->stop, 1);
+		atomic_set_release(&d->stop, 1);
 		wake_up(&d->done);
 	}
 	return 0;
@@ -798,7 +798,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 
 			data[thr].unc_len = off;
 
-			atomic_set(&data[thr].ready, 1);
+			atomic_set_release(&data[thr].ready, 1);
 			wake_up(&data[thr].go);
 		}
 
@@ -806,12 +806,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
 			break;
 
 		crc->run_threads = thr;
-		atomic_set(&crc->ready, 1);
+		atomic_set_release(&crc->ready, 1);
 		wake_up(&crc->go);
 
 		for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
 			wait_event(data[thr].done,
-			           atomic_read(&data[thr].stop));
+				atomic_read_acquire(&data[thr].stop));
 			atomic_set(&data[thr].stop, 0);
 
 			ret = data[thr].ret;
@@ -850,7 +850,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 			}
 		}
 
-		wait_event(crc->done, atomic_read(&crc->stop));
+		wait_event(crc->done, atomic_read_acquire(&crc->stop));
 		atomic_set(&crc->stop, 0);
 	}
 
@@ -1132,12 +1132,12 @@ static int lzo_decompress_threadfn(void *data)
 	struct dec_data *d = data;
 
 	while (1) {
-		wait_event(d->go, atomic_read(&d->ready) ||
+		wait_event(d->go, atomic_read_acquire(&d->ready) ||
 		                  kthread_should_stop());
 		if (kthread_should_stop()) {
 			d->thr = NULL;
 			d->ret = -1;
-			atomic_set(&d->stop, 1);
+			atomic_set_release(&d->stop, 1);
 			wake_up(&d->done);
 			break;
 		}
@@ -1150,7 +1150,7 @@ static int lzo_decompress_threadfn(void *data)
 			flush_icache_range((unsigned long)d->unc,
 					   (unsigned long)d->unc + d->unc_len);
 
-		atomic_set(&d->stop, 1);
+		atomic_set_release(&d->stop, 1);
 		wake_up(&d->done);
 	}
 	return 0;
@@ -1335,7 +1335,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 		}
 
 		if (crc->run_threads) {
-			wait_event(crc->done, atomic_read(&crc->stop));
+			wait_event(crc->done, atomic_read_acquire(&crc->stop));
 			atomic_set(&crc->stop, 0);
 			crc->run_threads = 0;
 		}
@@ -1371,7 +1371,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 					pg = 0;
 			}
 
-			atomic_set(&data[thr].ready, 1);
+			atomic_set_release(&data[thr].ready, 1);
 			wake_up(&data[thr].go);
 		}
 
@@ -1390,7 +1390,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 
 		for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
 			wait_event(data[thr].done,
-			           atomic_read(&data[thr].stop));
+				atomic_read_acquire(&data[thr].stop));
 			atomic_set(&data[thr].stop, 0);
 
 			ret = data[thr].ret;
@@ -1421,7 +1421,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 				ret = snapshot_write_next(snapshot);
 				if (ret <= 0) {
 					crc->run_threads = thr + 1;
-					atomic_set(&crc->ready, 1);
+					atomic_set_release(&crc->ready, 1);
 					wake_up(&crc->go);
 					goto out_finish;
 				}
@@ -1429,13 +1429,13 @@ static int load_image_lzo(struct swap_map_handle *handle,
 		}
 
 		crc->run_threads = thr;
-		atomic_set(&crc->ready, 1);
+		atomic_set_release(&crc->ready, 1);
 		wake_up(&crc->go);
 	}
 
 out_finish:
 	if (crc->run_threads) {
-		wait_event(crc->done, atomic_read(&crc->stop));
+		wait_event(crc->done, atomic_read_acquire(&crc->stop));
 		atomic_set(&crc->stop, 0);
 	}
 	stop = ktime_get();
-- 
cgit v1.2.3


From dd939425707898da992e59ab0fcfae4652546910 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 14 Dec 2023 22:29:21 -0500
Subject: ring-buffer: Do not try to put back write_stamp

If an update to an event is interrupted by another event between the time
the initial event allocated its buffer and where it wrote to the
write_stamp, the code try to reset the write stamp back to the what it had
just overwritten. It knows that it was overwritten via checking the
before_stamp, and if it didn't match what it wrote to the before_stamp
before it allocated its space, it knows it was overwritten.

To put back the write_stamp, it uses the before_stamp it read. The problem
here is that by writing the before_stamp to the write_stamp it makes the
two equal again, which means that the write_stamp can be considered valid
as the last timestamp written to the ring buffer. But this is not
necessarily true. The event that interrupted the event could have been
interrupted in a way that it was interrupted as well, and can end up
leaving with an invalid write_stamp. But if this happens and returns to
this context that uses the before_stamp to update the write_stamp again,
it can possibly incorrectly make it valid, causing later events to have in
correct time stamps.

As it is OK to leave this function with an invalid write_stamp (one that
doesn't match the before_stamp), there's no reason to try to make it valid
again in this case. If this race happens, then just leave with the invalid
write_stamp and the next event to come along will just add a absolute
timestamp and validate everything again.

Bonus points: This gets rid of another cmpxchg64!

Link: https://lore.kernel.org/linux-trace-kernel/20231214222921.193037a7@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Fixes: a389d86f7fd09 ("ring-buffer: Have nested events still record running time stamp")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 29 ++++++-----------------------
 1 file changed, 6 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d9caee7f542..2668dde23343 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3612,14 +3612,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	}
 
 	if (likely(tail == w)) {
-		u64 save_before;
-		bool s_ok;
-
 		/* Nothing interrupted us between A and C */
  /*D*/		rb_time_set(&cpu_buffer->write_stamp, info->ts);
-		barrier();
- /*E*/		s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before);
-		RB_WARN_ON(cpu_buffer, !s_ok);
+		/*
+		 * If something came in between C and D, the write stamp
+		 * may now not be in sync. But that's fine as the before_stamp
+		 * will be different and then next event will just be forced
+		 * to use an absolute timestamp.
+		 */
 		if (likely(!(info->add_timestamp &
 			     (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
 			/* This did not interrupt any time update */
@@ -3627,24 +3627,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		else
 			/* Just use full timestamp for interrupting event */
 			info->delta = info->ts;
-		barrier();
 		check_buffer(cpu_buffer, info, tail);
-		if (unlikely(info->ts != save_before)) {
-			/* SLOW PATH - Interrupted between C and E */
-
-			a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
-			RB_WARN_ON(cpu_buffer, !a_ok);
-
-			/* Write stamp must only go forward */
-			if (save_before > info->after) {
-				/*
-				 * We do not care about the result, only that
-				 * it gets updated atomically.
-				 */
-				(void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
-						      info->after, save_before);
-			}
-		}
 	} else {
 		u64 ts;
 		/* SLOW PATH - Interrupted between A and C */
-- 
cgit v1.2.3


From 083e9f65bd215582bf8f6a920db729fadf16704f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 15 Dec 2023 08:18:10 -0500
Subject: ring-buffer: Remove useless update to write_stamp in
 rb_try_to_discard()

When filtering is enabled, a temporary buffer is created to place the
content of the trace event output so that the filter logic can decide
from the trace event output if the trace event should be filtered out or
not. If it is to be filtered out, the content in the temporary buffer is
simply discarded, otherwise it is written into the trace buffer.

But if an interrupt were to come in while a previous event was using that
temporary buffer, the event written by the interrupt would actually go
into the ring buffer itself to prevent corrupting the data on the
temporary buffer. If the event is to be filtered out, the event in the
ring buffer is discarded, or if it fails to discard because another event
were to have already come in, it is turned into padding.

The update to the write_stamp in the rb_try_to_discard() happens after a
fix was made to force the next event after the discard to use an absolute
timestamp by setting the before_stamp to zero so it does not match the
write_stamp (which causes an event to use the absolute timestamp).

But there's an effort in rb_try_to_discard() to put back the write_stamp
to what it was before the event was added. But this is useless and
wasteful because nothing is going to be using that write_stamp for
calculations as it still will not match the before_stamp.

Remove this useless update, and in doing so, we remove another
cmpxchg64()!

Also update the comments to reflect this change as well as remove some
extra white space in another comment.

Link: https://lore.kernel.org/linux-trace-kernel/20231215081810.1f4f38fe@rorschach.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Vincent Donnefort   <vdonnefort@google.com>
Fixes: b2dd797543cf ("ring-buffer: Force absolute timestamp on discard of event")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 47 +++++++++++-----------------------------------
 1 file changed, 11 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2668dde23343..ad4af0cba159 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2983,25 +2983,6 @@ static unsigned rb_calculate_event_length(unsigned length)
 	return length;
 }
 
-static u64 rb_time_delta(struct ring_buffer_event *event)
-{
-	switch (event->type_len) {
-	case RINGBUF_TYPE_PADDING:
-		return 0;
-
-	case RINGBUF_TYPE_TIME_EXTEND:
-		return rb_event_time_stamp(event);
-
-	case RINGBUF_TYPE_TIME_STAMP:
-		return 0;
-
-	case RINGBUF_TYPE_DATA:
-		return event->time_delta;
-	default:
-		return 0;
-	}
-}
-
 static inline bool
 rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 		  struct ring_buffer_event *event)
@@ -3009,8 +2990,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 	unsigned long new_index, old_index;
 	struct buffer_page *bpage;
 	unsigned long addr;
-	u64 write_stamp;
-	u64 delta;
 
 	new_index = rb_event_index(event);
 	old_index = new_index + rb_event_ts_length(event);
@@ -3019,14 +2998,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 
 	bpage = READ_ONCE(cpu_buffer->tail_page);
 
-	delta = rb_time_delta(event);
-
-	if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
-		return false;
-
-	/* Make sure the write stamp is read before testing the location */
-	barrier();
-
+	/*
+	 * Make sure the tail_page is still the same and
+	 * the next write location is the end of this event
+	 */
 	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
 		unsigned long write_mask =
 			local_read(&bpage->write) & ~RB_WRITE_MASK;
@@ -3037,20 +3012,20 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 		 * to make sure that the next event adds an absolute
 		 * value and does not rely on the saved write stamp, which
 		 * is now going to be bogus.
+		 *
+		 * By setting the before_stamp to zero, the next event
+		 * is not going to use the write_stamp and will instead
+		 * create an absolute timestamp. This means there's no
+		 * reason to update the wirte_stamp!
 		 */
 		rb_time_set(&cpu_buffer->before_stamp, 0);
 
-		/* Something came in, can't discard */
-		if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
-				       write_stamp, write_stamp - delta))
-			return false;
-
 		/*
 		 * If an event were to come in now, it would see that the
 		 * write_stamp and the before_stamp are different, and assume
 		 * that this event just added itself before updating
 		 * the write stamp. The interrupting event will fix the
-		 * write stamp for us, and use the before stamp as its delta.
+		 * write stamp for us, and use an absolute timestamp.
 		 */
 
 		/*
@@ -3487,7 +3462,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
 		return;
 
 	/*
-	 * If this interrupted another event, 
+	 * If this interrupted another event,
 	 */
 	if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
 		goto out;
-- 
cgit v1.2.3


From fff88fa0fbc7067ba46dde570912d63da42c59a9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 12 Dec 2023 11:53:01 -0500
Subject: ring-buffer: Fix a race in rb_time_cmpxchg() for 32 bit archs

Mathieu Desnoyers pointed out an issue in the rb_time_cmpxchg() for 32 bit
architectures. That is:

 static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
 {
	unsigned long cnt, top, bottom, msb;
	unsigned long cnt2, top2, bottom2, msb2;
	u64 val;

	/* The cmpxchg always fails if it interrupted an update */
	 if (!__rb_time_read(t, &val, &cnt2))
		 return false;

	 if (val != expect)
		 return false;

<<<< interrupted here!

	 cnt = local_read(&t->cnt);

The problem is that the synchronization counter in the rb_time_t is read
*after* the value of the timestamp is read. That means if an interrupt
were to come in between the value being read and the counter being read,
it can change the value and the counter and the interrupted process would
be clueless about it!

The counter needs to be read first and then the value. That way it is easy
to tell if the value is stale or not. If the counter hasn't been updated,
then the value is still good.

Link: https://lore.kernel.org/linux-trace-kernel/20231211201324.652870-1-mathieu.desnoyers@efficios.com/
Link: https://lore.kernel.org/linux-trace-kernel/20231212115301.7a9c9a64@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Fixes: 10464b4aa605e ("ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit")
Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ad4af0cba159..b8ab0557bd1b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -706,6 +706,9 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
 	unsigned long cnt2, top2, bottom2, msb2;
 	u64 val;
 
+	/* Any interruptions in this function should cause a failure */
+	cnt = local_read(&t->cnt);
+
 	/* The cmpxchg always fails if it interrupted an update */
 	 if (!__rb_time_read(t, &val, &cnt2))
 		 return false;
@@ -713,7 +716,6 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
 	 if (val != expect)
 		 return false;
 
-	 cnt = local_read(&t->cnt);
 	 if ((cnt & 3) != cnt2)
 		 return false;
 
-- 
cgit v1.2.3


From dec890089bf79a4954b61482715ee2d084364856 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 12 Dec 2023 14:30:49 -0500
Subject: ring-buffer: Fix 32-bit rb_time_read() race with rb_time_cmpxchg()

The following race can cause rb_time_read() to observe a corrupted time
stamp:

rb_time_cmpxchg()
[...]
        if (!rb_time_read_cmpxchg(&t->msb, msb, msb2))
                return false;
        if (!rb_time_read_cmpxchg(&t->top, top, top2))
                return false;
<interrupted before updating bottom>
__rb_time_read()
[...]
        do {
                c = local_read(&t->cnt);
                top = local_read(&t->top);
                bottom = local_read(&t->bottom);
                msb = local_read(&t->msb);
        } while (c != local_read(&t->cnt));

        *cnt = rb_time_cnt(top);

        /* If top and msb counts don't match, this interrupted a write */
        if (*cnt != rb_time_cnt(msb))
                return false;
          ^ this check fails to catch that "bottom" is still not updated.

So the old "bottom" value is returned, which is wrong.

Fix this by checking that all three of msb, top, and bottom 2-bit cnt
values match.

The reason to favor checking all three fields over requiring a specific
update order for both rb_time_set() and rb_time_cmpxchg() is because
checking all three fields is more robust to handle partial failures of
rb_time_cmpxchg() when interrupted by nested rb_time_set().

Link: https://lore.kernel.org/lkml/20231211201324.652870-1-mathieu.desnoyers@efficios.com/
Link: https://lore.kernel.org/linux-trace-kernel/20231212193049.680122-1-mathieu.desnoyers@efficios.com

Fixes: f458a1453424e ("ring-buffer: Test last update in 32bit version of __rb_time_read()")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b8ab0557bd1b..f22a849da179 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -644,8 +644,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
 
 	*cnt = rb_time_cnt(top);
 
-	/* If top and msb counts don't match, this interrupted a write */
-	if (*cnt != rb_time_cnt(msb))
+	/* If top, msb or bottom counts don't match, this interrupted a write */
+	if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom))
 		return false;
 
 	/* The shift to msb will lose its cnt bits */
-- 
cgit v1.2.3


From 0aa0e5289cfe984a8a9fdd79ccf46ccf080151f7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 15 Dec 2023 08:41:14 -0500
Subject: ring-buffer: Have rb_time_cmpxchg() set the msb counter too

The rb_time_cmpxchg() on 32-bit architectures requires setting three
32-bit words to represent the 64-bit timestamp, with some salt for
synchronization. Those are: msb, top, and bottom

The issue is, the rb_time_cmpxchg() did not properly salt the msb portion,
and the msb that was written was stale.

Link: https://lore.kernel.org/linux-trace-kernel/20231215084114.20899342@rorschach.local.home

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Fixes: f03f2abce4f39 ("ring-buffer: Have 32 bit time stamps use all 64 bits")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f22a849da179..f4679013289b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -722,10 +722,12 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
 	 cnt2 = cnt + 1;
 
 	 rb_time_split(val, &top, &bottom, &msb);
+	 msb = rb_time_val_cnt(msb, cnt);
 	 top = rb_time_val_cnt(top, cnt);
 	 bottom = rb_time_val_cnt(bottom, cnt);
 
 	 rb_time_split(set, &top2, &bottom2, &msb2);
+	 msb2 = rb_time_val_cnt(msb2, cnt);
 	 top2 = rb_time_val_cnt(top2, cnt2);
 	 bottom2 = rb_time_val_cnt(bottom2, cnt2);
 
-- 
cgit v1.2.3


From 712292308af2265cd9b126aedfa987f10f452a33 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 13 Dec 2023 17:54:03 -0500
Subject: ring-buffer: Do not record in NMI if the arch does not support
 cmpxchg in NMI

As the ring buffer recording requires cmpxchg() to work, if the
architecture does not support cmpxchg in NMI, then do not do any recording
within an NMI.

Link: https://lore.kernel.org/linux-trace-kernel/20231213175403.6fc18540@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f4679013289b..5a114e752f11 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3674,6 +3674,12 @@ rb_reserve_next_event(struct trace_buffer *buffer,
 	int nr_loops = 0;
 	int add_ts_default;
 
+	/* ring buffer does cmpxchg, make sure it is safe in NMI context */
+	if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
+	    (unlikely(in_nmi()))) {
+		return NULL;
+	}
+
 	rb_start_commit(cpu_buffer);
 	/* The commit page can not change after this */
 
-- 
cgit v1.2.3


From fe3de0102bc830e78d80dbf6e2dd29c520d68575 Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Fri, 8 Dec 2023 10:33:09 +0100
Subject: kernel/cgroup: use kernfs_create_dir_ns()

By passing the fsugid to kernfs_create_dir_ns(), we don't need
cgroup_kn_set_ugid() any longer.  That function was added for exactly
this purpose by commit 49957f8e2a43 ("cgroup: newly created dirs and
files should be owned by the creator").

Eliminating this piece of duplicate code means we benefit from future
improvements to kernfs_create_dir_ns(); for example, both are lacking
S_ISGID support currently, which my next patch will add to
kernfs_create_dir_ns().  It cannot (easily) be added to
cgroup_kn_set_ugid() because we can't dereference struct kernfs_iattrs
from there.

--
v1 -> v2: 12-digit commit id

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lore.kernel.org/r/20231208093310.297233-1-max.kellermann@ionos.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/cgroup/cgroup.c | 31 ++++---------------------------
 1 file changed, 4 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4b9ff41ca603..a844b421fd83 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4169,20 +4169,6 @@ static struct kernfs_ops cgroup_kf_ops = {
 	.seq_show		= cgroup_seqfile_show,
 };
 
-/* set uid and gid of cgroup dirs and files to that of the creator */
-static int cgroup_kn_set_ugid(struct kernfs_node *kn)
-{
-	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
-			       .ia_uid = current_fsuid(),
-			       .ia_gid = current_fsgid(), };
-
-	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
-	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
-		return 0;
-
-	return kernfs_setattr(kn, &iattr);
-}
-
 static void cgroup_file_notify_timer(struct timer_list *timer)
 {
 	cgroup_file_notify(container_of(timer, struct cgroup_file,
@@ -4195,25 +4181,18 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
 	char name[CGROUP_FILE_NAME_MAX];
 	struct kernfs_node *kn;
 	struct lock_class_key *key = NULL;
-	int ret;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	key = &cft->lockdep_key;
 #endif
 	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
 				  cgroup_file_mode(cft),
-				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+				  current_fsuid(), current_fsgid(),
 				  0, cft->kf_ops, cft,
 				  NULL, key);
 	if (IS_ERR(kn))
 		return PTR_ERR(kn);
 
-	ret = cgroup_kn_set_ugid(kn);
-	if (ret) {
-		kernfs_remove(kn);
-		return ret;
-	}
-
 	if (cft->file_offset) {
 		struct cgroup_file *cfile = (void *)css + cft->file_offset;
 
@@ -5616,7 +5595,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
 		goto out_cancel_ref;
 
 	/* create the directory */
-	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+	kn = kernfs_create_dir_ns(parent->kn, name, mode,
+				  current_fsuid(), current_fsgid(),
+				  cgrp, NULL);
 	if (IS_ERR(kn)) {
 		ret = PTR_ERR(kn);
 		goto out_stat_exit;
@@ -5761,10 +5742,6 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
 	 */
 	kernfs_get(cgrp->kn);
 
-	ret = cgroup_kn_set_ugid(cgrp->kn);
-	if (ret)
-		goto out_destroy;
-
 	ret = css_populate_dir(&cgrp->self);
 	if (ret)
 		goto out_destroy;
-- 
cgit v1.2.3


From ff6d413b0b59466e5acf2e42f294b1842ae130a1 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 12 Dec 2023 13:17:40 -0800
Subject: kernfs: Convert kernfs_path_from_node_locked() from strlcpy() to
 strscpy()

One of the last remaining users of strlcpy() in the kernel is
kernfs_path_from_node_locked(), which passes back the problematic "length
we _would_ have copied" return value to indicate truncation.  Convert the
chain of all callers to use the negative return value (some of which
already doing this explicitly). All callers were already also checking
for negative return values, so the risk to missed checks looks very low.

In this analysis, it was found that cgroup1_release_agent() actually
didn't handle the "too large" condition, so this is technically also a
bug fix. :)

Here's the chain of callers, and resolution identifying each one as now
handling the correct return value:

kernfs_path_from_node_locked()
        kernfs_path_from_node()
                pr_cont_kernfs_path()
                        returns void
                kernfs_path()
                        sysfs_warn_dup()
                                return value ignored
                        cgroup_path()
                                blkg_path()
                                        bfq_bic_update_cgroup()
                                                return value ignored
                                TRACE_IOCG_PATH()
                                        return value ignored
                                TRACE_CGROUP_PATH()
                                        return value ignored
                                perf_event_cgroup()
                                        return value ignored
                                task_group_path()
                                        return value ignored
                                damon_sysfs_memcg_path_eq()
                                        return value ignored
                                get_mm_memcg_path()
                                        return value ignored
                                lru_gen_seq_show()
                                        return value ignored
                        cgroup_path_from_kernfs_id()
                                return value ignored
                cgroup_show_path()
                        already converted "too large" error to negative value
                cgroup_path_ns_locked()
                        cgroup_path_ns()
                                bpf_iter_cgroup_show_fdinfo()
                                        return value ignored
                                cgroup1_release_agent()
                                        wasn't checking "too large" error
                        proc_cgroup_show()
                                already converted "too large" to negative value

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Waiman Long <longman@redhat.com>
Cc:  <cgroups@vger.kernel.org>
Co-developed-by: Azeem Shaikh <azeemshaikh38@gmail.com>
Signed-off-by: Azeem Shaikh <azeemshaikh38@gmail.com>
Link: https://lore.kernel.org/r/20231116192127.1558276-3-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20231212211741.164376-3-keescook@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c           | 34 +++++++++++++++++-----------------
 kernel/cgroup/cgroup-v1.c |  2 +-
 kernel/cgroup/cgroup.c    |  4 ++--
 kernel/cgroup/cpuset.c    |  2 +-
 4 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index d23bf7883b08..bce1d7ac95ca 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -127,7 +127,7 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
  *
  * [3] when @kn_to is %NULL result will be "(null)"
  *
- * Return: the length of the full path.  If the full length is equal to or
+ * Return: the length of the constructed path.  If the path would have been
  * greater than @buflen, @buf contains the truncated path with the trailing
  * '\0'.  On error, -errno is returned.
  */
@@ -138,16 +138,17 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
 	struct kernfs_node *kn, *common;
 	const char parent_str[] = "/..";
 	size_t depth_from, depth_to, len = 0;
+	ssize_t copied;
 	int i, j;
 
 	if (!kn_to)
-		return strlcpy(buf, "(null)", buflen);
+		return strscpy(buf, "(null)", buflen);
 
 	if (!kn_from)
 		kn_from = kernfs_root(kn_to)->kn;
 
 	if (kn_from == kn_to)
-		return strlcpy(buf, "/", buflen);
+		return strscpy(buf, "/", buflen);
 
 	common = kernfs_common_ancestor(kn_from, kn_to);
 	if (WARN_ON(!common))
@@ -158,18 +159,19 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
 
 	buf[0] = '\0';
 
-	for (i = 0; i < depth_from; i++)
-		len += strlcpy(buf + len, parent_str,
-			       len < buflen ? buflen - len : 0);
+	for (i = 0; i < depth_from; i++) {
+		copied = strscpy(buf + len, parent_str, buflen - len);
+		if (copied < 0)
+			return copied;
+		len += copied;
+	}
 
 	/* Calculate how many bytes we need for the rest */
 	for (i = depth_to - 1; i >= 0; i--) {
 		for (kn = kn_to, j = 0; j < i; j++)
 			kn = kn->parent;
-		len += strlcpy(buf + len, "/",
-			       len < buflen ? buflen - len : 0);
-		len += strlcpy(buf + len, kn->name,
-			       len < buflen ? buflen - len : 0);
+
+		len += scnprintf(buf + len, buflen - len, "/%s", kn->name);
 	}
 
 	return len;
@@ -214,7 +216,7 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
  * path (which includes '..'s) as needed to reach from @from to @to is
  * returned.
  *
- * Return: the length of the full path.  If the full length is equal to or
+ * Return: the length of the constructed path.  If the path would have been
  * greater than @buflen, @buf contains the truncated path with the trailing
  * '\0'.  On error, -errno is returned.
  */
@@ -265,12 +267,10 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
 	sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf,
 				   sizeof(kernfs_pr_cont_buf));
 	if (sz < 0) {
-		pr_cont("(error)");
-		goto out;
-	}
-
-	if (sz >= sizeof(kernfs_pr_cont_buf)) {
-		pr_cont("(name too long)");
+		if (sz == -E2BIG)
+			pr_cont("(name too long)");
+		else
+			pr_cont("(error)");
 		goto out;
 	}
 
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 76db6c67e39a..9cb00ebe9ac6 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -802,7 +802,7 @@ void cgroup1_release_agent(struct work_struct *work)
 		goto out_free;
 
 	ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
-	if (ret < 0 || ret >= PATH_MAX)
+	if (ret < 0)
 		goto out_free;
 
 	argv[0] = agentbuf;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a844b421fd83..4bc8183b669f 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1893,7 +1893,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
 	spin_unlock_irq(&css_set_lock);
 
-	if (len >= PATH_MAX)
+	if (len == -E2BIG)
 		len = -ERANGE;
 	else if (len > 0) {
 		seq_escape(sf, buf, " \t\n\\");
@@ -6278,7 +6278,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 		if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
 			retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
 						current->nsproxy->cgroup_ns);
-			if (retval >= PATH_MAX)
+			if (retval == -E2BIG)
 				retval = -ENAMETOOLONG;
 			if (retval < 0)
 				goto out_unlock;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 615daaf87f1f..fb29158ae825 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4941,7 +4941,7 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
 	retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
 				current->nsproxy->cgroup_ns);
 	css_put(css);
-	if (retval >= PATH_MAX)
+	if (retval == -E2BIG)
 		retval = -ENAMETOOLONG;
 	if (retval < 0)
 		goto out_free;
-- 
cgit v1.2.3


From 8b2efe51ba85ca83460941672afac6fca4199df6 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 15 Dec 2023 18:07:04 +0800
Subject: bpf: Limit the number of uprobes when attaching program to multiple
 uprobes

An abnormally big cnt may be passed to link_create.uprobe_multi.cnt,
and it will trigger the following warning in kvmalloc_node():

	if (unlikely(size > INT_MAX)) {
		WARN_ON_ONCE(!(flags & __GFP_NOWARN));
		return NULL;
	}

Fix the warning by limiting the maximal number of uprobes in
bpf_uprobe_multi_link_attach(). If the number of uprobes is greater than
MAX_UPROBE_MULTI_CNT, the attachment will return -E2BIG.

Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link")
Reported-by: Xingwei Lee <xrivendell7@gmail.com>
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Closes: https://lore.kernel.org/bpf/CABOYnLwwJY=yFAGie59LFsUsBAgHfroVqbzZ5edAXbFE3YiNVA@mail.gmail.com
Link: https://lore.kernel.org/bpf/20231215100708.2265609-2-houtao@huaweicloud.com
---
 kernel/trace/bpf_trace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 774cf476a892..75c05aea9fd9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -42,6 +42,8 @@
 #define bpf_event_rcu_dereference(p)					\
 	rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
 
+#define MAX_UPROBE_MULTI_CNT (1U << 20)
+
 #ifdef CONFIG_MODULES
 struct bpf_trace_module {
 	struct module *module;
@@ -3344,6 +3346,8 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 
 	if (!upath || !uoffsets || !cnt)
 		return -EINVAL;
+	if (cnt > MAX_UPROBE_MULTI_CNT)
+		return -E2BIG;
 
 	uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets);
 	ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies);
-- 
cgit v1.2.3


From d6d1e6c17cab2dcb7b8530c599f00e7de906d380 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 15 Dec 2023 18:07:05 +0800
Subject: bpf: Limit the number of kprobes when attaching program to multiple
 kprobes

An abnormally big cnt may also be assigned to kprobe_multi.cnt when
attaching multiple kprobes. It will trigger the following warning in
kvmalloc_node():

	if (unlikely(size > INT_MAX)) {
	    WARN_ON_ONCE(!(flags & __GFP_NOWARN));
	    return NULL;
	}

Fix the warning by limiting the maximal number of kprobes in
bpf_kprobe_multi_link_attach(). If the number of kprobes is greater than
MAX_KPROBE_MULTI_CNT, the attachment will fail and return -E2BIG.

Fixes: 0dcac2725406 ("bpf: Add multi kprobe link")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231215100708.2265609-3-houtao@huaweicloud.com
---
 kernel/trace/bpf_trace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 75c05aea9fd9..97c0c49c40a0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -43,6 +43,7 @@
 	rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
 
 #define MAX_UPROBE_MULTI_CNT (1U << 20)
+#define MAX_KPROBE_MULTI_CNT (1U << 20)
 
 #ifdef CONFIG_MODULES
 struct bpf_trace_module {
@@ -2972,6 +2973,8 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	cnt = attr->link_create.kprobe_multi.cnt;
 	if (!cnt)
 		return -EINVAL;
+	if (cnt > MAX_KPROBE_MULTI_CNT)
+		return -E2BIG;
 
 	size = cnt * sizeof(*addrs);
 	addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
-- 
cgit v1.2.3


From f8fa5d76925991976b3e7076f9d1052515ec1fca Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Dec 2023 13:24:10 -0700
Subject: cred: switch to using atomic_long_t

There are multiple ways to grab references to credentials, and the only
protection we have against overflowing it is the memory required to do
so.

With memory sizes only moving in one direction, let's bump the reference
count to 64-bit and move it outside the realm of feasibly overflowing.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cred.h |  8 +++----
 kernel/cred.c        | 64 ++++++++++++++++++++++++++--------------------------
 2 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index af8d353a4b86..a3383f8efb8f 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -109,7 +109,7 @@ static inline int groups_search(const struct group_info *group_info, kgid_t grp)
  * same context as task->real_cred.
  */
 struct cred {
-	atomic_t	usage;
+	atomic_long_t	usage;
 #ifdef CONFIG_DEBUG_CREDENTIALS
 	atomic_t	subscribers;	/* number of processes subscribed */
 	void		*put_addr;
@@ -229,7 +229,7 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
  */
 static inline struct cred *get_new_cred_many(struct cred *cred, int nr)
 {
-	atomic_add(nr, &cred->usage);
+	atomic_long_add(nr, &cred->usage);
 	return cred;
 }
 
@@ -288,7 +288,7 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
 	struct cred *nonconst_cred = (struct cred *) cred;
 	if (!cred)
 		return NULL;
-	if (!atomic_inc_not_zero(&nonconst_cred->usage))
+	if (!atomic_long_inc_not_zero(&nonconst_cred->usage))
 		return NULL;
 	validate_creds(cred);
 	nonconst_cred->non_rcu = 0;
@@ -313,7 +313,7 @@ static inline void put_cred_many(const struct cred *_cred, int nr)
 
 	if (cred) {
 		validate_creds(cred);
-		if (atomic_sub_and_test(nr, &cred->usage))
+		if (atomic_long_sub_and_test(nr, &cred->usage))
 			__put_cred(cred);
 	}
 }
diff --git a/kernel/cred.c b/kernel/cred.c
index 3c714cb31660..4a6cd0f0fef5 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -102,17 +102,17 @@ static void put_cred_rcu(struct rcu_head *rcu)
 
 #ifdef CONFIG_DEBUG_CREDENTIALS
 	if (cred->magic != CRED_MAGIC_DEAD ||
-	    atomic_read(&cred->usage) != 0 ||
+	    atomic_long_read(&cred->usage) != 0 ||
 	    read_cred_subscribers(cred) != 0)
 		panic("CRED: put_cred_rcu() sees %p with"
-		      " mag %x, put %p, usage %d, subscr %d\n",
+		      " mag %x, put %p, usage %ld, subscr %d\n",
 		      cred, cred->magic, cred->put_addr,
-		      atomic_read(&cred->usage),
+		      atomic_long_read(&cred->usage),
 		      read_cred_subscribers(cred));
 #else
-	if (atomic_read(&cred->usage) != 0)
-		panic("CRED: put_cred_rcu() sees %p with usage %d\n",
-		      cred, atomic_read(&cred->usage));
+	if (atomic_long_read(&cred->usage) != 0)
+		panic("CRED: put_cred_rcu() sees %p with usage %ld\n",
+		      cred, atomic_long_read(&cred->usage));
 #endif
 
 	security_cred_free(cred);
@@ -137,11 +137,11 @@ static void put_cred_rcu(struct rcu_head *rcu)
  */
 void __put_cred(struct cred *cred)
 {
-	kdebug("__put_cred(%p{%d,%d})", cred,
-	       atomic_read(&cred->usage),
+	kdebug("__put_cred(%p{%ld,%d})", cred,
+	       atomic_long_read(&cred->usage),
 	       read_cred_subscribers(cred));
 
-	BUG_ON(atomic_read(&cred->usage) != 0);
+	BUG_ON(atomic_long_read(&cred->usage) != 0);
 #ifdef CONFIG_DEBUG_CREDENTIALS
 	BUG_ON(read_cred_subscribers(cred) != 0);
 	cred->magic = CRED_MAGIC_DEAD;
@@ -164,8 +164,8 @@ void exit_creds(struct task_struct *tsk)
 {
 	struct cred *real_cred, *cred;
 
-	kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
-	       atomic_read(&tsk->cred->usage),
+	kdebug("exit_creds(%u,%p,%p,{%ld,%d})", tsk->pid, tsk->real_cred, tsk->cred,
+	       atomic_long_read(&tsk->cred->usage),
 	       read_cred_subscribers(tsk->cred));
 
 	real_cred = (struct cred *) tsk->real_cred;
@@ -230,7 +230,7 @@ struct cred *cred_alloc_blank(void)
 	if (!new)
 		return NULL;
 
-	atomic_set(&new->usage, 1);
+	atomic_long_set(&new->usage, 1);
 #ifdef CONFIG_DEBUG_CREDENTIALS
 	new->magic = CRED_MAGIC;
 #endif
@@ -276,7 +276,7 @@ struct cred *prepare_creds(void)
 	memcpy(new, old, sizeof(struct cred));
 
 	new->non_rcu = 0;
-	atomic_set(&new->usage, 1);
+	atomic_long_set(&new->usage, 1);
 	set_cred_subscribers(new, 0);
 	get_group_info(new->group_info);
 	get_uid(new->user);
@@ -363,8 +363,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	    ) {
 		p->real_cred = get_cred_many(p->cred, 2);
 		alter_cred_subscribers(p->cred, 2);
-		kdebug("share_creds(%p{%d,%d})",
-		       p->cred, atomic_read(&p->cred->usage),
+		kdebug("share_creds(%p{%ld,%d})",
+		       p->cred, atomic_long_read(&p->cred->usage),
 		       read_cred_subscribers(p->cred));
 		inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
 		return 0;
@@ -457,8 +457,8 @@ int commit_creds(struct cred *new)
 	struct task_struct *task = current;
 	const struct cred *old = task->real_cred;
 
-	kdebug("commit_creds(%p{%d,%d})", new,
-	       atomic_read(&new->usage),
+	kdebug("commit_creds(%p{%ld,%d})", new,
+	       atomic_long_read(&new->usage),
 	       read_cred_subscribers(new));
 
 	BUG_ON(task->cred != old);
@@ -467,7 +467,7 @@ int commit_creds(struct cred *new)
 	validate_creds(old);
 	validate_creds(new);
 #endif
-	BUG_ON(atomic_read(&new->usage) < 1);
+	BUG_ON(atomic_long_read(&new->usage) < 1);
 
 	get_cred(new); /* we will require a ref for the subj creds too */
 
@@ -539,14 +539,14 @@ EXPORT_SYMBOL(commit_creds);
  */
 void abort_creds(struct cred *new)
 {
-	kdebug("abort_creds(%p{%d,%d})", new,
-	       atomic_read(&new->usage),
+	kdebug("abort_creds(%p{%ld,%d})", new,
+	       atomic_long_read(&new->usage),
 	       read_cred_subscribers(new));
 
 #ifdef CONFIG_DEBUG_CREDENTIALS
 	BUG_ON(read_cred_subscribers(new) != 0);
 #endif
-	BUG_ON(atomic_read(&new->usage) < 1);
+	BUG_ON(atomic_long_read(&new->usage) < 1);
 	put_cred(new);
 }
 EXPORT_SYMBOL(abort_creds);
@@ -562,8 +562,8 @@ const struct cred *override_creds(const struct cred *new)
 {
 	const struct cred *old = current->cred;
 
-	kdebug("override_creds(%p{%d,%d})", new,
-	       atomic_read(&new->usage),
+	kdebug("override_creds(%p{%ld,%d})", new,
+	       atomic_long_read(&new->usage),
 	       read_cred_subscribers(new));
 
 	validate_creds(old);
@@ -585,8 +585,8 @@ const struct cred *override_creds(const struct cred *new)
 	rcu_assign_pointer(current->cred, new);
 	alter_cred_subscribers(old, -1);
 
-	kdebug("override_creds() = %p{%d,%d}", old,
-	       atomic_read(&old->usage),
+	kdebug("override_creds() = %p{%ld,%d}", old,
+	       atomic_long_read(&old->usage),
 	       read_cred_subscribers(old));
 	return old;
 }
@@ -603,8 +603,8 @@ void revert_creds(const struct cred *old)
 {
 	const struct cred *override = current->cred;
 
-	kdebug("revert_creds(%p{%d,%d})", old,
-	       atomic_read(&old->usage),
+	kdebug("revert_creds(%p{%ld,%d})", old,
+	       atomic_long_read(&old->usage),
 	       read_cred_subscribers(old));
 
 	validate_creds(old);
@@ -735,7 +735,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 
 	*new = *old;
 	new->non_rcu = 0;
-	atomic_set(&new->usage, 1);
+	atomic_long_set(&new->usage, 1);
 	set_cred_subscribers(new, 0);
 	get_uid(new->user);
 	get_user_ns(new->user_ns);
@@ -849,8 +849,8 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
 	       cred == tsk->cred ? "[eff]" : "");
 	pr_err("->magic=%x, put_addr=%p\n",
 	       cred->magic, cred->put_addr);
-	pr_err("->usage=%d, subscr=%d\n",
-	       atomic_read(&cred->usage),
+	pr_err("->usage=%ld, subscr=%d\n",
+	       atomic_long_read(&cred->usage),
 	       read_cred_subscribers(cred));
 	pr_err("->*uid = { %d,%d,%d,%d }\n",
 		from_kuid_munged(&init_user_ns, cred->uid),
@@ -922,9 +922,9 @@ EXPORT_SYMBOL(__validate_process_creds);
  */
 void validate_creds_for_do_exit(struct task_struct *tsk)
 {
-	kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
+	kdebug("validate_creds_for_do_exit(%p,%p{%ld,%d})",
 	       tsk->real_cred, tsk->cred,
-	       atomic_read(&tsk->cred->usage),
+	       atomic_long_read(&tsk->cred->usage),
 	       read_cred_subscribers(tsk->cred));
 
 	__validate_process_creds(tsk, __FILE__, __LINE__);
-- 
cgit v1.2.3


From ae1914174a63a558113e80d24ccac2773f9f7b2b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Dec 2023 13:40:57 -0700
Subject: cred: get rid of CONFIG_DEBUG_CREDENTIALS

This code is rarely (never?) enabled by distros, and it hasn't caught
anything in decades. Let's kill off this legacy debug code.

Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/configs/skiroot_defconfig    |   1 -
 arch/s390/configs/debug_defconfig         |   1 -
 fs/nfsd/auth.c                            |   4 -
 fs/nfsd/nfssvc.c                          |   1 -
 fs/nfsd/vfs.c                             |   9 +-
 fs/open.c                                 |   3 -
 include/linux/cred.h                      |  50 -------
 kernel/cred.c                             | 231 +++---------------------------
 kernel/exit.c                             |   3 -
 lib/Kconfig.debug                         |  15 --
 net/sunrpc/auth.c                         |   3 -
 security/selinux/hooks.c                  |   6 -
 tools/objtool/noreturns.h                 |   1 -
 tools/testing/selftests/bpf/config.x86_64 |   1 -
 tools/testing/selftests/hid/config.common |   1 -
 15 files changed, 17 insertions(+), 313 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig
index 8d3eacb50d56..9d44e6630908 100644
--- a/arch/powerpc/configs/skiroot_defconfig
+++ b/arch/powerpc/configs/skiroot_defconfig
@@ -301,7 +301,6 @@ CONFIG_WQ_WATCHDOG=y
 CONFIG_DEBUG_SG=y
 CONFIG_DEBUG_NOTIFIERS=y
 CONFIG_BUG_ON_DATA_CORRUPTION=y
-CONFIG_DEBUG_CREDENTIALS=y
 # CONFIG_FTRACE is not set
 CONFIG_XMON=y
 # CONFIG_RUNTIME_TESTING_MENU is not set
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 438cd92e6080..dd0608629310 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -834,7 +834,6 @@ CONFIG_DEBUG_IRQFLAGS=y
 CONFIG_DEBUG_LIST=y
 CONFIG_DEBUG_SG=y
 CONFIG_DEBUG_NOTIFIERS=y
-CONFIG_DEBUG_CREDENTIALS=y
 CONFIG_RCU_TORTURE_TEST=m
 CONFIG_RCU_REF_SCALE_TEST=m
 CONFIG_RCU_CPU_STALL_TIMEOUT=300
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index fdf2aad73470..e6beaaf4f170 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -26,8 +26,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	int i;
 	int flags = nfsexp_flags(rqstp, exp);
 
-	validate_process_creds();
-
 	/* discard any old override before preparing the new set */
 	revert_creds(get_cred(current_real_cred()));
 	new = prepare_creds();
@@ -81,10 +79,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	else
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
 							new->cap_permitted);
-	validate_process_creds();
 	put_cred(override_creds(new));
 	put_cred(new);
-	validate_process_creds();
 	return 0;
 
 oom:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index fe61d9bbcc1f..5014ab87d313 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -955,7 +955,6 @@ nfsd(void *vrqstp)
 		rqstp->rq_server->sv_maxconn = nn->max_connections;
 
 		svc_recv(rqstp);
-		validate_process_creds();
 	}
 
 	atomic_dec(&nfsdstats.th_cnt);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fbbea7498f02..e01e4e2acbd9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -901,7 +901,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 	int host_err;
 	bool retried = false;
 
-	validate_process_creds();
 	/*
 	 * If we get here, then the client has already done an "open",
 	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
@@ -926,7 +925,6 @@ retry:
 		}
 		err = nfserrno(host_err);
 	}
-	validate_process_creds();
 	return err;
 }
 
@@ -943,12 +941,7 @@ int
 nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
 		   struct file **filp)
 {
-	int err;
-
-	validate_process_creds();
-	err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
-	validate_process_creds();
-	return err;
+	return __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
 }
 
 /*
diff --git a/fs/open.c b/fs/open.c
index 02dc608d40d8..3494a9cd8046 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1088,8 +1088,6 @@ struct file *dentry_open(const struct path *path, int flags,
 	int error;
 	struct file *f;
 
-	validate_creds(cred);
-
 	/* We must always pass in a valid mount pointer. */
 	BUG_ON(!path->mnt);
 
@@ -1128,7 +1126,6 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 	struct file *f;
 	int error;
 
-	validate_creds(cred);
 	f = alloc_empty_file(flags, cred);
 	if (IS_ERR(f))
 		return f;
diff --git a/include/linux/cred.h b/include/linux/cred.h
index a3383f8efb8f..2976f534a7a3 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -110,13 +110,6 @@ static inline int groups_search(const struct group_info *group_info, kgid_t grp)
  */
 struct cred {
 	atomic_long_t	usage;
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	atomic_t	subscribers;	/* number of processes subscribed */
-	void		*put_addr;
-	unsigned	magic;
-#define CRED_MAGIC	0x43736564
-#define CRED_MAGIC_DEAD	0x44656144
-#endif
 	kuid_t		uid;		/* real UID of the task */
 	kgid_t		gid;		/* real GID of the task */
 	kuid_t		suid;		/* saved UID of the task */
@@ -172,46 +165,6 @@ extern int cred_fscmp(const struct cred *, const struct cred *);
 extern void __init cred_init(void);
 extern int set_cred_ucounts(struct cred *);
 
-/*
- * check for validity of credentials
- */
-#ifdef CONFIG_DEBUG_CREDENTIALS
-extern void __noreturn __invalid_creds(const struct cred *, const char *, unsigned);
-extern void __validate_process_creds(struct task_struct *,
-				     const char *, unsigned);
-
-extern bool creds_are_invalid(const struct cred *cred);
-
-static inline void __validate_creds(const struct cred *cred,
-				    const char *file, unsigned line)
-{
-	if (unlikely(creds_are_invalid(cred)))
-		__invalid_creds(cred, file, line);
-}
-
-#define validate_creds(cred)				\
-do {							\
-	__validate_creds((cred), __FILE__, __LINE__);	\
-} while(0)
-
-#define validate_process_creds()				\
-do {								\
-	__validate_process_creds(current, __FILE__, __LINE__);	\
-} while(0)
-
-extern void validate_creds_for_do_exit(struct task_struct *);
-#else
-static inline void validate_creds(const struct cred *cred)
-{
-}
-static inline void validate_creds_for_do_exit(struct task_struct *tsk)
-{
-}
-static inline void validate_process_creds(void)
-{
-}
-#endif
-
 static inline bool cap_ambient_invariant_ok(const struct cred *cred)
 {
 	return cap_issubset(cred->cap_ambient,
@@ -264,7 +217,6 @@ static inline const struct cred *get_cred_many(const struct cred *cred, int nr)
 	struct cred *nonconst_cred = (struct cred *) cred;
 	if (!cred)
 		return cred;
-	validate_creds(cred);
 	nonconst_cred->non_rcu = 0;
 	return get_new_cred_many(nonconst_cred, nr);
 }
@@ -290,7 +242,6 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred)
 		return NULL;
 	if (!atomic_long_inc_not_zero(&nonconst_cred->usage))
 		return NULL;
-	validate_creds(cred);
 	nonconst_cred->non_rcu = 0;
 	return cred;
 }
@@ -312,7 +263,6 @@ static inline void put_cred_many(const struct cred *_cred, int nr)
 	struct cred *cred = (struct cred *) _cred;
 
 	if (cred) {
-		validate_creds(cred);
 		if (atomic_long_sub_and_test(nr, &cred->usage))
 			__put_cred(cred);
 	}
diff --git a/kernel/cred.c b/kernel/cred.c
index 4a6cd0f0fef5..c033a201c808 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -43,10 +43,6 @@ static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
  */
 struct cred init_cred = {
 	.usage			= ATOMIC_INIT(4),
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	.subscribers		= ATOMIC_INIT(2),
-	.magic			= CRED_MAGIC,
-#endif
 	.uid			= GLOBAL_ROOT_UID,
 	.gid			= GLOBAL_ROOT_GID,
 	.suid			= GLOBAL_ROOT_UID,
@@ -66,31 +62,6 @@ struct cred init_cred = {
 	.ucounts		= &init_ucounts,
 };
 
-static inline void set_cred_subscribers(struct cred *cred, int n)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	atomic_set(&cred->subscribers, n);
-#endif
-}
-
-static inline int read_cred_subscribers(const struct cred *cred)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	return atomic_read(&cred->subscribers);
-#else
-	return 0;
-#endif
-}
-
-static inline void alter_cred_subscribers(const struct cred *_cred, int n)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	struct cred *cred = (struct cred *) _cred;
-
-	atomic_add(n, &cred->subscribers);
-#endif
-}
-
 /*
  * The RCU callback to actually dispose of a set of credentials
  */
@@ -100,20 +71,9 @@ static void put_cred_rcu(struct rcu_head *rcu)
 
 	kdebug("put_cred_rcu(%p)", cred);
 
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	if (cred->magic != CRED_MAGIC_DEAD ||
-	    atomic_long_read(&cred->usage) != 0 ||
-	    read_cred_subscribers(cred) != 0)
-		panic("CRED: put_cred_rcu() sees %p with"
-		      " mag %x, put %p, usage %ld, subscr %d\n",
-		      cred, cred->magic, cred->put_addr,
-		      atomic_long_read(&cred->usage),
-		      read_cred_subscribers(cred));
-#else
 	if (atomic_long_read(&cred->usage) != 0)
 		panic("CRED: put_cred_rcu() sees %p with usage %ld\n",
 		      cred, atomic_long_read(&cred->usage));
-#endif
 
 	security_cred_free(cred);
 	key_put(cred->session_keyring);
@@ -137,16 +97,10 @@ static void put_cred_rcu(struct rcu_head *rcu)
  */
 void __put_cred(struct cred *cred)
 {
-	kdebug("__put_cred(%p{%ld,%d})", cred,
-	       atomic_long_read(&cred->usage),
-	       read_cred_subscribers(cred));
+	kdebug("__put_cred(%p{%ld})", cred,
+	       atomic_long_read(&cred->usage));
 
 	BUG_ON(atomic_long_read(&cred->usage) != 0);
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	BUG_ON(read_cred_subscribers(cred) != 0);
-	cred->magic = CRED_MAGIC_DEAD;
-	cred->put_addr = __builtin_return_address(0);
-#endif
 	BUG_ON(cred == current->cred);
 	BUG_ON(cred == current->real_cred);
 
@@ -164,9 +118,8 @@ void exit_creds(struct task_struct *tsk)
 {
 	struct cred *real_cred, *cred;
 
-	kdebug("exit_creds(%u,%p,%p,{%ld,%d})", tsk->pid, tsk->real_cred, tsk->cred,
-	       atomic_long_read(&tsk->cred->usage),
-	       read_cred_subscribers(tsk->cred));
+	kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred,
+	       atomic_long_read(&tsk->cred->usage));
 
 	real_cred = (struct cred *) tsk->real_cred;
 	tsk->real_cred = NULL;
@@ -174,15 +127,10 @@ void exit_creds(struct task_struct *tsk)
 	cred = (struct cred *) tsk->cred;
 	tsk->cred = NULL;
 
-	validate_creds(cred);
 	if (real_cred == cred) {
-		alter_cred_subscribers(cred, -2);
 		put_cred_many(cred, 2);
 	} else {
-		validate_creds(real_cred);
-		alter_cred_subscribers(real_cred, -1);
 		put_cred(real_cred);
-		alter_cred_subscribers(cred, -1);
 		put_cred(cred);
 	}
 
@@ -231,9 +179,6 @@ struct cred *cred_alloc_blank(void)
 		return NULL;
 
 	atomic_long_set(&new->usage, 1);
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	new->magic = CRED_MAGIC;
-#endif
 	if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
 		goto error;
 
@@ -264,8 +209,6 @@ struct cred *prepare_creds(void)
 	const struct cred *old;
 	struct cred *new;
 
-	validate_process_creds();
-
 	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
 	if (!new)
 		return NULL;
@@ -277,7 +220,6 @@ struct cred *prepare_creds(void)
 
 	new->non_rcu = 0;
 	atomic_long_set(&new->usage, 1);
-	set_cred_subscribers(new, 0);
 	get_group_info(new->group_info);
 	get_uid(new->user);
 	get_user_ns(new->user_ns);
@@ -300,7 +242,6 @@ struct cred *prepare_creds(void)
 	if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
 		goto error;
 
-	validate_creds(new);
 	return new;
 
 error:
@@ -362,10 +303,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 		clone_flags & CLONE_THREAD
 	    ) {
 		p->real_cred = get_cred_many(p->cred, 2);
-		alter_cred_subscribers(p->cred, 2);
-		kdebug("share_creds(%p{%ld,%d})",
-		       p->cred, atomic_long_read(&p->cred->usage),
-		       read_cred_subscribers(p->cred));
+		kdebug("share_creds(%p{%ld})",
+		       p->cred, atomic_long_read(&p->cred->usage));
 		inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
 		return 0;
 	}
@@ -404,8 +343,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 
 	p->cred = p->real_cred = get_cred(new);
 	inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
-	alter_cred_subscribers(new, 2);
-	validate_creds(new);
 	return 0;
 
 error_put:
@@ -457,16 +394,10 @@ int commit_creds(struct cred *new)
 	struct task_struct *task = current;
 	const struct cred *old = task->real_cred;
 
-	kdebug("commit_creds(%p{%ld,%d})", new,
-	       atomic_long_read(&new->usage),
-	       read_cred_subscribers(new));
+	kdebug("commit_creds(%p{%ld})", new,
+	       atomic_long_read(&new->usage));
 
 	BUG_ON(task->cred != old);
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	BUG_ON(read_cred_subscribers(old) < 2);
-	validate_creds(old);
-	validate_creds(new);
-#endif
 	BUG_ON(atomic_long_read(&new->usage) < 1);
 
 	get_cred(new); /* we will require a ref for the subj creds too */
@@ -502,14 +433,12 @@ int commit_creds(struct cred *new)
 	 * RLIMIT_NPROC limits on user->processes have already been checked
 	 * in set_user().
 	 */
-	alter_cred_subscribers(new, 2);
 	if (new->user != old->user || new->user_ns != old->user_ns)
 		inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
 	rcu_assign_pointer(task->real_cred, new);
 	rcu_assign_pointer(task->cred, new);
 	if (new->user != old->user || new->user_ns != old->user_ns)
 		dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
-	alter_cred_subscribers(old, -2);
 
 	/* send notifications */
 	if (!uid_eq(new->uid,   old->uid)  ||
@@ -539,13 +468,9 @@ EXPORT_SYMBOL(commit_creds);
  */
 void abort_creds(struct cred *new)
 {
-	kdebug("abort_creds(%p{%ld,%d})", new,
-	       atomic_long_read(&new->usage),
-	       read_cred_subscribers(new));
+	kdebug("abort_creds(%p{%ld})", new,
+	       atomic_long_read(&new->usage));
 
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	BUG_ON(read_cred_subscribers(new) != 0);
-#endif
 	BUG_ON(atomic_long_read(&new->usage) < 1);
 	put_cred(new);
 }
@@ -562,12 +487,8 @@ const struct cred *override_creds(const struct cred *new)
 {
 	const struct cred *old = current->cred;
 
-	kdebug("override_creds(%p{%ld,%d})", new,
-	       atomic_long_read(&new->usage),
-	       read_cred_subscribers(new));
-
-	validate_creds(old);
-	validate_creds(new);
+	kdebug("override_creds(%p{%ld})", new,
+	       atomic_long_read(&new->usage));
 
 	/*
 	 * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
@@ -576,18 +497,12 @@ const struct cred *override_creds(const struct cred *new)
 	 * we are only installing the cred into the thread-synchronous
 	 * '->cred' pointer, not the '->real_cred' pointer that is
 	 * visible to other threads under RCU.
-	 *
-	 * Also note that we did validate_creds() manually, not depending
-	 * on the validation in 'get_cred()'.
 	 */
 	get_new_cred((struct cred *)new);
-	alter_cred_subscribers(new, 1);
 	rcu_assign_pointer(current->cred, new);
-	alter_cred_subscribers(old, -1);
 
-	kdebug("override_creds() = %p{%ld,%d}", old,
-	       atomic_long_read(&old->usage),
-	       read_cred_subscribers(old));
+	kdebug("override_creds() = %p{%ld}", old,
+	       atomic_long_read(&old->usage));
 	return old;
 }
 EXPORT_SYMBOL(override_creds);
@@ -603,15 +518,10 @@ void revert_creds(const struct cred *old)
 {
 	const struct cred *override = current->cred;
 
-	kdebug("revert_creds(%p{%ld,%d})", old,
-	       atomic_long_read(&old->usage),
-	       read_cred_subscribers(old));
+	kdebug("revert_creds(%p{%ld})", old,
+	       atomic_long_read(&old->usage));
 
-	validate_creds(old);
-	validate_creds(override);
-	alter_cred_subscribers(old, 1);
 	rcu_assign_pointer(current->cred, old);
-	alter_cred_subscribers(override, -1);
 	put_cred(override);
 }
 EXPORT_SYMBOL(revert_creds);
@@ -731,12 +641,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 	kdebug("prepare_kernel_cred() alloc %p", new);
 
 	old = get_task_cred(daemon);
-	validate_creds(old);
 
 	*new = *old;
 	new->non_rcu = 0;
 	atomic_long_set(&new->usage, 1);
-	set_cred_subscribers(new, 0);
 	get_uid(new->user);
 	get_user_ns(new->user_ns);
 	get_group_info(new->group_info);
@@ -760,7 +668,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 		goto error;
 
 	put_cred(old);
-	validate_creds(new);
 	return new;
 
 error:
@@ -825,109 +732,3 @@ int set_create_files_as(struct cred *new, struct inode *inode)
 	return security_kernel_create_files_as(new, inode);
 }
 EXPORT_SYMBOL(set_create_files_as);
-
-#ifdef CONFIG_DEBUG_CREDENTIALS
-
-bool creds_are_invalid(const struct cred *cred)
-{
-	if (cred->magic != CRED_MAGIC)
-		return true;
-	return false;
-}
-EXPORT_SYMBOL(creds_are_invalid);
-
-/*
- * dump invalid credentials
- */
-static void dump_invalid_creds(const struct cred *cred, const char *label,
-			       const struct task_struct *tsk)
-{
-	pr_err("%s credentials: %p %s%s%s\n",
-	       label, cred,
-	       cred == &init_cred ? "[init]" : "",
-	       cred == tsk->real_cred ? "[real]" : "",
-	       cred == tsk->cred ? "[eff]" : "");
-	pr_err("->magic=%x, put_addr=%p\n",
-	       cred->magic, cred->put_addr);
-	pr_err("->usage=%ld, subscr=%d\n",
-	       atomic_long_read(&cred->usage),
-	       read_cred_subscribers(cred));
-	pr_err("->*uid = { %d,%d,%d,%d }\n",
-		from_kuid_munged(&init_user_ns, cred->uid),
-		from_kuid_munged(&init_user_ns, cred->euid),
-		from_kuid_munged(&init_user_ns, cred->suid),
-		from_kuid_munged(&init_user_ns, cred->fsuid));
-	pr_err("->*gid = { %d,%d,%d,%d }\n",
-		from_kgid_munged(&init_user_ns, cred->gid),
-		from_kgid_munged(&init_user_ns, cred->egid),
-		from_kgid_munged(&init_user_ns, cred->sgid),
-		from_kgid_munged(&init_user_ns, cred->fsgid));
-#ifdef CONFIG_SECURITY
-	pr_err("->security is %p\n", cred->security);
-	if ((unsigned long) cred->security >= PAGE_SIZE &&
-	    (((unsigned long) cred->security & 0xffffff00) !=
-	     (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
-		pr_err("->security {%x, %x}\n",
-		       ((u32*)cred->security)[0],
-		       ((u32*)cred->security)[1]);
-#endif
-}
-
-/*
- * report use of invalid credentials
- */
-void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line)
-{
-	pr_err("Invalid credentials\n");
-	pr_err("At %s:%u\n", file, line);
-	dump_invalid_creds(cred, "Specified", current);
-	BUG();
-}
-EXPORT_SYMBOL(__invalid_creds);
-
-/*
- * check the credentials on a process
- */
-void __validate_process_creds(struct task_struct *tsk,
-			      const char *file, unsigned line)
-{
-	if (tsk->cred == tsk->real_cred) {
-		if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
-			     creds_are_invalid(tsk->cred)))
-			goto invalid_creds;
-	} else {
-		if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
-			     read_cred_subscribers(tsk->cred) < 1 ||
-			     creds_are_invalid(tsk->real_cred) ||
-			     creds_are_invalid(tsk->cred)))
-			goto invalid_creds;
-	}
-	return;
-
-invalid_creds:
-	pr_err("Invalid process credentials\n");
-	pr_err("At %s:%u\n", file, line);
-
-	dump_invalid_creds(tsk->real_cred, "Real", tsk);
-	if (tsk->cred != tsk->real_cred)
-		dump_invalid_creds(tsk->cred, "Effective", tsk);
-	else
-		pr_err("Effective creds == Real creds\n");
-	BUG();
-}
-EXPORT_SYMBOL(__validate_process_creds);
-
-/*
- * check creds for do_exit()
- */
-void validate_creds_for_do_exit(struct task_struct *tsk)
-{
-	kdebug("validate_creds_for_do_exit(%p,%p{%ld,%d})",
-	       tsk->real_cred, tsk->cred,
-	       atomic_long_read(&tsk->cred->usage),
-	       read_cred_subscribers(tsk->cred));
-
-	__validate_process_creds(tsk, __FILE__, __LINE__);
-}
-
-#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/exit.c b/kernel/exit.c
index ee9f43bed49a..aedc0832c9f4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -824,8 +824,6 @@ void __noreturn do_exit(long code)
 	ptrace_event(PTRACE_EVENT_EXIT, code);
 	user_events_exit(tsk);
 
-	validate_creds_for_do_exit(tsk);
-
 	io_uring_files_cancel();
 	exit_signals(tsk);  /* sets PF_EXITING */
 
@@ -909,7 +907,6 @@ void __noreturn do_exit(long code)
 	if (tsk->task_frag.page)
 		put_page(tsk->task_frag.page);
 
-	validate_creds_for_do_exit(tsk);
 	exit_task_stack_account(tsk);
 
 	check_stack_usage();
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cc7d53d9dc01..4405f81248fb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1739,21 +1739,6 @@ config DEBUG_MAPLE_TREE
 
 endmenu
 
-config DEBUG_CREDENTIALS
-	bool "Debug credential management"
-	depends on DEBUG_KERNEL
-	help
-	  Enable this to turn on some debug checking for credential
-	  management.  The additional code keeps track of the number of
-	  pointers from task_structs to any given cred struct, and checks to
-	  see that this number never exceeds the usage count of the cred
-	  struct.
-
-	  Furthermore, if SELinux is enabled, this also checks that the
-	  security pointer in the cred struct is never seen to be invalid.
-
-	  If unsure, say N.
-
 source "kernel/rcu/Kconfig.debug"
 
 config DEBUG_WQ_FORCE_RR_CPU
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 7bfe7d9a32aa..04534ea537c8 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -40,9 +40,6 @@ static unsigned long number_cred_unused;
 
 static struct cred machine_cred = {
 	.usage = ATOMIC_INIT(1),
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	.magic = CRED_MAGIC,
-#endif
 };
 
 /*
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index feda711c6b7b..340b2bbbb2dd 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1660,8 +1660,6 @@ static int inode_has_perm(const struct cred *cred,
 	struct inode_security_struct *isec;
 	u32 sid;
 
-	validate_creds(cred);
-
 	if (unlikely(IS_PRIVATE(inode)))
 		return 0;
 
@@ -3056,8 +3054,6 @@ static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode,
 	struct inode_security_struct *isec;
 	u32 sid;
 
-	validate_creds(cred);
-
 	ad.type = LSM_AUDIT_DATA_DENTRY;
 	ad.u.dentry = dentry;
 	sid = cred_sid(cred);
@@ -3101,8 +3097,6 @@ static int selinux_inode_permission(struct inode *inode, int mask)
 	if (!mask)
 		return 0;
 
-	validate_creds(cred);
-
 	if (unlikely(IS_PRIVATE(inode)))
 		return 0;
 
diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h
index 649ebdef9c3f..1685d7ea6a9f 100644
--- a/tools/objtool/noreturns.h
+++ b/tools/objtool/noreturns.h
@@ -6,7 +6,6 @@
  *
  * Yes, this is unfortunate.  A better solution is in the works.
  */
-NORETURN(__invalid_creds)
 NORETURN(__kunit_abort)
 NORETURN(__module_put_and_kthread_exit)
 NORETURN(__reiserfs_panic)
diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64
index 2e70a6048278..49a29dbc1910 100644
--- a/tools/testing/selftests/bpf/config.x86_64
+++ b/tools/testing/selftests/bpf/config.x86_64
@@ -50,7 +50,6 @@ CONFIG_CRYPTO_SEQIV=y
 CONFIG_CRYPTO_XXHASH=y
 CONFIG_DCB=y
 CONFIG_DEBUG_ATOMIC_SLEEP=y
-CONFIG_DEBUG_CREDENTIALS=y
 CONFIG_DEBUG_INFO_BTF=y
 CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
 CONFIG_DEBUG_MEMORY_INIT=y
diff --git a/tools/testing/selftests/hid/config.common b/tools/testing/selftests/hid/config.common
index 0617275d93cc..0f456dbab62f 100644
--- a/tools/testing/selftests/hid/config.common
+++ b/tools/testing/selftests/hid/config.common
@@ -46,7 +46,6 @@ CONFIG_CRYPTO_SEQIV=y
 CONFIG_CRYPTO_XXHASH=y
 CONFIG_DCB=y
 CONFIG_DEBUG_ATOMIC_SLEEP=y
-CONFIG_DEBUG_CREDENTIALS=y
 CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
 CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_DEFAULT_FQ_CODEL=y
-- 
cgit v1.2.3


From 4f9087f16651aca4a5f32da840a53f6660f0579a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Dec 2023 10:12:18 +0100
Subject: x86/cfi,bpf: Fix BPF JIT call

The current BPF call convention is __nocfi, except when it calls !JIT things,
then it calls regular C functions.

It so happens that with FineIBT the __nocfi and C calling conventions are
incompatible. Specifically __nocfi will call at func+0, while FineIBT will have
endbr-poison there, which is not a valid indirect target. Causing #CP.

Notably this only triggers on IBT enabled hardware, which is probably why this
hasn't been reported (also, most people will have JIT on anyway).

Implement proper CFI prologues for the BPF JIT codegen and drop __nocfi for
x86.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20231215092707.345270396@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/include/asm/cfi.h    | 110 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/alternative.c |  47 +++++++++++++++---
 arch/x86/net/bpf_jit_comp.c   |  82 +++++++++++++++++++++++++++++--
 include/linux/bpf.h           |  12 ++++-
 include/linux/cfi.h           |   7 +++
 kernel/bpf/core.c             |  25 ++++++++++
 6 files changed, 269 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
index 2a494643089d..7a7b0b823a98 100644
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -9,15 +9,125 @@
  */
 #include <linux/bug.h>
 
+/*
+ * An overview of the various calling conventions...
+ *
+ * Traditional:
+ *
+ * foo:
+ *   ... code here ...
+ *   ret
+ *
+ * direct caller:
+ *   call foo
+ *
+ * indirect caller:
+ *   lea foo(%rip), %r11
+ *   ...
+ *   call *%r11
+ *
+ *
+ * IBT:
+ *
+ * foo:
+ *   endbr64
+ *   ... code here ...
+ *   ret
+ *
+ * direct caller:
+ *   call foo / call foo+4
+ *
+ * indirect caller:
+ *   lea foo(%rip), %r11
+ *   ...
+ *   call *%r11
+ *
+ *
+ * kCFI:
+ *
+ * __cfi_foo:
+ *   movl $0x12345678, %eax
+ *				# 11 nops when CONFIG_CALL_PADDING
+ * foo:
+ *   endbr64			# when IBT
+ *   ... code here ...
+ *   ret
+ *
+ * direct call:
+ *   call foo			# / call foo+4 when IBT
+ *
+ * indirect call:
+ *   lea foo(%rip), %r11
+ *   ...
+ *   movl $(-0x12345678), %r10d
+ *   addl -4(%r11), %r10d	# -15 when CONFIG_CALL_PADDING
+ *   jz   1f
+ *   ud2
+ * 1:call *%r11
+ *
+ *
+ * FineIBT (builds as kCFI + CALL_PADDING + IBT + RETPOLINE and runtime patches into):
+ *
+ * __cfi_foo:
+ *   endbr64
+ *   subl 0x12345678, %r10d
+ *   jz   foo
+ *   ud2
+ *   nop
+ * foo:
+ *   osp nop3			# was endbr64
+ *   ... code here ...
+ *   ret
+ *
+ * direct caller:
+ *   call foo / call foo+4
+ *
+ * indirect caller:
+ *   lea foo(%rip), %r11
+ *   ...
+ *   movl $0x12345678, %r10d
+ *   subl $16, %r11
+ *   nop4
+ *   call *%r11
+ *
+ */
+enum cfi_mode {
+	CFI_DEFAULT,	/* FineIBT if hardware has IBT, otherwise kCFI */
+	CFI_OFF,	/* Taditional / IBT depending on .config */
+	CFI_KCFI,	/* Optionally CALL_PADDING, IBT, RETPOLINE */
+	CFI_FINEIBT,	/* see arch/x86/kernel/alternative.c */
+};
+
+extern enum cfi_mode cfi_mode;
+
 struct pt_regs;
 
 #ifdef CONFIG_CFI_CLANG
 enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
+#define __bpfcall
+extern u32 cfi_bpf_hash;
+
+static inline int cfi_get_offset(void)
+{
+	switch (cfi_mode) {
+	case CFI_FINEIBT:
+		return 16;
+	case CFI_KCFI:
+		if (IS_ENABLED(CONFIG_CALL_PADDING))
+			return 16;
+		return 5;
+	default:
+		return 0;
+	}
+}
+#define cfi_get_offset cfi_get_offset
+
 #else
 static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
 {
 	return BUG_TRAP_TYPE_NONE;
 }
+#define cfi_bpf_hash 0U
 #endif /* CONFIG_CFI_CLANG */
 
 #endif /* _ASM_X86_CFI_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 73be3931e4f0..d808d3aaec7e 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -30,6 +30,7 @@
 #include <asm/fixmap.h>
 #include <asm/paravirt.h>
 #include <asm/asm-prototypes.h>
+#include <asm/cfi.h>
 
 int __read_mostly alternatives_patched;
 
@@ -832,15 +833,43 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
 #endif /* CONFIG_X86_KERNEL_IBT */
 
 #ifdef CONFIG_FINEIBT
+#define __CFI_DEFAULT	CFI_DEFAULT
+#elif defined(CONFIG_CFI_CLANG)
+#define __CFI_DEFAULT	CFI_KCFI
+#else
+#define __CFI_DEFAULT	CFI_OFF
+#endif
 
-enum cfi_mode {
-	CFI_DEFAULT,
-	CFI_OFF,
-	CFI_KCFI,
-	CFI_FINEIBT,
-};
+enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
+
+#ifdef CONFIG_CFI_CLANG
+struct bpf_insn;
+
+/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
+extern unsigned int __bpf_prog_runX(const void *ctx,
+				    const struct bpf_insn *insn);
+
+/*
+ * Force a reference to the external symbol so the compiler generates
+ * __kcfi_typid.
+ */
+__ADDRESSABLE(__bpf_prog_runX);
+
+/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
+asm (
+"	.pushsection	.data..ro_after_init,\"aw\",@progbits	\n"
+"	.type	cfi_bpf_hash,@object				\n"
+"	.globl	cfi_bpf_hash					\n"
+"	.p2align	2, 0x0					\n"
+"cfi_bpf_hash:							\n"
+"	.long	__kcfi_typeid___bpf_prog_runX			\n"
+"	.size	cfi_bpf_hash, 4					\n"
+"	.popsection						\n"
+);
+#endif
+
+#ifdef CONFIG_FINEIBT
 
-static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
 static bool cfi_rand __ro_after_init = true;
 static u32  cfi_seed __ro_after_init;
 
@@ -1149,8 +1178,10 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
 		goto err;
 
 	if (cfi_rand) {
-		if (builtin)
+		if (builtin) {
 			cfi_seed = get_random_u32();
+			cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
+		}
 
 		ret = cfi_rand_preamble(start_cfi, end_cfi);
 		if (ret)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index af4a5de7d93a..5d5b967b111d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -17,6 +17,7 @@
 #include <asm/nospec-branch.h>
 #include <asm/text-patching.h>
 #include <asm/unwind.h>
+#include <asm/cfi.h>
 
 static bool all_callee_regs_used[4] = {true, true, true, true};
 
@@ -51,9 +52,11 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 	do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
 
 #ifdef CONFIG_X86_KERNEL_IBT
-#define EMIT_ENDBR()	EMIT(gen_endbr(), 4)
+#define EMIT_ENDBR()		EMIT(gen_endbr(), 4)
+#define EMIT_ENDBR_POISON()	EMIT(gen_endbr_poison(), 4)
 #else
 #define EMIT_ENDBR()
+#define EMIT_ENDBR_POISON()
 #endif
 
 static bool is_imm8(int value)
@@ -304,6 +307,69 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
 	*pprog = prog;
 }
 
+/*
+ * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT
+ * in arch/x86/kernel/alternative.c
+ */
+
+static void emit_fineibt(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	EMIT_ENDBR();
+	EMIT3_off32(0x41, 0x81, 0xea, cfi_bpf_hash);	/* subl $hash, %r10d	*/
+	EMIT2(0x74, 0x07);				/* jz.d8 +7		*/
+	EMIT2(0x0f, 0x0b);				/* ud2			*/
+	EMIT1(0x90);					/* nop			*/
+	EMIT_ENDBR_POISON();
+
+	*pprog = prog;
+}
+
+static void emit_kcfi(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	EMIT1_off32(0xb8, cfi_bpf_hash);		/* movl $hash, %eax	*/
+#ifdef CONFIG_CALL_PADDING
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+	EMIT1(0x90);
+#endif
+	EMIT_ENDBR();
+
+	*pprog = prog;
+}
+
+static void emit_cfi(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	switch (cfi_mode) {
+	case CFI_FINEIBT:
+		emit_fineibt(&prog);
+		break;
+
+	case CFI_KCFI:
+		emit_kcfi(&prog);
+		break;
+
+	default:
+		EMIT_ENDBR();
+		break;
+	}
+
+	*pprog = prog;
+}
+
 /*
  * Emit x86-64 prologue code for BPF program.
  * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
@@ -315,10 +381,10 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
 {
 	u8 *prog = *pprog;
 
+	emit_cfi(&prog);
 	/* BPF trampoline can be made to work without these nops,
 	 * but let's waste 5 bytes for now and optimize later
 	 */
-	EMIT_ENDBR();
 	memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
 	prog += X86_PATCH_SIZE;
 	if (!ebpf_from_cbpf) {
@@ -3013,9 +3079,16 @@ out_image:
 			jit_data->header = header;
 			jit_data->rw_header = rw_header;
 		}
-		prog->bpf_func = (void *)image;
+		/*
+		 * ctx.prog_offset is used when CFI preambles put code *before*
+		 * the function. See emit_cfi(). For FineIBT specifically this code
+		 * can also be executed and bpf_prog_kallsyms_add() will
+		 * generate an additional symbol to cover this, hence also
+		 * decrement proglen.
+		 */
+		prog->bpf_func = (void *)image + cfi_get_offset();
 		prog->jited = 1;
-		prog->jited_len = proglen;
+		prog->jited_len = proglen - cfi_get_offset();
 	} else {
 		prog = orig_prog;
 	}
@@ -3070,6 +3143,7 @@ void bpf_jit_free(struct bpf_prog *prog)
 			kvfree(jit_data->addrs);
 			kfree(jit_data);
 		}
+		prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
 		hdr = bpf_jit_binary_pack_hdr(prog);
 		bpf_jit_binary_pack_free(hdr, NULL);
 		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c87c608a3689..9d84c376851a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -29,6 +29,7 @@
 #include <linux/rcupdate_trace.h>
 #include <linux/static_call.h>
 #include <linux/memcontrol.h>
+#include <linux/cfi.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -1211,7 +1212,11 @@ struct bpf_dispatcher {
 #endif
 };
 
-static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func(
+#ifndef __bpfcall
+#define __bpfcall __nocfi
+#endif
+
+static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func(
 	const void *ctx,
 	const struct bpf_insn *insnsi,
 	bpf_func_t bpf_func)
@@ -1303,7 +1308,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
 
 #define DEFINE_BPF_DISPATCHER(name)					\
 	__BPF_DISPATCHER_SC(name);					\
-	noinline __nocfi unsigned int bpf_dispatcher_##name##_func(	\
+	noinline __bpfcall unsigned int bpf_dispatcher_##name##_func(	\
 		const void *ctx,					\
 		const struct bpf_insn *insnsi,				\
 		bpf_func_t bpf_func)					\
@@ -1453,6 +1458,9 @@ struct bpf_prog_aux {
 	struct bpf_kfunc_desc_tab *kfunc_tab;
 	struct bpf_kfunc_btf_tab *kfunc_btf_tab;
 	u32 size_poke_tab;
+#ifdef CONFIG_FINEIBT
+	struct bpf_ksym ksym_prefix;
+#endif
 	struct bpf_ksym ksym;
 	const struct bpf_prog_ops *ops;
 	struct bpf_map **used_maps;
diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index 2309d74e77e6..1ed2d96c0cfc 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -11,6 +11,13 @@
 #include <linux/module.h>
 #include <asm/cfi.h>
 
+#ifndef cfi_get_offset
+static inline int cfi_get_offset(void)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_CFI_CLANG
 enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
 				      unsigned long *target, u32 type);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c34513d645c4..5aa6863ac33b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -121,6 +121,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 #endif
 
 	INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
+#ifdef CONFIG_FINEIBT
+	INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
+#endif
 	mutex_init(&fp->aux->used_maps_mutex);
 	mutex_init(&fp->aux->dst_mutex);
 
@@ -683,6 +686,23 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 	fp->aux->ksym.prog = true;
 
 	bpf_ksym_add(&fp->aux->ksym);
+
+#ifdef CONFIG_FINEIBT
+	/*
+	 * When FineIBT, code in the __cfi_foo() symbols can get executed
+	 * and hence unwinder needs help.
+	 */
+	if (cfi_mode != CFI_FINEIBT)
+		return;
+
+	snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
+		 "__cfi_%s", fp->aux->ksym.name);
+
+	fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
+	fp->aux->ksym_prefix.end   = (unsigned long) fp->bpf_func;
+
+	bpf_ksym_add(&fp->aux->ksym_prefix);
+#endif
 }
 
 void bpf_prog_kallsyms_del(struct bpf_prog *fp)
@@ -691,6 +711,11 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp)
 		return;
 
 	bpf_ksym_del(&fp->aux->ksym);
+#ifdef CONFIG_FINEIBT
+	if (cfi_mode != CFI_FINEIBT)
+		return;
+	bpf_ksym_del(&fp->aux->ksym_prefix);
+#endif
 }
 
 static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
-- 
cgit v1.2.3


From 2cd3e3772e41377f32d6eea643e0590774e9187c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Dec 2023 10:12:20 +0100
Subject: x86/cfi,bpf: Fix bpf_struct_ops CFI

BPF struct_ops uses __arch_prepare_bpf_trampoline() to write
trampolines for indirect function calls. These tramplines much have
matching CFI.

In order to obtain the correct CFI hash for the various methods, add a
matching structure that contains stub functions, the compiler will
generate correct CFI which we can pilfer for the trampolines.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20231215092707.566977112@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/include/asm/cfi.h     |  6 ++++
 arch/x86/kernel/alternative.c  | 22 ++++++++++++++
 arch/x86/net/bpf_jit_comp.c    | 66 ++++++++++++++++++++++++++--------------
 include/linux/bpf.h            | 13 ++++++++
 kernel/bpf/bpf_struct_ops.c    | 16 +++++-----
 net/bpf/bpf_dummy_struct_ops.c | 31 ++++++++++++++++++-
 net/ipv4/bpf_tcp_ca.c          | 69 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 191 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
index 8779abd217b7..1a50b2cd4713 100644
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -123,6 +123,8 @@ static inline int cfi_get_offset(void)
 }
 #define cfi_get_offset cfi_get_offset
 
+extern u32 cfi_get_func_hash(void *func);
+
 #else
 static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
 {
@@ -130,6 +132,10 @@ static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
 }
 #define cfi_bpf_hash 0U
 #define cfi_bpf_subprog_hash 0U
+static inline u32 cfi_get_func_hash(void *func)
+{
+	return 0;
+}
 #endif /* CONFIG_CFI_CLANG */
 
 #endif /* _ASM_X86_CFI_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index cb393efd5ccd..49c2a62ba5e4 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -883,6 +883,28 @@ asm (
 "	.size	cfi_bpf_subprog_hash, 4				\n"
 "	.popsection						\n"
 );
+
+u32 cfi_get_func_hash(void *func)
+{
+	u32 hash;
+
+	func -= cfi_get_offset();
+	switch (cfi_mode) {
+	case CFI_FINEIBT:
+		func += 7;
+		break;
+	case CFI_KCFI:
+		func += 1;
+		break;
+	default:
+		return 0;
+	}
+
+	if (get_kernel_nofault(hash, func))
+		return 0;
+
+	return hash;
+}
 #endif
 
 #ifdef CONFIG_FINEIBT
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 43b8c08fdf8d..c89a4abdd726 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -312,9 +312,8 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
  * in arch/x86/kernel/alternative.c
  */
 
-static void emit_fineibt(u8 **pprog, bool is_subprog)
+static void emit_fineibt(u8 **pprog, u32 hash)
 {
-	u32 hash = is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash;
 	u8 *prog = *pprog;
 
 	EMIT_ENDBR();
@@ -327,9 +326,8 @@ static void emit_fineibt(u8 **pprog, bool is_subprog)
 	*pprog = prog;
 }
 
-static void emit_kcfi(u8 **pprog, bool is_subprog)
+static void emit_kcfi(u8 **pprog, u32 hash)
 {
-	u32 hash = is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash;
 	u8 *prog = *pprog;
 
 	EMIT1_off32(0xb8, hash);			/* movl $hash, %eax	*/
@@ -351,17 +349,17 @@ static void emit_kcfi(u8 **pprog, bool is_subprog)
 	*pprog = prog;
 }
 
-static void emit_cfi(u8 **pprog, bool is_subprog)
+static void emit_cfi(u8 **pprog, u32 hash)
 {
 	u8 *prog = *pprog;
 
 	switch (cfi_mode) {
 	case CFI_FINEIBT:
-		emit_fineibt(&prog, is_subprog);
+		emit_fineibt(&prog, hash);
 		break;
 
 	case CFI_KCFI:
-		emit_kcfi(&prog, is_subprog);
+		emit_kcfi(&prog, hash);
 		break;
 
 	default:
@@ -383,7 +381,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
 {
 	u8 *prog = *pprog;
 
-	emit_cfi(&prog, is_subprog);
+	emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash);
 	/* BPF trampoline can be made to work without these nops,
 	 * but let's waste 5 bytes for now and optimize later
 	 */
@@ -2510,10 +2508,19 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 	u8 *prog;
 	bool save_ret;
 
+	/*
+	 * F_INDIRECT is only compatible with F_RET_FENTRY_RET, it is
+	 * explicitly incompatible with F_CALL_ORIG | F_SKIP_FRAME | F_IP_ARG
+	 * because @func_addr.
+	 */
+	WARN_ON_ONCE((flags & BPF_TRAMP_F_INDIRECT) &&
+		     (flags & ~(BPF_TRAMP_F_INDIRECT | BPF_TRAMP_F_RET_FENTRY_RET)));
+
 	/* extra registers for struct arguments */
-	for (i = 0; i < m->nr_args; i++)
+	for (i = 0; i < m->nr_args; i++) {
 		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
 			nr_regs += (m->arg_size[i] + 7) / 8 - 1;
+	}
 
 	/* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6
 	 * are passed through regs, the remains are through stack.
@@ -2596,20 +2603,27 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 
 	prog = rw_image;
 
-	EMIT_ENDBR();
-	/*
-	 * This is the direct-call trampoline, as such it needs accounting
-	 * for the __fentry__ call.
-	 */
-	x86_call_depth_emit_accounting(&prog, NULL);
+	if (flags & BPF_TRAMP_F_INDIRECT) {
+		/*
+		 * Indirect call for bpf_struct_ops
+		 */
+		emit_cfi(&prog, cfi_get_func_hash(func_addr));
+	} else {
+		/*
+		 * Direct-call fentry stub, as such it needs accounting for the
+		 * __fentry__ call.
+		 */
+		x86_call_depth_emit_accounting(&prog, NULL);
+	}
 	EMIT1(0x55);		 /* push rbp */
 	EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
-	if (!is_imm8(stack_size))
+	if (!is_imm8(stack_size)) {
 		/* sub rsp, stack_size */
 		EMIT3_off32(0x48, 0x81, 0xEC, stack_size);
-	else
+	} else {
 		/* sub rsp, stack_size */
 		EMIT4(0x48, 0x83, 0xEC, stack_size);
+	}
 	if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
 		EMIT1(0x50);		/* push rax */
 	/* mov QWORD PTR [rbp - rbx_off], rbx */
@@ -2643,10 +2657,11 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		}
 	}
 
-	if (fentry->nr_links)
+	if (fentry->nr_links) {
 		if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off,
 			       flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image))
 			return -EINVAL;
+	}
 
 	if (fmod_ret->nr_links) {
 		branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *),
@@ -2665,11 +2680,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		restore_regs(m, &prog, regs_off);
 		save_args(m, &prog, arg_stack_off, true);
 
-		if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+		if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
 			/* Before calling the original function, restore the
 			 * tail_call_cnt from stack to rax.
 			 */
 			RESTORE_TAIL_CALL_CNT(stack_size);
+		}
 
 		if (flags & BPF_TRAMP_F_ORIG_STACK) {
 			emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8);
@@ -2698,17 +2714,19 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 		/* Update the branches saved in invoke_bpf_mod_ret with the
 		 * aligned address of do_fexit.
 		 */
-		for (i = 0; i < fmod_ret->nr_links; i++)
+		for (i = 0; i < fmod_ret->nr_links; i++) {
 			emit_cond_near_jump(&branches[i], image + (prog - (u8 *)rw_image),
 					    image + (branches[i] - (u8 *)rw_image), X86_JNE);
+		}
 	}
 
-	if (fexit->nr_links)
+	if (fexit->nr_links) {
 		if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off,
 			       false, image, rw_image)) {
 			ret = -EINVAL;
 			goto cleanup;
 		}
+	}
 
 	if (flags & BPF_TRAMP_F_RESTORE_REGS)
 		restore_regs(m, &prog, regs_off);
@@ -2725,11 +2743,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 			ret = -EINVAL;
 			goto cleanup;
 		}
-	} else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+	} else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
 		/* Before running the original function, restore the
 		 * tail_call_cnt from stack to rax.
 		 */
 		RESTORE_TAIL_CALL_CNT(stack_size);
+	}
 
 	/* restore return value of orig_call or fentry prog back into RAX */
 	if (save_ret)
@@ -2737,9 +2756,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 
 	emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off);
 	EMIT1(0xC9); /* leave */
-	if (flags & BPF_TRAMP_F_SKIP_FRAME)
+	if (flags & BPF_TRAMP_F_SKIP_FRAME) {
 		/* skip our return address and return to parent */
 		EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
+	}
 	emit_return(&prog, image + (prog - (u8 *)rw_image));
 	/* Make sure the trampoline generation logic doesn't overflow */
 	if (WARN_ON_ONCE(prog > (u8 *)rw_image_end - BPF_INSN_SAFETY)) {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9d84c376851a..db46b3359bf5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1060,6 +1060,17 @@ struct btf_func_model {
  */
 #define BPF_TRAMP_F_TAIL_CALL_CTX	BIT(7)
 
+/*
+ * Indicate the trampoline should be suitable to receive indirect calls;
+ * without this indirectly calling the generated code can result in #UD/#CP,
+ * depending on the CFI options.
+ *
+ * Used by bpf_struct_ops.
+ *
+ * Incompatible with FENTRY usage, overloads @func_addr argument.
+ */
+#define BPF_TRAMP_F_INDIRECT		BIT(8)
+
 /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
  * bytes on x86.
  */
@@ -1697,6 +1708,7 @@ struct bpf_struct_ops {
 	struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
 	u32 type_id;
 	u32 value_id;
+	void *cfi_stubs;
 };
 
 #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
@@ -1710,6 +1722,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 				      struct bpf_tramp_link *link,
 				      const struct btf_func_model *model,
+				      void *stub_func,
 				      void *image, void *image_end);
 static inline bool bpf_try_module_get(const void *data, struct module *owner)
 {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 4d53c53fc5aa..02068bd0e4d9 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -352,17 +352,16 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = {
 int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 				      struct bpf_tramp_link *link,
 				      const struct btf_func_model *model,
-				      void *image, void *image_end)
+				      void *stub_func, void *image, void *image_end)
 {
-	u32 flags;
+	u32 flags = BPF_TRAMP_F_INDIRECT;
 	int size;
 
 	tlinks[BPF_TRAMP_FENTRY].links[0] = link;
 	tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
-	/* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops,
-	 * and it must be used alone.
-	 */
-	flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
+
+	if (model->ret_size > 0)
+		flags |= BPF_TRAMP_F_RET_FENTRY_RET;
 
 	size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
 	if (size < 0)
@@ -370,7 +369,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 	if (size > (unsigned long)image_end - (unsigned long)image)
 		return -E2BIG;
 	return arch_prepare_bpf_trampoline(NULL, image, image_end,
-					   model, flags, tlinks, NULL);
+					   model, flags, tlinks, stub_func);
 }
 
 static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
@@ -504,11 +503,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 
 		err = bpf_struct_ops_prepare_trampoline(tlinks, link,
 							&st_ops->func_models[i],
+							*(void **)(st_ops->cfi_stubs + moff),
 							image, image_end);
 		if (err < 0)
 			goto reset_unlock;
 
-		*(void **)(kdata + moff) = image;
+		*(void **)(kdata + moff) = image + cfi_get_offset();
 		image += err;
 
 		/* put prog_id to udata */
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index 2748f9d77b18..8906f7bdf4a9 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -12,6 +12,11 @@ extern struct bpf_struct_ops bpf_bpf_dummy_ops;
 /* A common type for test_N with return value in bpf_dummy_ops */
 typedef int (*dummy_ops_test_ret_fn)(struct bpf_dummy_ops_state *state, ...);
 
+static int dummy_ops_test_ret_function(struct bpf_dummy_ops_state *state, ...)
+{
+	return 0;
+}
+
 struct bpf_dummy_ops_test_args {
 	u64 args[MAX_BPF_FUNC_ARGS];
 	struct bpf_dummy_ops_state state;
@@ -62,7 +67,7 @@ static int dummy_ops_copy_args(struct bpf_dummy_ops_test_args *args)
 
 static int dummy_ops_call_op(void *image, struct bpf_dummy_ops_test_args *args)
 {
-	dummy_ops_test_ret_fn test = (void *)image;
+	dummy_ops_test_ret_fn test = (void *)image + cfi_get_offset();
 	struct bpf_dummy_ops_state *state = NULL;
 
 	/* state needs to be NULL if args[0] is 0 */
@@ -119,6 +124,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	op_idx = prog->expected_attach_type;
 	err = bpf_struct_ops_prepare_trampoline(tlinks, link,
 						&st_ops->func_models[op_idx],
+						&dummy_ops_test_ret_function,
 						image, image + PAGE_SIZE);
 	if (err < 0)
 		goto out;
@@ -219,6 +225,28 @@ static void bpf_dummy_unreg(void *kdata)
 {
 }
 
+static int bpf_dummy_test_1(struct bpf_dummy_ops_state *cb)
+{
+	return 0;
+}
+
+static int bpf_dummy_test_2(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2,
+			    char a3, unsigned long a4)
+{
+	return 0;
+}
+
+static int bpf_dummy_test_sleepable(struct bpf_dummy_ops_state *cb)
+{
+	return 0;
+}
+
+static struct bpf_dummy_ops __bpf_bpf_dummy_ops = {
+	.test_1 = bpf_dummy_test_1,
+	.test_2 = bpf_dummy_test_2,
+	.test_sleepable = bpf_dummy_test_sleepable,
+};
+
 struct bpf_struct_ops bpf_bpf_dummy_ops = {
 	.verifier_ops = &bpf_dummy_verifier_ops,
 	.init = bpf_dummy_init,
@@ -227,4 +255,5 @@ struct bpf_struct_ops bpf_bpf_dummy_ops = {
 	.reg = bpf_dummy_reg,
 	.unreg = bpf_dummy_unreg,
 	.name = "bpf_dummy_ops",
+	.cfi_stubs = &__bpf_bpf_dummy_ops,
 };
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index c7bbd8f3c708..634cfafa583d 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -271,6 +271,74 @@ static int bpf_tcp_ca_validate(void *kdata)
 	return tcp_validate_congestion_control(kdata);
 }
 
+static u32 bpf_tcp_ca_ssthresh(struct sock *sk)
+{
+	return 0;
+}
+
+static void bpf_tcp_ca_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+}
+
+static void bpf_tcp_ca_set_state(struct sock *sk, u8 new_state)
+{
+}
+
+static void bpf_tcp_ca_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+{
+}
+
+static void bpf_tcp_ca_in_ack_event(struct sock *sk, u32 flags)
+{
+}
+
+static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *sample)
+{
+}
+
+static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
+{
+	return 0;
+}
+
+static void bpf_tcp_ca_cong_control(struct sock *sk, const struct rate_sample *rs)
+{
+}
+
+static u32 bpf_tcp_ca_undo_cwnd(struct sock *sk)
+{
+	return 0;
+}
+
+static u32 bpf_tcp_ca_sndbuf_expand(struct sock *sk)
+{
+	return 0;
+}
+
+static void __bpf_tcp_ca_init(struct sock *sk)
+{
+}
+
+static void __bpf_tcp_ca_release(struct sock *sk)
+{
+}
+
+static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
+	.ssthresh = bpf_tcp_ca_ssthresh,
+	.cong_avoid = bpf_tcp_ca_cong_avoid,
+	.set_state = bpf_tcp_ca_set_state,
+	.cwnd_event = bpf_tcp_ca_cwnd_event,
+	.in_ack_event = bpf_tcp_ca_in_ack_event,
+	.pkts_acked = bpf_tcp_ca_pkts_acked,
+	.min_tso_segs = bpf_tcp_ca_min_tso_segs,
+	.cong_control = bpf_tcp_ca_cong_control,
+	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
+	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
+
+	.init = __bpf_tcp_ca_init,
+	.release = __bpf_tcp_ca_release,
+};
+
 struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.verifier_ops = &bpf_tcp_ca_verifier_ops,
 	.reg = bpf_tcp_ca_reg,
@@ -281,6 +349,7 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.init = bpf_tcp_ca_init,
 	.validate = bpf_tcp_ca_validate,
 	.name = "tcp_congestion_ops",
+	.cfi_stubs = &__bpf_ops_tcp_congestion_ops,
 };
 
 static int __init bpf_tcp_ca_kfunc_init(void)
-- 
cgit v1.2.3


From e4c00339891c074c76f626ac82981963cbba5332 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 15 Dec 2023 10:12:22 +0100
Subject: bpf: Fix dtor CFI

Ensure the various dtor functions match their prototype and retain
their CFI signatures, since they don't have their address taken, they
are prone to not getting CFI, making them impossible to call
indirectly.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20231215092707.799451071@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cpumask.c |  8 +++++++-
 kernel/bpf/helpers.c | 16 ++++++++++++++--
 net/bpf/test_run.c   | 15 +++++++++++++--
 3 files changed, 34 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 7499b7d8c06f..2e73533a3811 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -96,6 +96,12 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
 	migrate_enable();
 }
 
+__bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask)
+{
+	bpf_cpumask_release(cpumask);
+}
+CFI_NOSEAL(bpf_cpumask_release_dtor);
+
 /**
  * bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask.
  * @cpumask: The cpumask being queried.
@@ -453,7 +459,7 @@ static const struct btf_kfunc_id_set cpumask_kfunc_set = {
 
 BTF_ID_LIST(cpumask_dtor_ids)
 BTF_ID(struct, bpf_cpumask)
-BTF_ID(func, bpf_cpumask_release)
+BTF_ID(func, bpf_cpumask_release_dtor)
 
 static int __init cpumask_kfunc_init(void)
 {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b0b485126a76..e0c0e3676df8 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2150,6 +2150,12 @@ __bpf_kfunc void bpf_task_release(struct task_struct *p)
 	put_task_struct_rcu_user(p);
 }
 
+__bpf_kfunc void bpf_task_release_dtor(void *p)
+{
+	put_task_struct_rcu_user(p);
+}
+CFI_NOSEAL(bpf_task_release_dtor);
+
 #ifdef CONFIG_CGROUPS
 /**
  * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
@@ -2174,6 +2180,12 @@ __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
 	cgroup_put(cgrp);
 }
 
+__bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp)
+{
+	cgroup_put(cgrp);
+}
+CFI_NOSEAL(bpf_cgroup_release_dtor);
+
 /**
  * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
  * array. A cgroup returned by this kfunc which is not subsequently stored in a
@@ -2570,10 +2582,10 @@ static const struct btf_kfunc_id_set generic_kfunc_set = {
 
 BTF_ID_LIST(generic_dtor_ids)
 BTF_ID(struct, task_struct)
-BTF_ID(func, bpf_task_release)
+BTF_ID(func, bpf_task_release_dtor)
 #ifdef CONFIG_CGROUPS
 BTF_ID(struct, cgroup)
-BTF_ID(func, bpf_cgroup_release)
+BTF_ID(func, bpf_cgroup_release_dtor)
 #endif
 
 BTF_SET8_START(common_btf_ids)
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 711cf5d59816..dfd919374017 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -600,10 +600,21 @@ __bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p)
 	refcount_dec(&p->cnt);
 }
 
+__bpf_kfunc void bpf_kfunc_call_test_release_dtor(void *p)
+{
+	bpf_kfunc_call_test_release(p);
+}
+CFI_NOSEAL(bpf_kfunc_call_test_release_dtor);
+
 __bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p)
 {
 }
 
+__bpf_kfunc void bpf_kfunc_call_memb_release_dtor(void *p)
+{
+}
+CFI_NOSEAL(bpf_kfunc_call_memb_release_dtor);
+
 __bpf_kfunc_end_defs();
 
 BTF_SET8_START(bpf_test_modify_return_ids)
@@ -1671,9 +1682,9 @@ static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = {
 
 BTF_ID_LIST(bpf_prog_test_dtor_kfunc_ids)
 BTF_ID(struct, prog_test_ref_kfunc)
-BTF_ID(func, bpf_kfunc_call_test_release)
+BTF_ID(func, bpf_kfunc_call_test_release_dtor)
 BTF_ID(struct, prog_test_member)
-BTF_ID(func, bpf_kfunc_call_memb_release)
+BTF_ID(func, bpf_kfunc_call_memb_release_dtor)
 
 static int __init bpf_prog_test_run_init(void)
 {
-- 
cgit v1.2.3


From 852486b35f344887786d63250946dd921a05d7e8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Date: Fri, 15 Dec 2023 10:12:23 +0100
Subject: x86/cfi,bpf: Fix bpf_exception_cb() signature

As per the earlier patches, BPF sub-programs have bpf_callback_t
signature and CFI expects callers to have matching signature. This is
violated by bpf_prog_aux::bpf_exception_cb().

[peterz: Changelog]
Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/CAADnVQ+Z7UcXXBBhMubhcMM=R-dExk-uHtfOLtoLxQ1XxEpqEA@mail.gmail.com
Link: https://lore.kernel.org/r/20231215092707.910319166@infradead.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 2 +-
 kernel/bpf/helpers.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index db46b3359bf5..5e694934cf37 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1484,7 +1484,7 @@ struct bpf_prog_aux {
 	int cgroup_atype; /* enum cgroup_bpf_attach_type */
 	struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 	char name[BPF_OBJ_NAME_LEN];
-	unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp);
+	u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64);
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e0c0e3676df8..07fd4b5704f3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2537,7 +2537,7 @@ __bpf_kfunc void bpf_throw(u64 cookie)
 	 * which skips compiler generated instrumentation to do the same.
 	 */
 	kasan_unpoison_task_stack_below((void *)(long)ctx.sp);
-	ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp);
+	ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0);
 	WARN(1, "A call to BPF exception callback should never return\n");
 }
 
-- 
cgit v1.2.3


From 9c556b7c3f520d42c435c0d78b25c719c060f8a1 Mon Sep 17 00:00:00 2001
From: Naveen N Rao <naveen@kernel.org>
Date: Thu, 14 Dec 2023 10:47:02 +0530
Subject: trace/kprobe: Display the actual notrace function when rejecting a
 probe

Trying to probe update_sd_lb_stats() using perf results in the below
message in the kernel log:
	trace_kprobe: Could not probe notrace function _text

This is because 'perf probe' specifies the kprobe location as an offset
from '_text':
	$ sudo perf probe -D update_sd_lb_stats
	p:probe/update_sd_lb_stats _text+1830728

However, the error message is misleading and doesn't help convey the
actual notrace function that is being probed. Fix this by looking up the
actual function name that is being probed. With this fix, we now get the
below message in the kernel log:
	trace_kprobe: Could not probe notrace function update_sd_lb_stats.constprop.0

Link: https://lore.kernel.org/all/20231214051702.1687300-1-naveen@kernel.org/

Signed-off-by: Naveen N Rao <naveen@kernel.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace_kprobe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 52f8b537dd0a..c4c6e0e0068b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -487,8 +487,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
 		return -EINVAL;
 
 	if (within_notrace_func(tk)) {
-		pr_warn("Could not probe notrace function %s\n",
-			trace_kprobe_symbol(tk));
+		pr_warn("Could not probe notrace function %ps\n",
+			(void *)trace_kprobe_address(tk));
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 3983c00281d96af2ba611254d679107b5c390627 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 17 Dec 2023 22:55:37 +0100
Subject: bpf: Fail uprobe multi link with negative offset

Currently the __uprobe_register will return 0 (success) when called with
negative offset. The reason is that the call to register_for_each_vma and
then build_map_info won't return error for negative offset. They just won't
do anything - no matching vma is found so there's no registered breakpoint
for the uprobe.

I don't think we can change the behaviour of __uprobe_register and fail
for negative uprobe offset, because apps might depend on that already.

But I think we can still make the change and check for it on bpf multi
link syscall level.

Also moving the __get_user call and check for the offsets to the top of
loop, to fail early without extra __get_user calls for ref_ctr_offset
and cookie arrays.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/bpf/20231217215538.3361991-2-jolsa@kernel.org
---
 kernel/trace/bpf_trace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 97c0c49c40a0..492d60e9c480 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3391,15 +3391,19 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 		goto error_free;
 
 	for (i = 0; i < cnt; i++) {
-		if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
+		if (__get_user(uprobes[i].offset, uoffsets + i)) {
 			err = -EFAULT;
 			goto error_free;
 		}
+		if (uprobes[i].offset < 0) {
+			err = -EINVAL;
+			goto error_free;
+		}
 		if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) {
 			err = -EFAULT;
 			goto error_free;
 		}
-		if (__get_user(uprobes[i].offset, uoffsets + i)) {
+		if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
 			err = -EFAULT;
 			goto error_free;
 		}
-- 
cgit v1.2.3


From d81f0d7b8b23ec79f80be602ed6129ded27862e8 Mon Sep 17 00:00:00 2001
From: Rae Moar <rmoar@google.com>
Date: Wed, 13 Dec 2023 19:44:17 +0000
Subject: kunit: add KUNIT_INIT_TABLE to init linker section

Add KUNIT_INIT_TABLE to the INIT_DATA linker section.

Alter the KUnit macros to create init tests:
kunit_test_init_section_suites

Update lib/kunit/executor.c to run both the suites in KUNIT_TABLE and
KUNIT_INIT_TABLE.

Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Rae Moar <rmoar@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/asm-generic/vmlinux.lds.h |  9 +++++-
 include/kunit/test.h              | 30 +++++++++++-------
 include/linux/module.h            |  2 ++
 kernel/module/main.c              |  3 ++
 lib/kunit/executor.c              | 64 +++++++++++++++++++++++++++++++++++----
 lib/kunit/test.c                  | 26 +++++++++++-----
 6 files changed, 109 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 1107905d37fc..5dd3a61d673d 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -700,7 +700,8 @@
 	THERMAL_TABLE(governor)						\
 	EARLYCON_TABLE()						\
 	LSM_TABLE()							\
-	EARLY_LSM_TABLE()
+	EARLY_LSM_TABLE()						\
+	KUNIT_INIT_TABLE()
 
 #define INIT_TEXT							\
 	*(.init.text .init.text.*)					\
@@ -926,6 +927,12 @@
 		. = ALIGN(8);						\
 		BOUNDED_SECTION_POST_LABEL(.kunit_test_suites, __kunit_suites, _start, _end)
 
+/* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */
+#define KUNIT_INIT_TABLE()						\
+		. = ALIGN(8);						\
+		BOUNDED_SECTION_POST_LABEL(.kunit_init_test_suites, \
+				__kunit_init_suites, _start, _end)
+
 #ifdef CONFIG_BLK_DEV_INITRD
 #define INIT_RAM_FS							\
 	. = ALIGN(4);							\
diff --git a/include/kunit/test.h b/include/kunit/test.h
index 20ed9f9275c9..fe79cd736e94 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -337,6 +337,9 @@ void __kunit_test_suites_exit(struct kunit_suite **suites, int num_suites);
 void kunit_exec_run_tests(struct kunit_suite_set *suite_set, bool builtin);
 void kunit_exec_list_tests(struct kunit_suite_set *suite_set, bool include_attr);
 
+struct kunit_suite_set kunit_merge_suite_sets(struct kunit_suite_set init_suite_set,
+		struct kunit_suite_set suite_set);
+
 #if IS_BUILTIN(CONFIG_KUNIT)
 int kunit_run_all_tests(void);
 #else
@@ -371,6 +374,11 @@ static inline int kunit_run_all_tests(void)
 
 #define kunit_test_suite(suite)	kunit_test_suites(&suite)
 
+#define __kunit_init_test_suites(unique_array, ...)			       \
+	static struct kunit_suite *unique_array[]			       \
+	__aligned(sizeof(struct kunit_suite *))				       \
+	__used __section(".kunit_init_test_suites") = { __VA_ARGS__ }
+
 /**
  * kunit_test_init_section_suites() - used to register one or more &struct
  *				      kunit_suite containing init functions or
@@ -378,21 +386,21 @@ static inline int kunit_run_all_tests(void)
  *
  * @__suites: a statically allocated list of &struct kunit_suite.
  *
- * This functions identically as kunit_test_suites() except that it suppresses
- * modpost warnings for referencing functions marked __init or data marked
- * __initdata; this is OK because currently KUnit only runs tests upon boot
- * during the init phase or upon loading a module during the init phase.
+ * This functions similar to kunit_test_suites() except that it compiles the
+ * list of suites during init phase.
+ *
+ * This macro also suffixes the array and suite declarations it makes with
+ * _probe; so that modpost suppresses warnings about referencing init data
+ * for symbols named in this manner.
  *
- * NOTE TO KUNIT DEVS: If we ever allow KUnit tests to be run after boot, these
- * tests must be excluded.
+ * Note: these init tests are not able to be run after boot so there is no
+ * "run" debugfs file generated for these tests.
  *
- * The only thing this macro does that's different from kunit_test_suites is
- * that it suffixes the array and suite declarations it makes with _probe;
- * modpost suppresses warnings about referencing init data for symbols named in
- * this manner.
+ * Also, do not mark the suite or test case structs with __initdata because
+ * they will be used after the init phase with debugfs.
  */
 #define kunit_test_init_section_suites(__suites...)			\
-	__kunit_test_suites(CONCATENATE(__UNIQUE_ID(array), _probe),	\
+	__kunit_init_test_suites(CONCATENATE(__UNIQUE_ID(array), _probe), \
 			    ##__suites)
 
 #define kunit_test_init_section_suite(suite)	\
diff --git a/include/linux/module.h b/include/linux/module.h
index a98e188cf37b..9cd0009bd050 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -540,6 +540,8 @@ struct module {
 	struct static_call_site *static_call_sites;
 #endif
 #if IS_ENABLED(CONFIG_KUNIT)
+	int num_kunit_init_suites;
+	struct kunit_suite **kunit_init_suites;
 	int num_kunit_suites;
 	struct kunit_suite **kunit_suites;
 #endif
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 98fedfdb8db5..36681911c05a 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2199,6 +2199,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 	mod->kunit_suites = section_objs(info, ".kunit_test_suites",
 					      sizeof(*mod->kunit_suites),
 					      &mod->num_kunit_suites);
+	mod->kunit_init_suites = section_objs(info, ".kunit_init_test_suites",
+					      sizeof(*mod->kunit_init_suites),
+					      &mod->num_kunit_init_suites);
 #endif
 
 	mod->extable = section_objs(info, "__ex_table",
diff --git a/lib/kunit/executor.c b/lib/kunit/executor.c
index 1236b3cd2fbb..847329c51e91 100644
--- a/lib/kunit/executor.c
+++ b/lib/kunit/executor.c
@@ -12,6 +12,8 @@
  */
 extern struct kunit_suite * const __kunit_suites_start[];
 extern struct kunit_suite * const __kunit_suites_end[];
+extern struct kunit_suite * const __kunit_init_suites_start[];
+extern struct kunit_suite * const __kunit_init_suites_end[];
 
 static char *action_param;
 
@@ -292,6 +294,33 @@ void kunit_exec_list_tests(struct kunit_suite_set *suite_set, bool include_attr)
 	}
 }
 
+struct kunit_suite_set kunit_merge_suite_sets(struct kunit_suite_set init_suite_set,
+		struct kunit_suite_set suite_set)
+{
+	struct kunit_suite_set total_suite_set = {NULL, NULL};
+	struct kunit_suite **total_suite_start = NULL;
+	size_t init_num_suites, num_suites, suite_size;
+
+	init_num_suites = init_suite_set.end - init_suite_set.start;
+	num_suites = suite_set.end - suite_set.start;
+	suite_size = sizeof(suite_set.start);
+
+	/* Allocate memory for array of all kunit suites */
+	total_suite_start = kmalloc_array(init_num_suites + num_suites, suite_size, GFP_KERNEL);
+	if (!total_suite_start)
+		return total_suite_set;
+
+	/* Append init suites and then all other kunit suites */
+	memcpy(total_suite_start, init_suite_set.start, init_num_suites * suite_size);
+	memcpy(total_suite_start + init_num_suites, suite_set.start, num_suites * suite_size);
+
+	/* Set kunit suite set start and end */
+	total_suite_set.start = total_suite_start;
+	total_suite_set.end = total_suite_start + (init_num_suites + num_suites);
+
+	return total_suite_set;
+}
+
 #if IS_BUILTIN(CONFIG_KUNIT)
 
 static char *kunit_shutdown;
@@ -313,21 +342,41 @@ static void kunit_handle_shutdown(void)
 
 int kunit_run_all_tests(void)
 {
-	struct kunit_suite_set suite_set = {
+	struct kunit_suite_set suite_set = {NULL, NULL};
+	struct kunit_suite_set filtered_suite_set = {NULL, NULL};
+	struct kunit_suite_set init_suite_set = {
+		__kunit_init_suites_start, __kunit_init_suites_end,
+	};
+	struct kunit_suite_set normal_suite_set = {
 		__kunit_suites_start, __kunit_suites_end,
 	};
+	size_t init_num_suites = init_suite_set.end - init_suite_set.start;
 	int err = 0;
+
+	if (init_num_suites > 0) {
+		suite_set = kunit_merge_suite_sets(init_suite_set, normal_suite_set);
+		if (!suite_set.start)
+			goto out;
+	} else
+		suite_set = normal_suite_set;
+
 	if (!kunit_enabled()) {
 		pr_info("kunit: disabled\n");
-		goto out;
+		goto free_out;
 	}
 
 	if (filter_glob_param || filter_param) {
-		suite_set = kunit_filter_suites(&suite_set, filter_glob_param,
+		filtered_suite_set = kunit_filter_suites(&suite_set, filter_glob_param,
 				filter_param, filter_action_param, &err);
+
+		/* Free original suite set before using filtered suite set */
+		if (init_num_suites > 0)
+			kfree(suite_set.start);
+		suite_set = filtered_suite_set;
+
 		if (err) {
 			pr_err("kunit executor: error filtering suites: %d\n", err);
-			goto out;
+			goto free_out;
 		}
 	}
 
@@ -340,9 +389,12 @@ int kunit_run_all_tests(void)
 	else
 		pr_err("kunit executor: unknown action '%s'\n", action_param);
 
-	if (filter_glob_param || filter_param) { /* a copy was made of each suite */
+free_out:
+	if (filter_glob_param || filter_param)
 		kunit_free_suite_set(suite_set);
-	}
+	else if (init_num_suites > 0)
+		/* Don't use kunit_free_suite_set because suites aren't individually allocated */
+		kfree(suite_set.start);
 
 out:
 	kunit_handle_shutdown();
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index 7deee3701d20..6b60d85ce108 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -742,28 +742,40 @@ EXPORT_SYMBOL_GPL(__kunit_test_suites_exit);
 #ifdef CONFIG_MODULES
 static void kunit_module_init(struct module *mod)
 {
-	struct kunit_suite_set suite_set = {
+	struct kunit_suite_set suite_set, filtered_set;
+	struct kunit_suite_set normal_suite_set = {
 		mod->kunit_suites, mod->kunit_suites + mod->num_kunit_suites,
 	};
+	struct kunit_suite_set init_suite_set = {
+		mod->kunit_init_suites, mod->kunit_init_suites + mod->num_kunit_init_suites,
+	};
 	const char *action = kunit_action();
 	int err = 0;
 
-	suite_set = kunit_filter_suites(&suite_set,
+	if (mod->num_kunit_init_suites > 0)
+		suite_set = kunit_merge_suite_sets(init_suite_set, normal_suite_set);
+	else
+		suite_set = normal_suite_set;
+
+	filtered_set = kunit_filter_suites(&suite_set,
 					kunit_filter_glob() ?: "*.*",
 					kunit_filter(), kunit_filter_action(),
 					&err);
 	if (err)
 		pr_err("kunit module: error filtering suites: %d\n", err);
 
-	mod->kunit_suites = (struct kunit_suite **)suite_set.start;
-	mod->num_kunit_suites = suite_set.end - suite_set.start;
+	mod->kunit_suites = (struct kunit_suite **)filtered_set.start;
+	mod->num_kunit_suites = filtered_set.end - filtered_set.start;
+
+	if (mod->num_kunit_init_suites > 0)
+		kfree(suite_set.start);
 
 	if (!action)
-		kunit_exec_run_tests(&suite_set, false);
+		kunit_exec_run_tests(&filtered_set, false);
 	else if (!strcmp(action, "list"))
-		kunit_exec_list_tests(&suite_set, false);
+		kunit_exec_list_tests(&filtered_set, false);
 	else if (!strcmp(action, "list_attr"))
-		kunit_exec_list_tests(&suite_set, true);
+		kunit_exec_list_tests(&filtered_set, true);
 	else
 		pr_err("kunit: unknown action '%s'\n", action);
 }
-- 
cgit v1.2.3


From 8e432e6197cef6250dfd6fdffd41c06613c874ca Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 18 Dec 2023 09:36:01 -0800
Subject: bpf: Ensure precise is reset to false in __mark_reg_const_zero()

It is safe to always start with imprecise SCALAR_VALUE register.
Previously __mark_reg_const_zero() relied on caller to reset precise
mark, but it's very error prone and we already missed it in a few
places. So instead make __mark_reg_const_zero() reset precision always,
as it's a safe default for SCALAR_VALUE. Explanation is basically the
same as for why we are resetting (or rather not setting) precision in
current state. If necessary, precision propagation will set it to
precise correctly.

As such, also remove a big comment about forward precision propagation
in mark_reg_stack_read() and avoid unnecessarily setting precision to
true after reading from STACK_ZERO stack. Again, precision propagation
will correctly handle this, if that SCALAR_VALUE register will ever be
needed to be precise.

Reported-by: Maxim Mikityanskiy <maxtram95@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Maxim Mikityanskiy <maxtram95@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20231218173601.53047-1-andrii@kernel.org
---
 kernel/bpf/verifier.c                              | 29 ++++++++--------------
 .../selftests/bpf/progs/verifier_spill_fill.c      | 10 ++++++--
 2 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1863826a4ac3..9456ee0ad129 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1777,10 +1777,14 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
 	__mark_reg_known(reg, 0);
 }
 
-static void __mark_reg_const_zero(struct bpf_reg_state *reg)
+static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	__mark_reg_known(reg, 0);
 	reg->type = SCALAR_VALUE;
+	/* all scalars are assumed imprecise initially (unless unprivileged,
+	 * in which case everything is forced to be precise)
+	 */
+	reg->precise = !env->bpf_capable;
 }
 
 static void mark_reg_known_zero(struct bpf_verifier_env *env,
@@ -4706,21 +4710,10 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env,
 		zeros++;
 	}
 	if (zeros == max_off - min_off) {
-		/* any access_size read into register is zero extended,
-		 * so the whole register == const_zero
-		 */
-		__mark_reg_const_zero(&state->regs[dst_regno]);
-		/* backtracking doesn't support STACK_ZERO yet,
-		 * so mark it precise here, so that later
-		 * backtracking can stop here.
-		 * Backtracking may not need this if this register
-		 * doesn't participate in pointer adjustment.
-		 * Forward propagation of precise flag is not
-		 * necessary either. This mark is only to stop
-		 * backtracking. Any register that contributed
-		 * to const 0 was marked precise before spill.
+		/* Any access_size read into register is zero extended,
+		 * so the whole register == const_zero.
 		 */
-		state->regs[dst_regno].precise = true;
+		__mark_reg_const_zero(env, &state->regs[dst_regno]);
 	} else {
 		/* have read misc data from the stack */
 		mark_reg_unknown(env, state->regs, dst_regno);
@@ -4803,11 +4796,11 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 
 				if (spill_cnt == size &&
 				    tnum_is_const(reg->var_off) && reg->var_off.value == 0) {
-					__mark_reg_const_zero(&state->regs[dst_regno]);
+					__mark_reg_const_zero(env, &state->regs[dst_regno]);
 					/* this IS register fill, so keep insn_flags */
 				} else if (zero_cnt == size) {
 					/* similarly to mark_reg_stack_read(), preserve zeroes */
-					__mark_reg_const_zero(&state->regs[dst_regno]);
+					__mark_reg_const_zero(env, &state->regs[dst_regno]);
 					insn_flags = 0; /* not restoring original register state */
 				} else {
 					mark_reg_unknown(env, state->regs, dst_regno);
@@ -7963,7 +7956,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
 	/* switch to DRAINED state, but keep the depth unchanged */
 	/* mark current iter state as drained and assume returned NULL */
 	cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
-	__mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]);
+	__mark_reg_const_zero(env, &cur_fr->regs[BPF_REG_0]);
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
index 508f5d6c7347..39fe3372e0e0 100644
--- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
+++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c
@@ -499,8 +499,14 @@ __success
 __msg("2: (7a) *(u64 *)(r10 -8) = 0          ; R10=fp0 fp-8_w=00000000")
 /* but fp-16 is spilled IMPRECISE zero const reg */
 __msg("4: (7b) *(u64 *)(r10 -16) = r0        ; R0_w=0 R10=fp0 fp-16_w=0")
-/* and now check that precision propagation works even for such tricky case */
-__msg("10: (71) r2 = *(u8 *)(r10 -9)         ; R2_w=P0 R10=fp0 fp-16_w=0")
+/* validate that assigning R2 from STACK_ZERO doesn't mark register
+ * precise immediately; if necessary, it will be marked precise later
+ */
+__msg("6: (71) r2 = *(u8 *)(r10 -1)          ; R2_w=0 R10=fp0 fp-8_w=00000000")
+/* similarly, when R2 is assigned from spilled register, it is initially
+ * imprecise, but will be marked precise later once it is used in precise context
+ */
+__msg("10: (71) r2 = *(u8 *)(r10 -9)         ; R2_w=0 R10=fp0 fp-16_w=0")
 __msg("11: (0f) r1 += r2")
 __msg("mark_precise: frame0: last_idx 11 first_idx 0 subseq_idx -1")
 __msg("mark_precise: frame0: regs=r2 stack= before 10: (71) r2 = *(u8 *)(r10 -9)")
-- 
cgit v1.2.3


From b803d7c664d55705831729d2f2e29c874bcd62ea Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Mon, 18 Dec 2023 23:07:12 -0500
Subject: ring-buffer: Fix slowpath of interrupted event

To synchronize the timestamps with the ring buffer reservation, there are
two timestamps that are saved in the buffer meta data.

1. before_stamp
2. write_stamp

When the two are equal, the write_stamp is considered valid, as in, it may
be used to calculate the delta of the next event as the write_stamp is the
timestamp of the previous reserved event on the buffer.

This is done by the following:

 /*A*/	w = current position on the ring buffer
	before = before_stamp
	after = write_stamp
	ts = read current timestamp

	if (before != after) {
		write_stamp is not valid, force adding an absolute
		timestamp.
	}

 /*B*/	before_stamp = ts

 /*C*/	write = local_add_return(event length, position on ring buffer)

	if (w == write - event length) {
		/* Nothing interrupted between A and C */
 /*E*/		write_stamp = ts;
		delta = ts - after
		/*
		 * If nothing interrupted again,
		 * before_stamp == write_stamp and write_stamp
		 * can be used to calculate the delta for
		 * events that come in after this one.
		 */
	} else {

		/*
		 * The slow path!
		 * Was interrupted between A and C.
		 */

This is the place that there's a bug. We currently have:

		after = write_stamp
		ts = read current timestamp

 /*F*/		if (write == current position on the ring buffer &&
		    after < ts && cmpxchg(write_stamp, after, ts)) {

			delta = ts - after;

		} else {
			delta = 0;
		}

The assumption is that if the current position on the ring buffer hasn't
moved between C and F, then it also was not interrupted, and that the last
event written has a timestamp that matches the write_stamp. That is the
write_stamp is valid.

But this may not be the case:

If a task context event was interrupted by softirq between B and C.

And the softirq wrote an event that got interrupted by a hard irq between
C and E.

and the hard irq wrote an event (does not need to be interrupted)

We have:

 /*B*/ before_stamp = ts of normal context

   ---> interrupted by softirq

	/*B*/ before_stamp = ts of softirq context

	  ---> interrupted by hardirq

		/*B*/ before_stamp = ts of hard irq context
		/*E*/ write_stamp = ts of hard irq context

		/* matches and write_stamp valid */
	  <----

	/*E*/ write_stamp = ts of softirq context

	/* No longer matches before_stamp, write_stamp is not valid! */

   <---

 w != write - length, go to slow path

// Right now the order of events in the ring buffer is:
//
// |-- softirq event --|-- hard irq event --|-- normal context event --|
//

 after = write_stamp (this is the ts of softirq)
 ts = read current timestamp

 if (write == current position on the ring buffer [true] &&
     after < ts [true] && cmpxchg(write_stamp, after, ts) [true]) {

	delta = ts - after  [Wrong!]

The delta is to be between the hard irq event and the normal context
event, but the above logic made the delta between the softirq event and
the normal context event, where the hard irq event is between the two. This
will shift all the remaining event timestamps on the sub-buffer
incorrectly.

The write_stamp is only valid if it matches the before_stamp. The cmpxchg
does nothing to help this.

Instead, the following logic can be done to fix this:

	before = before_stamp
	ts = read current timestamp
	before_stamp = ts

	after = write_stamp

	if (write == current position on the ring buffer &&
	    after == before && after < ts) {

		delta = ts - after

	} else {
		delta = 0;
	}

The above will only use the write_stamp if it still matches before_stamp
and was tested to not have changed since C.

As a bonus, with this logic we do not need any 64-bit cmpxchg() at all!

This means the 32-bit rb_time_t workaround can finally be removed. But
that's for a later time.

Link: https://lore.kernel.org/linux-trace-kernel/20231218175229.58ec3daf@gandalf.local.home/
Link: https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Fixes: dd93942570789 ("ring-buffer: Do not try to put back write_stamp")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 79 ++++++++++++++--------------------------------
 1 file changed, 24 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5a114e752f11..83eab547f1d1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -700,48 +700,6 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
 	return local_try_cmpxchg(l, &expect, set);
 }
 
-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
-{
-	unsigned long cnt, top, bottom, msb;
-	unsigned long cnt2, top2, bottom2, msb2;
-	u64 val;
-
-	/* Any interruptions in this function should cause a failure */
-	cnt = local_read(&t->cnt);
-
-	/* The cmpxchg always fails if it interrupted an update */
-	 if (!__rb_time_read(t, &val, &cnt2))
-		 return false;
-
-	 if (val != expect)
-		 return false;
-
-	 if ((cnt & 3) != cnt2)
-		 return false;
-
-	 cnt2 = cnt + 1;
-
-	 rb_time_split(val, &top, &bottom, &msb);
-	 msb = rb_time_val_cnt(msb, cnt);
-	 top = rb_time_val_cnt(top, cnt);
-	 bottom = rb_time_val_cnt(bottom, cnt);
-
-	 rb_time_split(set, &top2, &bottom2, &msb2);
-	 msb2 = rb_time_val_cnt(msb2, cnt);
-	 top2 = rb_time_val_cnt(top2, cnt2);
-	 bottom2 = rb_time_val_cnt(bottom2, cnt2);
-
-	if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
-		return false;
-	if (!rb_time_read_cmpxchg(&t->msb, msb, msb2))
-		return false;
-	if (!rb_time_read_cmpxchg(&t->top, top, top2))
-		return false;
-	if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
-		return false;
-	return true;
-}
-
 #else /* 64 bits */
 
 /* local64_t always succeeds */
@@ -755,11 +713,6 @@ static void rb_time_set(rb_time_t *t, u64 val)
 {
 	local64_set(&t->time, val);
 }
-
-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
-{
-	return local64_try_cmpxchg(&t->time, &expect, set);
-}
 #endif
 
 /*
@@ -3610,20 +3563,36 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	} else {
 		u64 ts;
 		/* SLOW PATH - Interrupted between A and C */
-		a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
-		/* Was interrupted before here, write_stamp must be valid */
+
+		/* Save the old before_stamp */
+		a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
 		RB_WARN_ON(cpu_buffer, !a_ok);
+
+		/*
+		 * Read a new timestamp and update the before_stamp to make
+		 * the next event after this one force using an absolute
+		 * timestamp. This is in case an interrupt were to come in
+		 * between E and F.
+		 */
 		ts = rb_time_stamp(cpu_buffer->buffer);
+		rb_time_set(&cpu_buffer->before_stamp, ts);
+
+		barrier();
+ /*E*/		a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+		/* Was interrupted before here, write_stamp must be valid */
+		RB_WARN_ON(cpu_buffer, !a_ok);
 		barrier();
- /*E*/		if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
-		    info->after < ts &&
-		    rb_time_cmpxchg(&cpu_buffer->write_stamp,
-				    info->after, ts)) {
-			/* Nothing came after this event between C and E */
+ /*F*/		if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
+		    info->after == info->before && info->after < ts) {
+			/*
+			 * Nothing came after this event between C and F, it is
+			 * safe to use info->after for the delta as it
+			 * matched info->before and is still valid.
+			 */
 			info->delta = ts - info->after;
 		} else {
 			/*
-			 * Interrupted between C and E:
+			 * Interrupted between C and F:
 			 * Lost the previous events time stamp. Just set the
 			 * delta to zero, and this will be the same time as
 			 * the event this event interrupted. And the events that
-- 
cgit v1.2.3


From d23569979ca1cd139a42c410e0c7b9e6014c3b3a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 13 Dec 2023 09:37:01 -0500
Subject: tracing: Allow creating instances with specified system events

A trace instance may only need to enable specific events. As the eventfs
directory of an instance currently creates all events which adds overhead,
allow internal instances to be created with just the events in systems
that they care about. This currently only deals with systems and not
individual events, but this should bring down the overhead of creating
instances for specific use cases quite bit.

The trace_array_get_by_name() now has another parameter "systems". This
parameter is a const string pointer of a comma/space separated list of
event systems that should be created by the trace_array. (Note if the
trace_array already exists, this parameter is ignored).

The list of systems is saved and if a module is loaded, its events will
not be added unless the system for those events also match the systems
string.

Link: https://lore.kernel.org/linux-trace-kernel/20231213093701.03fddec0@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Sean Paul <seanpaul@chromium.org>
Cc: Arun Easi   <aeasi@marvell.com>
Cc: Daniel Wagner <dwagner@suse.de>
Tested-by: Dmytro Maluka <dmaluka@chromium.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 drivers/scsi/qla2xxx/qla_os.c       |  2 +-
 include/linux/trace.h               |  4 ++--
 kernel/trace/trace.c                | 23 ++++++++++++++----
 kernel/trace/trace.h                |  1 +
 kernel/trace/trace_boot.c           |  2 +-
 kernel/trace/trace_events.c         | 48 ++++++++++++++++++++++++++++++++++---
 samples/ftrace/sample-trace-array.c |  2 +-
 7 files changed, 70 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 03348f605c2e..dd674378f2f3 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -2889,7 +2889,7 @@ static void qla2x00_iocb_work_fn(struct work_struct *work)
 static void
 qla_trace_init(void)
 {
-	qla_trc_array = trace_array_get_by_name("qla2xxx");
+	qla_trc_array = trace_array_get_by_name("qla2xxx", NULL);
 	if (!qla_trc_array) {
 		ql_log(ql_log_fatal, NULL, 0x0001,
 		       "Unable to create qla2xxx trace instance, instance logging will be disabled.\n");
diff --git a/include/linux/trace.h b/include/linux/trace.h
index 2a70a447184c..fdcd76b7be83 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -51,7 +51,7 @@ int trace_array_printk(struct trace_array *tr, unsigned long ip,
 		       const char *fmt, ...);
 int trace_array_init_printk(struct trace_array *tr);
 void trace_array_put(struct trace_array *tr);
-struct trace_array *trace_array_get_by_name(const char *name);
+struct trace_array *trace_array_get_by_name(const char *name, const char *systems);
 int trace_array_destroy(struct trace_array *tr);
 
 /* For osnoise tracer */
@@ -84,7 +84,7 @@ static inline int trace_array_init_printk(struct trace_array *tr)
 static inline void trace_array_put(struct trace_array *tr)
 {
 }
-static inline struct trace_array *trace_array_get_by_name(const char *name)
+static inline struct trace_array *trace_array_get_by_name(const char *name, const char *systems)
 {
 	return NULL;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 199df497db07..59e39b652afb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9490,7 +9490,8 @@ static int trace_array_create_dir(struct trace_array *tr)
 	return ret;
 }
 
-static struct trace_array *trace_array_create(const char *name)
+static struct trace_array *
+trace_array_create_systems(const char *name, const char *systems)
 {
 	struct trace_array *tr;
 	int ret;
@@ -9510,6 +9511,12 @@ static struct trace_array *trace_array_create(const char *name)
 	if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
 		goto out_free_tr;
 
+	if (systems) {
+		tr->system_names = kstrdup_const(systems, GFP_KERNEL);
+		if (!tr->system_names)
+			goto out_free_tr;
+	}
+
 	tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
 
 	cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9556,12 +9563,18 @@ static struct trace_array *trace_array_create(const char *name)
 	free_trace_buffers(tr);
 	free_cpumask_var(tr->pipe_cpumask);
 	free_cpumask_var(tr->tracing_cpumask);
+	kfree_const(tr->system_names);
 	kfree(tr->name);
 	kfree(tr);
 
 	return ERR_PTR(ret);
 }
 
+static struct trace_array *trace_array_create(const char *name)
+{
+	return trace_array_create_systems(name, NULL);
+}
+
 static int instance_mkdir(const char *name)
 {
 	struct trace_array *tr;
@@ -9587,6 +9600,7 @@ out_unlock:
 /**
  * trace_array_get_by_name - Create/Lookup a trace array, given its name.
  * @name: The name of the trace array to be looked up/created.
+ * @systems: A list of systems to create event directories for (NULL for all)
  *
  * Returns pointer to trace array with given name.
  * NULL, if it cannot be created.
@@ -9600,7 +9614,7 @@ out_unlock:
  * trace_array_put() is called, user space can not delete it.
  *
  */
-struct trace_array *trace_array_get_by_name(const char *name)
+struct trace_array *trace_array_get_by_name(const char *name, const char *systems)
 {
 	struct trace_array *tr;
 
@@ -9612,7 +9626,7 @@ struct trace_array *trace_array_get_by_name(const char *name)
 			goto out_unlock;
 	}
 
-	tr = trace_array_create(name);
+	tr = trace_array_create_systems(name, systems);
 
 	if (IS_ERR(tr))
 		tr = NULL;
@@ -9659,6 +9673,7 @@ static int __remove_instance(struct trace_array *tr)
 
 	free_cpumask_var(tr->pipe_cpumask);
 	free_cpumask_var(tr->tracing_cpumask);
+	kfree_const(tr->system_names);
 	kfree(tr->name);
 	kfree(tr);
 
@@ -10377,7 +10392,7 @@ __init static void enable_instances(void)
 		if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
 			do_allocate_snapshot(tok);
 
-		tr = trace_array_get_by_name(tok);
+		tr = trace_array_get_by_name(tok, NULL);
 		if (!tr) {
 			pr_warn("Failed to create instance buffer %s\n", curr_str);
 			continue;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0489e72c8169..79180aed13ee 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,7 @@ struct trace_array {
 	unsigned char		trace_flags_index[TRACE_FLAGS_MAX_SIZE];
 	unsigned int		flags;
 	raw_spinlock_t		start_lock;
+	const char		*system_names;
 	struct list_head	err_log;
 	struct dentry		*dir;
 	struct dentry		*options;
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7ccc7a8e155b..dbe29b4c6a7a 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -633,7 +633,7 @@ trace_boot_init_instances(struct xbc_node *node)
 		if (!p || *p == '\0')
 			continue;
 
-		tr = trace_array_get_by_name(p);
+		tr = trace_array_get_by_name(p, NULL);
 		if (!tr) {
 			pr_err("Failed to get trace instance %s\n", p);
 			continue;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f29e815ca5b2..b70d03818038 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2896,6 +2896,27 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
 	up_write(&trace_event_sem);
 }
 
+static bool event_in_systems(struct trace_event_call *call,
+			     const char *systems)
+{
+	const char *system;
+	const char *p;
+
+	if (!systems)
+		return true;
+
+	system = call->class->system;
+	p = strstr(systems, system);
+	if (!p)
+		return false;
+
+	if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',')
+		return false;
+
+	p += strlen(system);
+	return !*p || isspace(*p) || *p == ',';
+}
+
 static struct trace_event_file *
 trace_create_new_event(struct trace_event_call *call,
 		       struct trace_array *tr)
@@ -2905,9 +2926,12 @@ trace_create_new_event(struct trace_event_call *call,
 	struct trace_event_file *file;
 	unsigned int first;
 
+	if (!event_in_systems(call, tr->system_names))
+		return NULL;
+
 	file = kmem_cache_alloc(file_cachep, GFP_TRACE);
 	if (!file)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	pid_list = rcu_dereference_protected(tr->filtered_pids,
 					     lockdep_is_held(&event_mutex));
@@ -2972,8 +2996,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
 	struct trace_event_file *file;
 
 	file = trace_create_new_event(call, tr);
+	/*
+	 * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+	 * allocation, or NULL if the event is not part of the tr->system_names.
+	 * When the event is not part of the tr->system_names, return zero, not
+	 * an error.
+	 */
 	if (!file)
-		return -ENOMEM;
+		return 0;
+
+	if (IS_ERR(file))
+		return PTR_ERR(file);
 
 	if (eventdir_initialized)
 		return event_create_dir(tr->event_dir, file);
@@ -3012,8 +3045,17 @@ __trace_early_add_new_event(struct trace_event_call *call,
 	int ret;
 
 	file = trace_create_new_event(call, tr);
+	/*
+	 * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+	 * allocation, or NULL if the event is not part of the tr->system_names.
+	 * When the event is not part of the tr->system_names, return zero, not
+	 * an error.
+	 */
 	if (!file)
-		return -ENOMEM;
+		return 0;
+
+	if (IS_ERR(file))
+		return PTR_ERR(file);
 
 	ret = event_define_fields(call);
 	if (ret)
diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c
index 6aba02a31c96..d0ee9001c7b3 100644
--- a/samples/ftrace/sample-trace-array.c
+++ b/samples/ftrace/sample-trace-array.c
@@ -105,7 +105,7 @@ static int __init sample_trace_array_init(void)
 	 * NOTE: This function increments the reference counter
 	 * associated with the trace array - "tr".
 	 */
-	tr = trace_array_get_by_name("sample-instance");
+	tr = trace_array_get_by_name("sample-instance", "sched,timer,kprobes");
 
 	if (!tr)
 		return -1;
-- 
cgit v1.2.3


From 0b9036efd83d4adefab197a1ec98c496c9df44f0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Mon, 11 Dec 2023 13:16:23 -0500
Subject: ring-buffer: Add offset of events in dump on mismatch

On bugs that have the ring buffer timestamp get out of sync, the config
CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS, that checks for it and if it is
detected it causes a dump of the bad sub buffer.

It shows each event and their timestamp as well as the delta in the event.
But it's also good to see the offset into the subbuffer for that event to
know if how close to the end it is.

Also print where the last event actually ended compared to where it was
expected to end.

Link: https://lore.kernel.org/linux-trace-kernel/20231211131623.59eaebd2@gandalf.local.home

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 83eab547f1d1..bfe2697a92ee 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3358,29 +3358,34 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
 		case RINGBUF_TYPE_TIME_EXTEND:
 			delta = rb_event_time_stamp(event);
 			ts += delta;
-			pr_warn("  [%lld] delta:%lld TIME EXTEND\n", ts, delta);
+			pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n",
+				e, ts, delta);
 			break;
 
 		case RINGBUF_TYPE_TIME_STAMP:
 			delta = rb_event_time_stamp(event);
 			ts = rb_fix_abs_ts(delta, ts);
-			pr_warn("  [%lld] absolute:%lld TIME STAMP\n", ts, delta);
+			pr_warn(" 0x%x:  [%lld] absolute:%lld TIME STAMP\n",
+				e, ts, delta);
 			break;
 
 		case RINGBUF_TYPE_PADDING:
 			ts += event->time_delta;
-			pr_warn("  [%lld] delta:%d PADDING\n", ts, event->time_delta);
+			pr_warn(" 0x%x:  [%lld] delta:%d PADDING\n",
+				e, ts, event->time_delta);
 			break;
 
 		case RINGBUF_TYPE_DATA:
 			ts += event->time_delta;
-			pr_warn("  [%lld] delta:%d\n", ts, event->time_delta);
+			pr_warn(" 0x%x:  [%lld] delta:%d\n",
+				e, ts, event->time_delta);
 			break;
 
 		default:
 			break;
 		}
 	}
+	pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e);
 }
 
 static DEFINE_PER_CPU(atomic_t, checking);
-- 
cgit v1.2.3


From 8ec90be7f15fac42992ea821be929d3b06cd0fd9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 12 Dec 2023 13:19:01 -0500
Subject: tracing: Allow for max buffer data size trace_marker writes

Allow a trace write to be as big as the ring buffer tracing data will
allow. Currently, it only allows writes of 1KB in size, but there's no
reason that it cannot allow what the ring buffer can hold.

Link: https://lore.kernel.org/linux-trace-kernel/20231212131901.5f501e72@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  1 +
 kernel/trace/ring_buffer.c  | 15 +++++++++++++++
 kernel/trace/trace.c        | 31 ++++++++++++++++++++++++-------
 3 files changed, 40 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 782e14f62201..b1b03b2c0f08 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -141,6 +141,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
 bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter);
 
 unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu);
+unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer);
 
 void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu);
 void ring_buffer_reset_online_cpus(struct trace_buffer *buffer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bfe2697a92ee..16b640d824f9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5190,6 +5190,21 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_size);
 
+/**
+ * ring_buffer_max_event_size - return the max data size of an event
+ * @buffer: The ring buffer.
+ *
+ * Returns the maximum size an event can be.
+ */
+unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer)
+{
+	/* If abs timestamp is requested, events have a timestamp too */
+	if (ring_buffer_time_stamp_abs(buffer))
+		return BUF_MAX_DATA_SIZE - RB_LEN_TIME_EXTEND;
+	return BUF_MAX_DATA_SIZE;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_max_event_size);
+
 static void rb_clear_buffer_page(struct buffer_page *page)
 {
 	local_set(&page->write, 0);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 59e39b652afb..dba1328e454b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7278,8 +7278,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	enum event_trigger_type tt = ETT_NONE;
 	struct trace_buffer *buffer;
 	struct print_entry *entry;
+	int meta_size;
 	ssize_t written;
-	int size;
+	size_t size;
 	int len;
 
 /* Used in tracing_mark_raw_write() as well */
@@ -7292,12 +7293,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	if (!(tr->trace_flags & TRACE_ITER_MARKERS))
 		return -EINVAL;
 
-	if (cnt > TRACE_BUF_SIZE)
-		cnt = TRACE_BUF_SIZE;
-
-	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
+	if ((ssize_t)cnt < 0)
+		return -EINVAL;
 
-	size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
+	meta_size = sizeof(*entry) + 2;  /* add '\0' and possible '\n' */
+ again:
+	size = cnt + meta_size;
 
 	/* If less than "<faulted>", then make sure we can still add that */
 	if (cnt < FAULTED_SIZE)
@@ -7306,9 +7307,25 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	buffer = tr->array_buffer.buffer;
 	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
 					    tracing_gen_ctx());
-	if (unlikely(!event))
+	if (unlikely(!event)) {
+		/*
+		 * If the size was greater than what was allowed, then
+		 * make it smaller and try again.
+		 */
+		if (size > ring_buffer_max_event_size(buffer)) {
+			/* cnt < FAULTED size should never be bigger than max */
+			if (WARN_ON_ONCE(cnt < FAULTED_SIZE))
+				return -EBADF;
+			cnt = ring_buffer_max_event_size(buffer) - meta_size;
+			/* The above should only happen once */
+			if (WARN_ON_ONCE(cnt + meta_size == size))
+				return -EBADF;
+			goto again;
+		}
+
 		/* Ring buffer disabled, return as if not open for write */
 		return -EBADF;
+	}
 
 	entry = ring_buffer_event_data(event);
 	entry->ip = _THIS_IP_;
-- 
cgit v1.2.3


From 40fc60e36c60ba85b2974e507b67df40c94e9578 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Sat, 9 Dec 2023 17:52:20 -0500
Subject: trace_seq: Increase the buffer size to almost two pages

Now that trace_marker can hold more than 1KB string, and can write as much
as the ring buffer can hold, the trace_seq is not big enough to hold
writes:

 ~# a="1234567890"
 ~# cnt=4080
 ~# s=""
 ~# while [ $cnt -gt 10 ]; do
 ~#	s="${s}${a}"
 ~#	cnt=$((cnt-10))
 ~# done
 ~# echo $s > trace_marker
 ~# cat trace
 # tracer: nop
 #
 # entries-in-buffer/entries-written: 2/2   #P:8
 #
 #                                _-----=> irqs-off/BH-disabled
 #                               / _----=> need-resched
 #                              | / _---=> hardirq/softirq
 #                              || / _--=> preempt-depth
 #                              ||| / _-=> migrate-disable
 #                              |||| /     delay
 #           TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
 #              | |         |   |||||     |         |
            <...>-860     [002] .....   105.543465: tracing_mark_write[LINE TOO BIG]
            <...>-860     [002] .....   105.543496: tracing_mark_write: 789012345678901234567890

By increasing the trace_seq buffer to almost two pages, it can now print
out the first line.

This also subtracts the rest of the trace_seq fields from the buffer, so
that the entire trace_seq is now PAGE_SIZE aligned.

Link: https://lore.kernel.org/linux-trace-kernel/20231209175220.19867af4@gandalf.local.home

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_seq.h | 9 ++++++---
 kernel/trace/trace.c      | 6 +++---
 kernel/trace/trace_seq.c  | 3 ---
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 3691e0e76a1a..9ec229dfddaa 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -8,11 +8,14 @@
 
 /*
  * Trace sequences are used to allow a function to call several other functions
- * to create a string of data to use (up to a max of PAGE_SIZE).
+ * to create a string of data to use.
  */
 
+#define TRACE_SEQ_BUFFER_SIZE	(PAGE_SIZE * 2 - \
+	(sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int)))
+
 struct trace_seq {
-	char			buffer[PAGE_SIZE];
+	char			buffer[TRACE_SEQ_BUFFER_SIZE];
 	struct seq_buf		seq;
 	size_t			readpos;
 	int			full;
@@ -21,7 +24,7 @@ struct trace_seq {
 static inline void
 trace_seq_init(struct trace_seq *s)
 {
-	seq_buf_init(&s->seq, s->buffer, PAGE_SIZE);
+	seq_buf_init(&s->seq, s->buffer, TRACE_SEQ_BUFFER_SIZE);
 	s->full = 0;
 	s->readpos = 0;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dba1328e454b..0be30cccabb4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3753,7 +3753,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
 
 	/* OK if part of the temp seq buffer */
 	if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
-	    (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE))
+	    (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE))
 		return true;
 
 	/* Core rodata can not be freed */
@@ -6932,8 +6932,8 @@ waitagain:
 		goto out;
 	}
 
-	if (cnt >= PAGE_SIZE)
-		cnt = PAGE_SIZE - 1;
+	if (cnt >= TRACE_SEQ_BUFFER_SIZE)
+		cnt = TRACE_SEQ_BUFFER_SIZE - 1;
 
 	/* reset all but tr, trace, and overruns */
 	trace_iterator_reset(iter);
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 7be97229ddf8..c158d65a8a88 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -13,9 +13,6 @@
  * trace_seq_init() more than once to reset the trace_seq to start
  * from scratch.
  * 
- * The buffer size is currently PAGE_SIZE, although it may become dynamic
- * in the future.
- *
  * A write to the buffer will either succeed or fail. That is, unlike
  * sprintf() there will not be a partial write (well it may write into
  * the buffer but it wont update the pointers). This allows users to
-- 
cgit v1.2.3


From 9482341d9bdaf194552673b6c1c81a824e985e7f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 12 Dec 2023 19:04:22 -0500
Subject: tracing: Have trace_marker break up by lines by size of trace_seq

If a trace_marker write is bigger than what trace_seq can hold, then it
will print "LINE TOO BIG" message and not what was written.

Instead, check if the write is bigger than the trace_seq and break it
up by that size.

Ideally, we could make the trace_seq dynamic that could hold this. But
that's for another time.

Link: https://lore.kernel.org/linux-trace-kernel/20231212190422.1eaf224f@gandalf.local.home

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0be30cccabb4..9cf58383d2fb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7304,6 +7304,11 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	if (cnt < FAULTED_SIZE)
 		size += FAULTED_SIZE - cnt;
 
+	if (size > TRACE_SEQ_BUFFER_SIZE) {
+		cnt -= size - TRACE_SEQ_BUFFER_SIZE;
+		goto again;
+	}
+
 	buffer = tr->array_buffer.buffer;
 	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
 					    tracing_gen_ctx());
-- 
cgit v1.2.3


From 76ca20c748684f63bb985166850049f242797f65 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 13 Dec 2023 10:42:18 -0500
Subject: tracing: Increase size of trace_marker_raw to max ring buffer entry

There's no reason to give an arbitrary limit to the size of a raw trace
marker. Just let it be as big as the size that is allowed by the ring
buffer itself.

And there's also no reason to artificially break up the write to
TRACE_BUF_SIZE, as that's not even used.

Link: https://lore.kernel.org/linux-trace-kernel/20231213104218.2efc70c1@gandalf.local.home

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9cf58383d2fb..55dabee4c78b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7365,9 +7365,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	return written;
 }
 
-/* Limit it for now to 3K (including tag) */
-#define RAW_DATA_MAX_SIZE (1024*3)
-
 static ssize_t
 tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
 					size_t cnt, loff_t *fpos)
@@ -7389,19 +7386,18 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
 		return -EINVAL;
 
 	/* The marker must at least have a tag id */
-	if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE)
+	if (cnt < sizeof(unsigned int))
 		return -EINVAL;
 
-	if (cnt > TRACE_BUF_SIZE)
-		cnt = TRACE_BUF_SIZE;
-
-	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
-
 	size = sizeof(*entry) + cnt;
 	if (cnt < FAULT_SIZE_ID)
 		size += FAULT_SIZE_ID - cnt;
 
 	buffer = tr->array_buffer.buffer;
+
+	if (size > ring_buffer_max_event_size(buffer))
+		return -EINVAL;
+
 	event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
 					    tracing_gen_ctx());
 	if (!event)
-- 
cgit v1.2.3


From c84897c0ff5925090af0c6a8bf61424b71481764 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 19 Dec 2023 07:43:03 -0500
Subject: ring-buffer: Remove 32bit timestamp logic

Each event has a 27 bit timestamp delta that is used to hold the delta
from the last event. If the time between events is greater than 2^27, then
a timestamp is added that holds a 59 bit absolute timestamp.

Until a389d86f7fd09 ("ring-buffer: Have nested events still record running
time stamp"), if an interrupt interrupted an event in progress, all the
events delta would be zero to not deal with the races that need to be
handled. The commit a389d86f7fd09 changed that to handle the races giving
all events, even those that preempt other events, still have an accurate
timestamp.

To handle those races requires performing 64-bit cmpxchg on the
timestamps. But doing 64-bit cmpxchg on 32-bit architectures is considered
very slow. To try to deal with this the timestamp logic was broken into
two and then three 32-bit cmpxchgs, with the thought that two (or three)
32-bit cmpxchgs are still faster than a single 64-bit cmpxchg on 32-bit
architectures.

Part of the problem with this is that I didn't have any 32-bit
architectures to test on. After hitting several subtle bugs in this code,
an effort was made to try and see if three 32-bit cmpxchgs are indeed
faster than a single 64-bit. After a few people brushed off the dust of
their old 32-bit machines, tests were done, and even though 32-bit cmpxchg
was faster than a single 64-bit, it was in the order of 50% at best, not
300%.

After some more refactoring of the code, all 4 64-bit cmpxchg were removed:

 https://lore.kernel.org/linux-trace-kernel/20231211114420.36dde01b@gandalf.local.home
 https://lore.kernel.org/linux-trace-kernel/20231214222921.193037a7@gandalf.local.home
 https://lore.kernel.org/linux-trace-kernel/20231215081810.1f4f38fe@rorschach.local.home
 https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.local.home/

With all the 64-bit cmpxchg removed, the complex 32-bit workaround can also be
removed.

The 32-bit and 64-bit logic is now exactly the same.

Link: https://lore.kernel.org/all/20231213214632.15047c40@gandalf.local.home/
Link: https://lore.kernel.org/linux-trace-kernel/20231219074303.28f9abda@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 176 +++------------------------------------------
 1 file changed, 9 insertions(+), 167 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 16b640d824f9..a4ada4f303c4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -27,6 +27,7 @@
 #include <linux/cpu.h>
 #include <linux/oom.h>
 
+#include <asm/local64.h>
 #include <asm/local.h>
 
 /*
@@ -463,27 +464,9 @@ enum {
 	RB_CTX_MAX
 };
 
-#if BITS_PER_LONG == 32
-#define RB_TIME_32
-#endif
-
-/* To test on 64 bit machines */
-//#define RB_TIME_32
-
-#ifdef RB_TIME_32
-
-struct rb_time_struct {
-	local_t		cnt;
-	local_t		top;
-	local_t		bottom;
-	local_t		msb;
-};
-#else
-#include <asm/local64.h>
 struct rb_time_struct {
 	local64_t	time;
 };
-#endif
 typedef struct rb_time_struct rb_time_t;
 
 #define MAX_NEST	5
@@ -573,147 +556,14 @@ struct ring_buffer_iter {
 	int				missed_events;
 };
 
-#ifdef RB_TIME_32
-
-/*
- * On 32 bit machines, local64_t is very expensive. As the ring
- * buffer doesn't need all the features of a true 64 bit atomic,
- * on 32 bit, it uses these functions (64 still uses local64_t).
- *
- * For the ring buffer, 64 bit required operations for the time is
- * the following:
- *
- *  - Reads may fail if it interrupted a modification of the time stamp.
- *      It will succeed if it did not interrupt another write even if
- *      the read itself is interrupted by a write.
- *      It returns whether it was successful or not.
- *
- *  - Writes always succeed and will overwrite other writes and writes
- *      that were done by events interrupting the current write.
- *
- *  - A write followed by a read of the same time stamp will always succeed,
- *      but may not contain the same value.
- *
- *  - A cmpxchg will fail if it interrupted another write or cmpxchg.
- *      Other than that, it acts like a normal cmpxchg.
- *
- * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
- *  (bottom being the least significant 30 bits of the 60 bit time stamp).
- *
- * The two most significant bits of each half holds a 2 bit counter (0-3).
- * Each update will increment this counter by one.
- * When reading the top and bottom, if the two counter bits match then the
- *  top and bottom together make a valid 60 bit number.
- */
-#define RB_TIME_SHIFT	30
-#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
-#define RB_TIME_MSB_SHIFT	 60
-
-static inline int rb_time_cnt(unsigned long val)
-{
-	return (val >> RB_TIME_SHIFT) & 3;
-}
-
-static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
-{
-	u64 val;
-
-	val = top & RB_TIME_VAL_MASK;
-	val <<= RB_TIME_SHIFT;
-	val |= bottom & RB_TIME_VAL_MASK;
-
-	return val;
-}
-
-static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
-{
-	unsigned long top, bottom, msb;
-	unsigned long c;
-
-	/*
-	 * If the read is interrupted by a write, then the cnt will
-	 * be different. Loop until both top and bottom have been read
-	 * without interruption.
-	 */
-	do {
-		c = local_read(&t->cnt);
-		top = local_read(&t->top);
-		bottom = local_read(&t->bottom);
-		msb = local_read(&t->msb);
-	} while (c != local_read(&t->cnt));
-
-	*cnt = rb_time_cnt(top);
-
-	/* If top, msb or bottom counts don't match, this interrupted a write */
-	if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom))
-		return false;
-
-	/* The shift to msb will lose its cnt bits */
-	*ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT);
-	return true;
-}
-
-static bool rb_time_read(rb_time_t *t, u64 *ret)
-{
-	unsigned long cnt;
-
-	return __rb_time_read(t, ret, &cnt);
-}
-
-static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
-{
-	return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
-}
-
-static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom,
-				 unsigned long *msb)
-{
-	*top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
-	*bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
-	*msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT);
-}
-
-static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
-{
-	val = rb_time_val_cnt(val, cnt);
-	local_set(t, val);
-}
-
-static void rb_time_set(rb_time_t *t, u64 val)
-{
-	unsigned long cnt, top, bottom, msb;
-
-	rb_time_split(val, &top, &bottom, &msb);
-
-	/* Writes always succeed with a valid number even if it gets interrupted. */
-	do {
-		cnt = local_inc_return(&t->cnt);
-		rb_time_val_set(&t->top, top, cnt);
-		rb_time_val_set(&t->bottom, bottom, cnt);
-		rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt);
-	} while (cnt != local_read(&t->cnt));
-}
-
-static inline bool
-rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
-{
-	return local_try_cmpxchg(l, &expect, set);
-}
-
-#else /* 64 bits */
-
-/* local64_t always succeeds */
-
-static inline bool rb_time_read(rb_time_t *t, u64 *ret)
+static inline void rb_time_read(rb_time_t *t, u64 *ret)
 {
 	*ret = local64_read(&t->time);
-	return true;
 }
 static void rb_time_set(rb_time_t *t, u64 val)
 {
 	local64_set(&t->time, val);
 }
-#endif
 
 /*
  * Enable this to make sure that the event passed to
@@ -820,10 +670,7 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
 	WARN_ONCE(1, "nest (%d) greater than max", nest);
 
  fail:
-	/* Can only fail on 32 bit */
-	if (!rb_time_read(&cpu_buffer->write_stamp, &ts))
-		/* Screw it, just read the current time */
-		ts = rb_time_stamp(cpu_buffer->buffer);
+	rb_time_read(&cpu_buffer->write_stamp, &ts);
 
 	return ts;
 }
@@ -2820,7 +2667,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  (unsigned long long)info->ts,
 		  (unsigned long long)info->before,
 		  (unsigned long long)info->after,
-		  (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
+		  (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}),
 		  sched_clock_stable() ? "" :
 		  "If you just came from a suspend/resume,\n"
 		  "please switch to the trace global clock:\n"
@@ -3497,16 +3344,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	struct ring_buffer_event *event;
 	struct buffer_page *tail_page;
 	unsigned long tail, write, w;
-	bool a_ok;
-	bool b_ok;
 
 	/* Don't let the compiler play games with cpu_buffer->tail_page */
 	tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
 
  /*A*/	w = local_read(&tail_page->write) & RB_WRITE_MASK;
 	barrier();
-	b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
-	a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+	rb_time_read(&cpu_buffer->before_stamp, &info->before);
+	rb_time_read(&cpu_buffer->write_stamp, &info->after);
 	barrier();
 	info->ts = rb_time_stamp(cpu_buffer->buffer);
 
@@ -3521,7 +3366,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		if (!w) {
 			/* Use the sub-buffer timestamp */
 			info->delta = 0;
-		} else if (unlikely(!a_ok || !b_ok || info->before != info->after)) {
+		} else if (unlikely(info->before != info->after)) {
 			info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
 			info->length += RB_LEN_TIME_EXTEND;
 		} else {
@@ -3570,8 +3415,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		/* SLOW PATH - Interrupted between A and C */
 
 		/* Save the old before_stamp */
-		a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
-		RB_WARN_ON(cpu_buffer, !a_ok);
+		rb_time_read(&cpu_buffer->before_stamp, &info->before);
 
 		/*
 		 * Read a new timestamp and update the before_stamp to make
@@ -3583,9 +3427,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		rb_time_set(&cpu_buffer->before_stamp, ts);
 
 		barrier();
- /*E*/		a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
-		/* Was interrupted before here, write_stamp must be valid */
-		RB_WARN_ON(cpu_buffer, !a_ok);
+ /*E*/		rb_time_read(&cpu_buffer->write_stamp, &info->after);
 		barrier();
  /*F*/		if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
 		    info->after == info->before && info->after < ts) {
-- 
cgit v1.2.3


From d40dbb617ae9a8fe57343b26b4ad2bd7bb05c130 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 19 Dec 2023 07:45:42 -0500
Subject: ring-buffer: Add interrupt information to dump of data sub-buffer

When the ring buffer timestamp verifier triggers, it dumps the content of
the sub-buffer. But currently it only dumps the timestamps and the offset
of the data as well as the deltas. It would be even more informative if
the event data also showed the interrupt context level it was in.

That is, if each event showed that the event was written in normal,
softirq, irq or NMI context. Then a better idea about how the events may
have been interrupted from each other.

As the payload of the ring buffer is really a black box of the ring
buffer, just assume that if the payload is larger than a trace entry, that
it is a trace entry. As trace entries have the interrupt context
information saved in a flags field, look at that location and report the
output of the flags.

If the payload is not a trace entry, there's no way to really know, and
the information will be garbage. But that's OK, because this is for
debugging only (this output is not used in production as the buffer check
that calls it causes a huge overhead to the tracing). This information,
when available, is crucial for debugging timestamp issues. If it's
garbage, it will also be pretty obvious that its garbage too.

As this output usually happens in kselftests of the tracing code, the user
will know what the payload is at the time.

Link: https://lore.kernel.org/linux-trace-kernel/20231219074542.6f304601@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Suggested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 79 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 75 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a4ada4f303c4..c0cc45482e1e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3185,6 +3185,76 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 #define CHECK_FULL_PAGE		1L
 
 #ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
+
+static const char *show_irq_str(int bits)
+{
+	const char *type[] = {
+		".",	// 0
+		"s",	// 1
+		"h",	// 2
+		"Hs",	// 3
+		"n",	// 4
+		"Ns",	// 5
+		"Nh",	// 6
+		"NHs",	// 7
+	};
+
+	return type[bits];
+}
+
+/* Assume this is an trace event */
+static const char *show_flags(struct ring_buffer_event *event)
+{
+	struct trace_entry *entry;
+	int bits = 0;
+
+	if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
+		return "X";
+
+	entry = ring_buffer_event_data(event);
+
+	if (entry->flags & TRACE_FLAG_SOFTIRQ)
+		bits |= 1;
+
+	if (entry->flags & TRACE_FLAG_HARDIRQ)
+		bits |= 2;
+
+	if (entry->flags & TRACE_FLAG_NMI)
+		bits |= 4;
+
+	return show_irq_str(bits);
+}
+
+static const char *show_irq(struct ring_buffer_event *event)
+{
+	struct trace_entry *entry;
+
+	if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
+		return "";
+
+	entry = ring_buffer_event_data(event);
+	if (entry->flags & TRACE_FLAG_IRQS_OFF)
+		return "d";
+	return "";
+}
+
+static const char *show_interrupt_level(void)
+{
+	unsigned long pc = preempt_count();
+	unsigned char level = 0;
+
+	if (pc & SOFTIRQ_OFFSET)
+		level |= 1;
+
+	if (pc & HARDIRQ_MASK)
+		level |= 2;
+
+	if (pc & NMI_MASK)
+		level |= 4;
+
+	return show_irq_str(level);
+}
+
 static void dump_buffer_page(struct buffer_data_page *bpage,
 			     struct rb_event_info *info,
 			     unsigned long tail)
@@ -3224,8 +3294,9 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
 
 		case RINGBUF_TYPE_DATA:
 			ts += event->time_delta;
-			pr_warn(" 0x%x:  [%lld] delta:%d\n",
-				e, ts, event->time_delta);
+			pr_warn(" 0x%x:  [%lld] delta:%d %s%s\n",
+				e, ts, event->time_delta,
+				show_flags(event), show_irq(event));
 			break;
 
 		default:
@@ -3316,11 +3387,11 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
 		atomic_inc(&cpu_buffer->record_disabled);
 		/* There's some cases in boot up that this can happen */
 		WARN_ON_ONCE(system_state != SYSTEM_BOOTING);
-		pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n",
+		pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
 			cpu_buffer->cpu,
 			ts + info->delta, info->ts, info->delta,
 			info->before, info->after,
-			full ? " (full)" : "");
+			full ? " (full)" : "", show_interrupt_level());
 		dump_buffer_page(bpage, info, tail);
 		atomic_dec(&ts_dump);
 		/* Do not re-enable checking */
-- 
cgit v1.2.3


From f50345b49b16f7fe6dbebbec065b9248c38556ff Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 19 Dec 2023 07:47:32 -0500
Subject: ring-buffer: Check if absolute timestamp goes backwards

The check_buffer() which checks the timestamps of the ring buffer
sub-buffer page, when enabled, only checks if the adding of deltas of the
events from the last absolute timestamp or the timestamp of the sub-buffer
page adds up to the current event.

What it does not check is if the absolute timestamp causes the time of the
events to go backwards, as that can cause issues elsewhere.

Test for the timestamp going backwards too.

This also fixes a slight issue where if the warning triggers at boot up
(because of the resetting of the tsc), it will disable all further checks,
even those that are after boot Have it continue checking if the warning
was ignored during boot up.

Link: https://lore.kernel.org/linux-trace-kernel/20231219074732.18b092d4@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 46 ++++++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c0cc45482e1e..f7dc74e45ebf 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3309,6 +3309,23 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
 static DEFINE_PER_CPU(atomic_t, checking);
 static atomic_t ts_dump;
 
+#define buffer_warn_return(fmt, ...)					\
+	do {								\
+		/* If another report is happening, ignore this one */	\
+		if (atomic_inc_return(&ts_dump) != 1) {			\
+			atomic_dec(&ts_dump);				\
+			goto out;					\
+		}							\
+		atomic_inc(&cpu_buffer->record_disabled);		\
+		pr_warn(fmt, ##__VA_ARGS__);				\
+		dump_buffer_page(bpage, info, tail);			\
+		atomic_dec(&ts_dump);					\
+		/* There's some cases in boot up that this can happen */ \
+		if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING))	\
+			/* Do not re-enable checking */			\
+			return;						\
+	} while (0)
+
 /*
  * Check if the current event time stamp matches the deltas on
  * the buffer page.
@@ -3362,7 +3379,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
 
 		case RINGBUF_TYPE_TIME_STAMP:
 			delta = rb_event_time_stamp(event);
-			ts = rb_fix_abs_ts(delta, ts);
+			delta = rb_fix_abs_ts(delta, ts);
+			if (delta < ts) {
+				buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
+						   cpu_buffer->cpu, ts, delta);
+			}
+			ts = delta;
 			break;
 
 		case RINGBUF_TYPE_PADDING:
@@ -3379,23 +3401,11 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
 	}
 	if ((full && ts > info->ts) ||
 	    (!full && ts + info->delta != info->ts)) {
-		/* If another report is happening, ignore this one */
-		if (atomic_inc_return(&ts_dump) != 1) {
-			atomic_dec(&ts_dump);
-			goto out;
-		}
-		atomic_inc(&cpu_buffer->record_disabled);
-		/* There's some cases in boot up that this can happen */
-		WARN_ON_ONCE(system_state != SYSTEM_BOOTING);
-		pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
-			cpu_buffer->cpu,
-			ts + info->delta, info->ts, info->delta,
-			info->before, info->after,
-			full ? " (full)" : "", show_interrupt_level());
-		dump_buffer_page(bpage, info, tail);
-		atomic_dec(&ts_dump);
-		/* Do not re-enable checking */
-		return;
+		buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
+				   cpu_buffer->cpu,
+				   ts + info->delta, info->ts, info->delta,
+				   info->before, info->after,
+				   full ? " (full)" : "", show_interrupt_level());
 	}
 out:
 	atomic_dec(this_cpu_ptr(&checking));
-- 
cgit v1.2.3


From d17aff807f845cf93926c28705216639c7279110 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 19 Dec 2023 07:37:35 -0800
Subject: Revert BPF token-related functionality

This patch includes the following revert (one  conflicting BPF FS
patch and three token patch sets, represented by merge commits):
  - revert 0f5d5454c723 "Merge branch 'bpf-fs-mount-options-parsing-follow-ups'";
  - revert 750e785796bb "bpf: Support uid and gid when mounting bpffs";
  - revert 733763285acf "Merge branch 'bpf-token-support-in-libbpf-s-bpf-object'";
  - revert c35919dcce28 "Merge branch 'bpf-token-and-bpf-fs-based-delegation'".

Link: https://lore.kernel.org/bpf/CAHk-=wg7JuFYwGy=GOMbRCtOL+jwSQsdUaBsRWkDVYbxipbM5A@mail.gmail.com
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 drivers/media/rc/bpf-lirc.c                        |    2 +-
 include/linux/bpf.h                                |   85 +-
 include/linux/filter.h                             |    2 +-
 include/linux/lsm_hook_defs.h                      |   15 +-
 include/linux/security.h                           |   43 +-
 include/uapi/linux/bpf.h                           |   42 -
 kernel/bpf/Makefile                                |    2 +-
 kernel/bpf/arraymap.c                              |    2 +-
 kernel/bpf/bpf_lsm.c                               |   15 +-
 kernel/bpf/cgroup.c                                |    6 +-
 kernel/bpf/core.c                                  |    3 +-
 kernel/bpf/helpers.c                               |    6 +-
 kernel/bpf/inode.c                                 |  326 +------
 kernel/bpf/syscall.c                               |  215 ++--
 kernel/bpf/token.c                                 |  271 -----
 kernel/bpf/verifier.c                              |   13 +-
 kernel/trace/bpf_trace.c                           |    2 +-
 net/core/filter.c                                  |   36 +-
 net/ipv4/bpf_tcp_ca.c                              |    2 +-
 net/netfilter/nf_bpf_link.c                        |    2 +-
 security/security.c                                |  101 +-
 security/selinux/hooks.c                           |   47 +-
 tools/include/uapi/linux/bpf.h                     |   42 -
 tools/lib/bpf/Build                                |    2 +-
 tools/lib/bpf/bpf.c                                |   37 +-
 tools/lib/bpf/bpf.h                                |   35 +-
 tools/lib/bpf/btf.c                                |    7 +-
 tools/lib/bpf/elf.c                                |    2 +
 tools/lib/bpf/features.c                           |  478 ---------
 tools/lib/bpf/libbpf.c                             |  573 ++++++++---
 tools/lib/bpf/libbpf.h                             |   37 +-
 tools/lib/bpf/libbpf.map                           |    1 -
 tools/lib/bpf/libbpf_internal.h                    |   36 +-
 tools/lib/bpf/libbpf_probes.c                      |    8 +-
 tools/lib/bpf/str_error.h                          |    3 -
 .../selftests/bpf/prog_tests/libbpf_probes.c       |    4 -
 .../testing/selftests/bpf/prog_tests/libbpf_str.c  |    6 -
 tools/testing/selftests/bpf/prog_tests/token.c     | 1031 --------------------
 tools/testing/selftests/bpf/progs/priv_map.c       |   13 -
 tools/testing/selftests/bpf/progs/priv_prog.c      |   13 -
 40 files changed, 641 insertions(+), 2925 deletions(-)
 delete mode 100644 kernel/bpf/token.c
 delete mode 100644 tools/lib/bpf/features.c
 delete mode 100644 tools/testing/selftests/bpf/prog_tests/token.c
 delete mode 100644 tools/testing/selftests/bpf/progs/priv_map.c
 delete mode 100644 tools/testing/selftests/bpf/progs/priv_prog.c

(limited to 'kernel')

diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index 6d07693c6b9f..fe17c7f98e81 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_trace_printk:
-		if (bpf_token_capable(prog->aux->token, CAP_PERFMON))
+		if (perfmon_capable())
 			return bpf_get_trace_printk_proto();
 		fallthrough;
 	default:
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2f54cc0436c4..7a8d4c81a39a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -52,10 +52,6 @@ struct module;
 struct bpf_func_state;
 struct ftrace_ops;
 struct cgroup;
-struct bpf_token;
-struct user_namespace;
-struct super_block;
-struct inode;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -1488,7 +1484,6 @@ struct bpf_prog_aux {
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
-	struct bpf_token *token;
 	struct bpf_prog_offload *offload;
 	struct btf *btf;
 	struct bpf_func_info *func_info;
@@ -1613,31 +1608,6 @@ struct bpf_link_primer {
 	u32 id;
 };
 
-struct bpf_mount_opts {
-	kuid_t uid;
-	kgid_t gid;
-	umode_t mode;
-
-	/* BPF token-related delegation options */
-	u64 delegate_cmds;
-	u64 delegate_maps;
-	u64 delegate_progs;
-	u64 delegate_attachs;
-};
-
-struct bpf_token {
-	struct work_struct work;
-	atomic64_t refcnt;
-	struct user_namespace *userns;
-	u64 allowed_cmds;
-	u64 allowed_maps;
-	u64 allowed_progs;
-	u64 allowed_attachs;
-#ifdef CONFIG_SECURITY
-	void *security;
-#endif
-};
-
 struct bpf_struct_ops_value;
 struct btf_member;
 
@@ -2097,7 +2067,6 @@ static inline void bpf_enable_instrumentation(void)
 	migrate_enable();
 }
 
-extern const struct super_operations bpf_super_ops;
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 extern const struct file_operations bpf_iter_fops;
@@ -2232,26 +2201,24 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map)
 
 extern int sysctl_unprivileged_bpf_disabled;
 
-bool bpf_token_capable(const struct bpf_token *token, int cap);
-
-static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token)
+static inline bool bpf_allow_ptr_leaks(void)
 {
-	return bpf_token_capable(token, CAP_PERFMON);
+	return perfmon_capable();
 }
 
-static inline bool bpf_allow_uninit_stack(const struct bpf_token *token)
+static inline bool bpf_allow_uninit_stack(void)
 {
-	return bpf_token_capable(token, CAP_PERFMON);
+	return perfmon_capable();
 }
 
-static inline bool bpf_bypass_spec_v1(const struct bpf_token *token)
+static inline bool bpf_bypass_spec_v1(void)
 {
-	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
+	return cpu_mitigations_off() || perfmon_capable();
 }
 
-static inline bool bpf_bypass_spec_v4(const struct bpf_token *token)
+static inline bool bpf_bypass_spec_v4(void)
 {
-	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
+	return cpu_mitigations_off() || perfmon_capable();
 }
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
@@ -2268,21 +2235,8 @@ int bpf_link_new_fd(struct bpf_link *link);
 struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 struct bpf_link *bpf_link_get_curr_or_next(u32 *id);
 
-void bpf_token_inc(struct bpf_token *token);
-void bpf_token_put(struct bpf_token *token);
-int bpf_token_create(union bpf_attr *attr);
-struct bpf_token *bpf_token_get_from_fd(u32 ufd);
-
-bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
-bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
-bool bpf_token_allow_prog_type(const struct bpf_token *token,
-			       enum bpf_prog_type prog_type,
-			       enum bpf_attach_type attach_type);
-
 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname);
 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags);
-struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir,
-			    umode_t mode);
 
 #define BPF_ITER_FUNC_PREFIX "bpf_iter_"
 #define DEFINE_BPF_ITER_FUNC(target, args...)			\
@@ -2526,8 +2480,7 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type
 struct bpf_prog *bpf_prog_by_id(u32 id);
 struct bpf_link *bpf_link_by_id(u32 id);
 
-const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id,
-						 const struct bpf_prog *prog);
+const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
 void bpf_task_storage_free(struct task_struct *task);
 void bpf_cgrp_storage_free(struct cgroup *cgroup);
 bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
@@ -2646,24 +2599,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 	return -EOPNOTSUPP;
 }
 
-static inline bool bpf_token_capable(const struct bpf_token *token, int cap)
-{
-	return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
-}
-
-static inline void bpf_token_inc(struct bpf_token *token)
-{
-}
-
-static inline void bpf_token_put(struct bpf_token *token)
-{
-}
-
-static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
 static inline void __dev_flush(void)
 {
 }
@@ -2787,7 +2722,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log,
 }
 
 static inline const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+bpf_base_func_proto(enum bpf_func_id func_id)
 {
 	return NULL;
 }
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 12d907f17d36..68fb6c8142fe 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1139,7 +1139,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
 		return false;
 	if (!bpf_jit_harden)
 		return false;
-	if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF))
+	if (bpf_jit_harden == 1 && bpf_capable())
 		return false;
 
 	return true;
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 3fdd00b452ac..ff217a5ce552 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -398,17 +398,10 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule)
 LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size)
 LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
 LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
-LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr,
-	 struct bpf_token *token)
-LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map)
-LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
-	 struct bpf_token *token)
-LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
-LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr,
-	 struct path *path)
-LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token)
-LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd)
-LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap)
+LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map)
+LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map)
+LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux)
+LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux)
 #endif /* CONFIG_BPF_SYSCALL */
 
 LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)
diff --git a/include/linux/security.h b/include/linux/security.h
index 00809d2d5c38..1d1df326c881 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -32,7 +32,6 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/sockptr.h>
-#include <linux/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -2021,22 +2020,15 @@ static inline void securityfs_remove(struct dentry *dentry)
 union bpf_attr;
 struct bpf_map;
 struct bpf_prog;
-struct bpf_token;
+struct bpf_prog_aux;
 #ifdef CONFIG_SECURITY
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
 extern int security_bpf_prog(struct bpf_prog *prog);
-extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
-				   struct bpf_token *token);
+extern int security_bpf_map_alloc(struct bpf_map *map);
 extern void security_bpf_map_free(struct bpf_map *map);
-extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
-				  struct bpf_token *token);
-extern void security_bpf_prog_free(struct bpf_prog *prog);
-extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
-				     struct path *path);
-extern void security_bpf_token_free(struct bpf_token *token);
-extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
-extern int security_bpf_token_capable(const struct bpf_token *token, int cap);
+extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux);
+extern void security_bpf_prog_free(struct bpf_prog_aux *aux);
 #else
 static inline int security_bpf(int cmd, union bpf_attr *attr,
 					     unsigned int size)
@@ -2054,8 +2046,7 @@ static inline int security_bpf_prog(struct bpf_prog *prog)
 	return 0;
 }
 
-static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
-					  struct bpf_token *token)
+static inline int security_bpf_map_alloc(struct bpf_map *map)
 {
 	return 0;
 }
@@ -2063,33 +2054,13 @@ static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *a
 static inline void security_bpf_map_free(struct bpf_map *map)
 { }
 
-static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
-					 struct bpf_token *token)
+static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
 {
 	return 0;
 }
 
-static inline void security_bpf_prog_free(struct bpf_prog *prog)
+static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
 { }
-
-static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
-				     struct path *path)
-{
-	return 0;
-}
-
-static inline void security_bpf_token_free(struct bpf_token *token)
-{ }
-
-static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
-{
-	return 0;
-}
-
-static inline int security_bpf_token_capable(const struct bpf_token *token, int cap)
-{
-	return 0;
-}
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_BPF_SYSCALL */
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 42f4d3090efe..754e68ca8744 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -847,36 +847,6 @@ union bpf_iter_link_info {
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
  *
- * BPF_TOKEN_CREATE
- *	Description
- *		Create BPF token with embedded information about what
- *		BPF-related functionality it allows:
- *		- a set of allowed bpf() syscall commands;
- *		- a set of allowed BPF map types to be created with
- *		BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed;
- *		- a set of allowed BPF program types and BPF program attach
- *		types to be loaded with BPF_PROG_LOAD command, if
- *		BPF_PROG_LOAD itself is allowed.
- *
- *		BPF token is created (derived) from an instance of BPF FS,
- *		assuming it has necessary delegation mount options specified.
- *		This BPF token can be passed as an extra parameter to various
- *		bpf() syscall commands to grant BPF subsystem functionality to
- *		unprivileged processes.
- *
- *		When created, BPF token is "associated" with the owning
- *		user namespace of BPF FS instance (super block) that it was
- *		derived from, and subsequent BPF operations performed with
- *		BPF token would be performing capabilities checks (i.e.,
- *		CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within
- *		that user namespace. Without BPF token, such capabilities
- *		have to be granted in init user namespace, making bpf()
- *		syscall incompatible with user namespace, for the most part.
- *
- *	Return
- *		A new file descriptor (a nonnegative integer), or -1 if an
- *		error occurred (in which case, *errno* is set appropriately).
- *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -931,8 +901,6 @@ enum bpf_cmd {
 	BPF_ITER_CREATE,
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
-	BPF_TOKEN_CREATE,
-	__MAX_BPF_CMD,
 };
 
 enum bpf_map_type {
@@ -983,7 +951,6 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
-	__MAX_BPF_MAP_TYPE
 };
 
 /* Note that tracing related programs such as
@@ -1028,7 +995,6 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
-	__MAX_BPF_PROG_TYPE
 };
 
 enum bpf_attach_type {
@@ -1437,7 +1403,6 @@ union bpf_attr {
 		 * to using 5 hash functions).
 		 */
 		__u64	map_extra;
-		__u32	map_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -1507,7 +1472,6 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		log_true_size;
-		__u32		prog_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1620,7 +1584,6 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		btf_log_true_size;
-		__u32		btf_token_fd;
 	};
 
 	struct {
@@ -1751,11 +1714,6 @@ union bpf_attr {
 		__u32		flags;		/* extra flags */
 	} prog_bind_map;
 
-	struct { /* struct used by BPF_TOKEN_CREATE command */
-		__u32		flags;
-		__u32		bpffs_fd;
-	} token_create;
-
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 4ce95acfcaa7..f526b7573e97 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
 endif
 CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 13358675ff2e..0bdbbbeab155 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -82,7 +82,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int numa_node = bpf_map_attr_numa_node(attr);
 	u32 elem_size, index_mask, max_entries;
-	bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL);
+	bool bypass_spec_v1 = bpf_bypass_spec_v1();
 	u64 array_size, mask64;
 	struct bpf_array *array;
 
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 63b4dc495125..e8e910395bf6 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -260,15 +260,9 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 BTF_SET_START(sleepable_lsm_hooks)
 BTF_ID(func, bpf_lsm_bpf)
 BTF_ID(func, bpf_lsm_bpf_map)
-BTF_ID(func, bpf_lsm_bpf_map_create)
-BTF_ID(func, bpf_lsm_bpf_map_free)
+BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
+BTF_ID(func, bpf_lsm_bpf_map_free_security)
 BTF_ID(func, bpf_lsm_bpf_prog)
-BTF_ID(func, bpf_lsm_bpf_prog_load)
-BTF_ID(func, bpf_lsm_bpf_prog_free)
-BTF_ID(func, bpf_lsm_bpf_token_create)
-BTF_ID(func, bpf_lsm_bpf_token_free)
-BTF_ID(func, bpf_lsm_bpf_token_cmd)
-BTF_ID(func, bpf_lsm_bpf_token_capable)
 BTF_ID(func, bpf_lsm_bprm_check_security)
 BTF_ID(func, bpf_lsm_bprm_committed_creds)
 BTF_ID(func, bpf_lsm_bprm_committing_creds)
@@ -363,8 +357,9 @@ BTF_ID(func, bpf_lsm_userns_create)
 BTF_SET_END(sleepable_lsm_hooks)
 
 BTF_SET_START(untrusted_lsm_hooks)
-BTF_ID(func, bpf_lsm_bpf_map_free)
-BTF_ID(func, bpf_lsm_bpf_prog_free)
+BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
+BTF_ID(func, bpf_lsm_bpf_prog_free_security)
 BTF_ID(func, bpf_lsm_file_alloc_security)
 BTF_ID(func, bpf_lsm_file_free_security)
 #ifdef CONFIG_SECURITY_NETWORK
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 98e0e3835b28..491d20038cbe 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1630,7 +1630,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -2191,7 +2191,7 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -2348,7 +2348,7 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 14ace23d517b..ea6843be2616 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -682,7 +682,7 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
 	if (!bpf_prog_kallsyms_candidate(fp) ||
-	    !bpf_token_capable(fp->aux->token, CAP_BPF))
+	    !bpf_capable())
 		return;
 
 	bpf_prog_ksym_set_addr(fp);
@@ -2779,7 +2779,6 @@ void bpf_prog_free(struct bpf_prog *fp)
 
 	if (aux->dst_prog)
 		bpf_prog_put(aux->dst_prog);
-	bpf_token_put(aux->token);
 	INIT_WORK(&aux->work, bpf_prog_free_deferred);
 	schedule_work(&aux->work);
 }
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 07fd4b5704f3..be72824f32b2 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1679,7 +1679,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
 const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
 
 const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+bpf_base_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
 	case BPF_FUNC_map_lookup_elem:
@@ -1730,7 +1730,7 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		break;
 	}
 
-	if (!bpf_token_capable(prog->aux->token, CAP_BPF))
+	if (!bpf_capable())
 		return NULL;
 
 	switch (func_id) {
@@ -1788,7 +1788,7 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		break;
 	}
 
-	if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
+	if (!perfmon_capable())
 		return NULL;
 
 	switch (func_id) {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 4383b3d13a55..1aafb2ff2e95 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -20,7 +20,6 @@
 #include <linux/filter.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
-#include <linux/kstrtox.h>
 #include "preload/bpf_preload.h"
 
 enum bpf_type {
@@ -99,9 +98,9 @@ static const struct inode_operations bpf_prog_iops = { };
 static const struct inode_operations bpf_map_iops  = { };
 static const struct inode_operations bpf_link_iops  = { };
 
-struct inode *bpf_get_inode(struct super_block *sb,
-			    const struct inode *dir,
-			    umode_t mode)
+static struct inode *bpf_get_inode(struct super_block *sb,
+				   const struct inode *dir,
+				   umode_t mode)
 {
 	struct inode *inode;
 
@@ -595,183 +594,15 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ
 }
 EXPORT_SYMBOL(bpf_prog_get_type_path);
 
-struct bpffs_btf_enums {
-	const struct btf *btf;
-	const struct btf_type *cmd_t;
-	const struct btf_type *map_t;
-	const struct btf_type *prog_t;
-	const struct btf_type *attach_t;
-};
-
-static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
-{
-	const struct btf *btf;
-	const struct btf_type *t;
-	const char *name;
-	int i, n;
-
-	memset(info, 0, sizeof(*info));
-
-	btf = bpf_get_btf_vmlinux();
-	if (IS_ERR(btf))
-		return PTR_ERR(btf);
-	if (!btf)
-		return -ENOENT;
-
-	info->btf = btf;
-
-	for (i = 1, n = btf_nr_types(btf); i < n; i++) {
-		t = btf_type_by_id(btf, i);
-		if (!btf_type_is_enum(t))
-			continue;
-
-		name = btf_name_by_offset(btf, t->name_off);
-		if (!name)
-			continue;
-
-		if (strcmp(name, "bpf_cmd") == 0)
-			info->cmd_t = t;
-		else if (strcmp(name, "bpf_map_type") == 0)
-			info->map_t = t;
-		else if (strcmp(name, "bpf_prog_type") == 0)
-			info->prog_t = t;
-		else if (strcmp(name, "bpf_attach_type") == 0)
-			info->attach_t = t;
-		else
-			continue;
-
-		if (info->cmd_t && info->map_t && info->prog_t && info->attach_t)
-			return 0;
-	}
-
-	return -ESRCH;
-}
-
-static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t,
-				const char *prefix, const char *str, int *value)
-{
-	const struct btf_enum *e;
-	const char *name;
-	int i, n, pfx_len = strlen(prefix);
-
-	*value = 0;
-
-	if (!btf || !enum_t)
-		return false;
-
-	for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
-		e = &btf_enum(enum_t)[i];
-
-		name = btf_name_by_offset(btf, e->name_off);
-		if (!name || strncasecmp(name, prefix, pfx_len) != 0)
-			continue;
-
-		/* match symbolic name case insensitive and ignoring prefix */
-		if (strcasecmp(name + pfx_len, str) == 0) {
-			*value = e->val;
-			return true;
-		}
-	}
-
-	return false;
-}
-
-static void seq_print_delegate_opts(struct seq_file *m,
-				    const char *opt_name,
-				    const struct btf *btf,
-				    const struct btf_type *enum_t,
-				    const char *prefix,
-				    u64 delegate_msk, u64 any_msk)
-{
-	const struct btf_enum *e;
-	bool first = true;
-	const char *name;
-	u64 msk;
-	int i, n, pfx_len = strlen(prefix);
-
-	delegate_msk &= any_msk; /* clear unknown bits */
-
-	if (delegate_msk == 0)
-		return;
-
-	seq_printf(m, ",%s", opt_name);
-	if (delegate_msk == any_msk) {
-		seq_printf(m, "=any");
-		return;
-	}
-
-	if (btf && enum_t) {
-		for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
-			e = &btf_enum(enum_t)[i];
-			name = btf_name_by_offset(btf, e->name_off);
-			if (!name || strncasecmp(name, prefix, pfx_len) != 0)
-				continue;
-			msk = 1ULL << e->val;
-			if (delegate_msk & msk) {
-				/* emit lower-case name without prefix */
-				seq_printf(m, "%c", first ? '=' : ':');
-				name += pfx_len;
-				while (*name) {
-					seq_printf(m, "%c", tolower(*name));
-					name++;
-				}
-
-				delegate_msk &= ~msk;
-				first = false;
-			}
-		}
-	}
-	if (delegate_msk)
-		seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk);
-}
-
 /*
  * Display the mount options in /proc/mounts.
  */
 static int bpf_show_options(struct seq_file *m, struct dentry *root)
 {
-	struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
-	struct inode *inode = d_inode(root);
-	umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
-	u64 mask;
-
-	if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
-		seq_printf(m, ",uid=%u",
-			   from_kuid_munged(&init_user_ns, inode->i_uid));
-	if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID))
-		seq_printf(m, ",gid=%u",
-			   from_kgid_munged(&init_user_ns, inode->i_gid));
+	umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
+
 	if (mode != S_IRWXUGO)
 		seq_printf(m, ",mode=%o", mode);
-
-	if (opts->delegate_cmds || opts->delegate_maps ||
-	    opts->delegate_progs || opts->delegate_attachs) {
-		struct bpffs_btf_enums info;
-
-		/* ignore errors, fallback to hex */
-		(void)find_bpffs_btf_enums(&info);
-
-		mask = (1ULL << __MAX_BPF_CMD) - 1;
-		seq_print_delegate_opts(m, "delegate_cmds",
-					info.btf, info.cmd_t, "BPF_",
-					opts->delegate_cmds, mask);
-
-		mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
-		seq_print_delegate_opts(m, "delegate_maps",
-					info.btf, info.map_t, "BPF_MAP_TYPE_",
-					opts->delegate_maps, mask);
-
-		mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
-		seq_print_delegate_opts(m, "delegate_progs",
-					info.btf, info.prog_t, "BPF_PROG_TYPE_",
-					opts->delegate_progs, mask);
-
-		mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
-		seq_print_delegate_opts(m, "delegate_attachs",
-					info.btf, info.attach_t, "BPF_",
-					opts->delegate_attachs, mask);
-	}
-
 	return 0;
 }
 
@@ -786,7 +617,7 @@ static void bpf_free_inode(struct inode *inode)
 	free_inode_nonrcu(inode);
 }
 
-const struct super_operations bpf_super_ops = {
+static const struct super_operations bpf_super_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
 	.show_options	= bpf_show_options,
@@ -794,33 +625,23 @@ const struct super_operations bpf_super_ops = {
 };
 
 enum {
-	OPT_UID,
-	OPT_GID,
 	OPT_MODE,
-	OPT_DELEGATE_CMDS,
-	OPT_DELEGATE_MAPS,
-	OPT_DELEGATE_PROGS,
-	OPT_DELEGATE_ATTACHS,
 };
 
 static const struct fs_parameter_spec bpf_fs_parameters[] = {
-	fsparam_u32	("uid",				OPT_UID),
-	fsparam_u32	("gid",				OPT_GID),
 	fsparam_u32oct	("mode",			OPT_MODE),
-	fsparam_string	("delegate_cmds",		OPT_DELEGATE_CMDS),
-	fsparam_string	("delegate_maps",		OPT_DELEGATE_MAPS),
-	fsparam_string	("delegate_progs",		OPT_DELEGATE_PROGS),
-	fsparam_string	("delegate_attachs",		OPT_DELEGATE_ATTACHS),
 	{}
 };
 
+struct bpf_mount_opts {
+	umode_t mode;
+};
+
 static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	struct bpf_mount_opts *opts = fc->s_fs_info;
+	struct bpf_mount_opts *opts = fc->fs_private;
 	struct fs_parse_result result;
-	kuid_t uid;
-	kgid_t gid;
-	int opt, err;
+	int opt;
 
 	opt = fs_parse(fc, bpf_fs_parameters, param, &result);
 	if (opt < 0) {
@@ -841,104 +662,12 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	}
 
 	switch (opt) {
-	case OPT_UID:
-		uid = make_kuid(current_user_ns(), result.uint_32);
-		if (!uid_valid(uid))
-			goto bad_value;
-
-		/*
-		 * The requested uid must be representable in the
-		 * filesystem's idmapping.
-		 */
-		if (!kuid_has_mapping(fc->user_ns, uid))
-			goto bad_value;
-
-		opts->uid = uid;
-		break;
-	case OPT_GID:
-		gid = make_kgid(current_user_ns(), result.uint_32);
-		if (!gid_valid(gid))
-			goto bad_value;
-
-		/*
-		 * The requested gid must be representable in the
-		 * filesystem's idmapping.
-		 */
-		if (!kgid_has_mapping(fc->user_ns, gid))
-			goto bad_value;
-
-		opts->gid = gid;
-		break;
 	case OPT_MODE:
 		opts->mode = result.uint_32 & S_IALLUGO;
 		break;
-	case OPT_DELEGATE_CMDS:
-	case OPT_DELEGATE_MAPS:
-	case OPT_DELEGATE_PROGS:
-	case OPT_DELEGATE_ATTACHS: {
-		struct bpffs_btf_enums info;
-		const struct btf_type *enum_t;
-		const char *enum_pfx;
-		u64 *delegate_msk, msk = 0;
-		char *p;
-		int val;
-
-		/* ignore errors, fallback to hex */
-		(void)find_bpffs_btf_enums(&info);
-
-		switch (opt) {
-		case OPT_DELEGATE_CMDS:
-			delegate_msk = &opts->delegate_cmds;
-			enum_t = info.cmd_t;
-			enum_pfx = "BPF_";
-			break;
-		case OPT_DELEGATE_MAPS:
-			delegate_msk = &opts->delegate_maps;
-			enum_t = info.map_t;
-			enum_pfx = "BPF_MAP_TYPE_";
-			break;
-		case OPT_DELEGATE_PROGS:
-			delegate_msk = &opts->delegate_progs;
-			enum_t = info.prog_t;
-			enum_pfx = "BPF_PROG_TYPE_";
-			break;
-		case OPT_DELEGATE_ATTACHS:
-			delegate_msk = &opts->delegate_attachs;
-			enum_t = info.attach_t;
-			enum_pfx = "BPF_";
-			break;
-		default:
-			return -EINVAL;
-		}
-
-		while ((p = strsep(&param->string, ":"))) {
-			if (strcmp(p, "any") == 0) {
-				msk |= ~0ULL;
-			} else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) {
-				msk |= 1ULL << val;
-			} else {
-				err = kstrtou64(p, 0, &msk);
-				if (err)
-					return err;
-			}
-		}
-
-		/* Setting delegation mount options requires privileges */
-		if (msk && !capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		*delegate_msk |= msk;
-		break;
-	}
-	default:
-		/* ignore unknown mount options */
-		break;
 	}
 
 	return 0;
-
-bad_value:
-	return invalfc(fc, "Bad value for '%s'", param->key);
 }
 
 struct bpf_preload_ops *bpf_preload_ops;
@@ -1010,14 +739,10 @@ out:
 static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	static const struct tree_descr bpf_rfiles[] = { { "" } };
-	struct bpf_mount_opts *opts = sb->s_fs_info;
+	struct bpf_mount_opts *opts = fc->fs_private;
 	struct inode *inode;
 	int ret;
 
-	/* Mounting an instance of BPF FS requires privileges */
-	if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
 	if (ret)
 		return ret;
@@ -1025,8 +750,6 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_op = &bpf_super_ops;
 
 	inode = sb->s_root->d_inode;
-	inode->i_uid = opts->uid;
-	inode->i_gid = opts->gid;
 	inode->i_op = &bpf_dir_iops;
 	inode->i_mode &= ~S_IALLUGO;
 	populate_bpffs(sb->s_root);
@@ -1041,7 +764,7 @@ static int bpf_get_tree(struct fs_context *fc)
 
 static void bpf_free_fc(struct fs_context *fc)
 {
-	kfree(fc->s_fs_info);
+	kfree(fc->fs_private);
 }
 
 static const struct fs_context_operations bpf_context_ops = {
@@ -1062,35 +785,18 @@ static int bpf_init_fs_context(struct fs_context *fc)
 		return -ENOMEM;
 
 	opts->mode = S_IRWXUGO;
-	opts->uid = current_fsuid();
-	opts->gid = current_fsgid();
-
-	/* start out with no BPF token delegation enabled */
-	opts->delegate_cmds = 0;
-	opts->delegate_maps = 0;
-	opts->delegate_progs = 0;
-	opts->delegate_attachs = 0;
 
-	fc->s_fs_info = opts;
+	fc->fs_private = opts;
 	fc->ops = &bpf_context_ops;
 	return 0;
 }
 
-static void bpf_kill_super(struct super_block *sb)
-{
-	struct bpf_mount_opts *opts = sb->s_fs_info;
-
-	kill_litter_super(sb);
-	kfree(opts);
-}
-
 static struct file_system_type bpf_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "bpf",
 	.init_fs_context = bpf_init_fs_context,
 	.parameters	= bpf_fs_parameters,
-	.kill_sb	= bpf_kill_super,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.kill_sb	= kill_litter_super,
 };
 
 static int __init bpf_init(void)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8faa1a20edf8..1bf9805ee185 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1011,8 +1011,8 @@ int map_check_no_btf(const struct bpf_map *map,
 	return -ENOTSUPP;
 }
 
-static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
-			 const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
+static int map_check_btf(struct bpf_map *map, const struct btf *btf,
+			 u32 btf_key_id, u32 btf_value_id)
 {
 	const struct btf_type *key_type, *value_type;
 	u32 key_size, value_size;
@@ -1040,7 +1040,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 	if (!IS_ERR_OR_NULL(map->record)) {
 		int i;
 
-		if (!bpf_token_capable(token, CAP_BPF)) {
+		if (!bpf_capable()) {
 			ret = -EPERM;
 			goto free_map_tab;
 		}
@@ -1123,17 +1123,11 @@ free_map_tab:
 	return ret;
 }
 
-static bool bpf_net_capable(void)
-{
-	return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
-}
-
-#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
+#define BPF_MAP_CREATE_LAST_FIELD map_extra
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
 	const struct bpf_map_ops *ops;
-	struct bpf_token *token = NULL;
 	int numa_node = bpf_map_attr_numa_node(attr);
 	u32 map_type = attr->map_type;
 	struct bpf_map *map;
@@ -1184,32 +1178,14 @@ static int map_create(union bpf_attr *attr)
 	if (!ops->map_mem_usage)
 		return -EINVAL;
 
-	if (attr->map_token_fd) {
-		token = bpf_token_get_from_fd(attr->map_token_fd);
-		if (IS_ERR(token))
-			return PTR_ERR(token);
-
-		/* if current token doesn't grant map creation permissions,
-		 * then we can't use this token, so ignore it and rely on
-		 * system-wide capabilities checks
-		 */
-		if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) ||
-		    !bpf_token_allow_map_type(token, attr->map_type)) {
-			bpf_token_put(token);
-			token = NULL;
-		}
-	}
-
-	err = -EPERM;
-
 	/* Intent here is for unprivileged_bpf_disabled to block BPF map
 	 * creation for unprivileged users; other actions depend
 	 * on fd availability and access to bpffs, so are dependent on
 	 * object creation success. Even with unprivileged BPF disabled,
 	 * capability checks are still carried out.
 	 */
-	if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
-		goto put_token;
+	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+		return -EPERM;
 
 	/* check privileged map type permissions */
 	switch (map_type) {
@@ -1242,27 +1218,25 @@ static int map_create(union bpf_attr *attr)
 	case BPF_MAP_TYPE_LRU_PERCPU_HASH:
 	case BPF_MAP_TYPE_STRUCT_OPS:
 	case BPF_MAP_TYPE_CPUMAP:
-		if (!bpf_token_capable(token, CAP_BPF))
-			goto put_token;
+		if (!bpf_capable())
+			return -EPERM;
 		break;
 	case BPF_MAP_TYPE_SOCKMAP:
 	case BPF_MAP_TYPE_SOCKHASH:
 	case BPF_MAP_TYPE_DEVMAP:
 	case BPF_MAP_TYPE_DEVMAP_HASH:
 	case BPF_MAP_TYPE_XSKMAP:
-		if (!bpf_token_capable(token, CAP_NET_ADMIN))
-			goto put_token;
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
 		break;
 	default:
 		WARN(1, "unsupported map type %d", map_type);
-		goto put_token;
+		return -EPERM;
 	}
 
 	map = ops->map_alloc(attr);
-	if (IS_ERR(map)) {
-		err = PTR_ERR(map);
-		goto put_token;
-	}
+	if (IS_ERR(map))
+		return PTR_ERR(map);
 	map->ops = ops;
 	map->map_type = map_type;
 
@@ -1299,7 +1273,7 @@ static int map_create(union bpf_attr *attr)
 		map->btf = btf;
 
 		if (attr->btf_value_type_id) {
-			err = map_check_btf(map, token, btf, attr->btf_key_type_id,
+			err = map_check_btf(map, btf, attr->btf_key_type_id,
 					    attr->btf_value_type_id);
 			if (err)
 				goto free_map;
@@ -1311,16 +1285,15 @@ static int map_create(union bpf_attr *attr)
 			attr->btf_vmlinux_value_type_id;
 	}
 
-	err = security_bpf_map_create(map, attr, token);
+	err = security_bpf_map_alloc(map);
 	if (err)
-		goto free_map_sec;
+		goto free_map;
 
 	err = bpf_map_alloc_id(map);
 	if (err)
 		goto free_map_sec;
 
 	bpf_map_save_memcg(map);
-	bpf_token_put(token);
 
 	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
@@ -1341,8 +1314,6 @@ free_map_sec:
 free_map:
 	btf_put(map->btf);
 	map->ops->map_free(map);
-put_token:
-	bpf_token_put(token);
 	return err;
 }
 
@@ -2173,7 +2144,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 	kvfree(aux->func_info);
 	kfree(aux->func_info_aux);
 	free_uid(aux->user);
-	security_bpf_prog_free(aux->prog);
+	security_bpf_prog_free(aux);
 	bpf_prog_free(aux->prog);
 }
 
@@ -2619,15 +2590,13 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd
+#define	BPF_PROG_LOAD_LAST_FIELD log_true_size
 
 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	enum bpf_prog_type type = attr->prog_type;
 	struct bpf_prog *prog, *dst_prog = NULL;
 	struct btf *attach_btf = NULL;
-	struct bpf_token *token = NULL;
-	bool bpf_cap;
 	int err;
 	char license[128];
 
@@ -2644,31 +2613,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 				 BPF_F_TEST_REG_INVARIANTS))
 		return -EINVAL;
 
-	bpf_prog_load_fixup_attach_type(attr);
-
-	if (attr->prog_token_fd) {
-		token = bpf_token_get_from_fd(attr->prog_token_fd);
-		if (IS_ERR(token))
-			return PTR_ERR(token);
-		/* if current token doesn't grant prog loading permissions,
-		 * then we can't use this token, so ignore it and rely on
-		 * system-wide capabilities checks
-		 */
-		if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) ||
-		    !bpf_token_allow_prog_type(token, attr->prog_type,
-					       attr->expected_attach_type)) {
-			bpf_token_put(token);
-			token = NULL;
-		}
-	}
-
-	bpf_cap = bpf_token_capable(token, CAP_BPF);
-	err = -EPERM;
-
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
 	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
-	    !bpf_cap)
-		goto put_token;
+	    !bpf_capable())
+		return -EPERM;
 
 	/* Intent here is for unprivileged_bpf_disabled to block BPF program
 	 * creation for unprivileged users; other actions depend
@@ -2677,23 +2625,21 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	 * capability checks are still carried out for these
 	 * and other operations.
 	 */
-	if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
-		goto put_token;
+	if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
+		return -EPERM;
 
 	if (attr->insn_cnt == 0 ||
-	    attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
-		err = -E2BIG;
-		goto put_token;
-	}
+	    attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
+		return -E2BIG;
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
-	    !bpf_cap)
-		goto put_token;
+	    !bpf_capable())
+		return -EPERM;
 
-	if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
-		goto put_token;
-	if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
-		goto put_token;
+	if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (is_perfmon_prog_type(type) && !perfmon_capable())
+		return -EPERM;
 
 	/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
 	 * or btf, we need to check which one it is
@@ -2703,33 +2649,27 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 		if (IS_ERR(dst_prog)) {
 			dst_prog = NULL;
 			attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd);
-			if (IS_ERR(attach_btf)) {
-				err = -EINVAL;
-				goto put_token;
-			}
+			if (IS_ERR(attach_btf))
+				return -EINVAL;
 			if (!btf_is_kernel(attach_btf)) {
 				/* attaching through specifying bpf_prog's BTF
 				 * objects directly might be supported eventually
 				 */
 				btf_put(attach_btf);
-				err = -ENOTSUPP;
-				goto put_token;
+				return -ENOTSUPP;
 			}
 		}
 	} else if (attr->attach_btf_id) {
 		/* fall back to vmlinux BTF, if BTF type ID is specified */
 		attach_btf = bpf_get_btf_vmlinux();
-		if (IS_ERR(attach_btf)) {
-			err = PTR_ERR(attach_btf);
-			goto put_token;
-		}
-		if (!attach_btf) {
-			err = -EINVAL;
-			goto put_token;
-		}
+		if (IS_ERR(attach_btf))
+			return PTR_ERR(attach_btf);
+		if (!attach_btf)
+			return -EINVAL;
 		btf_get(attach_btf);
 	}
 
+	bpf_prog_load_fixup_attach_type(attr);
 	if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
 				       attach_btf, attr->attach_btf_id,
 				       dst_prog)) {
@@ -2737,8 +2677,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 			bpf_prog_put(dst_prog);
 		if (attach_btf)
 			btf_put(attach_btf);
-		err = -EINVAL;
-		goto put_token;
+		return -EINVAL;
 	}
 
 	/* plain bpf_prog allocation */
@@ -2748,8 +2687,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 			bpf_prog_put(dst_prog);
 		if (attach_btf)
 			btf_put(attach_btf);
-		err = -EINVAL;
-		goto put_token;
+		return -ENOMEM;
 	}
 
 	prog->expected_attach_type = attr->expected_attach_type;
@@ -2760,9 +2698,9 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
 	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
 
-	/* move token into prog->aux, reuse taken refcnt */
-	prog->aux->token = token;
-	token = NULL;
+	err = security_bpf_prog_alloc(prog->aux);
+	if (err)
+		goto free_prog;
 
 	prog->aux->user = get_current_user();
 	prog->len = attr->insn_cnt;
@@ -2771,12 +2709,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	if (copy_from_bpfptr(prog->insns,
 			     make_bpfptr(attr->insns, uattr.is_kernel),
 			     bpf_prog_insn_size(prog)) != 0)
-		goto free_prog;
+		goto free_prog_sec;
 	/* copy eBPF program license from user space */
 	if (strncpy_from_bpfptr(license,
 				make_bpfptr(attr->license, uattr.is_kernel),
 				sizeof(license) - 1) < 0)
-		goto free_prog;
+		goto free_prog_sec;
 	license[sizeof(license) - 1] = 0;
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
@@ -2790,29 +2728,25 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	if (bpf_prog_is_dev_bound(prog->aux)) {
 		err = bpf_prog_dev_bound_init(prog, attr);
 		if (err)
-			goto free_prog;
+			goto free_prog_sec;
 	}
 
 	if (type == BPF_PROG_TYPE_EXT && dst_prog &&
 	    bpf_prog_is_dev_bound(dst_prog->aux)) {
 		err = bpf_prog_dev_bound_inherit(prog, dst_prog);
 		if (err)
-			goto free_prog;
+			goto free_prog_sec;
 	}
 
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
 	if (err < 0)
-		goto free_prog;
+		goto free_prog_sec;
 
 	prog->aux->load_time = ktime_get_boottime_ns();
 	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name,
 			       sizeof(attr->prog_name));
 	if (err < 0)
-		goto free_prog;
-
-	err = security_bpf_prog_load(prog, attr, token);
-	if (err)
 		goto free_prog_sec;
 
 	/* run eBPF verifier */
@@ -2858,16 +2792,13 @@ free_used_maps:
 	 */
 	__bpf_prog_put_noref(prog, prog->aux->real_func_cnt);
 	return err;
-
 free_prog_sec:
-	security_bpf_prog_free(prog);
-free_prog:
 	free_uid(prog->aux->user);
+	security_bpf_prog_free(prog->aux);
+free_prog:
 	if (prog->aux->attach_btf)
 		btf_put(prog->aux->attach_btf);
 	bpf_prog_free(prog);
-put_token:
-	bpf_token_put(token);
 	return err;
 }
 
@@ -3857,7 +3788,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
 	case BPF_PROG_TYPE_CGROUP_SKB:
-		if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN))
+		if (!capable(CAP_NET_ADMIN))
 			/* cg-skb progs can be loaded by unpriv user.
 			 * check permissions at attach time.
 			 */
@@ -4060,7 +3991,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 static int bpf_prog_query(const union bpf_attr *attr,
 			  union bpf_attr __user *uattr)
 {
-	if (!bpf_net_capable())
+	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 	if (CHECK_ATTR(BPF_PROG_QUERY))
 		return -EINVAL;
@@ -4828,31 +4759,15 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	return err;
 }
 
-#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
 
 static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
 {
-	struct bpf_token *token = NULL;
-
 	if (CHECK_ATTR(BPF_BTF_LOAD))
 		return -EINVAL;
 
-	if (attr->btf_token_fd) {
-		token = bpf_token_get_from_fd(attr->btf_token_fd);
-		if (IS_ERR(token))
-			return PTR_ERR(token);
-		if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) {
-			bpf_token_put(token);
-			token = NULL;
-		}
-	}
-
-	if (!bpf_token_capable(token, CAP_BPF)) {
-		bpf_token_put(token);
+	if (!bpf_capable())
 		return -EPERM;
-	}
-
-	bpf_token_put(token);
 
 	return btf_new_fd(attr, uattr, uattr_size);
 }
@@ -5470,20 +5385,6 @@ out_prog_put:
 	return ret;
 }
 
-#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd
-
-static int token_create(union bpf_attr *attr)
-{
-	if (CHECK_ATTR(BPF_TOKEN_CREATE))
-		return -EINVAL;
-
-	/* no flags are supported yet */
-	if (attr->token_create.flags)
-		return -EINVAL;
-
-	return bpf_token_create(attr);
-}
-
 static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 {
 	union bpf_attr attr;
@@ -5617,9 +5518,6 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 	case BPF_PROG_BIND_MAP:
 		err = bpf_prog_bind_map(&attr);
 		break;
-	case BPF_TOKEN_CREATE:
-		err = token_create(&attr);
-		break;
 	default:
 		err = -EINVAL;
 		break;
@@ -5726,7 +5624,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
 const struct bpf_func_proto * __weak
 tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-	return bpf_base_func_proto(func_id, prog);
+	return bpf_base_func_proto(func_id);
 }
 
 BPF_CALL_1(bpf_sys_close, u32, fd)
@@ -5776,8 +5674,7 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_sys_bpf:
-		return !bpf_token_capable(prog->aux->token, CAP_PERFMON)
-		       ? NULL : &bpf_sys_bpf_proto;
+		return !perfmon_capable() ? NULL : &bpf_sys_bpf_proto;
 	case BPF_FUNC_btf_find_by_name_kind:
 		return &bpf_btf_find_by_name_kind_proto;
 	case BPF_FUNC_sys_close:
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
deleted file mode 100644
index a86fccd57e2d..000000000000
--- a/kernel/bpf/token.c
+++ /dev/null
@@ -1,271 +0,0 @@
-#include <linux/bpf.h>
-#include <linux/vmalloc.h>
-#include <linux/fdtable.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/idr.h>
-#include <linux/namei.h>
-#include <linux/user_namespace.h>
-#include <linux/security.h>
-
-bool bpf_token_capable(const struct bpf_token *token, int cap)
-{
-	/* BPF token allows ns_capable() level of capabilities, but only if
-	 * token's userns is *exactly* the same as current user's userns
-	 */
-	if (token && current_user_ns() == token->userns) {
-		if (ns_capable(token->userns, cap) ||
-		    (cap != CAP_SYS_ADMIN && ns_capable(token->userns, CAP_SYS_ADMIN)))
-			return security_bpf_token_capable(token, cap) == 0;
-	}
-	/* otherwise fallback to capable() checks */
-	return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN));
-}
-
-void bpf_token_inc(struct bpf_token *token)
-{
-	atomic64_inc(&token->refcnt);
-}
-
-static void bpf_token_free(struct bpf_token *token)
-{
-	security_bpf_token_free(token);
-	put_user_ns(token->userns);
-	kvfree(token);
-}
-
-static void bpf_token_put_deferred(struct work_struct *work)
-{
-	struct bpf_token *token = container_of(work, struct bpf_token, work);
-
-	bpf_token_free(token);
-}
-
-void bpf_token_put(struct bpf_token *token)
-{
-	if (!token)
-		return;
-
-	if (!atomic64_dec_and_test(&token->refcnt))
-		return;
-
-	INIT_WORK(&token->work, bpf_token_put_deferred);
-	schedule_work(&token->work);
-}
-
-static int bpf_token_release(struct inode *inode, struct file *filp)
-{
-	struct bpf_token *token = filp->private_data;
-
-	bpf_token_put(token);
-	return 0;
-}
-
-static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
-{
-	struct bpf_token *token = filp->private_data;
-	u64 mask;
-
-	BUILD_BUG_ON(__MAX_BPF_CMD >= 64);
-	mask = (1ULL << __MAX_BPF_CMD) - 1;
-	if ((token->allowed_cmds & mask) == mask)
-		seq_printf(m, "allowed_cmds:\tany\n");
-	else
-		seq_printf(m, "allowed_cmds:\t0x%llx\n", token->allowed_cmds);
-
-	BUILD_BUG_ON(__MAX_BPF_MAP_TYPE >= 64);
-	mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
-	if ((token->allowed_maps & mask) == mask)
-		seq_printf(m, "allowed_maps:\tany\n");
-	else
-		seq_printf(m, "allowed_maps:\t0x%llx\n", token->allowed_maps);
-
-	BUILD_BUG_ON(__MAX_BPF_PROG_TYPE >= 64);
-	mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
-	if ((token->allowed_progs & mask) == mask)
-		seq_printf(m, "allowed_progs:\tany\n");
-	else
-		seq_printf(m, "allowed_progs:\t0x%llx\n", token->allowed_progs);
-
-	BUILD_BUG_ON(__MAX_BPF_ATTACH_TYPE >= 64);
-	mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
-	if ((token->allowed_attachs & mask) == mask)
-		seq_printf(m, "allowed_attachs:\tany\n");
-	else
-		seq_printf(m, "allowed_attachs:\t0x%llx\n", token->allowed_attachs);
-}
-
-#define BPF_TOKEN_INODE_NAME "bpf-token"
-
-static const struct inode_operations bpf_token_iops = { };
-
-static const struct file_operations bpf_token_fops = {
-	.release	= bpf_token_release,
-	.show_fdinfo	= bpf_token_show_fdinfo,
-};
-
-int bpf_token_create(union bpf_attr *attr)
-{
-	struct bpf_mount_opts *mnt_opts;
-	struct bpf_token *token = NULL;
-	struct user_namespace *userns;
-	struct inode *inode;
-	struct file *file;
-	struct path path;
-	struct fd f;
-	umode_t mode;
-	int err, fd;
-
-	f = fdget(attr->token_create.bpffs_fd);
-	if (!f.file)
-		return -EBADF;
-
-	path = f.file->f_path;
-	path_get(&path);
-	fdput(f);
-
-	if (path.dentry != path.mnt->mnt_sb->s_root) {
-		err = -EINVAL;
-		goto out_path;
-	}
-	if (path.mnt->mnt_sb->s_op != &bpf_super_ops) {
-		err = -EINVAL;
-		goto out_path;
-	}
-	err = path_permission(&path, MAY_ACCESS);
-	if (err)
-		goto out_path;
-
-	userns = path.dentry->d_sb->s_user_ns;
-	/*
-	 * Enforce that creators of BPF tokens are in the same user
-	 * namespace as the BPF FS instance. This makes reasoning about
-	 * permissions a lot easier and we can always relax this later.
-	 */
-	if (current_user_ns() != userns) {
-		err = -EPERM;
-		goto out_path;
-	}
-	if (!ns_capable(userns, CAP_BPF)) {
-		err = -EPERM;
-		goto out_path;
-	}
-
-	mnt_opts = path.dentry->d_sb->s_fs_info;
-	if (mnt_opts->delegate_cmds == 0 &&
-	    mnt_opts->delegate_maps == 0 &&
-	    mnt_opts->delegate_progs == 0 &&
-	    mnt_opts->delegate_attachs == 0) {
-		err = -ENOENT; /* no BPF token delegation is set up */
-		goto out_path;
-	}
-
-	mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
-	inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		goto out_path;
-	}
-
-	inode->i_op = &bpf_token_iops;
-	inode->i_fop = &bpf_token_fops;
-	clear_nlink(inode); /* make sure it is unlinked */
-
-	file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
-	if (IS_ERR(file)) {
-		iput(inode);
-		err = PTR_ERR(file);
-		goto out_path;
-	}
-
-	token = kvzalloc(sizeof(*token), GFP_USER);
-	if (!token) {
-		err = -ENOMEM;
-		goto out_file;
-	}
-
-	atomic64_set(&token->refcnt, 1);
-
-	/* remember bpffs owning userns for future ns_capable() checks */
-	token->userns = get_user_ns(userns);
-
-	token->allowed_cmds = mnt_opts->delegate_cmds;
-	token->allowed_maps = mnt_opts->delegate_maps;
-	token->allowed_progs = mnt_opts->delegate_progs;
-	token->allowed_attachs = mnt_opts->delegate_attachs;
-
-	err = security_bpf_token_create(token, attr, &path);
-	if (err)
-		goto out_token;
-
-	fd = get_unused_fd_flags(O_CLOEXEC);
-	if (fd < 0) {
-		err = fd;
-		goto out_token;
-	}
-
-	file->private_data = token;
-	fd_install(fd, file);
-
-	path_put(&path);
-	return fd;
-
-out_token:
-	bpf_token_free(token);
-out_file:
-	fput(file);
-out_path:
-	path_put(&path);
-	return err;
-}
-
-struct bpf_token *bpf_token_get_from_fd(u32 ufd)
-{
-	struct fd f = fdget(ufd);
-	struct bpf_token *token;
-
-	if (!f.file)
-		return ERR_PTR(-EBADF);
-	if (f.file->f_op != &bpf_token_fops) {
-		fdput(f);
-		return ERR_PTR(-EINVAL);
-	}
-
-	token = f.file->private_data;
-	bpf_token_inc(token);
-	fdput(f);
-
-	return token;
-}
-
-bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
-{
-	/* BPF token can be used only within exactly the same userns in which
-	 * it was created
-	 */
-	if (!token || current_user_ns() != token->userns)
-		return false;
-	if (!(token->allowed_cmds & (1ULL << cmd)))
-		return false;
-	return security_bpf_token_cmd(token, cmd) == 0;
-}
-
-bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type)
-{
-	if (!token || type >= __MAX_BPF_MAP_TYPE)
-		return false;
-
-	return token->allowed_maps & (1ULL << type);
-}
-
-bool bpf_token_allow_prog_type(const struct bpf_token *token,
-			       enum bpf_prog_type prog_type,
-			       enum bpf_attach_type attach_type)
-{
-	if (!token || prog_type >= __MAX_BPF_PROG_TYPE || attach_type >= __MAX_BPF_ATTACH_TYPE)
-		return false;
-
-	return (token->allowed_progs & (1ULL << prog_type)) &&
-	       (token->allowed_attachs & (1ULL << attach_type));
-}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9456ee0ad129..4ceec8c2a484 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20594,12 +20594,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	env->prog = *prog;
 	env->ops = bpf_verifier_ops[env->prog->type];
 	env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
-
-	env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
-	env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
-	env->bypass_spec_v1 = bpf_bypass_spec_v1(env->prog->aux->token);
-	env->bypass_spec_v4 = bpf_bypass_spec_v4(env->prog->aux->token);
-	env->bpf_capable = is_priv = bpf_token_capable(env->prog->aux->token, CAP_BPF);
+	is_priv = bpf_capable();
 
 	bpf_get_btf_vmlinux();
 
@@ -20631,6 +20626,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
 		env->strict_alignment = false;
 
+	env->allow_ptr_leaks = bpf_allow_ptr_leaks();
+	env->allow_uninit_stack = bpf_allow_uninit_stack();
+	env->bypass_spec_v1 = bpf_bypass_spec_v1();
+	env->bypass_spec_v4 = bpf_bypass_spec_v4();
+	env->bpf_capable = bpf_capable();
+
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
 	env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 492d60e9c480..7ac6c52b25eb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1629,7 +1629,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_trace_vprintk:
 		return bpf_get_trace_vprintk_proto();
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 3cc52b82bab8..24061f29c9dd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -87,7 +87,7 @@
 #include "dev.h"
 
 static const struct bpf_func_proto *
-bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
+bpf_sk_base_func_proto(enum bpf_func_id func_id);
 
 int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
 {
@@ -7862,7 +7862,7 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -7955,7 +7955,7 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 			return NULL;
 		}
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -7974,7 +7974,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_skb_event_output_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -8161,7 +8161,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -8220,7 +8220,7 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 
 #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
@@ -8281,7 +8281,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_sock_proto;
 #endif /* CONFIG_INET */
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -8323,7 +8323,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_cgroup_classid_curr_proto;
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -8367,7 +8367,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skc_lookup_tcp_proto;
 #endif
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -8378,7 +8378,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_flow_dissector_load_bytes_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -8405,7 +8405,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_under_cgroup:
 		return &bpf_skb_under_cgroup_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -8580,7 +8580,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
-		if (!bpf_token_capable(prog->aux->token, CAP_BPF))
+		if (!bpf_capable())
 			return false;
 		break;
 	}
@@ -8592,7 +8592,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 			break;
 		case bpf_ctx_range(struct __sk_buff, tstamp):
-			if (!bpf_token_capable(prog->aux->token, CAP_BPF))
+			if (!bpf_capable())
 				return false;
 			break;
 		default:
@@ -11236,7 +11236,7 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
@@ -11418,7 +11418,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_sk_release:
 		return &bpf_sk_release_proto;
 	default:
-		return bpf_sk_base_func_proto(func_id, prog);
+		return bpf_sk_base_func_proto(func_id);
 	}
 }
 
@@ -11752,7 +11752,7 @@ const struct bpf_func_proto bpf_sock_from_file_proto = {
 };
 
 static const struct bpf_func_proto *
-bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+bpf_sk_base_func_proto(enum bpf_func_id func_id)
 {
 	const struct bpf_func_proto *func;
 
@@ -11781,10 +11781,10 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 
-	if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
+	if (!perfmon_capable())
 		return NULL;
 
 	return func;
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 634cfafa583d..ae8b15e6896f 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -191,7 +191,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
 	case BPF_FUNC_ktime_get_coarse_ns:
 		return &bpf_ktime_get_coarse_ns_proto;
 	default:
-		return bpf_base_func_proto(func_id, prog);
+		return bpf_base_func_proto(func_id);
 	}
 }
 
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
index 5257d5e7eb09..0e4beae421f8 100644
--- a/net/netfilter/nf_bpf_link.c
+++ b/net/netfilter/nf_bpf_link.c
@@ -314,7 +314,7 @@ static bool nf_is_valid_access(int off, int size, enum bpf_access_type type,
 static const struct bpf_func_proto *
 bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-	return bpf_base_func_proto(func_id, prog);
+	return bpf_base_func_proto(func_id);
 }
 
 const struct bpf_verifier_ops netfilter_verifier_ops = {
diff --git a/security/security.c b/security/security.c
index 088a79c35c26..dcb3e7014f9b 100644
--- a/security/security.c
+++ b/security/security.c
@@ -5167,87 +5167,29 @@ int security_bpf_prog(struct bpf_prog *prog)
 }
 
 /**
- * security_bpf_map_create() - Check if BPF map creation is allowed
- * @map: BPF map object
- * @attr: BPF syscall attributes used to create BPF map
- * @token: BPF token used to grant user access
- *
- * Do a check when the kernel creates a new BPF map. This is also the
- * point where LSM blob is allocated for LSMs that need them.
- *
- * Return: Returns 0 on success, error on failure.
- */
-int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
-			    struct bpf_token *token)
-{
-	return call_int_hook(bpf_map_create, 0, map, attr, token);
-}
-
-/**
- * security_bpf_prog_load() - Check if loading of BPF program is allowed
- * @prog: BPF program object
- * @attr: BPF syscall attributes used to create BPF program
- * @token: BPF token used to grant user access to BPF subsystem
- *
- * Perform an access control check when the kernel loads a BPF program and
- * allocates associated BPF program object. This hook is also responsible for
- * allocating any required LSM state for the BPF program.
- *
- * Return: Returns 0 on success, error on failure.
- */
-int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
-			   struct bpf_token *token)
-{
-	return call_int_hook(bpf_prog_load, 0, prog, attr, token);
-}
-
-/**
- * security_bpf_token_create() - Check if creating of BPF token is allowed
- * @token: BPF token object
- * @attr: BPF syscall attributes used to create BPF token
- * @path: path pointing to BPF FS mount point from which BPF token is created
- *
- * Do a check when the kernel instantiates a new BPF token object from BPF FS
- * instance. This is also the point where LSM blob can be allocated for LSMs.
- *
- * Return: Returns 0 on success, error on failure.
- */
-int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
-			      struct path *path)
-{
-	return call_int_hook(bpf_token_create, 0, token, attr, path);
-}
-
-/**
- * security_bpf_token_cmd() - Check if BPF token is allowed to delegate
- * requested BPF syscall command
- * @token: BPF token object
- * @cmd: BPF syscall command requested to be delegated by BPF token
+ * security_bpf_map_alloc() - Allocate a bpf map LSM blob
+ * @map: bpf map
  *
- * Do a check when the kernel decides whether provided BPF token should allow
- * delegation of requested BPF syscall command.
+ * Initialize the security field inside bpf map.
  *
  * Return: Returns 0 on success, error on failure.
  */
-int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
+int security_bpf_map_alloc(struct bpf_map *map)
 {
-	return call_int_hook(bpf_token_cmd, 0, token, cmd);
+	return call_int_hook(bpf_map_alloc_security, 0, map);
 }
 
 /**
- * security_bpf_token_capable() - Check if BPF token is allowed to delegate
- * requested BPF-related capability
- * @token: BPF token object
- * @cap: capabilities requested to be delegated by BPF token
+ * security_bpf_prog_alloc() - Allocate a bpf program LSM blob
+ * @aux: bpf program aux info struct
  *
- * Do a check when the kernel decides whether provided BPF token should allow
- * delegation of requested BPF-related capabilities.
+ * Initialize the security field inside bpf program.
  *
  * Return: Returns 0 on success, error on failure.
  */
-int security_bpf_token_capable(const struct bpf_token *token, int cap)
+int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
 {
-	return call_int_hook(bpf_token_capable, 0, token, cap);
+	return call_int_hook(bpf_prog_alloc_security, 0, aux);
 }
 
 /**
@@ -5258,29 +5200,18 @@ int security_bpf_token_capable(const struct bpf_token *token, int cap)
  */
 void security_bpf_map_free(struct bpf_map *map)
 {
-	call_void_hook(bpf_map_free, map);
-}
-
-/**
- * security_bpf_prog_free() - Free a BPF program's LSM blob
- * @prog: BPF program struct
- *
- * Clean up the security information stored inside BPF program.
- */
-void security_bpf_prog_free(struct bpf_prog *prog)
-{
-	call_void_hook(bpf_prog_free, prog);
+	call_void_hook(bpf_map_free_security, map);
 }
 
 /**
- * security_bpf_token_free() - Free a BPF token's LSM blob
- * @token: BPF token struct
+ * security_bpf_prog_free() - Free a bpf program's LSM blob
+ * @aux: bpf program aux info struct
  *
- * Clean up the security information stored inside BPF token.
+ * Clean up the security information stored inside bpf prog.
  */
-void security_bpf_token_free(struct bpf_token *token)
+void security_bpf_prog_free(struct bpf_prog_aux *aux)
 {
-	call_void_hook(bpf_token_free, token);
+	call_void_hook(bpf_prog_free_security, aux);
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 1501e95366a1..feda711c6b7b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6783,8 +6783,7 @@ static int selinux_bpf_prog(struct bpf_prog *prog)
 			    BPF__PROG_RUN, NULL);
 }
 
-static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
-				  struct bpf_token *token)
+static int selinux_bpf_map_alloc(struct bpf_map *map)
 {
 	struct bpf_security_struct *bpfsec;
 
@@ -6806,8 +6805,7 @@ static void selinux_bpf_map_free(struct bpf_map *map)
 	kfree(bpfsec);
 }
 
-static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
-				 struct bpf_token *token)
+static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux)
 {
 	struct bpf_security_struct *bpfsec;
 
@@ -6816,39 +6814,16 @@ static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
 		return -ENOMEM;
 
 	bpfsec->sid = current_sid();
-	prog->aux->security = bpfsec;
+	aux->security = bpfsec;
 
 	return 0;
 }
 
-static void selinux_bpf_prog_free(struct bpf_prog *prog)
+static void selinux_bpf_prog_free(struct bpf_prog_aux *aux)
 {
-	struct bpf_security_struct *bpfsec = prog->aux->security;
+	struct bpf_security_struct *bpfsec = aux->security;
 
-	prog->aux->security = NULL;
-	kfree(bpfsec);
-}
-
-static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
-				    struct path *path)
-{
-	struct bpf_security_struct *bpfsec;
-
-	bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
-	if (!bpfsec)
-		return -ENOMEM;
-
-	bpfsec->sid = current_sid();
-	token->security = bpfsec;
-
-	return 0;
-}
-
-static void selinux_bpf_token_free(struct bpf_token *token)
-{
-	struct bpf_security_struct *bpfsec = token->security;
-
-	token->security = NULL;
+	aux->security = NULL;
 	kfree(bpfsec);
 }
 #endif
@@ -7204,9 +7179,8 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(bpf, selinux_bpf),
 	LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
 	LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
-	LSM_HOOK_INIT(bpf_map_free, selinux_bpf_map_free),
-	LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free),
-	LSM_HOOK_INIT(bpf_token_free, selinux_bpf_token_free),
+	LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
+	LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
 #endif
 
 #ifdef CONFIG_PERF_EVENTS
@@ -7263,9 +7237,8 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init),
 #endif
 #ifdef CONFIG_BPF_SYSCALL
-	LSM_HOOK_INIT(bpf_map_create, selinux_bpf_map_create),
-	LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load),
-	LSM_HOOK_INIT(bpf_token_create, selinux_bpf_token_create),
+	LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc),
+	LSM_HOOK_INIT(bpf_prog_alloc_security, selinux_bpf_prog_alloc),
 #endif
 #ifdef CONFIG_PERF_EVENTS
 	LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc),
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e0545201b55f..7f24d898efbb 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -847,36 +847,6 @@ union bpf_iter_link_info {
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
  *
- * BPF_TOKEN_CREATE
- *	Description
- *		Create BPF token with embedded information about what
- *		BPF-related functionality it allows:
- *		- a set of allowed bpf() syscall commands;
- *		- a set of allowed BPF map types to be created with
- *		BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed;
- *		- a set of allowed BPF program types and BPF program attach
- *		types to be loaded with BPF_PROG_LOAD command, if
- *		BPF_PROG_LOAD itself is allowed.
- *
- *		BPF token is created (derived) from an instance of BPF FS,
- *		assuming it has necessary delegation mount options specified.
- *		This BPF token can be passed as an extra parameter to various
- *		bpf() syscall commands to grant BPF subsystem functionality to
- *		unprivileged processes.
- *
- *		When created, BPF token is "associated" with the owning
- *		user namespace of BPF FS instance (super block) that it was
- *		derived from, and subsequent BPF operations performed with
- *		BPF token would be performing capabilities checks (i.e.,
- *		CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within
- *		that user namespace. Without BPF token, such capabilities
- *		have to be granted in init user namespace, making bpf()
- *		syscall incompatible with user namespace, for the most part.
- *
- *	Return
- *		A new file descriptor (a nonnegative integer), or -1 if an
- *		error occurred (in which case, *errno* is set appropriately).
- *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -931,8 +901,6 @@ enum bpf_cmd {
 	BPF_ITER_CREATE,
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
-	BPF_TOKEN_CREATE,
-	__MAX_BPF_CMD,
 };
 
 enum bpf_map_type {
@@ -983,7 +951,6 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_BLOOM_FILTER,
 	BPF_MAP_TYPE_USER_RINGBUF,
 	BPF_MAP_TYPE_CGRP_STORAGE,
-	__MAX_BPF_MAP_TYPE
 };
 
 /* Note that tracing related programs such as
@@ -1028,7 +995,6 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
-	__MAX_BPF_PROG_TYPE
 };
 
 enum bpf_attach_type {
@@ -1437,7 +1403,6 @@ union bpf_attr {
 		 * to using 5 hash functions).
 		 */
 		__u64	map_extra;
-		__u32	map_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -1507,7 +1472,6 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		log_true_size;
-		__u32		prog_token_fd;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1620,7 +1584,6 @@ union bpf_attr {
 		 * truncated), or smaller (if log buffer wasn't filled completely).
 		 */
 		__u32		btf_log_true_size;
-		__u32		btf_token_fd;
 	};
 
 	struct {
@@ -1751,11 +1714,6 @@ union bpf_attr {
 		__u32		flags;		/* extra flags */
 	} prog_bind_map;
 
-	struct { /* struct used by BPF_TOKEN_CREATE command */
-		__u32		flags;
-		__u32		bpffs_fd;
-	} token_create;
-
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index b6619199a706..2d0c282c8588 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1,4 +1,4 @@
 libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \
 	    netlink.o bpf_prog_linfo.o libbpf_probes.o hashmap.o \
 	    btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o \
-	    usdt.o zip.o elf.o features.o
+	    usdt.o zip.o elf.o
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 0ad8e532b3cf..9dc9625651dc 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -103,7 +103,7 @@ int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts)
  *   [0] https://lore.kernel.org/bpf/20201201215900.3569844-1-guro@fb.com/
  *   [1] d05512618056 ("bpf: Add bpf_ktime_get_coarse_ns helper")
  */
-int probe_memcg_account(int token_fd)
+int probe_memcg_account(void)
 {
 	const size_t attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd);
 	struct bpf_insn insns[] = {
@@ -120,7 +120,6 @@ int probe_memcg_account(int token_fd)
 	attr.insns = ptr_to_u64(insns);
 	attr.insn_cnt = insn_cnt;
 	attr.license = ptr_to_u64("GPL");
-	attr.prog_token_fd = token_fd;
 
 	prog_fd = sys_bpf_fd(BPF_PROG_LOAD, &attr, attr_sz);
 	if (prog_fd >= 0) {
@@ -147,7 +146,7 @@ int bump_rlimit_memlock(void)
 	struct rlimit rlim;
 
 	/* if kernel supports memcg-based accounting, skip bumping RLIMIT_MEMLOCK */
-	if (memlock_bumped || feat_supported(NULL, FEAT_MEMCG_ACCOUNT))
+	if (memlock_bumped || kernel_supports(NULL, FEAT_MEMCG_ACCOUNT))
 		return 0;
 
 	memlock_bumped = true;
@@ -170,7 +169,7 @@ int bpf_map_create(enum bpf_map_type map_type,
 		   __u32 max_entries,
 		   const struct bpf_map_create_opts *opts)
 {
-	const size_t attr_sz = offsetofend(union bpf_attr, map_token_fd);
+	const size_t attr_sz = offsetofend(union bpf_attr, map_extra);
 	union bpf_attr attr;
 	int fd;
 
@@ -182,7 +181,7 @@ int bpf_map_create(enum bpf_map_type map_type,
 		return libbpf_err(-EINVAL);
 
 	attr.map_type = map_type;
-	if (map_name && feat_supported(NULL, FEAT_PROG_NAME))
+	if (map_name && kernel_supports(NULL, FEAT_PROG_NAME))
 		libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name));
 	attr.key_size = key_size;
 	attr.value_size = value_size;
@@ -199,8 +198,6 @@ int bpf_map_create(enum bpf_map_type map_type,
 	attr.numa_node = OPTS_GET(opts, numa_node, 0);
 	attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0);
 
-	attr.map_token_fd = OPTS_GET(opts, token_fd, 0);
-
 	fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz);
 	return libbpf_err_errno(fd);
 }
@@ -235,7 +232,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 		  const struct bpf_insn *insns, size_t insn_cnt,
 		  struct bpf_prog_load_opts *opts)
 {
-	const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd);
+	const size_t attr_sz = offsetofend(union bpf_attr, log_true_size);
 	void *finfo = NULL, *linfo = NULL;
 	const char *func_info, *line_info;
 	__u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd;
@@ -264,9 +261,8 @@ int bpf_prog_load(enum bpf_prog_type prog_type,
 	attr.prog_flags = OPTS_GET(opts, prog_flags, 0);
 	attr.prog_ifindex = OPTS_GET(opts, prog_ifindex, 0);
 	attr.kern_version = OPTS_GET(opts, kern_version, 0);
-	attr.prog_token_fd = OPTS_GET(opts, token_fd, 0);
 
-	if (prog_name && feat_supported(NULL, FEAT_PROG_NAME))
+	if (prog_name && kernel_supports(NULL, FEAT_PROG_NAME))
 		libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name));
 	attr.license = ptr_to_u64(license);
 
@@ -1186,7 +1182,7 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd)
 
 int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts *opts)
 {
-	const size_t attr_sz = offsetofend(union bpf_attr, btf_token_fd);
+	const size_t attr_sz = offsetofend(union bpf_attr, btf_log_true_size);
 	union bpf_attr attr;
 	char *log_buf;
 	size_t log_size;
@@ -1211,8 +1207,6 @@ int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts
 
 	attr.btf = ptr_to_u64(btf_data);
 	attr.btf_size = btf_size;
-	attr.btf_token_fd = OPTS_GET(opts, token_fd, 0);
-
 	/* log_level == 0 and log_buf != NULL means "try loading without
 	 * log_buf, but retry with log_buf and log_level=1 on error", which is
 	 * consistent across low-level and high-level BTF and program loading
@@ -1293,20 +1287,3 @@ int bpf_prog_bind_map(int prog_fd, int map_fd,
 	ret = sys_bpf(BPF_PROG_BIND_MAP, &attr, attr_sz);
 	return libbpf_err_errno(ret);
 }
-
-int bpf_token_create(int bpffs_fd, struct bpf_token_create_opts *opts)
-{
-	const size_t attr_sz = offsetofend(union bpf_attr, token_create);
-	union bpf_attr attr;
-	int fd;
-
-	if (!OPTS_VALID(opts, bpf_token_create_opts))
-		return libbpf_err(-EINVAL);
-
-	memset(&attr, 0, attr_sz);
-	attr.token_create.bpffs_fd = bpffs_fd;
-	attr.token_create.flags = OPTS_GET(opts, flags, 0);
-
-	fd = sys_bpf_fd(BPF_TOKEN_CREATE, &attr, attr_sz);
-	return libbpf_err_errno(fd);
-}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 991b86bfe7e4..d0f53772bdc0 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -51,11 +51,8 @@ struct bpf_map_create_opts {
 
 	__u32 numa_node;
 	__u32 map_ifindex;
-
-	__u32 token_fd;
-	size_t :0;
 };
-#define bpf_map_create_opts__last_field token_fd
+#define bpf_map_create_opts__last_field map_ifindex
 
 LIBBPF_API int bpf_map_create(enum bpf_map_type map_type,
 			      const char *map_name,
@@ -105,10 +102,9 @@ struct bpf_prog_load_opts {
 	 * If kernel doesn't support this feature, log_size is left unchanged.
 	 */
 	__u32 log_true_size;
-	__u32 token_fd;
 	size_t :0;
 };
-#define bpf_prog_load_opts__last_field token_fd
+#define bpf_prog_load_opts__last_field log_true_size
 
 LIBBPF_API int bpf_prog_load(enum bpf_prog_type prog_type,
 			     const char *prog_name, const char *license,
@@ -134,10 +130,9 @@ struct bpf_btf_load_opts {
 	 * If kernel doesn't support this feature, log_size is left unchanged.
 	 */
 	__u32 log_true_size;
-	__u32 token_fd;
 	size_t :0;
 };
-#define bpf_btf_load_opts__last_field token_fd
+#define bpf_btf_load_opts__last_field log_true_size
 
 LIBBPF_API int bpf_btf_load(const void *btf_data, size_t btf_size,
 			    struct bpf_btf_load_opts *opts);
@@ -645,30 +640,6 @@ struct bpf_test_run_opts {
 LIBBPF_API int bpf_prog_test_run_opts(int prog_fd,
 				      struct bpf_test_run_opts *opts);
 
-struct bpf_token_create_opts {
-	size_t sz; /* size of this struct for forward/backward compatibility */
-	__u32 flags;
-	size_t :0;
-};
-#define bpf_token_create_opts__last_field flags
-
-/**
- * @brief **bpf_token_create()** creates a new instance of BPF token derived
- * from specified BPF FS mount point.
- *
- * BPF token created with this API can be passed to bpf() syscall for
- * commands like BPF_PROG_LOAD, BPF_MAP_CREATE, etc.
- *
- * @param bpffs_fd FD for BPF FS instance from which to derive a BPF token
- * instance.
- * @param opts optional BPF token creation options, can be NULL
- *
- * @return BPF token FD > 0, on success; negative error code, otherwise (errno
- * is also set to the error code)
- */
-LIBBPF_API int bpf_token_create(int bpffs_fd,
-				struct bpf_token_create_opts *opts);
-
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 63033c334320..ee95fd379d4d 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1317,9 +1317,7 @@ struct btf *btf__parse_split(const char *path, struct btf *base_btf)
 
 static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian);
 
-int btf_load_into_kernel(struct btf *btf,
-			 char *log_buf, size_t log_sz, __u32 log_level,
-			 int token_fd)
+int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level)
 {
 	LIBBPF_OPTS(bpf_btf_load_opts, opts);
 	__u32 buf_sz = 0, raw_size;
@@ -1369,7 +1367,6 @@ retry_load:
 		opts.log_level = log_level;
 	}
 
-	opts.token_fd = token_fd;
 	btf->fd = bpf_btf_load(raw_data, raw_size, &opts);
 	if (btf->fd < 0) {
 		/* time to turn on verbose mode and try again */
@@ -1397,7 +1394,7 @@ done:
 
 int btf__load_into_kernel(struct btf *btf)
 {
-	return btf_load_into_kernel(btf, NULL, 0, 0, 0);
+	return btf_load_into_kernel(btf, NULL, 0, 0);
 }
 
 int btf__fd(const struct btf *btf)
diff --git a/tools/lib/bpf/elf.c b/tools/lib/bpf/elf.c
index c92e02394159..b02faec748a5 100644
--- a/tools/lib/bpf/elf.c
+++ b/tools/lib/bpf/elf.c
@@ -11,6 +11,8 @@
 #include "libbpf_internal.h"
 #include "str_error.h"
 
+#define STRERR_BUFSIZE  128
+
 /* A SHT_GNU_versym section holds 16-bit words. This bit is set if
  * the symbol is hidden and can only be seen when referenced using an
  * explicit version number. This is a GNU extension.
diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c
deleted file mode 100644
index ce98a334be21..000000000000
--- a/tools/lib/bpf/features.c
+++ /dev/null
@@ -1,478 +0,0 @@
-// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
-/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
-#include <linux/kernel.h>
-#include <linux/filter.h>
-#include "bpf.h"
-#include "libbpf.h"
-#include "libbpf_common.h"
-#include "libbpf_internal.h"
-#include "str_error.h"
-
-static inline __u64 ptr_to_u64(const void *ptr)
-{
-	return (__u64)(unsigned long)ptr;
-}
-
-static int probe_fd(int fd)
-{
-	if (fd >= 0)
-		close(fd);
-	return fd >= 0;
-}
-
-static int probe_kern_prog_name(int token_fd)
-{
-	const size_t attr_sz = offsetofend(union bpf_attr, prog_name);
-	struct bpf_insn insns[] = {
-		BPF_MOV64_IMM(BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	union bpf_attr attr;
-	int ret;
-
-	memset(&attr, 0, attr_sz);
-	attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
-	attr.license = ptr_to_u64("GPL");
-	attr.insns = ptr_to_u64(insns);
-	attr.insn_cnt = (__u32)ARRAY_SIZE(insns);
-	attr.prog_token_fd = token_fd;
-	libbpf_strlcpy(attr.prog_name, "libbpf_nametest", sizeof(attr.prog_name));
-
-	/* make sure loading with name works */
-	ret = sys_bpf_prog_load(&attr, attr_sz, PROG_LOAD_ATTEMPTS);
-	return probe_fd(ret);
-}
-
-static int probe_kern_global_data(int token_fd)
-{
-	char *cp, errmsg[STRERR_BUFSIZE];
-	struct bpf_insn insns[] = {
-		BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16),
-		BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42),
-		BPF_MOV64_IMM(BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	LIBBPF_OPTS(bpf_map_create_opts, map_opts, .token_fd = token_fd);
-	LIBBPF_OPTS(bpf_prog_load_opts, prog_opts, .token_fd = token_fd);
-	int ret, map, insn_cnt = ARRAY_SIZE(insns);
-
-	map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_global", sizeof(int), 32, 1, &map_opts);
-	if (map < 0) {
-		ret = -errno;
-		cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
-		pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n",
-			__func__, cp, -ret);
-		return ret;
-	}
-
-	insns[0].imm = map;
-
-	ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, &prog_opts);
-	close(map);
-	return probe_fd(ret);
-}
-
-static int probe_kern_btf(int token_fd)
-{
-	static const char strs[] = "\0int";
-	__u32 types[] = {
-		/* int */
-		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-static int probe_kern_btf_func(int token_fd)
-{
-	static const char strs[] = "\0int\0x\0a";
-	/* void x(int a) {} */
-	__u32 types[] = {
-		/* int */
-		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
-		/* FUNC_PROTO */                                /* [2] */
-		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0),
-		BTF_PARAM_ENC(7, 1),
-		/* FUNC x */                                    /* [3] */
-		BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), 2),
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-static int probe_kern_btf_func_global(int token_fd)
-{
-	static const char strs[] = "\0int\0x\0a";
-	/* static void x(int a) {} */
-	__u32 types[] = {
-		/* int */
-		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
-		/* FUNC_PROTO */                                /* [2] */
-		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0),
-		BTF_PARAM_ENC(7, 1),
-		/* FUNC x BTF_FUNC_GLOBAL */                    /* [3] */
-		BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 2),
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-static int probe_kern_btf_datasec(int token_fd)
-{
-	static const char strs[] = "\0x\0.data";
-	/* static int a; */
-	__u32 types[] = {
-		/* int */
-		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
-		/* VAR x */                                     /* [2] */
-		BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1),
-		BTF_VAR_STATIC,
-		/* DATASEC val */                               /* [3] */
-		BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
-		BTF_VAR_SECINFO_ENC(2, 0, 4),
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-static int probe_kern_btf_float(int token_fd)
-{
-	static const char strs[] = "\0float";
-	__u32 types[] = {
-		/* float */
-		BTF_TYPE_FLOAT_ENC(1, 4),
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-static int probe_kern_btf_decl_tag(int token_fd)
-{
-	static const char strs[] = "\0tag";
-	__u32 types[] = {
-		/* int */
-		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
-		/* VAR x */                                     /* [2] */
-		BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1),
-		BTF_VAR_STATIC,
-		/* attr */
-		BTF_TYPE_DECL_TAG_ENC(1, 2, -1),
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-static int probe_kern_btf_type_tag(int token_fd)
-{
-	static const char strs[] = "\0tag";
-	__u32 types[] = {
-		/* int */
-		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
-		/* attr */
-		BTF_TYPE_TYPE_TAG_ENC(1, 1),				/* [2] */
-		/* ptr */
-		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),	/* [3] */
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-static int probe_kern_array_mmap(int token_fd)
-{
-	LIBBPF_OPTS(bpf_map_create_opts, opts,
-		.map_flags = BPF_F_MMAPABLE,
-		.token_fd = token_fd,
-	);
-	int fd;
-
-	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_mmap", sizeof(int), sizeof(int), 1, &opts);
-	return probe_fd(fd);
-}
-
-static int probe_kern_exp_attach_type(int token_fd)
-{
-	LIBBPF_OPTS(bpf_prog_load_opts, opts,
-		.expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE,
-		.token_fd = token_fd,
-	);
-	struct bpf_insn insns[] = {
-		BPF_MOV64_IMM(BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	int fd, insn_cnt = ARRAY_SIZE(insns);
-
-	/* use any valid combination of program type and (optional)
-	 * non-zero expected attach type (i.e., not a BPF_CGROUP_INET_INGRESS)
-	 * to see if kernel supports expected_attach_type field for
-	 * BPF_PROG_LOAD command
-	 */
-	fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", insns, insn_cnt, &opts);
-	return probe_fd(fd);
-}
-
-static int probe_kern_probe_read_kernel(int token_fd)
-{
-	LIBBPF_OPTS(bpf_prog_load_opts, opts, .token_fd = token_fd);
-	struct bpf_insn insns[] = {
-		BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),	/* r1 = r10 (fp) */
-		BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),	/* r1 += -8 */
-		BPF_MOV64_IMM(BPF_REG_2, 8),		/* r2 = 8 */
-		BPF_MOV64_IMM(BPF_REG_3, 0),		/* r3 = 0 */
-		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_probe_read_kernel),
-		BPF_EXIT_INSN(),
-	};
-	int fd, insn_cnt = ARRAY_SIZE(insns);
-
-	fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts);
-	return probe_fd(fd);
-}
-
-static int probe_prog_bind_map(int token_fd)
-{
-	char *cp, errmsg[STRERR_BUFSIZE];
-	struct bpf_insn insns[] = {
-		BPF_MOV64_IMM(BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	LIBBPF_OPTS(bpf_map_create_opts, map_opts, .token_fd = token_fd);
-	LIBBPF_OPTS(bpf_prog_load_opts, prog_opts, .token_fd = token_fd);
-	int ret, map, prog, insn_cnt = ARRAY_SIZE(insns);
-
-	map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_det_bind", sizeof(int), 32, 1, &map_opts);
-	if (map < 0) {
-		ret = -errno;
-		cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
-		pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n",
-			__func__, cp, -ret);
-		return ret;
-	}
-
-	prog = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, &prog_opts);
-	if (prog < 0) {
-		close(map);
-		return 0;
-	}
-
-	ret = bpf_prog_bind_map(prog, map, NULL);
-
-	close(map);
-	close(prog);
-
-	return ret >= 0;
-}
-
-static int probe_module_btf(int token_fd)
-{
-	static const char strs[] = "\0int";
-	__u32 types[] = {
-		/* int */
-		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),
-	};
-	struct bpf_btf_info info;
-	__u32 len = sizeof(info);
-	char name[16];
-	int fd, err;
-
-	fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs), token_fd);
-	if (fd < 0)
-		return 0; /* BTF not supported at all */
-
-	memset(&info, 0, sizeof(info));
-	info.name = ptr_to_u64(name);
-	info.name_len = sizeof(name);
-
-	/* check that BPF_OBJ_GET_INFO_BY_FD supports specifying name pointer;
-	 * kernel's module BTF support coincides with support for
-	 * name/name_len fields in struct bpf_btf_info.
-	 */
-	err = bpf_btf_get_info_by_fd(fd, &info, &len);
-	close(fd);
-	return !err;
-}
-
-static int probe_perf_link(int token_fd)
-{
-	struct bpf_insn insns[] = {
-		BPF_MOV64_IMM(BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	LIBBPF_OPTS(bpf_prog_load_opts, opts, .token_fd = token_fd);
-	int prog_fd, link_fd, err;
-
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL",
-				insns, ARRAY_SIZE(insns), &opts);
-	if (prog_fd < 0)
-		return -errno;
-
-	/* use invalid perf_event FD to get EBADF, if link is supported;
-	 * otherwise EINVAL should be returned
-	 */
-	link_fd = bpf_link_create(prog_fd, -1, BPF_PERF_EVENT, NULL);
-	err = -errno; /* close() can clobber errno */
-
-	if (link_fd >= 0)
-		close(link_fd);
-	close(prog_fd);
-
-	return link_fd < 0 && err == -EBADF;
-}
-
-static int probe_uprobe_multi_link(int token_fd)
-{
-	LIBBPF_OPTS(bpf_prog_load_opts, load_opts,
-		.expected_attach_type = BPF_TRACE_UPROBE_MULTI,
-		.token_fd = token_fd,
-	);
-	LIBBPF_OPTS(bpf_link_create_opts, link_opts);
-	struct bpf_insn insns[] = {
-		BPF_MOV64_IMM(BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	int prog_fd, link_fd, err;
-	unsigned long offset = 0;
-
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL",
-				insns, ARRAY_SIZE(insns), &load_opts);
-	if (prog_fd < 0)
-		return -errno;
-
-	/* Creating uprobe in '/' binary should fail with -EBADF. */
-	link_opts.uprobe_multi.path = "/";
-	link_opts.uprobe_multi.offsets = &offset;
-	link_opts.uprobe_multi.cnt = 1;
-
-	link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts);
-	err = -errno; /* close() can clobber errno */
-
-	if (link_fd >= 0)
-		close(link_fd);
-	close(prog_fd);
-
-	return link_fd < 0 && err == -EBADF;
-}
-
-static int probe_kern_bpf_cookie(int token_fd)
-{
-	struct bpf_insn insns[] = {
-		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_attach_cookie),
-		BPF_EXIT_INSN(),
-	};
-	LIBBPF_OPTS(bpf_prog_load_opts, opts, .token_fd = token_fd);
-	int ret, insn_cnt = ARRAY_SIZE(insns);
-
-	ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts);
-	return probe_fd(ret);
-}
-
-static int probe_kern_btf_enum64(int token_fd)
-{
-	static const char strs[] = "\0enum64";
-	__u32 types[] = {
-		BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 0), 8),
-	};
-
-	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
-					     strs, sizeof(strs), token_fd));
-}
-
-typedef int (*feature_probe_fn)(int /* token_fd */);
-
-static struct kern_feature_cache feature_cache;
-
-static struct kern_feature_desc {
-	const char *desc;
-	feature_probe_fn probe;
-} feature_probes[__FEAT_CNT] = {
-	[FEAT_PROG_NAME] = {
-		"BPF program name", probe_kern_prog_name,
-	},
-	[FEAT_GLOBAL_DATA] = {
-		"global variables", probe_kern_global_data,
-	},
-	[FEAT_BTF] = {
-		"minimal BTF", probe_kern_btf,
-	},
-	[FEAT_BTF_FUNC] = {
-		"BTF functions", probe_kern_btf_func,
-	},
-	[FEAT_BTF_GLOBAL_FUNC] = {
-		"BTF global function", probe_kern_btf_func_global,
-	},
-	[FEAT_BTF_DATASEC] = {
-		"BTF data section and variable", probe_kern_btf_datasec,
-	},
-	[FEAT_ARRAY_MMAP] = {
-		"ARRAY map mmap()", probe_kern_array_mmap,
-	},
-	[FEAT_EXP_ATTACH_TYPE] = {
-		"BPF_PROG_LOAD expected_attach_type attribute",
-		probe_kern_exp_attach_type,
-	},
-	[FEAT_PROBE_READ_KERN] = {
-		"bpf_probe_read_kernel() helper", probe_kern_probe_read_kernel,
-	},
-	[FEAT_PROG_BIND_MAP] = {
-		"BPF_PROG_BIND_MAP support", probe_prog_bind_map,
-	},
-	[FEAT_MODULE_BTF] = {
-		"module BTF support", probe_module_btf,
-	},
-	[FEAT_BTF_FLOAT] = {
-		"BTF_KIND_FLOAT support", probe_kern_btf_float,
-	},
-	[FEAT_PERF_LINK] = {
-		"BPF perf link support", probe_perf_link,
-	},
-	[FEAT_BTF_DECL_TAG] = {
-		"BTF_KIND_DECL_TAG support", probe_kern_btf_decl_tag,
-	},
-	[FEAT_BTF_TYPE_TAG] = {
-		"BTF_KIND_TYPE_TAG support", probe_kern_btf_type_tag,
-	},
-	[FEAT_MEMCG_ACCOUNT] = {
-		"memcg-based memory accounting", probe_memcg_account,
-	},
-	[FEAT_BPF_COOKIE] = {
-		"BPF cookie support", probe_kern_bpf_cookie,
-	},
-	[FEAT_BTF_ENUM64] = {
-		"BTF_KIND_ENUM64 support", probe_kern_btf_enum64,
-	},
-	[FEAT_SYSCALL_WRAPPER] = {
-		"Kernel using syscall wrapper", probe_kern_syscall_wrapper,
-	},
-	[FEAT_UPROBE_MULTI_LINK] = {
-		"BPF multi-uprobe link support", probe_uprobe_multi_link,
-	},
-};
-
-bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id)
-{
-	struct kern_feature_desc *feat = &feature_probes[feat_id];
-	int ret;
-
-	/* assume global feature cache, unless custom one is provided */
-	if (!cache)
-		cache = &feature_cache;
-
-	if (READ_ONCE(cache->res[feat_id]) == FEAT_UNKNOWN) {
-		ret = feat->probe(cache->token_fd);
-		if (ret > 0) {
-			WRITE_ONCE(cache->res[feat_id], FEAT_SUPPORTED);
-		} else if (ret == 0) {
-			WRITE_ONCE(cache->res[feat_id], FEAT_MISSING);
-		} else {
-			pr_warn("Detection of kernel %s support failed: %d\n", feat->desc, ret);
-			WRITE_ONCE(cache->res[feat_id], FEAT_MISSING);
-		}
-	}
-
-	return READ_ONCE(cache->res[feat_id]) == FEAT_SUPPORTED;
-}
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 4b5ff9508e18..ac54ebc0629f 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -59,8 +59,6 @@
 #define BPF_FS_MAGIC		0xcafe4a11
 #endif
 
-#define BPF_FS_DEFAULT_PATH "/sys/fs/bpf"
-
 #define BPF_INSN_SZ (sizeof(struct bpf_insn))
 
 /* vsprintf() in __base_pr() uses nonliteral format string. It may break
@@ -695,10 +693,6 @@ struct bpf_object {
 
 	struct usdt_manager *usdt_man;
 
-	struct kern_feature_cache *feat_cache;
-	char *token_path;
-	int token_fd;
-
 	char path[];
 };
 
@@ -2198,7 +2192,7 @@ static int build_map_pin_path(struct bpf_map *map, const char *path)
 	int err;
 
 	if (!path)
-		path = BPF_FS_DEFAULT_PATH;
+		path = "/sys/fs/bpf";
 
 	err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map));
 	if (err)
@@ -3285,7 +3279,7 @@ skip_exception_cb:
 	} else {
 		/* currently BPF_BTF_LOAD only supports log_level 1 */
 		err = btf_load_into_kernel(kern_btf, obj->log_buf, obj->log_size,
-					   obj->log_level ? 1 : 0, obj->token_fd);
+					   obj->log_level ? 1 : 0);
 	}
 	if (sanitize) {
 		if (!err) {
@@ -4608,63 +4602,6 @@ int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries)
 	return 0;
 }
 
-static int bpf_object_prepare_token(struct bpf_object *obj)
-{
-	const char *bpffs_path;
-	int bpffs_fd = -1, token_fd, err;
-	bool mandatory;
-	enum libbpf_print_level level;
-
-	/* token is already set up */
-	if (obj->token_fd > 0)
-		return 0;
-	/* token is explicitly prevented */
-	if (obj->token_fd < 0) {
-		pr_debug("object '%s': token is prevented, skipping...\n", obj->name);
-		/* reset to zero to avoid extra checks during map_create and prog_load steps */
-		obj->token_fd = 0;
-		return 0;
-	}
-
-	mandatory = obj->token_path != NULL;
-	level = mandatory ? LIBBPF_WARN : LIBBPF_DEBUG;
-
-	bpffs_path = obj->token_path ?: BPF_FS_DEFAULT_PATH;
-	bpffs_fd = open(bpffs_path, O_DIRECTORY, O_RDWR);
-	if (bpffs_fd < 0) {
-		err = -errno;
-		__pr(level, "object '%s': failed (%d) to open BPF FS mount at '%s'%s\n",
-		     obj->name, err, bpffs_path,
-		     mandatory ? "" : ", skipping optional step...");
-		return mandatory ? err : 0;
-	}
-
-	token_fd = bpf_token_create(bpffs_fd, 0);
-	close(bpffs_fd);
-	if (token_fd < 0) {
-		if (!mandatory && token_fd == -ENOENT) {
-			pr_debug("object '%s': BPF FS at '%s' doesn't have BPF token delegation set up, skipping...\n",
-				 obj->name, bpffs_path);
-			return 0;
-		}
-		__pr(level, "object '%s': failed (%d) to create BPF token from '%s'%s\n",
-		     obj->name, token_fd, bpffs_path,
-		     mandatory ? "" : ", skipping optional step...");
-		return mandatory ? token_fd : 0;
-	}
-
-	obj->feat_cache = calloc(1, sizeof(*obj->feat_cache));
-	if (!obj->feat_cache) {
-		close(token_fd);
-		return -ENOMEM;
-	}
-
-	obj->token_fd = token_fd;
-	obj->feat_cache->token_fd = token_fd;
-
-	return 0;
-}
-
 static int
 bpf_object__probe_loading(struct bpf_object *obj)
 {
@@ -4674,7 +4611,6 @@ bpf_object__probe_loading(struct bpf_object *obj)
 		BPF_EXIT_INSN(),
 	};
 	int ret, insn_cnt = ARRAY_SIZE(insns);
-	LIBBPF_OPTS(bpf_prog_load_opts, opts, .token_fd = obj->token_fd);
 
 	if (obj->gen_loader)
 		return 0;
@@ -4684,9 +4620,9 @@ bpf_object__probe_loading(struct bpf_object *obj)
 		pr_warn("Failed to bump RLIMIT_MEMLOCK (err = %d), you might need to do it explicitly!\n", ret);
 
 	/* make sure basic loading works */
-	ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, &opts);
+	ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL);
 	if (ret < 0)
-		ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts);
+		ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL);
 	if (ret < 0) {
 		ret = errno;
 		cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
@@ -4701,18 +4637,462 @@ bpf_object__probe_loading(struct bpf_object *obj)
 	return 0;
 }
 
+static int probe_fd(int fd)
+{
+	if (fd >= 0)
+		close(fd);
+	return fd >= 0;
+}
+
+static int probe_kern_prog_name(void)
+{
+	const size_t attr_sz = offsetofend(union bpf_attr, prog_name);
+	struct bpf_insn insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	union bpf_attr attr;
+	int ret;
+
+	memset(&attr, 0, attr_sz);
+	attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+	attr.license = ptr_to_u64("GPL");
+	attr.insns = ptr_to_u64(insns);
+	attr.insn_cnt = (__u32)ARRAY_SIZE(insns);
+	libbpf_strlcpy(attr.prog_name, "libbpf_nametest", sizeof(attr.prog_name));
+
+	/* make sure loading with name works */
+	ret = sys_bpf_prog_load(&attr, attr_sz, PROG_LOAD_ATTEMPTS);
+	return probe_fd(ret);
+}
+
+static int probe_kern_global_data(void)
+{
+	char *cp, errmsg[STRERR_BUFSIZE];
+	struct bpf_insn insns[] = {
+		BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16),
+		BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	int ret, map, insn_cnt = ARRAY_SIZE(insns);
+
+	map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_global", sizeof(int), 32, 1, NULL);
+	if (map < 0) {
+		ret = -errno;
+		cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
+		pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n",
+			__func__, cp, -ret);
+		return ret;
+	}
+
+	insns[0].imm = map;
+
+	ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL);
+	close(map);
+	return probe_fd(ret);
+}
+
+static int probe_kern_btf(void)
+{
+	static const char strs[] = "\0int";
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_btf_func(void)
+{
+	static const char strs[] = "\0int\0x\0a";
+	/* void x(int a) {} */
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+		/* FUNC_PROTO */                                /* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0),
+		BTF_PARAM_ENC(7, 1),
+		/* FUNC x */                                    /* [3] */
+		BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), 2),
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_btf_func_global(void)
+{
+	static const char strs[] = "\0int\0x\0a";
+	/* static void x(int a) {} */
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+		/* FUNC_PROTO */                                /* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0),
+		BTF_PARAM_ENC(7, 1),
+		/* FUNC x BTF_FUNC_GLOBAL */                    /* [3] */
+		BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 2),
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_btf_datasec(void)
+{
+	static const char strs[] = "\0x\0.data";
+	/* static int a; */
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+		/* VAR x */                                     /* [2] */
+		BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1),
+		BTF_VAR_STATIC,
+		/* DATASEC val */                               /* [3] */
+		BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+		BTF_VAR_SECINFO_ENC(2, 0, 4),
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_btf_float(void)
+{
+	static const char strs[] = "\0float";
+	__u32 types[] = {
+		/* float */
+		BTF_TYPE_FLOAT_ENC(1, 4),
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_btf_decl_tag(void)
+{
+	static const char strs[] = "\0tag";
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),  /* [1] */
+		/* VAR x */                                     /* [2] */
+		BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1),
+		BTF_VAR_STATIC,
+		/* attr */
+		BTF_TYPE_DECL_TAG_ENC(1, 2, -1),
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_btf_type_tag(void)
+{
+	static const char strs[] = "\0tag";
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),		/* [1] */
+		/* attr */
+		BTF_TYPE_TYPE_TAG_ENC(1, 1),				/* [2] */
+		/* ptr */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),	/* [3] */
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_array_mmap(void)
+{
+	LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_MMAPABLE);
+	int fd;
+
+	fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_mmap", sizeof(int), sizeof(int), 1, &opts);
+	return probe_fd(fd);
+}
+
+static int probe_kern_exp_attach_type(void)
+{
+	LIBBPF_OPTS(bpf_prog_load_opts, opts, .expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE);
+	struct bpf_insn insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	int fd, insn_cnt = ARRAY_SIZE(insns);
+
+	/* use any valid combination of program type and (optional)
+	 * non-zero expected attach type (i.e., not a BPF_CGROUP_INET_INGRESS)
+	 * to see if kernel supports expected_attach_type field for
+	 * BPF_PROG_LOAD command
+	 */
+	fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", insns, insn_cnt, &opts);
+	return probe_fd(fd);
+}
+
+static int probe_kern_probe_read_kernel(void)
+{
+	struct bpf_insn insns[] = {
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),	/* r1 = r10 (fp) */
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),	/* r1 += -8 */
+		BPF_MOV64_IMM(BPF_REG_2, 8),		/* r2 = 8 */
+		BPF_MOV64_IMM(BPF_REG_3, 0),		/* r3 = 0 */
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_probe_read_kernel),
+		BPF_EXIT_INSN(),
+	};
+	int fd, insn_cnt = ARRAY_SIZE(insns);
+
+	fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL);
+	return probe_fd(fd);
+}
+
+static int probe_prog_bind_map(void)
+{
+	char *cp, errmsg[STRERR_BUFSIZE];
+	struct bpf_insn insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	int ret, map, prog, insn_cnt = ARRAY_SIZE(insns);
+
+	map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_det_bind", sizeof(int), 32, 1, NULL);
+	if (map < 0) {
+		ret = -errno;
+		cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg));
+		pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n",
+			__func__, cp, -ret);
+		return ret;
+	}
+
+	prog = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL);
+	if (prog < 0) {
+		close(map);
+		return 0;
+	}
+
+	ret = bpf_prog_bind_map(prog, map, NULL);
+
+	close(map);
+	close(prog);
+
+	return ret >= 0;
+}
+
+static int probe_module_btf(void)
+{
+	static const char strs[] = "\0int";
+	__u32 types[] = {
+		/* int */
+		BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4),
+	};
+	struct bpf_btf_info info;
+	__u32 len = sizeof(info);
+	char name[16];
+	int fd, err;
+
+	fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs));
+	if (fd < 0)
+		return 0; /* BTF not supported at all */
+
+	memset(&info, 0, sizeof(info));
+	info.name = ptr_to_u64(name);
+	info.name_len = sizeof(name);
+
+	/* check that BPF_OBJ_GET_INFO_BY_FD supports specifying name pointer;
+	 * kernel's module BTF support coincides with support for
+	 * name/name_len fields in struct bpf_btf_info.
+	 */
+	err = bpf_btf_get_info_by_fd(fd, &info, &len);
+	close(fd);
+	return !err;
+}
+
+static int probe_perf_link(void)
+{
+	struct bpf_insn insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	int prog_fd, link_fd, err;
+
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL",
+				insns, ARRAY_SIZE(insns), NULL);
+	if (prog_fd < 0)
+		return -errno;
+
+	/* use invalid perf_event FD to get EBADF, if link is supported;
+	 * otherwise EINVAL should be returned
+	 */
+	link_fd = bpf_link_create(prog_fd, -1, BPF_PERF_EVENT, NULL);
+	err = -errno; /* close() can clobber errno */
+
+	if (link_fd >= 0)
+		close(link_fd);
+	close(prog_fd);
+
+	return link_fd < 0 && err == -EBADF;
+}
+
+static int probe_uprobe_multi_link(void)
+{
+	LIBBPF_OPTS(bpf_prog_load_opts, load_opts,
+		.expected_attach_type = BPF_TRACE_UPROBE_MULTI,
+	);
+	LIBBPF_OPTS(bpf_link_create_opts, link_opts);
+	struct bpf_insn insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	int prog_fd, link_fd, err;
+	unsigned long offset = 0;
+
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL",
+				insns, ARRAY_SIZE(insns), &load_opts);
+	if (prog_fd < 0)
+		return -errno;
+
+	/* Creating uprobe in '/' binary should fail with -EBADF. */
+	link_opts.uprobe_multi.path = "/";
+	link_opts.uprobe_multi.offsets = &offset;
+	link_opts.uprobe_multi.cnt = 1;
+
+	link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts);
+	err = -errno; /* close() can clobber errno */
+
+	if (link_fd >= 0)
+		close(link_fd);
+	close(prog_fd);
+
+	return link_fd < 0 && err == -EBADF;
+}
+
+static int probe_kern_bpf_cookie(void)
+{
+	struct bpf_insn insns[] = {
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_attach_cookie),
+		BPF_EXIT_INSN(),
+	};
+	int ret, insn_cnt = ARRAY_SIZE(insns);
+
+	ret = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL", insns, insn_cnt, NULL);
+	return probe_fd(ret);
+}
+
+static int probe_kern_btf_enum64(void)
+{
+	static const char strs[] = "\0enum64";
+	__u32 types[] = {
+		BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 0), 8),
+	};
+
+	return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+					     strs, sizeof(strs)));
+}
+
+static int probe_kern_syscall_wrapper(void);
+
+enum kern_feature_result {
+	FEAT_UNKNOWN = 0,
+	FEAT_SUPPORTED = 1,
+	FEAT_MISSING = 2,
+};
+
+typedef int (*feature_probe_fn)(void);
+
+static struct kern_feature_desc {
+	const char *desc;
+	feature_probe_fn probe;
+	enum kern_feature_result res;
+} feature_probes[__FEAT_CNT] = {
+	[FEAT_PROG_NAME] = {
+		"BPF program name", probe_kern_prog_name,
+	},
+	[FEAT_GLOBAL_DATA] = {
+		"global variables", probe_kern_global_data,
+	},
+	[FEAT_BTF] = {
+		"minimal BTF", probe_kern_btf,
+	},
+	[FEAT_BTF_FUNC] = {
+		"BTF functions", probe_kern_btf_func,
+	},
+	[FEAT_BTF_GLOBAL_FUNC] = {
+		"BTF global function", probe_kern_btf_func_global,
+	},
+	[FEAT_BTF_DATASEC] = {
+		"BTF data section and variable", probe_kern_btf_datasec,
+	},
+	[FEAT_ARRAY_MMAP] = {
+		"ARRAY map mmap()", probe_kern_array_mmap,
+	},
+	[FEAT_EXP_ATTACH_TYPE] = {
+		"BPF_PROG_LOAD expected_attach_type attribute",
+		probe_kern_exp_attach_type,
+	},
+	[FEAT_PROBE_READ_KERN] = {
+		"bpf_probe_read_kernel() helper", probe_kern_probe_read_kernel,
+	},
+	[FEAT_PROG_BIND_MAP] = {
+		"BPF_PROG_BIND_MAP support", probe_prog_bind_map,
+	},
+	[FEAT_MODULE_BTF] = {
+		"module BTF support", probe_module_btf,
+	},
+	[FEAT_BTF_FLOAT] = {
+		"BTF_KIND_FLOAT support", probe_kern_btf_float,
+	},
+	[FEAT_PERF_LINK] = {
+		"BPF perf link support", probe_perf_link,
+	},
+	[FEAT_BTF_DECL_TAG] = {
+		"BTF_KIND_DECL_TAG support", probe_kern_btf_decl_tag,
+	},
+	[FEAT_BTF_TYPE_TAG] = {
+		"BTF_KIND_TYPE_TAG support", probe_kern_btf_type_tag,
+	},
+	[FEAT_MEMCG_ACCOUNT] = {
+		"memcg-based memory accounting", probe_memcg_account,
+	},
+	[FEAT_BPF_COOKIE] = {
+		"BPF cookie support", probe_kern_bpf_cookie,
+	},
+	[FEAT_BTF_ENUM64] = {
+		"BTF_KIND_ENUM64 support", probe_kern_btf_enum64,
+	},
+	[FEAT_SYSCALL_WRAPPER] = {
+		"Kernel using syscall wrapper", probe_kern_syscall_wrapper,
+	},
+	[FEAT_UPROBE_MULTI_LINK] = {
+		"BPF multi-uprobe link support", probe_uprobe_multi_link,
+	},
+};
+
 bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id)
 {
+	struct kern_feature_desc *feat = &feature_probes[feat_id];
+	int ret;
+
 	if (obj && obj->gen_loader)
 		/* To generate loader program assume the latest kernel
 		 * to avoid doing extra prog_load, map_create syscalls.
 		 */
 		return true;
 
-	if (obj->token_fd)
-		return feat_supported(obj->feat_cache, feat_id);
+	if (READ_ONCE(feat->res) == FEAT_UNKNOWN) {
+		ret = feat->probe();
+		if (ret > 0) {
+			WRITE_ONCE(feat->res, FEAT_SUPPORTED);
+		} else if (ret == 0) {
+			WRITE_ONCE(feat->res, FEAT_MISSING);
+		} else {
+			pr_warn("Detection of kernel %s support failed: %d\n", feat->desc, ret);
+			WRITE_ONCE(feat->res, FEAT_MISSING);
+		}
+	}
 
-	return feat_supported(NULL, feat_id);
+	return READ_ONCE(feat->res) == FEAT_SUPPORTED;
 }
 
 static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd)
@@ -4831,7 +5211,6 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
 	create_attr.map_flags = def->map_flags;
 	create_attr.numa_node = map->numa_node;
 	create_attr.map_extra = map->map_extra;
-	create_attr.token_fd = obj->token_fd;
 
 	if (bpf_map__is_struct_ops(map))
 		create_attr.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
@@ -6667,7 +7046,6 @@ static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog
 	load_attr.attach_btf_id = prog->attach_btf_id;
 	load_attr.kern_version = kern_version;
 	load_attr.prog_ifindex = prog->prog_ifindex;
-	load_attr.token_fd = obj->token_fd;
 
 	/* specify func_info/line_info only if kernel supports them */
 	btf_fd = bpf_object__btf_fd(obj);
@@ -7129,10 +7507,10 @@ static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object
 static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, size_t obj_buf_sz,
 					  const struct bpf_object_open_opts *opts)
 {
-	const char *obj_name, *kconfig, *btf_tmp_path, *token_path;
+	const char *obj_name, *kconfig, *btf_tmp_path;
 	struct bpf_object *obj;
 	char tmp_name[64];
-	int err, token_fd;
+	int err;
 	char *log_buf;
 	size_t log_size;
 	__u32 log_level;
@@ -7166,28 +7544,6 @@ static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf,
 	if (log_size && !log_buf)
 		return ERR_PTR(-EINVAL);
 
-	token_path = OPTS_GET(opts, bpf_token_path, NULL);
-	token_fd = OPTS_GET(opts, bpf_token_fd, -1);
-	/* non-empty token path can't be combined with invalid token FD */
-	if (token_path && token_path[0] != '\0' && token_fd < 0)
-		return ERR_PTR(-EINVAL);
-	/* empty token path can't be combined with valid token FD */
-	if (token_path && token_path[0] == '\0' && token_fd > 0)
-		return ERR_PTR(-EINVAL);
-	/* if user didn't specify bpf_token_path/bpf_token_fd explicitly,
-	 * check if LIBBPF_BPF_TOKEN_PATH envvar was set and treat it as
-	 * bpf_token_path option
-	 */
-	if (token_fd == 0 && !token_path)
-		token_path = getenv("LIBBPF_BPF_TOKEN_PATH");
-	/* empty token_path is equivalent to invalid token_fd */
-	if (token_path && token_path[0] == '\0') {
-		token_path = NULL;
-		token_fd = -1;
-	}
-	if (token_path && strlen(token_path) >= PATH_MAX)
-		return ERR_PTR(-ENAMETOOLONG);
-
 	obj = bpf_object__new(path, obj_buf, obj_buf_sz, obj_name);
 	if (IS_ERR(obj))
 		return obj;
@@ -7196,19 +7552,6 @@ static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf,
 	obj->log_size = log_size;
 	obj->log_level = log_level;
 
-	obj->token_fd = token_fd <= 0 ? token_fd : dup_good_fd(token_fd);
-	if (token_fd > 0 && obj->token_fd < 0) {
-		err = -errno;
-		goto out;
-	}
-	if (token_path) {
-		obj->token_path = strdup(token_path);
-		if (!obj->token_path) {
-			err = -ENOMEM;
-			goto out;
-		}
-	}
-
 	btf_tmp_path = OPTS_GET(opts, btf_custom_path, NULL);
 	if (btf_tmp_path) {
 		if (strlen(btf_tmp_path) >= PATH_MAX) {
@@ -7719,8 +8062,7 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch
 	if (obj->gen_loader)
 		bpf_gen__init(obj->gen_loader, extra_log_level, obj->nr_programs, obj->nr_maps);
 
-	err = bpf_object_prepare_token(obj);
-	err = err ? : bpf_object__probe_loading(obj);
+	err = bpf_object__probe_loading(obj);
 	err = err ? : bpf_object__load_vmlinux_btf(obj, false);
 	err = err ? : bpf_object__resolve_externs(obj, obj->kconfig);
 	err = err ? : bpf_object__sanitize_and_load_btf(obj);
@@ -8257,11 +8599,6 @@ void bpf_object__close(struct bpf_object *obj)
 	}
 	zfree(&obj->programs);
 
-	zfree(&obj->feat_cache);
-	zfree(&obj->token_path);
-	if (obj->token_fd > 0)
-		close(obj->token_fd);
-
 	free(obj);
 }
 
@@ -10275,7 +10612,7 @@ static const char *arch_specific_syscall_pfx(void)
 #endif
 }
 
-int probe_kern_syscall_wrapper(int token_fd)
+static int probe_kern_syscall_wrapper(void)
 {
 	char syscall_name[64];
 	const char *ksys_pfx;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 916904bd2a7a..6cd9c501624f 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -177,45 +177,10 @@ struct bpf_object_open_opts {
 	 * logs through its print callback.
 	 */
 	__u32 kernel_log_level;
-	/* FD of a BPF token instantiated by user through bpf_token_create()
-	 * API. BPF object will keep dup()'ed FD internally, so passed token
-	 * FD can be closed after BPF object/skeleton open step.
-	 *
-	 * Setting bpf_token_fd to negative value disables libbpf's automatic
-	 * attempt to create BPF token from default BPF FS mount point
-	 * (/sys/fs/bpf), in case this default behavior is undesirable.
-	 *
-	 * If bpf_token_path and bpf_token_fd are not specified, libbpf will
-	 * consult LIBBPF_BPF_TOKEN_PATH environment variable. If set, it will
-	 * be taken as a value of bpf_token_path option and will force libbpf
-	 * to either create BPF token from provided custom BPF FS path, or
-	 * will disable implicit BPF token creation, if envvar value is an
-	 * empty string.
-	 *
-	 * bpf_token_path and bpf_token_fd are mutually exclusive and only one
-	 * of those options should be set. Either of them overrides
-	 * LIBBPF_BPF_TOKEN_PATH envvar.
-	 */
-	int bpf_token_fd;
-	/* Path to BPF FS mount point to derive BPF token from.
-	 *
-	 * Created BPF token will be used for all bpf() syscall operations
-	 * that accept BPF token (e.g., map creation, BTF and program loads,
-	 * etc) automatically within instantiated BPF object.
-	 *
-	 * Setting bpf_token_path option to empty string disables libbpf's
-	 * automatic attempt to create BPF token from default BPF FS mount
-	 * point (/sys/fs/bpf), in case this default behavior is undesirable.
-	 *
-	 * bpf_token_path and bpf_token_fd are mutually exclusive and only one
-	 * of those options should be set. Either of them overrides
-	 * LIBBPF_BPF_TOKEN_PATH envvar.
-	 */
-	const char *bpf_token_path;
 
 	size_t :0;
 };
-#define bpf_object_open_opts__last_field bpf_token_path
+#define bpf_object_open_opts__last_field kernel_log_level
 
 /**
  * @brief **bpf_object__open()** creates a bpf_object by opening
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index df7657b65c47..91c5aef7dae7 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -401,7 +401,6 @@ LIBBPF_1.3.0 {
 		bpf_program__attach_netkit;
 		bpf_program__attach_tcx;
 		bpf_program__attach_uprobe_multi;
-		bpf_token_create;
 		ring__avail_data_size;
 		ring__consume;
 		ring__consumer_pos;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 4cda32298c49..b5d334754e5d 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -360,32 +360,15 @@ enum kern_feature_id {
 	__FEAT_CNT,
 };
 
-enum kern_feature_result {
-	FEAT_UNKNOWN = 0,
-	FEAT_SUPPORTED = 1,
-	FEAT_MISSING = 2,
-};
-
-struct kern_feature_cache {
-	enum kern_feature_result res[__FEAT_CNT];
-	int token_fd;
-};
-
-bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id);
+int probe_memcg_account(void);
 bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id);
-
-int probe_kern_syscall_wrapper(int token_fd);
-int probe_memcg_account(int token_fd);
 int bump_rlimit_memlock(void);
 
 int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz);
 int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz);
 int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
-			 const char *str_sec, size_t str_len,
-			 int token_fd);
-int btf_load_into_kernel(struct btf *btf,
-			 char *log_buf, size_t log_sz, __u32 log_level,
-			 int token_fd);
+			 const char *str_sec, size_t str_len);
+int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level);
 
 struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf);
 void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type,
@@ -549,17 +532,6 @@ static inline bool is_ldimm64_insn(struct bpf_insn *insn)
 	return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
 }
 
-/* Unconditionally dup FD, ensuring it doesn't use [0, 2] range.
- * Original FD is not closed or altered in any other way.
- * Preserves original FD value, if it's invalid (negative).
- */
-static inline int dup_good_fd(int fd)
-{
-	if (fd < 0)
-		return fd;
-	return fcntl(fd, F_DUPFD_CLOEXEC, 3);
-}
-
 /* if fd is stdin, stdout, or stderr, dup to a fd greater than 2
  * Takes ownership of the fd passed in, and closes it if calling
  * fcntl(fd, F_DUPFD_CLOEXEC, 3).
@@ -571,7 +543,7 @@ static inline int ensure_good_fd(int fd)
 	if (fd < 0)
 		return fd;
 	if (fd < 3) {
-		fd = dup_good_fd(fd);
+		fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
 		saved_errno = errno;
 		close(old_fd);
 		errno = saved_errno;
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 8e7437006639..9c4db90b92b6 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -219,8 +219,7 @@ int libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts)
 }
 
 int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
-			 const char *str_sec, size_t str_len,
-			 int token_fd)
+			 const char *str_sec, size_t str_len)
 {
 	struct btf_header hdr = {
 		.magic = BTF_MAGIC,
@@ -230,7 +229,6 @@ int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
 		.str_off = types_len,
 		.str_len = str_len,
 	};
-	LIBBPF_OPTS(bpf_btf_load_opts, opts, .token_fd = token_fd);
 	int btf_fd, btf_len;
 	__u8 *raw_btf;
 
@@ -243,7 +241,7 @@ int libbpf__load_raw_btf(const char *raw_types, size_t types_len,
 	memcpy(raw_btf + hdr.hdr_len, raw_types, hdr.type_len);
 	memcpy(raw_btf + hdr.hdr_len + hdr.type_len, str_sec, hdr.str_len);
 
-	btf_fd = bpf_btf_load(raw_btf, btf_len, &opts);
+	btf_fd = bpf_btf_load(raw_btf, btf_len, NULL);
 
 	free(raw_btf);
 	return btf_fd;
@@ -273,7 +271,7 @@ static int load_local_storage_btf(void)
 	};
 
 	return libbpf__load_raw_btf((char *)types, sizeof(types),
-				     strs, sizeof(strs), 0);
+				     strs, sizeof(strs));
 }
 
 static int probe_map_create(enum bpf_map_type map_type)
diff --git a/tools/lib/bpf/str_error.h b/tools/lib/bpf/str_error.h
index 626d7ffb03d6..a139334d57b6 100644
--- a/tools/lib/bpf/str_error.h
+++ b/tools/lib/bpf/str_error.h
@@ -2,8 +2,5 @@
 #ifndef __LIBBPF_STR_ERROR_H
 #define __LIBBPF_STR_ERROR_H
 
-#define STRERR_BUFSIZE  128
-
 char *libbpf_strerror_r(int err, char *dst, int len);
-
 #endif /* __LIBBPF_STR_ERROR_H */
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
index 4ed46ed58a7b..9f766ddd946a 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_probes.c
@@ -30,8 +30,6 @@ void test_libbpf_probe_prog_types(void)
 
 		if (prog_type == BPF_PROG_TYPE_UNSPEC)
 			continue;
-		if (strcmp(prog_type_name, "__MAX_BPF_PROG_TYPE") == 0)
-			continue;
 
 		if (!test__start_subtest(prog_type_name))
 			continue;
@@ -70,8 +68,6 @@ void test_libbpf_probe_map_types(void)
 
 		if (map_type == BPF_MAP_TYPE_UNSPEC)
 			continue;
-		if (strcmp(map_type_name, "__MAX_BPF_MAP_TYPE") == 0)
-			continue;
 
 		if (!test__start_subtest(map_type_name))
 			continue;
diff --git a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
index 62ea855ec4d0..eb34d612d6f8 100644
--- a/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
+++ b/tools/testing/selftests/bpf/prog_tests/libbpf_str.c
@@ -132,9 +132,6 @@ static void test_libbpf_bpf_map_type_str(void)
 		const char *map_type_str;
 		char buf[256];
 
-		if (map_type == __MAX_BPF_MAP_TYPE)
-			continue;
-
 		map_type_name = btf__str_by_offset(btf, e->name_off);
 		map_type_str = libbpf_bpf_map_type_str(map_type);
 		ASSERT_OK_PTR(map_type_str, map_type_name);
@@ -189,9 +186,6 @@ static void test_libbpf_bpf_prog_type_str(void)
 		const char *prog_type_str;
 		char buf[256];
 
-		if (prog_type == __MAX_BPF_PROG_TYPE)
-			continue;
-
 		prog_type_name = btf__str_by_offset(btf, e->name_off);
 		prog_type_str = libbpf_bpf_prog_type_str(prog_type);
 		ASSERT_OK_PTR(prog_type_str, prog_type_name);
diff --git a/tools/testing/selftests/bpf/prog_tests/token.c b/tools/testing/selftests/bpf/prog_tests/token.c
deleted file mode 100644
index b5dce630e0e1..000000000000
--- a/tools/testing/selftests/bpf/prog_tests/token.c
+++ /dev/null
@@ -1,1031 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
-#define _GNU_SOURCE
-#include <test_progs.h>
-#include <bpf/btf.h>
-#include "cap_helpers.h"
-#include <fcntl.h>
-#include <sched.h>
-#include <signal.h>
-#include <unistd.h>
-#include <linux/filter.h>
-#include <linux/unistd.h>
-#include <linux/mount.h>
-#include <sys/socket.h>
-#include <sys/stat.h>
-#include <sys/syscall.h>
-#include <sys/un.h>
-#include "priv_map.skel.h"
-#include "priv_prog.skel.h"
-#include "dummy_st_ops_success.skel.h"
-
-static inline int sys_mount(const char *dev_name, const char *dir_name,
-			    const char *type, unsigned long flags,
-			    const void *data)
-{
-	return syscall(__NR_mount, dev_name, dir_name, type, flags, data);
-}
-
-static inline int sys_fsopen(const char *fsname, unsigned flags)
-{
-	return syscall(__NR_fsopen, fsname, flags);
-}
-
-static inline int sys_fspick(int dfd, const char *path, unsigned flags)
-{
-	return syscall(__NR_fspick, dfd, path, flags);
-}
-
-static inline int sys_fsconfig(int fs_fd, unsigned cmd, const char *key, const void *val, int aux)
-{
-	return syscall(__NR_fsconfig, fs_fd, cmd, key, val, aux);
-}
-
-static inline int sys_fsmount(int fs_fd, unsigned flags, unsigned ms_flags)
-{
-	return syscall(__NR_fsmount, fs_fd, flags, ms_flags);
-}
-
-static inline int sys_move_mount(int from_dfd, const char *from_path,
-				 int to_dfd, const char *to_path,
-				 unsigned flags)
-{
-	return syscall(__NR_move_mount, from_dfd, from_path, to_dfd, to_path, flags);
-}
-
-static int drop_priv_caps(__u64 *old_caps)
-{
-	return cap_disable_effective((1ULL << CAP_BPF) |
-				     (1ULL << CAP_PERFMON) |
-				     (1ULL << CAP_NET_ADMIN) |
-				     (1ULL << CAP_SYS_ADMIN), old_caps);
-}
-
-static int restore_priv_caps(__u64 old_caps)
-{
-	return cap_enable_effective(old_caps, NULL);
-}
-
-static int set_delegate_mask(int fs_fd, const char *key, __u64 mask, const char *mask_str)
-{
-	char buf[32];
-	int err;
-
-	if (!mask_str) {
-		if (mask == ~0ULL) {
-			mask_str = "any";
-		} else {
-			snprintf(buf, sizeof(buf), "0x%llx", (unsigned long long)mask);
-			mask_str = buf;
-		}
-	}
-
-	err = sys_fsconfig(fs_fd, FSCONFIG_SET_STRING, key,
-			   mask_str, 0);
-	if (err < 0)
-		err = -errno;
-	return err;
-}
-
-#define zclose(fd) do { if (fd >= 0) close(fd); fd = -1; } while (0)
-
-struct bpffs_opts {
-	__u64 cmds;
-	__u64 maps;
-	__u64 progs;
-	__u64 attachs;
-	const char *cmds_str;
-	const char *maps_str;
-	const char *progs_str;
-	const char *attachs_str;
-};
-
-static int create_bpffs_fd(void)
-{
-	int fs_fd;
-
-	/* create VFS context */
-	fs_fd = sys_fsopen("bpf", 0);
-	ASSERT_GE(fs_fd, 0, "fs_fd");
-
-	return fs_fd;
-}
-
-static int materialize_bpffs_fd(int fs_fd, struct bpffs_opts *opts)
-{
-	int mnt_fd, err;
-
-	/* set up token delegation mount options */
-	err = set_delegate_mask(fs_fd, "delegate_cmds", opts->cmds, opts->cmds_str);
-	if (!ASSERT_OK(err, "fs_cfg_cmds"))
-		return err;
-	err = set_delegate_mask(fs_fd, "delegate_maps", opts->maps, opts->maps_str);
-	if (!ASSERT_OK(err, "fs_cfg_maps"))
-		return err;
-	err = set_delegate_mask(fs_fd, "delegate_progs", opts->progs, opts->progs_str);
-	if (!ASSERT_OK(err, "fs_cfg_progs"))
-		return err;
-	err = set_delegate_mask(fs_fd, "delegate_attachs", opts->attachs, opts->attachs_str);
-	if (!ASSERT_OK(err, "fs_cfg_attachs"))
-		return err;
-
-	/* instantiate FS object */
-	err = sys_fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
-	if (err < 0)
-		return -errno;
-
-	/* create O_PATH fd for detached mount */
-	mnt_fd = sys_fsmount(fs_fd, 0, 0);
-	if (err < 0)
-		return -errno;
-
-	return mnt_fd;
-}
-
-/* send FD over Unix domain (AF_UNIX) socket */
-static int sendfd(int sockfd, int fd)
-{
-	struct msghdr msg = {};
-	struct cmsghdr *cmsg;
-	int fds[1] = { fd }, err;
-	char iobuf[1];
-	struct iovec io = {
-		.iov_base = iobuf,
-		.iov_len = sizeof(iobuf),
-	};
-	union {
-		char buf[CMSG_SPACE(sizeof(fds))];
-		struct cmsghdr align;
-	} u;
-
-	msg.msg_iov = &io;
-	msg.msg_iovlen = 1;
-	msg.msg_control = u.buf;
-	msg.msg_controllen = sizeof(u.buf);
-	cmsg = CMSG_FIRSTHDR(&msg);
-	cmsg->cmsg_level = SOL_SOCKET;
-	cmsg->cmsg_type = SCM_RIGHTS;
-	cmsg->cmsg_len = CMSG_LEN(sizeof(fds));
-	memcpy(CMSG_DATA(cmsg), fds, sizeof(fds));
-
-	err = sendmsg(sockfd, &msg, 0);
-	if (err < 0)
-		err = -errno;
-	if (!ASSERT_EQ(err, 1, "sendmsg"))
-		return -EINVAL;
-
-	return 0;
-}
-
-/* receive FD over Unix domain (AF_UNIX) socket */
-static int recvfd(int sockfd, int *fd)
-{
-	struct msghdr msg = {};
-	struct cmsghdr *cmsg;
-	int fds[1], err;
-	char iobuf[1];
-	struct iovec io = {
-		.iov_base = iobuf,
-		.iov_len = sizeof(iobuf),
-	};
-	union {
-		char buf[CMSG_SPACE(sizeof(fds))];
-		struct cmsghdr align;
-	} u;
-
-	msg.msg_iov = &io;
-	msg.msg_iovlen = 1;
-	msg.msg_control = u.buf;
-	msg.msg_controllen = sizeof(u.buf);
-
-	err = recvmsg(sockfd, &msg, 0);
-	if (err < 0)
-		err = -errno;
-	if (!ASSERT_EQ(err, 1, "recvmsg"))
-		return -EINVAL;
-
-	cmsg = CMSG_FIRSTHDR(&msg);
-	if (!ASSERT_OK_PTR(cmsg, "cmsg_null") ||
-	    !ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(fds)), "cmsg_len") ||
-	    !ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET, "cmsg_level") ||
-	    !ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS, "cmsg_type"))
-		return -EINVAL;
-
-	memcpy(fds, CMSG_DATA(cmsg), sizeof(fds));
-	*fd = fds[0];
-
-	return 0;
-}
-
-static ssize_t write_nointr(int fd, const void *buf, size_t count)
-{
-	ssize_t ret;
-
-	do {
-		ret = write(fd, buf, count);
-	} while (ret < 0 && errno == EINTR);
-
-	return ret;
-}
-
-static int write_file(const char *path, const void *buf, size_t count)
-{
-	int fd;
-	ssize_t ret;
-
-	fd = open(path, O_WRONLY | O_CLOEXEC | O_NOCTTY | O_NOFOLLOW);
-	if (fd < 0)
-		return -1;
-
-	ret = write_nointr(fd, buf, count);
-	close(fd);
-	if (ret < 0 || (size_t)ret != count)
-		return -1;
-
-	return 0;
-}
-
-static int create_and_enter_userns(void)
-{
-	uid_t uid;
-	gid_t gid;
-	char map[100];
-
-	uid = getuid();
-	gid = getgid();
-
-	if (unshare(CLONE_NEWUSER))
-		return -1;
-
-	if (write_file("/proc/self/setgroups", "deny", sizeof("deny") - 1) &&
-	    errno != ENOENT)
-		return -1;
-
-	snprintf(map, sizeof(map), "0 %d 1", uid);
-	if (write_file("/proc/self/uid_map", map, strlen(map)))
-		return -1;
-
-
-	snprintf(map, sizeof(map), "0 %d 1", gid);
-	if (write_file("/proc/self/gid_map", map, strlen(map)))
-		return -1;
-
-	if (setgid(0))
-		return -1;
-
-	if (setuid(0))
-		return -1;
-
-	return 0;
-}
-
-typedef int (*child_callback_fn)(int);
-
-static void child(int sock_fd, struct bpffs_opts *opts, child_callback_fn callback)
-{
-	LIBBPF_OPTS(bpf_map_create_opts, map_opts);
-	int mnt_fd = -1, fs_fd = -1, err = 0, bpffs_fd = -1;
-
-	/* setup userns with root mappings */
-	err = create_and_enter_userns();
-	if (!ASSERT_OK(err, "create_and_enter_userns"))
-		goto cleanup;
-
-	/* setup mountns to allow creating BPF FS (fsopen("bpf")) from unpriv process */
-	err = unshare(CLONE_NEWNS);
-	if (!ASSERT_OK(err, "create_mountns"))
-		goto cleanup;
-
-	err = sys_mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0);
-	if (!ASSERT_OK(err, "remount_root"))
-		goto cleanup;
-
-	fs_fd = create_bpffs_fd();
-	if (!ASSERT_GE(fs_fd, 0, "create_bpffs_fd")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* ensure unprivileged child cannot set delegation options */
-	err = set_delegate_mask(fs_fd, "delegate_cmds", 0x1, NULL);
-	ASSERT_EQ(err, -EPERM, "delegate_cmd_eperm");
-	err = set_delegate_mask(fs_fd, "delegate_maps", 0x1, NULL);
-	ASSERT_EQ(err, -EPERM, "delegate_maps_eperm");
-	err = set_delegate_mask(fs_fd, "delegate_progs", 0x1, NULL);
-	ASSERT_EQ(err, -EPERM, "delegate_progs_eperm");
-	err = set_delegate_mask(fs_fd, "delegate_attachs", 0x1, NULL);
-	ASSERT_EQ(err, -EPERM, "delegate_attachs_eperm");
-
-	/* pass BPF FS context object to parent */
-	err = sendfd(sock_fd, fs_fd);
-	if (!ASSERT_OK(err, "send_fs_fd"))
-		goto cleanup;
-	zclose(fs_fd);
-
-	/* avoid mucking around with mount namespaces and mounting at
-	 * well-known path, just get detach-mounted BPF FS fd back from parent
-	 */
-	err = recvfd(sock_fd, &mnt_fd);
-	if (!ASSERT_OK(err, "recv_mnt_fd"))
-		goto cleanup;
-
-	/* try to fspick() BPF FS and try to add some delegation options */
-	fs_fd = sys_fspick(mnt_fd, "", FSPICK_EMPTY_PATH);
-	if (!ASSERT_GE(fs_fd, 0, "bpffs_fspick")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* ensure unprivileged child cannot reconfigure to set delegation options */
-	err = set_delegate_mask(fs_fd, "delegate_cmds", 0, "any");
-	if (!ASSERT_EQ(err, -EPERM, "delegate_cmd_eperm_reconfig")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-	err = set_delegate_mask(fs_fd, "delegate_maps", 0, "any");
-	if (!ASSERT_EQ(err, -EPERM, "delegate_maps_eperm_reconfig")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-	err = set_delegate_mask(fs_fd, "delegate_progs", 0, "any");
-	if (!ASSERT_EQ(err, -EPERM, "delegate_progs_eperm_reconfig")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-	err = set_delegate_mask(fs_fd, "delegate_attachs", 0, "any");
-	if (!ASSERT_EQ(err, -EPERM, "delegate_attachs_eperm_reconfig")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-	zclose(fs_fd);
-
-	bpffs_fd = openat(mnt_fd, ".", 0, O_RDWR);
-	if (!ASSERT_GE(bpffs_fd, 0, "bpffs_open")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* do custom test logic with customly set up BPF FS instance */
-	err = callback(bpffs_fd);
-	if (!ASSERT_OK(err, "test_callback"))
-		goto cleanup;
-
-	err = 0;
-cleanup:
-	zclose(sock_fd);
-	zclose(mnt_fd);
-	zclose(fs_fd);
-	zclose(bpffs_fd);
-
-	exit(-err);
-}
-
-static int wait_for_pid(pid_t pid)
-{
-	int status, ret;
-
-again:
-	ret = waitpid(pid, &status, 0);
-	if (ret == -1) {
-		if (errno == EINTR)
-			goto again;
-
-		return -1;
-	}
-
-	if (!WIFEXITED(status))
-		return -1;
-
-	return WEXITSTATUS(status);
-}
-
-static void parent(int child_pid, struct bpffs_opts *bpffs_opts, int sock_fd)
-{
-	int fs_fd = -1, mnt_fd = -1, err;
-
-	err = recvfd(sock_fd, &fs_fd);
-	if (!ASSERT_OK(err, "recv_bpffs_fd"))
-		goto cleanup;
-
-	mnt_fd = materialize_bpffs_fd(fs_fd, bpffs_opts);
-	if (!ASSERT_GE(mnt_fd, 0, "materialize_bpffs_fd")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-	zclose(fs_fd);
-
-	/* pass BPF FS context object to parent */
-	err = sendfd(sock_fd, mnt_fd);
-	if (!ASSERT_OK(err, "send_mnt_fd"))
-		goto cleanup;
-	zclose(mnt_fd);
-
-	err = wait_for_pid(child_pid);
-	ASSERT_OK(err, "waitpid_child");
-
-cleanup:
-	zclose(sock_fd);
-	zclose(fs_fd);
-	zclose(mnt_fd);
-
-	if (child_pid > 0)
-		(void)kill(child_pid, SIGKILL);
-}
-
-static void subtest_userns(struct bpffs_opts *bpffs_opts, child_callback_fn cb)
-{
-	int sock_fds[2] = { -1, -1 };
-	int child_pid = 0, err;
-
-	err = socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds);
-	if (!ASSERT_OK(err, "socketpair"))
-		goto cleanup;
-
-	child_pid = fork();
-	if (!ASSERT_GE(child_pid, 0, "fork"))
-		goto cleanup;
-
-	if (child_pid == 0) {
-		zclose(sock_fds[0]);
-		return child(sock_fds[1], bpffs_opts, cb);
-
-	} else {
-		zclose(sock_fds[1]);
-		return parent(child_pid, bpffs_opts, sock_fds[0]);
-	}
-
-cleanup:
-	zclose(sock_fds[0]);
-	zclose(sock_fds[1]);
-	if (child_pid > 0)
-		(void)kill(child_pid, SIGKILL);
-}
-
-static int userns_map_create(int mnt_fd)
-{
-	LIBBPF_OPTS(bpf_map_create_opts, map_opts);
-	int err, token_fd = -1, map_fd = -1;
-	__u64 old_caps = 0;
-
-	/* create BPF token from BPF FS mount */
-	token_fd = bpf_token_create(mnt_fd, NULL);
-	if (!ASSERT_GT(token_fd, 0, "token_create")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* while inside non-init userns, we need both a BPF token *and*
-	 * CAP_BPF inside current userns to create privileged map; let's test
-	 * that neither BPF token alone nor namespaced CAP_BPF is sufficient
-	 */
-	err = drop_priv_caps(&old_caps);
-	if (!ASSERT_OK(err, "drop_caps"))
-		goto cleanup;
-
-	/* no token, no CAP_BPF -> fail */
-	map_opts.token_fd = 0;
-	map_fd = bpf_map_create(BPF_MAP_TYPE_STACK, "wo_token_wo_bpf", 0, 8, 1, &map_opts);
-	if (!ASSERT_LT(map_fd, 0, "stack_map_wo_token_wo_cap_bpf_should_fail")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* token without CAP_BPF -> fail */
-	map_opts.token_fd = token_fd;
-	map_fd = bpf_map_create(BPF_MAP_TYPE_STACK, "w_token_wo_bpf", 0, 8, 1, &map_opts);
-	if (!ASSERT_LT(map_fd, 0, "stack_map_w_token_wo_cap_bpf_should_fail")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* get back effective local CAP_BPF (and CAP_SYS_ADMIN) */
-	err = restore_priv_caps(old_caps);
-	if (!ASSERT_OK(err, "restore_caps"))
-		goto cleanup;
-
-	/* CAP_BPF without token -> fail */
-	map_opts.token_fd = 0;
-	map_fd = bpf_map_create(BPF_MAP_TYPE_STACK, "wo_token_w_bpf", 0, 8, 1, &map_opts);
-	if (!ASSERT_LT(map_fd, 0, "stack_map_wo_token_w_cap_bpf_should_fail")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* finally, namespaced CAP_BPF + token -> success */
-	map_opts.token_fd = token_fd;
-	map_fd = bpf_map_create(BPF_MAP_TYPE_STACK, "w_token_w_bpf", 0, 8, 1, &map_opts);
-	if (!ASSERT_GT(map_fd, 0, "stack_map_w_token_w_cap_bpf")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-cleanup:
-	zclose(token_fd);
-	zclose(map_fd);
-	return err;
-}
-
-static int userns_btf_load(int mnt_fd)
-{
-	LIBBPF_OPTS(bpf_btf_load_opts, btf_opts);
-	int err, token_fd = -1, btf_fd = -1;
-	const void *raw_btf_data;
-	struct btf *btf = NULL;
-	__u32 raw_btf_size;
-	__u64 old_caps = 0;
-
-	/* create BPF token from BPF FS mount */
-	token_fd = bpf_token_create(mnt_fd, NULL);
-	if (!ASSERT_GT(token_fd, 0, "token_create")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* while inside non-init userns, we need both a BPF token *and*
-	 * CAP_BPF inside current userns to create privileged map; let's test
-	 * that neither BPF token alone nor namespaced CAP_BPF is sufficient
-	 */
-	err = drop_priv_caps(&old_caps);
-	if (!ASSERT_OK(err, "drop_caps"))
-		goto cleanup;
-
-	/* setup a trivial BTF data to load to the kernel */
-	btf = btf__new_empty();
-	if (!ASSERT_OK_PTR(btf, "empty_btf"))
-		goto cleanup;
-
-	ASSERT_GT(btf__add_int(btf, "int", 4, 0), 0, "int_type");
-
-	raw_btf_data = btf__raw_data(btf, &raw_btf_size);
-	if (!ASSERT_OK_PTR(raw_btf_data, "raw_btf_data"))
-		goto cleanup;
-
-	/* no token + no CAP_BPF -> failure */
-	btf_opts.token_fd = 0;
-	btf_fd = bpf_btf_load(raw_btf_data, raw_btf_size, &btf_opts);
-	if (!ASSERT_LT(btf_fd, 0, "no_token_no_cap_should_fail"))
-		goto cleanup;
-
-	/* token + no CAP_BPF -> failure */
-	btf_opts.token_fd = token_fd;
-	btf_fd = bpf_btf_load(raw_btf_data, raw_btf_size, &btf_opts);
-	if (!ASSERT_LT(btf_fd, 0, "token_no_cap_should_fail"))
-		goto cleanup;
-
-	/* get back effective local CAP_BPF (and CAP_SYS_ADMIN) */
-	err = restore_priv_caps(old_caps);
-	if (!ASSERT_OK(err, "restore_caps"))
-		goto cleanup;
-
-	/* token + CAP_BPF -> success */
-	btf_opts.token_fd = token_fd;
-	btf_fd = bpf_btf_load(raw_btf_data, raw_btf_size, &btf_opts);
-	if (!ASSERT_GT(btf_fd, 0, "token_and_cap_success"))
-		goto cleanup;
-
-	err = 0;
-cleanup:
-	btf__free(btf);
-	zclose(btf_fd);
-	zclose(token_fd);
-	return err;
-}
-
-static int userns_prog_load(int mnt_fd)
-{
-	LIBBPF_OPTS(bpf_prog_load_opts, prog_opts);
-	int err, token_fd = -1, prog_fd = -1;
-	struct bpf_insn insns[] = {
-		/* bpf_jiffies64() requires CAP_BPF */
-		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64),
-		/* bpf_get_current_task() requires CAP_PERFMON */
-		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_current_task),
-		/* r0 = 0; exit; */
-		BPF_MOV64_IMM(BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	size_t insn_cnt = ARRAY_SIZE(insns);
-	__u64 old_caps = 0;
-
-	/* create BPF token from BPF FS mount */
-	token_fd = bpf_token_create(mnt_fd, NULL);
-	if (!ASSERT_GT(token_fd, 0, "token_create")) {
-		err = -EINVAL;
-		goto cleanup;
-	}
-
-	/* validate we can successfully load BPF program with token; this
-	 * being XDP program (CAP_NET_ADMIN) using bpf_jiffies64() (CAP_BPF)
-	 * and bpf_get_current_task() (CAP_PERFMON) helpers validates we have
-	 * BPF token wired properly in a bunch of places in the kernel
-	 */
-	prog_opts.token_fd = token_fd;
-	prog_opts.expected_attach_type = BPF_XDP;
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, "token_prog", "GPL",
-				insns, insn_cnt, &prog_opts);
-	if (!ASSERT_GT(prog_fd, 0, "prog_fd")) {
-		err = -EPERM;
-		goto cleanup;
-	}
-
-	/* no token + caps -> failure */
-	prog_opts.token_fd = 0;
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, "token_prog", "GPL",
-				insns, insn_cnt, &prog_opts);
-	if (!ASSERT_EQ(prog_fd, -EPERM, "prog_fd_eperm")) {
-		err = -EPERM;
-		goto cleanup;
-	}
-
-	err = drop_priv_caps(&old_caps);
-	if (!ASSERT_OK(err, "drop_caps"))
-		goto cleanup;
-
-	/* no caps + token -> failure */
-	prog_opts.token_fd = token_fd;
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, "token_prog", "GPL",
-				insns, insn_cnt, &prog_opts);
-	if (!ASSERT_EQ(prog_fd, -EPERM, "prog_fd_eperm")) {
-		err = -EPERM;
-		goto cleanup;
-	}
-
-	/* no caps + no token -> definitely a failure */
-	prog_opts.token_fd = 0;
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, "token_prog", "GPL",
-				insns, insn_cnt, &prog_opts);
-	if (!ASSERT_EQ(prog_fd, -EPERM, "prog_fd_eperm")) {
-		err = -EPERM;
-		goto cleanup;
-	}
-
-	err = 0;
-cleanup:
-	zclose(prog_fd);
-	zclose(token_fd);
-	return err;
-}
-
-static int userns_obj_priv_map(int mnt_fd)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
-	char buf[256];
-	struct priv_map *skel;
-	int err, token_fd;
-
-	skel = priv_map__open_and_load();
-	if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load")) {
-		priv_map__destroy(skel);
-		return -EINVAL;
-	}
-
-	/* use bpf_token_path to provide BPF FS path */
-	snprintf(buf, sizeof(buf), "/proc/self/fd/%d", mnt_fd);
-	opts.bpf_token_path = buf;
-	skel = priv_map__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_token_path_open"))
-		return -EINVAL;
-
-	err = priv_map__load(skel);
-	priv_map__destroy(skel);
-	if (!ASSERT_OK(err, "obj_token_path_load"))
-		return -EINVAL;
-
-	/* create token and pass it through bpf_token_fd */
-	token_fd = bpf_token_create(mnt_fd, NULL);
-	if (!ASSERT_GT(token_fd, 0, "create_token"))
-		return -EINVAL;
-
-	opts.bpf_token_path = NULL;
-	opts.bpf_token_fd = token_fd;
-	skel = priv_map__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_token_fd_open"))
-		return -EINVAL;
-
-	/* we can close our token FD, bpf_object owns dup()'ed FD now */
-	close(token_fd);
-
-	err = priv_map__load(skel);
-	priv_map__destroy(skel);
-	if (!ASSERT_OK(err, "obj_token_fd_load"))
-		return -EINVAL;
-
-	return 0;
-}
-
-static int userns_obj_priv_prog(int mnt_fd)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
-	char buf[256];
-	struct priv_prog *skel;
-	int err;
-
-	skel = priv_prog__open_and_load();
-	if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load")) {
-		priv_prog__destroy(skel);
-		return -EINVAL;
-	}
-
-	/* use bpf_token_path to provide BPF FS path */
-	snprintf(buf, sizeof(buf), "/proc/self/fd/%d", mnt_fd);
-	opts.bpf_token_path = buf;
-	skel = priv_prog__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_token_path_open"))
-		return -EINVAL;
-
-	err = priv_prog__load(skel);
-	priv_prog__destroy(skel);
-	if (!ASSERT_OK(err, "obj_token_path_load"))
-		return -EINVAL;
-
-	return 0;
-}
-
-/* this test is called with BPF FS that doesn't delegate BPF_BTF_LOAD command,
- * which should cause struct_ops application to fail, as BTF won't be uploaded
- * into the kernel, even if STRUCT_OPS programs themselves are allowed
- */
-static int validate_struct_ops_load(int mnt_fd, bool expect_success)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
-	char buf[256];
-	struct dummy_st_ops_success *skel;
-	int err;
-
-	snprintf(buf, sizeof(buf), "/proc/self/fd/%d", mnt_fd);
-	opts.bpf_token_path = buf;
-	skel = dummy_st_ops_success__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_token_path_open"))
-		return -EINVAL;
-
-	err = dummy_st_ops_success__load(skel);
-	dummy_st_ops_success__destroy(skel);
-	if (expect_success) {
-		if (!ASSERT_OK(err, "obj_token_path_load"))
-			return -EINVAL;
-	} else /* expect failure */ {
-		if (!ASSERT_ERR(err, "obj_token_path_load"))
-			return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int userns_obj_priv_btf_fail(int mnt_fd)
-{
-	return validate_struct_ops_load(mnt_fd, false /* should fail */);
-}
-
-static int userns_obj_priv_btf_success(int mnt_fd)
-{
-	return validate_struct_ops_load(mnt_fd, true /* should succeed */);
-}
-
-#define TOKEN_ENVVAR "LIBBPF_BPF_TOKEN_PATH"
-#define TOKEN_BPFFS_CUSTOM "/bpf-token-fs"
-
-static int userns_obj_priv_implicit_token(int mnt_fd)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
-	struct dummy_st_ops_success *skel;
-	int err;
-
-	/* before we mount BPF FS with token delegation, struct_ops skeleton
-	 * should fail to load
-	 */
-	skel = dummy_st_ops_success__open_and_load();
-	if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load")) {
-		dummy_st_ops_success__destroy(skel);
-		return -EINVAL;
-	}
-
-	/* mount custom BPF FS over /sys/fs/bpf so that libbpf can create BPF
-	 * token automatically and implicitly
-	 */
-	err = sys_move_mount(mnt_fd, "", AT_FDCWD, "/sys/fs/bpf", MOVE_MOUNT_F_EMPTY_PATH);
-	if (!ASSERT_OK(err, "move_mount_bpffs"))
-		return -EINVAL;
-
-	/* disable implicit BPF token creation by setting
-	 * LIBBPF_BPF_TOKEN_PATH envvar to empty value, load should fail
-	 */
-	err = setenv(TOKEN_ENVVAR, "", 1 /*overwrite*/);
-	if (!ASSERT_OK(err, "setenv_token_path"))
-		return -EINVAL;
-	skel = dummy_st_ops_success__open_and_load();
-	if (!ASSERT_ERR_PTR(skel, "obj_token_envvar_disabled_load")) {
-		unsetenv(TOKEN_ENVVAR);
-		dummy_st_ops_success__destroy(skel);
-		return -EINVAL;
-	}
-	unsetenv(TOKEN_ENVVAR);
-
-	/* now the same struct_ops skeleton should succeed thanks to libppf
-	 * creating BPF token from /sys/fs/bpf mount point
-	 */
-	skel = dummy_st_ops_success__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "obj_implicit_token_load"))
-		return -EINVAL;
-
-	dummy_st_ops_success__destroy(skel);
-
-	/* now disable implicit token through empty bpf_token_path, should fail */
-	opts.bpf_token_path = "";
-	skel = dummy_st_ops_success__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_empty_token_path_open"))
-		return -EINVAL;
-
-	err = dummy_st_ops_success__load(skel);
-	dummy_st_ops_success__destroy(skel);
-	if (!ASSERT_ERR(err, "obj_empty_token_path_load"))
-		return -EINVAL;
-
-	/* now disable implicit token through negative bpf_token_fd, should fail */
-	opts.bpf_token_path = NULL;
-	opts.bpf_token_fd = -1;
-	skel = dummy_st_ops_success__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_neg_token_fd_open"))
-		return -EINVAL;
-
-	err = dummy_st_ops_success__load(skel);
-	dummy_st_ops_success__destroy(skel);
-	if (!ASSERT_ERR(err, "obj_neg_token_fd_load"))
-		return -EINVAL;
-
-	return 0;
-}
-
-static int userns_obj_priv_implicit_token_envvar(int mnt_fd)
-{
-	LIBBPF_OPTS(bpf_object_open_opts, opts);
-	struct dummy_st_ops_success *skel;
-	int err;
-
-	/* before we mount BPF FS with token delegation, struct_ops skeleton
-	 * should fail to load
-	 */
-	skel = dummy_st_ops_success__open_and_load();
-	if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load")) {
-		dummy_st_ops_success__destroy(skel);
-		return -EINVAL;
-	}
-
-	/* mount custom BPF FS over custom location, so libbpf can't create
-	 * BPF token implicitly, unless pointed to it through
-	 * LIBBPF_BPF_TOKEN_PATH envvar
-	 */
-	rmdir(TOKEN_BPFFS_CUSTOM);
-	if (!ASSERT_OK(mkdir(TOKEN_BPFFS_CUSTOM, 0777), "mkdir_bpffs_custom"))
-		goto err_out;
-	err = sys_move_mount(mnt_fd, "", AT_FDCWD, TOKEN_BPFFS_CUSTOM, MOVE_MOUNT_F_EMPTY_PATH);
-	if (!ASSERT_OK(err, "move_mount_bpffs"))
-		goto err_out;
-
-	/* even though we have BPF FS with delegation, it's not at default
-	 * /sys/fs/bpf location, so we still fail to load until envvar is set up
-	 */
-	skel = dummy_st_ops_success__open_and_load();
-	if (!ASSERT_ERR_PTR(skel, "obj_tokenless_load2")) {
-		dummy_st_ops_success__destroy(skel);
-		goto err_out;
-	}
-
-	err = setenv(TOKEN_ENVVAR, TOKEN_BPFFS_CUSTOM, 1 /*overwrite*/);
-	if (!ASSERT_OK(err, "setenv_token_path"))
-		goto err_out;
-
-	/* now the same struct_ops skeleton should succeed thanks to libppf
-	 * creating BPF token from custom mount point
-	 */
-	skel = dummy_st_ops_success__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "obj_implicit_token_load"))
-		goto err_out;
-
-	dummy_st_ops_success__destroy(skel);
-
-	/* now disable implicit token through empty bpf_token_path, envvar
-	 * will be ignored, should fail
-	 */
-	opts.bpf_token_path = "";
-	skel = dummy_st_ops_success__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_empty_token_path_open"))
-		goto err_out;
-
-	err = dummy_st_ops_success__load(skel);
-	dummy_st_ops_success__destroy(skel);
-	if (!ASSERT_ERR(err, "obj_empty_token_path_load"))
-		goto err_out;
-
-	/* now disable implicit token through negative bpf_token_fd, envvar
-	 * will be ignored, should fail
-	 */
-	opts.bpf_token_path = NULL;
-	opts.bpf_token_fd = -1;
-	skel = dummy_st_ops_success__open_opts(&opts);
-	if (!ASSERT_OK_PTR(skel, "obj_neg_token_fd_open"))
-		goto err_out;
-
-	err = dummy_st_ops_success__load(skel);
-	dummy_st_ops_success__destroy(skel);
-	if (!ASSERT_ERR(err, "obj_neg_token_fd_load"))
-		goto err_out;
-
-	rmdir(TOKEN_BPFFS_CUSTOM);
-	unsetenv(TOKEN_ENVVAR);
-	return 0;
-err_out:
-	rmdir(TOKEN_BPFFS_CUSTOM);
-	unsetenv(TOKEN_ENVVAR);
-	return -EINVAL;
-}
-
-#define bit(n) (1ULL << (n))
-
-void test_token(void)
-{
-	if (test__start_subtest("map_token")) {
-		struct bpffs_opts opts = {
-			.cmds_str = "map_create",
-			.maps_str = "stack",
-		};
-
-		subtest_userns(&opts, userns_map_create);
-	}
-	if (test__start_subtest("btf_token")) {
-		struct bpffs_opts opts = {
-			.cmds = 1ULL << BPF_BTF_LOAD,
-		};
-
-		subtest_userns(&opts, userns_btf_load);
-	}
-	if (test__start_subtest("prog_token")) {
-		struct bpffs_opts opts = {
-			.cmds_str = "PROG_LOAD",
-			.progs_str = "XDP",
-			.attachs_str = "xdp",
-		};
-
-		subtest_userns(&opts, userns_prog_load);
-	}
-	if (test__start_subtest("obj_priv_map")) {
-		struct bpffs_opts opts = {
-			.cmds = bit(BPF_MAP_CREATE),
-			.maps = bit(BPF_MAP_TYPE_QUEUE),
-		};
-
-		subtest_userns(&opts, userns_obj_priv_map);
-	}
-	if (test__start_subtest("obj_priv_prog")) {
-		struct bpffs_opts opts = {
-			.cmds = bit(BPF_PROG_LOAD),
-			.progs = bit(BPF_PROG_TYPE_KPROBE),
-			.attachs = ~0ULL,
-		};
-
-		subtest_userns(&opts, userns_obj_priv_prog);
-	}
-	if (test__start_subtest("obj_priv_btf_fail")) {
-		struct bpffs_opts opts = {
-			/* disallow BTF loading */
-			.cmds = bit(BPF_MAP_CREATE) | bit(BPF_PROG_LOAD),
-			.maps = bit(BPF_MAP_TYPE_STRUCT_OPS),
-			.progs = bit(BPF_PROG_TYPE_STRUCT_OPS),
-			.attachs = ~0ULL,
-		};
-
-		subtest_userns(&opts, userns_obj_priv_btf_fail);
-	}
-	if (test__start_subtest("obj_priv_btf_success")) {
-		struct bpffs_opts opts = {
-			/* allow BTF loading */
-			.cmds = bit(BPF_BTF_LOAD) | bit(BPF_MAP_CREATE) | bit(BPF_PROG_LOAD),
-			.maps = bit(BPF_MAP_TYPE_STRUCT_OPS),
-			.progs = bit(BPF_PROG_TYPE_STRUCT_OPS),
-			.attachs = ~0ULL,
-		};
-
-		subtest_userns(&opts, userns_obj_priv_btf_success);
-	}
-	if (test__start_subtest("obj_priv_implicit_token")) {
-		struct bpffs_opts opts = {
-			/* allow BTF loading */
-			.cmds = bit(BPF_BTF_LOAD) | bit(BPF_MAP_CREATE) | bit(BPF_PROG_LOAD),
-			.maps = bit(BPF_MAP_TYPE_STRUCT_OPS),
-			.progs = bit(BPF_PROG_TYPE_STRUCT_OPS),
-			.attachs = ~0ULL,
-		};
-
-		subtest_userns(&opts, userns_obj_priv_implicit_token);
-	}
-	if (test__start_subtest("obj_priv_implicit_token_envvar")) {
-		struct bpffs_opts opts = {
-			/* allow BTF loading */
-			.cmds = bit(BPF_BTF_LOAD) | bit(BPF_MAP_CREATE) | bit(BPF_PROG_LOAD),
-			.maps = bit(BPF_MAP_TYPE_STRUCT_OPS),
-			.progs = bit(BPF_PROG_TYPE_STRUCT_OPS),
-			.attachs = ~0ULL,
-		};
-
-		subtest_userns(&opts, userns_obj_priv_implicit_token_envvar);
-	}
-}
diff --git a/tools/testing/selftests/bpf/progs/priv_map.c b/tools/testing/selftests/bpf/progs/priv_map.c
deleted file mode 100644
index 9085be50f03b..000000000000
--- a/tools/testing/selftests/bpf/progs/priv_map.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
-
-#include "vmlinux.h"
-#include <bpf/bpf_helpers.h>
-
-char _license[] SEC("license") = "GPL";
-
-struct {
-	__uint(type, BPF_MAP_TYPE_QUEUE);
-	__uint(max_entries, 1);
-	__type(value, __u32);
-} priv_map SEC(".maps");
diff --git a/tools/testing/selftests/bpf/progs/priv_prog.c b/tools/testing/selftests/bpf/progs/priv_prog.c
deleted file mode 100644
index 3c7b2b618c8a..000000000000
--- a/tools/testing/selftests/bpf/progs/priv_prog.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */
-
-#include "vmlinux.h"
-#include <bpf/bpf_helpers.h>
-
-char _license[] SEC("license") = "GPL";
-
-SEC("kprobe")
-int kprobe_prog(void *ctx)
-{
-	return 1;
-}
-- 
cgit v1.2.3


From 489c693bd04a2308865dc50f37bd0b5f6ad52deb Mon Sep 17 00:00:00 2001
From: Chen Haonan <chen.haonan2@zte.com.cn>
Date: Tue, 19 Dec 2023 21:06:25 +0800
Subject: PM: hibernate: Use kmap_local_page() in copy_data_page()

kmap_atomic() has been deprecated in favor of kmap_local_page().

kmap_atomic() disables page-faults and preemption (the latter
only for !PREEMPT_RT kernels).The code between the mapping and
un-mapping in this patch does not depend on the above-mentioned
side effects.So simply replaced kmap_atomic() with kmap_local_page().

Signed-off-by: Chen Haonan <chen.haonan2@zte.com.cn>
[ rjw: Subject edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index e3e8f1c6e75f..5c96ff067c64 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1487,11 +1487,11 @@ static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 	s_page = pfn_to_page(src_pfn);
 	d_page = pfn_to_page(dst_pfn);
 	if (PageHighMem(s_page)) {
-		src = kmap_atomic(s_page);
-		dst = kmap_atomic(d_page);
+		src = kmap_local_page(s_page);
+		dst = kmap_local_page(d_page);
 		zeros_only = do_copy_page(dst, src);
-		kunmap_atomic(dst);
-		kunmap_atomic(src);
+		kunmap_local(dst);
+		kunmap_local(src);
 	} else {
 		if (PageHighMem(d_page)) {
 			/*
@@ -1499,9 +1499,9 @@ static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 			 * data modified by kmap_atomic()
 			 */
 			zeros_only = safe_copy_page(buffer, s_page);
-			dst = kmap_atomic(d_page);
+			dst = kmap_local_page(d_page);
 			copy_page(dst, buffer);
-			kunmap_atomic(dst);
+			kunmap_local(dst);
 		} else {
 			zeros_only = safe_copy_page(page_address(d_page), s_page);
 		}
-- 
cgit v1.2.3


From f17f2c13d613cbeef529b03ca17ae2581b2e6cb8 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Fri, 8 Dec 2023 16:29:34 +0800
Subject: module: Remove redundant TASK_UNINTERRUPTIBLE

TASK_KILLABLE already includes TASK_UNINTERRUPTIBLE, so there is no
need to add a separate TASK_UNINTERRUPTIBLE.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 kernel/module/dups.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module/dups.c b/kernel/module/dups.c
index f3d7ea1e96d8..9a92f2f8c9d3 100644
--- a/kernel/module/dups.c
+++ b/kernel/module/dups.c
@@ -207,7 +207,7 @@ bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
 	 * optimization enabled ...
 	 */
 	ret = wait_for_completion_state(&kmod_req->first_req_done,
-					TASK_UNINTERRUPTIBLE | TASK_KILLABLE);
+					TASK_KILLABLE);
 	if (ret) {
 		*dup_ret = ret;
 		return true;
-- 
cgit v1.2.3


From d028f87517d6775dccff4ddbca2740826f9e53f1 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Tue, 19 Dec 2023 21:47:57 +0800
Subject: bpf: make the verifier tracks the "not equal" for regs

We can derive some new information for BPF_JNE in regs_refine_cond_op().
Take following code for example:

  /* The type of "a" is u32 */
  if (a > 0 && a < 100) {
    /* the range of the register for a is [0, 99], not [1, 99],
     * and will cause the following error:
     *
     *   invalid zero-sized read
     *
     * as a can be 0.
     */
    bpf_skb_store_bytes(skb, xx, xx, a, 0);
  }

In the code above, "a > 0" will be compiled to "jmp xxx if a == 0". In the
TRUE branch, the dst_reg will be marked as known to 0. However, in the
fallthrough(FALSE) branch, the dst_reg will not be handled, which makes
the [min, max] for a is [0, 99], not [1, 99].

For BPF_JNE, we can reduce the range of the dst reg if the src reg is a
const and is exactly the edge of the dst reg.

Signed-off-by: Menglong Dong <menglong8.dong@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Link: https://lore.kernel.org/r/20231219134800.1550388-2-menglong8.dong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4ceec8c2a484..df1cae459c77 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14336,7 +14336,43 @@ again:
 		}
 		break;
 	case BPF_JNE:
-		/* we don't derive any new information for inequality yet */
+		if (!is_reg_const(reg2, is_jmp32))
+			swap(reg1, reg2);
+		if (!is_reg_const(reg2, is_jmp32))
+			break;
+
+		/* try to recompute the bound of reg1 if reg2 is a const and
+		 * is exactly the edge of reg1.
+		 */
+		val = reg_const_value(reg2, is_jmp32);
+		if (is_jmp32) {
+			/* u32_min_value is not equal to 0xffffffff at this point,
+			 * because otherwise u32_max_value is 0xffffffff as well,
+			 * in such a case both reg1 and reg2 would be constants,
+			 * jump would be predicted and reg_set_min_max() won't
+			 * be called.
+			 *
+			 * Same reasoning works for all {u,s}{min,max}{32,64} cases
+			 * below.
+			 */
+			if (reg1->u32_min_value == (u32)val)
+				reg1->u32_min_value++;
+			if (reg1->u32_max_value == (u32)val)
+				reg1->u32_max_value--;
+			if (reg1->s32_min_value == (s32)val)
+				reg1->s32_min_value++;
+			if (reg1->s32_max_value == (s32)val)
+				reg1->s32_max_value--;
+		} else {
+			if (reg1->umin_value == (u64)val)
+				reg1->umin_value++;
+			if (reg1->umax_value == (u64)val)
+				reg1->umax_value--;
+			if (reg1->smin_value == (s64)val)
+				reg1->smin_value++;
+			if (reg1->smax_value == (s64)val)
+				reg1->smax_value--;
+		}
 		break;
 	case BPF_JSET:
 		if (!is_reg_const(reg2, is_jmp32))
-- 
cgit v1.2.3


From 4ba1d0f23414135e4f426dae4cb5cdc2ce246f89 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:25 -0800
Subject: bpf: abstract away global subprog arg preparation logic from reg
 state setup

btf_prepare_func_args() is used to understand expectations and
restrictions on global subprog arguments. But current implementation is
hard to extend, as it intermixes BTF-based func prototype parsing and
interpretation logic with setting up register state at subprog entry.

Worse still, those registers are not completely set up inside
btf_prepare_func_args(), requiring some more logic later in
do_check_common(). Like calling mark_reg_unknown() and similar
initialization operations.

This intermixing of BTF interpretation and register state setup is
problematic. First, it causes duplication of BTF parsing logic for global
subprog verification (to set up initial state of global subprog) and
global subprog call sites analysis (when we need to check that whatever
is being passed into global subprog matches expectations), performed in
btf_check_subprog_call().

Given we want to extend global func argument with tags later, this
duplication is problematic. So refactor btf_prepare_func_args() to do
only BTF-based func proto and args parsing, returning high-level
argument "expectations" only, with no regard to specifics of register
state. I.e., if it's a context argument, instead of setting register
state to PTR_TO_CTX, we return ARG_PTR_TO_CTX enum for that argument as
"an argument specification" for further processing inside
do_check_common(). Similarly for SCALAR arguments, PTR_TO_MEM, etc.

This allows to reuse btf_prepare_func_args() in following patches at
global subprog call site analysis time. It also keeps register setup
code consistently in one place, do_check_common().

Besides all this, we cache this argument specs information inside
env->subprog_info, eliminating the need to redo these potentially
expensive BTF traversals, especially if BPF program's BTF is big and/or
there are lots of global subprog calls.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  3 +--
 include/linux/bpf_verifier.h | 16 ++++++++++++++++
 kernel/bpf/btf.c             | 38 ++++++++++++++++++++------------------
 kernel/bpf/verifier.c        | 43 +++++++++++++++++++++++++++----------------
 4 files changed, 64 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7a8d4c81a39a..c050c82cc9a5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2470,8 +2470,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 				struct bpf_reg_state *regs);
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs);
-int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *reg, u32 *nargs);
+int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
 const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c2819a6579a5..5742e9c0a7b8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -606,6 +606,13 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 
 #define BPF_MAX_SUBPROGS 256
 
+struct bpf_subprog_arg_info {
+	enum bpf_arg_type arg_type;
+	union {
+		u32 mem_size;
+	};
+};
+
 struct bpf_subprog_info {
 	/* 'start' has to be the first field otherwise find_subprog() won't work */
 	u32 start; /* insn idx of function entry point */
@@ -617,6 +624,10 @@ struct bpf_subprog_info {
 	bool is_cb: 1;
 	bool is_async_cb: 1;
 	bool is_exception_cb: 1;
+	bool args_cached: 1;
+
+	u8 arg_cnt;
+	struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
 };
 
 struct bpf_verifier_env;
@@ -727,6 +738,11 @@ struct bpf_verifier_env {
 	char tmp_str_buf[TMP_STR_BUF_LEN];
 };
 
+static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog)
+{
+	return &env->subprog_info[subprog];
+}
+
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
 				      const char *fmt, va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index d56433bf8aba..be2104e5f2f5 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6948,16 +6948,17 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 	return err;
 }
 
-/* Convert BTF of a function into bpf_reg_state if possible
+/* Process BTF of a function to produce high-level expectation of function
+ * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information
+ * is cached in subprog info for reuse.
  * Returns:
  * EFAULT - there is a verifier bug. Abort verification.
  * EINVAL - cannot convert BTF.
- * 0 - Successfully converted BTF into bpf_reg_state
- * (either PTR_TO_CTX or SCALAR_VALUE).
+ * 0 - Successfully processed BTF and constructed argument expectations.
  */
-int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
-			  struct bpf_reg_state *regs, u32 *arg_cnt)
+int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 {
+	struct bpf_subprog_info *sub = subprog_info(env, subprog);
 	struct bpf_verifier_log *log = &env->log;
 	struct bpf_prog *prog = env->prog;
 	enum bpf_prog_type prog_type = prog->type;
@@ -6967,6 +6968,9 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 	u32 i, nargs, btf_id;
 	const char *tname;
 
+	if (sub->args_cached)
+		return 0;
+
 	if (!prog->aux->func_info ||
 	    prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) {
 		bpf_log(log, "Verifier bug\n");
@@ -6990,10 +6994,6 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 	}
 	tname = btf_name_by_offset(btf, t->name_off);
 
-	if (log->level & BPF_LOG_LEVEL)
-		bpf_log(log, "Validating %s() func#%d...\n",
-			tname, subprog);
-
 	if (prog->aux->func_info_aux[subprog].unreliable) {
 		bpf_log(log, "Verifier bug in function %s()\n", tname);
 		return -EFAULT;
@@ -7013,7 +7013,6 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			tname, nargs, MAX_BPF_FUNC_REG_ARGS);
 		return -EINVAL;
 	}
-	*arg_cnt = nargs;
 	/* check that function returns int, exception cb also requires this */
 	t = btf_type_by_id(btf, t->type);
 	while (btf_type_is_modifier(t))
@@ -7028,24 +7027,24 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 	 * Only PTR_TO_CTX and SCALAR are supported atm.
 	 */
 	for (i = 0; i < nargs; i++) {
-		struct bpf_reg_state *reg = &regs[i + 1];
-
 		t = btf_type_by_id(btf, args[i].type);
 		while (btf_type_is_modifier(t))
 			t = btf_type_by_id(btf, t->type);
 		if (btf_type_is_int(t) || btf_is_any_enum(t)) {
-			reg->type = SCALAR_VALUE;
+			sub->args[i].arg_type = ARG_ANYTHING;
 			continue;
 		}
 		if (btf_type_is_ptr(t)) {
+			u32 mem_size;
+
 			if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
-				reg->type = PTR_TO_CTX;
+				sub->args[i].arg_type = ARG_PTR_TO_CTX;
 				continue;
 			}
 
 			t = btf_type_skip_modifiers(btf, t->type, NULL);
 
-			ref_t = btf_resolve_size(btf, t, &reg->mem_size);
+			ref_t = btf_resolve_size(btf, t, &mem_size);
 			if (IS_ERR(ref_t)) {
 				bpf_log(log,
 				    "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
@@ -7054,15 +7053,18 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 				return -EINVAL;
 			}
 
-			reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
-			reg->id = ++env->id_gen;
-
+			sub->args[i].arg_type = ARG_PTR_TO_MEM_OR_NULL;
+			sub->args[i].mem_size = mem_size;
 			continue;
 		}
 		bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
 			i, btf_type_str(t), tname);
 		return -EINVAL;
 	}
+
+	sub->arg_cnt = nargs;
+	sub->args_cached = true;
+
 	return 0;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index df1cae459c77..6555785b9f63 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -442,11 +442,6 @@ static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env,
 	return &env->prog->aux->func_info_aux[subprog];
 }
 
-static struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog)
-{
-	return &env->subprog_info[subprog];
-}
-
 static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
 {
 	struct bpf_subprog_info *info = subprog_info(env, subprog);
@@ -19937,34 +19932,50 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 
 	regs = state->frame[state->curframe]->regs;
 	if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
-		u32 nargs;
+		struct bpf_subprog_info *sub = subprog_info(env, subprog);
+		const char *sub_name = subprog_name(env, subprog);
+		struct bpf_subprog_arg_info *arg;
+		struct bpf_reg_state *reg;
 
-		ret = btf_prepare_func_args(env, subprog, regs, &nargs);
+		verbose(env, "Validating %s() func#%d...\n", sub_name, subprog);
+		ret = btf_prepare_func_args(env, subprog);
 		if (ret)
 			goto out;
+
 		if (subprog_is_exc_cb(env, subprog)) {
 			state->frame[0]->in_exception_callback_fn = true;
 			/* We have already ensured that the callback returns an integer, just
 			 * like all global subprogs. We need to determine it only has a single
 			 * scalar argument.
 			 */
-			if (nargs != 1 || regs[BPF_REG_1].type != SCALAR_VALUE) {
+			if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) {
 				verbose(env, "exception cb only supports single integer argument\n");
 				ret = -EINVAL;
 				goto out;
 			}
 		}
-		for (i = BPF_REG_1; i <= BPF_REG_5; i++) {
-			if (regs[i].type == PTR_TO_CTX)
+		for (i = BPF_REG_1; i <= sub->arg_cnt; i++) {
+			arg = &sub->args[i - BPF_REG_1];
+			reg = &regs[i];
+
+			if (arg->arg_type == ARG_PTR_TO_CTX) {
+				reg->type = PTR_TO_CTX;
 				mark_reg_known_zero(env, regs, i);
-			else if (regs[i].type == SCALAR_VALUE)
+			} else if (arg->arg_type == ARG_ANYTHING) {
+				reg->type = SCALAR_VALUE;
 				mark_reg_unknown(env, regs, i);
-			else if (base_type(regs[i].type) == PTR_TO_MEM) {
-				const u32 mem_size = regs[i].mem_size;
-
+			} else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
+				reg->type = PTR_TO_MEM;
+				if (arg->arg_type & PTR_MAYBE_NULL)
+					reg->type |= PTR_MAYBE_NULL;
 				mark_reg_known_zero(env, regs, i);
-				regs[i].mem_size = mem_size;
-				regs[i].id = ++env->id_gen;
+				reg->mem_size = arg->mem_size;
+				reg->id = ++env->id_gen;
+			} else {
+				WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n",
+					  i - BPF_REG_1, arg->arg_type);
+				ret = -EFAULT;
+				goto out;
 			}
 		}
 	} else {
-- 
cgit v1.2.3


From 5eccd2db42d77e3570619c32d39e39bf486607cf Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:26 -0800
Subject: bpf: reuse btf_prepare_func_args() check for main program BTF
 validation

Instead of btf_check_subprog_arg_match(), use btf_prepare_func_args()
logic to validate "trustworthiness" of main BPF program's BTF information,
if it is present.

We ignored results of original BTF check anyway, often times producing
confusing and ominously-sounding "reg type unsupported for arg#0
function" message, which has no apparent effect on program correctness
and verification process.

All the -EFAULT returning sanity checks are already performed in
check_btf_info_early(), so there is zero reason to have this duplication
of logic between btf_check_subprog_call() and btf_check_subprog_arg_match().
Dropping btf_check_subprog_arg_match() simplifies
btf_check_func_arg_match() further removing `bool processing_call` flag.

One subtle bit that was done by btf_check_subprog_arg_match() was
potentially marking main program's BTF as unreliable. We do this
explicitly now with a dedicated simple check, preserving the original
behavior, but now based on well factored btf_prepare_func_args() logic.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                |  2 -
 kernel/bpf/btf.c                                   | 50 ++--------------------
 kernel/bpf/verifier.c                              | 25 ++++++-----
 tools/testing/selftests/bpf/prog_tests/log_fixup.c |  4 +-
 .../selftests/bpf/progs/cgrp_kfunc_failure.c       |  2 +-
 .../selftests/bpf/progs/task_kfunc_failure.c       |  2 +-
 6 files changed, 19 insertions(+), 66 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c050c82cc9a5..d0d7eff22b8a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2466,8 +2466,6 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 			   struct btf_func_model *m);
 
 struct bpf_reg_state;
-int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
-				struct bpf_reg_state *regs);
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index be2104e5f2f5..33d9a1c73f6e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6768,8 +6768,7 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    const struct btf *btf, u32 func_id,
 				    struct bpf_reg_state *regs,
-				    bool ptr_to_mem_ok,
-				    bool processing_call)
+				    bool ptr_to_mem_ok)
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
 	struct bpf_verifier_log *log = &env->log;
@@ -6842,7 +6841,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 					i, btf_type_str(t));
 				return -EINVAL;
 			}
-		} else if (ptr_to_mem_ok && processing_call) {
+		} else if (ptr_to_mem_ok) {
 			const struct btf_type *resolve_ret;
 			u32 type_size;
 
@@ -6867,55 +6866,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 	return 0;
 }
 
-/* Compare BTF of a function declaration with given bpf_reg_state.
- * Returns:
- * EFAULT - there is a verifier bug. Abort verification.
- * EINVAL - there is a type mismatch or BTF is not available.
- * 0 - BTF matches with what bpf_reg_state expects.
- * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- */
-int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
-				struct bpf_reg_state *regs)
-{
-	struct bpf_prog *prog = env->prog;
-	struct btf *btf = prog->aux->btf;
-	bool is_global;
-	u32 btf_id;
-	int err;
-
-	if (!prog->aux->func_info)
-		return -EINVAL;
-
-	btf_id = prog->aux->func_info[subprog].type_id;
-	if (!btf_id)
-		return -EFAULT;
-
-	if (prog->aux->func_info_aux[subprog].unreliable)
-		return -EINVAL;
-
-	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, false);
-
-	/* Compiler optimizations can remove arguments from static functions
-	 * or mismatched type can be passed into a global function.
-	 * In such cases mark the function as unreliable from BTF point of view.
-	 */
-	if (err)
-		prog->aux->func_info_aux[subprog].unreliable = true;
-	return err;
-}
-
 /* Compare BTF of a function call with given bpf_reg_state.
  * Returns:
  * EFAULT - there is a verifier bug. Abort verification.
  * EINVAL - there is a type mismatch or BTF is not available.
  * 0 - BTF matches with what bpf_reg_state expects.
  * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- *
- * NOTE: the code is duplicated from btf_check_subprog_arg_match()
- * because btf_check_func_arg_match() is still doing both. Once that
- * function is split in 2, we can call from here btf_check_subprog_arg_match()
- * first, and then treat the calling part in a new code path.
  */
 int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 			   struct bpf_reg_state *regs)
@@ -6937,7 +6893,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 		return -EINVAL;
 
 	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, true);
+	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
 
 	/* Compiler optimizations can remove arguments from static functions
 	 * or mismatched type can be passed into a global function.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6555785b9f63..c26e9ab5226c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19904,6 +19904,7 @@ static void free_states(struct bpf_verifier_env *env)
 static int do_check_common(struct bpf_verifier_env *env, int subprog)
 {
 	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
+	struct bpf_subprog_info *sub = subprog_info(env, subprog);
 	struct bpf_verifier_state *state;
 	struct bpf_reg_state *regs;
 	int ret, i;
@@ -19930,9 +19931,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 	state->first_insn_idx = env->subprog_info[subprog].start;
 	state->last_insn_idx = -1;
 
+
 	regs = state->frame[state->curframe]->regs;
 	if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
-		struct bpf_subprog_info *sub = subprog_info(env, subprog);
 		const char *sub_name = subprog_name(env, subprog);
 		struct bpf_subprog_arg_info *arg;
 		struct bpf_reg_state *reg;
@@ -19979,21 +19980,19 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 			}
 		}
 	} else {
+		/* if main BPF program has associated BTF info, validate that
+		 * it's matching expected signature, and otherwise mark BTF
+		 * info for main program as unreliable
+		 */
+		if (env->prog->aux->func_info_aux) {
+			ret = btf_prepare_func_args(env, 0);
+			if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX)
+				env->prog->aux->func_info_aux[0].unreliable = true;
+		}
+
 		/* 1st arg to a function */
 		regs[BPF_REG_1].type = PTR_TO_CTX;
 		mark_reg_known_zero(env, regs, BPF_REG_1);
-		ret = btf_check_subprog_arg_match(env, subprog, regs);
-		if (ret == -EFAULT)
-			/* unlikely verifier bug. abort.
-			 * ret == 0 and ret < 0 are sadly acceptable for
-			 * main() function due to backward compatibility.
-			 * Like socket filter program may be written as:
-			 * int bpf_prog(struct pt_regs *ctx)
-			 * and never dereference that ctx in the program.
-			 * 'struct pt_regs' is a type mismatch for socket
-			 * filter that should be using 'struct __sk_buff'.
-			 */
-			goto out;
 	}
 
 	ret = do_check(env);
diff --git a/tools/testing/selftests/bpf/prog_tests/log_fixup.c b/tools/testing/selftests/bpf/prog_tests/log_fixup.c
index effd78b2a657..7a3fa2ff567b 100644
--- a/tools/testing/selftests/bpf/prog_tests/log_fixup.c
+++ b/tools/testing/selftests/bpf/prog_tests/log_fixup.c
@@ -169,9 +169,9 @@ void test_log_fixup(void)
 	if (test__start_subtest("bad_core_relo_trunc_none"))
 		bad_core_relo(0, TRUNC_NONE /* full buf */);
 	if (test__start_subtest("bad_core_relo_trunc_partial"))
-		bad_core_relo(300, TRUNC_PARTIAL /* truncate original log a bit */);
+		bad_core_relo(280, TRUNC_PARTIAL /* truncate original log a bit */);
 	if (test__start_subtest("bad_core_relo_trunc_full"))
-		bad_core_relo(210, TRUNC_FULL  /* truncate also libbpf's message patch */);
+		bad_core_relo(220, TRUNC_FULL  /* truncate also libbpf's message patch */);
 	if (test__start_subtest("bad_core_relo_subprog"))
 		bad_core_relo_subprog();
 	if (test__start_subtest("missing_map"))
diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
index 0fa564a5cc5b..9fe9c4a4e8f6 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
@@ -78,7 +78,7 @@ int BPF_PROG(cgrp_kfunc_acquire_fp, struct cgroup *cgrp, const char *path)
 }
 
 SEC("kretprobe/cgroup_destroy_locked")
-__failure __msg("reg type unsupported for arg#0 function")
+__failure __msg("calling kernel function bpf_cgroup_acquire is not allowed")
 int BPF_PROG(cgrp_kfunc_acquire_unsafe_kretprobe, struct cgroup *cgrp)
 {
 	struct cgroup *acquired;
diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
index dcdea3127086..ad88a3796ddf 100644
--- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c
@@ -248,7 +248,7 @@ int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 cl
 }
 
 SEC("lsm/task_free")
-__failure __msg("reg type unsupported for arg#0 function")
+__failure __msg("R1 must be a rcu pointer")
 int BPF_PROG(task_kfunc_from_lsm_task_free, struct task_struct *task)
 {
 	struct task_struct *acquired;
-- 
cgit v1.2.3


From e26080d0da87f20222ca6712b65f95a856fadee0 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:27 -0800
Subject: bpf: prepare btf_prepare_func_args() for handling static subprogs

Generalize btf_prepare_func_args() to support both global and static
subprogs. We are going to utilize this property in the next patch,
reusing btf_prepare_func_args() for subprog call logic instead of
reparsing BTF information in a completely separate implementation.

btf_prepare_func_args() now detects whether subprog is global or static
makes slight logic adjustments for static func cases, like not failing
fatally (-EFAULT) for conditions that are allowable for static subprogs.

Somewhat subtle (but major!) difference is the handling of pointer arguments.
Both global and static functions need to handle special context
arguments (which are pointers to predefined type names), but static
subprogs give up on any other pointers, falling back to marking subprog
as "unreliable", disabling the use of BTF type information altogether.

For global functions, though, we are assuming that such pointers to
unrecognized types are just pointers to fixed-sized memory region (or
error out if size cannot be established, like for `void *` pointers).

This patch accommodates these small differences and sets up a stage for
refactoring in the next patch, eliminating a separate BTF-based parsing
logic in btf_check_func_arg_match().

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  5 +++++
 kernel/bpf/btf.c             | 18 +++++++++---------
 kernel/bpf/verifier.c        |  5 -----
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5742e9c0a7b8..d3ea9ef04767 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -738,6 +738,11 @@ struct bpf_verifier_env {
 	char tmp_str_buf[TMP_STR_BUF_LEN];
 };
 
+static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
+{
+	return &env->prog->aux->func_info_aux[subprog];
+}
+
 static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog)
 {
 	return &env->subprog_info[subprog];
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 33d9a1c73f6e..d321340e16f1 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6914,6 +6914,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
  */
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 {
+	bool is_global = subprog_aux(env, subprog)->linkage == BTF_FUNC_GLOBAL;
 	struct bpf_subprog_info *sub = subprog_info(env, subprog);
 	struct bpf_verifier_log *log = &env->log;
 	struct bpf_prog *prog = env->prog;
@@ -6927,14 +6928,15 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 	if (sub->args_cached)
 		return 0;
 
-	if (!prog->aux->func_info ||
-	    prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) {
+	if (!prog->aux->func_info) {
 		bpf_log(log, "Verifier bug\n");
 		return -EFAULT;
 	}
 
 	btf_id = prog->aux->func_info[subprog].type_id;
 	if (!btf_id) {
+		if (!is_global) /* not fatal for static funcs */
+			return -EINVAL;
 		bpf_log(log, "Global functions need valid BTF\n");
 		return -EFAULT;
 	}
@@ -6990,16 +6992,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 			sub->args[i].arg_type = ARG_ANYTHING;
 			continue;
 		}
-		if (btf_type_is_ptr(t)) {
+		if (btf_type_is_ptr(t) && btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+			sub->args[i].arg_type = ARG_PTR_TO_CTX;
+			continue;
+		}
+		if (is_global && btf_type_is_ptr(t)) {
 			u32 mem_size;
 
-			if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
-				sub->args[i].arg_type = ARG_PTR_TO_CTX;
-				continue;
-			}
-
 			t = btf_type_skip_modifiers(btf, t->type, NULL);
-
 			ref_t = btf_resolve_size(btf, t, &mem_size);
 			if (IS_ERR(ref_t)) {
 				bpf_log(log,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c26e9ab5226c..6c9ecb86a8d9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -437,11 +437,6 @@ static const char *subprog_name(const struct bpf_verifier_env *env, int subprog)
 	return btf_type_name(env->prog->aux->btf, info->type_id);
 }
 
-static struct bpf_func_info_aux *subprog_aux(const struct bpf_verifier_env *env, int subprog)
-{
-	return &env->prog->aux->func_info_aux[subprog];
-}
-
 static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog)
 {
 	struct bpf_subprog_info *info = subprog_info(env, subprog);
-- 
cgit v1.2.3


From c5a7244759b1eeacc59d0426fb73859afa942d0d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:28 -0800
Subject: bpf: move subprog call logic back to verifier.c

Subprog call logic in btf_check_subprog_call() currently has both a lot
of BTF parsing logic (which is, presumably, what justified putting it
into btf.c), but also a bunch of register state checks, some of each
utilize deep verifier logic helpers, necessarily exported from
verifier.c: check_ptr_off_reg(), check_func_arg_reg_off(),
and check_mem_reg().

Going forward, btf_check_subprog_call() will have a minimum of
BTF-related logic, but will get more internal verifier logic related to
register state manipulation. So move it into verifier.c to minimize
amount of verifier-specific logic exposed to btf.c.

We do this move before refactoring btf_check_func_arg_match() to
preserve as much history post-refactoring as possible.

No functional changes.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |   2 -
 include/linux/bpf_verifier.h |   8 ---
 kernel/bpf/btf.c             | 139 ---------------------------------------
 kernel/bpf/verifier.c        | 153 +++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 146 insertions(+), 156 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d0d7eff22b8a..7671530d6e4e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2466,8 +2466,6 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 			   struct btf_func_model *m);
 
 struct bpf_reg_state;
-int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
-			   struct bpf_reg_state *regs);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d3ea9ef04767..d07d857ca67f 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -785,14 +785,6 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off,
 void
 bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt);
 
-int check_ptr_off_reg(struct bpf_verifier_env *env,
-		      const struct bpf_reg_state *reg, int regno);
-int check_func_arg_reg_off(struct bpf_verifier_env *env,
-			   const struct bpf_reg_state *reg, int regno,
-			   enum bpf_arg_type arg_type);
-int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-		   u32 regno, u32 mem_size);
-
 /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
 static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
 					     struct btf *btf, u32 btf_id)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index d321340e16f1..341811bcca53 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6765,145 +6765,6 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 	return btf_check_func_type_match(log, btf1, t1, btf2, t2);
 }
 
-static int btf_check_func_arg_match(struct bpf_verifier_env *env,
-				    const struct btf *btf, u32 func_id,
-				    struct bpf_reg_state *regs,
-				    bool ptr_to_mem_ok)
-{
-	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
-	struct bpf_verifier_log *log = &env->log;
-	const char *func_name, *ref_tname;
-	const struct btf_type *t, *ref_t;
-	const struct btf_param *args;
-	u32 i, nargs, ref_id;
-	int ret;
-
-	t = btf_type_by_id(btf, func_id);
-	if (!t || !btf_type_is_func(t)) {
-		/* These checks were already done by the verifier while loading
-		 * struct bpf_func_info or in add_kfunc_call().
-		 */
-		bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
-			func_id);
-		return -EFAULT;
-	}
-	func_name = btf_name_by_offset(btf, t->name_off);
-
-	t = btf_type_by_id(btf, t->type);
-	if (!t || !btf_type_is_func_proto(t)) {
-		bpf_log(log, "Invalid BTF of func %s\n", func_name);
-		return -EFAULT;
-	}
-	args = (const struct btf_param *)(t + 1);
-	nargs = btf_type_vlen(t);
-	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
-		bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs,
-			MAX_BPF_FUNC_REG_ARGS);
-		return -EINVAL;
-	}
-
-	/* check that BTF function arguments match actual types that the
-	 * verifier sees.
-	 */
-	for (i = 0; i < nargs; i++) {
-		enum bpf_arg_type arg_type = ARG_DONTCARE;
-		u32 regno = i + 1;
-		struct bpf_reg_state *reg = &regs[regno];
-
-		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
-		if (btf_type_is_scalar(t)) {
-			if (reg->type == SCALAR_VALUE)
-				continue;
-			bpf_log(log, "R%d is not a scalar\n", regno);
-			return -EINVAL;
-		}
-
-		if (!btf_type_is_ptr(t)) {
-			bpf_log(log, "Unrecognized arg#%d type %s\n",
-				i, btf_type_str(t));
-			return -EINVAL;
-		}
-
-		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
-		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
-
-		ret = check_func_arg_reg_off(env, reg, regno, arg_type);
-		if (ret < 0)
-			return ret;
-
-		if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
-			/* If function expects ctx type in BTF check that caller
-			 * is passing PTR_TO_CTX.
-			 */
-			if (reg->type != PTR_TO_CTX) {
-				bpf_log(log,
-					"arg#%d expected pointer to ctx, but got %s\n",
-					i, btf_type_str(t));
-				return -EINVAL;
-			}
-		} else if (ptr_to_mem_ok) {
-			const struct btf_type *resolve_ret;
-			u32 type_size;
-
-			resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
-			if (IS_ERR(resolve_ret)) {
-				bpf_log(log,
-					"arg#%d reference type('%s %s') size cannot be determined: %ld\n",
-					i, btf_type_str(ref_t), ref_tname,
-					PTR_ERR(resolve_ret));
-				return -EINVAL;
-			}
-
-			if (check_mem_reg(env, reg, regno, type_size))
-				return -EINVAL;
-		} else {
-			bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i,
-				func_name, func_id);
-			return -EINVAL;
-		}
-	}
-
-	return 0;
-}
-
-/* Compare BTF of a function call with given bpf_reg_state.
- * Returns:
- * EFAULT - there is a verifier bug. Abort verification.
- * EINVAL - there is a type mismatch or BTF is not available.
- * 0 - BTF matches with what bpf_reg_state expects.
- * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- */
-int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
-			   struct bpf_reg_state *regs)
-{
-	struct bpf_prog *prog = env->prog;
-	struct btf *btf = prog->aux->btf;
-	bool is_global;
-	u32 btf_id;
-	int err;
-
-	if (!prog->aux->func_info)
-		return -EINVAL;
-
-	btf_id = prog->aux->func_info[subprog].type_id;
-	if (!btf_id)
-		return -EFAULT;
-
-	if (prog->aux->func_info_aux[subprog].unreliable)
-		return -EINVAL;
-
-	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
-
-	/* Compiler optimizations can remove arguments from static functions
-	 * or mismatched type can be passed into a global function.
-	 * In such cases mark the function as unreliable from BTF point of view.
-	 */
-	if (err)
-		prog->aux->func_info_aux[subprog].unreliable = true;
-	return err;
-}
-
 /* Process BTF of a function to produce high-level expectation of function
  * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information
  * is cached in subprog info for reuse.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6c9ecb86a8d9..f48e49f2d482 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5127,8 +5127,8 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env,
 	return 0;
 }
 
-int check_ptr_off_reg(struct bpf_verifier_env *env,
-		      const struct bpf_reg_state *reg, int regno)
+static int check_ptr_off_reg(struct bpf_verifier_env *env,
+		             const struct bpf_reg_state *reg, int regno)
 {
 	return __check_ptr_off_reg(env, reg, regno, false);
 }
@@ -7300,8 +7300,8 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
 	return err;
 }
 
-int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-		   u32 regno, u32 mem_size)
+static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+			 u32 regno, u32 mem_size)
 {
 	bool may_be_null = type_may_be_null(reg->type);
 	struct bpf_reg_state saved_reg;
@@ -8286,9 +8286,9 @@ reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields)
 	return field;
 }
 
-int check_func_arg_reg_off(struct bpf_verifier_env *env,
-			   const struct bpf_reg_state *reg, int regno,
-			   enum bpf_arg_type arg_type)
+static int check_func_arg_reg_off(struct bpf_verifier_env *env,
+				  const struct bpf_reg_state *reg, int regno,
+				  enum bpf_arg_type arg_type)
 {
 	u32 type = reg->type;
 
@@ -9249,6 +9249,145 @@ err_out:
 	return err;
 }
 
+static int btf_check_func_arg_match(struct bpf_verifier_env *env,
+				    const struct btf *btf, u32 func_id,
+				    struct bpf_reg_state *regs,
+				    bool ptr_to_mem_ok)
+{
+	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+	struct bpf_verifier_log *log = &env->log;
+	const char *func_name, *ref_tname;
+	const struct btf_type *t, *ref_t;
+	const struct btf_param *args;
+	u32 i, nargs, ref_id;
+	int ret;
+
+	t = btf_type_by_id(btf, func_id);
+	if (!t || !btf_type_is_func(t)) {
+		/* These checks were already done by the verifier while loading
+		 * struct bpf_func_info or in add_kfunc_call().
+		 */
+		bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
+			func_id);
+		return -EFAULT;
+	}
+	func_name = btf_name_by_offset(btf, t->name_off);
+
+	t = btf_type_by_id(btf, t->type);
+	if (!t || !btf_type_is_func_proto(t)) {
+		bpf_log(log, "Invalid BTF of func %s\n", func_name);
+		return -EFAULT;
+	}
+	args = (const struct btf_param *)(t + 1);
+	nargs = btf_type_vlen(t);
+	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+		bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs,
+			MAX_BPF_FUNC_REG_ARGS);
+		return -EINVAL;
+	}
+
+	/* check that BTF function arguments match actual types that the
+	 * verifier sees.
+	 */
+	for (i = 0; i < nargs; i++) {
+		enum bpf_arg_type arg_type = ARG_DONTCARE;
+		u32 regno = i + 1;
+		struct bpf_reg_state *reg = &regs[regno];
+
+		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+		if (btf_type_is_scalar(t)) {
+			if (reg->type == SCALAR_VALUE)
+				continue;
+			bpf_log(log, "R%d is not a scalar\n", regno);
+			return -EINVAL;
+		}
+
+		if (!btf_type_is_ptr(t)) {
+			bpf_log(log, "Unrecognized arg#%d type %s\n",
+				i, btf_type_str(t));
+			return -EINVAL;
+		}
+
+		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
+		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+		ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+		if (ret < 0)
+			return ret;
+
+		if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+			/* If function expects ctx type in BTF check that caller
+			 * is passing PTR_TO_CTX.
+			 */
+			if (reg->type != PTR_TO_CTX) {
+				bpf_log(log,
+					"arg#%d expected pointer to ctx, but got %s\n",
+					i, btf_type_str(t));
+				return -EINVAL;
+			}
+		} else if (ptr_to_mem_ok) {
+			const struct btf_type *resolve_ret;
+			u32 type_size;
+
+			resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+			if (IS_ERR(resolve_ret)) {
+				bpf_log(log,
+					"arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+					i, btf_type_str(ref_t), ref_tname,
+					PTR_ERR(resolve_ret));
+				return -EINVAL;
+			}
+
+			if (check_mem_reg(env, reg, regno, type_size))
+				return -EINVAL;
+		} else {
+			bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i,
+				func_name, func_id);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/* Compare BTF of a function call with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
+			          struct bpf_reg_state *regs)
+{
+	struct bpf_prog *prog = env->prog;
+	struct btf *btf = prog->aux->btf;
+	bool is_global;
+	u32 btf_id;
+	int err;
+
+	if (!prog->aux->func_info)
+		return -EINVAL;
+
+	btf_id = prog->aux->func_info[subprog].type_id;
+	if (!btf_id)
+		return -EFAULT;
+
+	if (prog->aux->func_info_aux[subprog].unreliable)
+		return -EINVAL;
+
+	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
+
+	/* Compiler optimizations can remove arguments from static functions
+	 * or mismatched type can be passed into a global function.
+	 * In such cases mark the function as unreliable from BTF point of view.
+	 */
+	if (err)
+		prog->aux->func_info_aux[subprog].unreliable = true;
+	return err;
+}
+
 static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			      int insn_idx, int subprog,
 			      set_callee_state_fn set_callee_state_cb)
-- 
cgit v1.2.3


From f18c3d88deedf0defc3e4800341cc7bcaaabcdf9 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:29 -0800
Subject: bpf: reuse subprog argument parsing logic for subprog call checks

Remove duplicated BTF parsing logic when it comes to subprog call check.
Instead, use (potentially cached) results of btf_prepare_func_args() to
abstract away expectations of each subprog argument in generic terms
(e.g., "this is pointer to context", or "this is a pointer to memory of
size X"), and then use those simple high-level argument type
expectations to validate actual register states to check if they match
expectations.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-6-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                              | 110 ++++++---------------
 .../selftests/bpf/progs/test_global_func5.c        |   2 +-
 2 files changed, 31 insertions(+), 81 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f48e49f2d482..3c9e6d0d77d6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9249,101 +9249,54 @@ err_out:
 	return err;
 }
 
-static int btf_check_func_arg_match(struct bpf_verifier_env *env,
-				    const struct btf *btf, u32 func_id,
-				    struct bpf_reg_state *regs,
-				    bool ptr_to_mem_ok)
+static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
+				    const struct btf *btf,
+				    struct bpf_reg_state *regs)
 {
-	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+	struct bpf_subprog_info *sub = subprog_info(env, subprog);
 	struct bpf_verifier_log *log = &env->log;
-	const char *func_name, *ref_tname;
-	const struct btf_type *t, *ref_t;
-	const struct btf_param *args;
-	u32 i, nargs, ref_id;
+	u32 i;
 	int ret;
 
-	t = btf_type_by_id(btf, func_id);
-	if (!t || !btf_type_is_func(t)) {
-		/* These checks were already done by the verifier while loading
-		 * struct bpf_func_info or in add_kfunc_call().
-		 */
-		bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
-			func_id);
-		return -EFAULT;
-	}
-	func_name = btf_name_by_offset(btf, t->name_off);
-
-	t = btf_type_by_id(btf, t->type);
-	if (!t || !btf_type_is_func_proto(t)) {
-		bpf_log(log, "Invalid BTF of func %s\n", func_name);
-		return -EFAULT;
-	}
-	args = (const struct btf_param *)(t + 1);
-	nargs = btf_type_vlen(t);
-	if (nargs > MAX_BPF_FUNC_REG_ARGS) {
-		bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs,
-			MAX_BPF_FUNC_REG_ARGS);
-		return -EINVAL;
-	}
+	ret = btf_prepare_func_args(env, subprog);
+	if (ret)
+		return ret;
 
 	/* check that BTF function arguments match actual types that the
 	 * verifier sees.
 	 */
-	for (i = 0; i < nargs; i++) {
-		enum bpf_arg_type arg_type = ARG_DONTCARE;
+	for (i = 0; i < sub->arg_cnt; i++) {
 		u32 regno = i + 1;
 		struct bpf_reg_state *reg = &regs[regno];
+		struct bpf_subprog_arg_info *arg = &sub->args[i];
 
-		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
-		if (btf_type_is_scalar(t)) {
-			if (reg->type == SCALAR_VALUE)
-				continue;
-			bpf_log(log, "R%d is not a scalar\n", regno);
-			return -EINVAL;
-		}
-
-		if (!btf_type_is_ptr(t)) {
-			bpf_log(log, "Unrecognized arg#%d type %s\n",
-				i, btf_type_str(t));
-			return -EINVAL;
-		}
-
-		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
-		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
-
-		ret = check_func_arg_reg_off(env, reg, regno, arg_type);
-		if (ret < 0)
-			return ret;
-
-		if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+		if (arg->arg_type == ARG_ANYTHING) {
+			if (reg->type != SCALAR_VALUE) {
+				bpf_log(log, "R%d is not a scalar\n", regno);
+				return -EINVAL;
+			}
+		} else if (arg->arg_type == ARG_PTR_TO_CTX) {
+			ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+			if (ret < 0)
+				return ret;
 			/* If function expects ctx type in BTF check that caller
 			 * is passing PTR_TO_CTX.
 			 */
 			if (reg->type != PTR_TO_CTX) {
-				bpf_log(log,
-					"arg#%d expected pointer to ctx, but got %s\n",
-					i, btf_type_str(t));
-				return -EINVAL;
-			}
-		} else if (ptr_to_mem_ok) {
-			const struct btf_type *resolve_ret;
-			u32 type_size;
-
-			resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
-			if (IS_ERR(resolve_ret)) {
-				bpf_log(log,
-					"arg#%d reference type('%s %s') size cannot be determined: %ld\n",
-					i, btf_type_str(ref_t), ref_tname,
-					PTR_ERR(resolve_ret));
+				bpf_log(log, "arg#%d expects pointer to ctx\n", i);
 				return -EINVAL;
 			}
+		} else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
+			ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
+			if (ret < 0)
+				return ret;
 
-			if (check_mem_reg(env, reg, regno, type_size))
+			if (check_mem_reg(env, reg, regno, arg->mem_size))
 				return -EINVAL;
 		} else {
-			bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i,
-				func_name, func_id);
-			return -EINVAL;
+			bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n",
+				i, arg->arg_type);
+			return -EFAULT;
 		}
 	}
 
@@ -9358,11 +9311,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
  * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
  */
 static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
-			          struct bpf_reg_state *regs)
+				  struct bpf_reg_state *regs)
 {
 	struct bpf_prog *prog = env->prog;
 	struct btf *btf = prog->aux->btf;
-	bool is_global;
 	u32 btf_id;
 	int err;
 
@@ -9376,9 +9328,7 @@ static int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 	if (prog->aux->func_info_aux[subprog].unreliable)
 		return -EINVAL;
 
-	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
-
+	err = btf_check_func_arg_match(env, subprog, btf, regs);
 	/* Compiler optimizations can remove arguments from static functions
 	 * or mismatched type can be passed into a global function.
 	 * In such cases mark the function as unreliable from BTF point of view.
diff --git a/tools/testing/selftests/bpf/progs/test_global_func5.c b/tools/testing/selftests/bpf/progs/test_global_func5.c
index cc55aedaf82d..257c0569ff98 100644
--- a/tools/testing/selftests/bpf/progs/test_global_func5.c
+++ b/tools/testing/selftests/bpf/progs/test_global_func5.c
@@ -26,7 +26,7 @@ int f3(int val, struct __sk_buff *skb)
 }
 
 SEC("tc")
-__failure __msg("expected pointer to ctx, but got PTR")
+__failure __msg("expects pointer to ctx")
 int global_func5(struct __sk_buff *skb)
 {
 	return f1(skb) + f2(2, skb) + f3(3, skb);
-- 
cgit v1.2.3


From 94e1c70a34523b5e1529e4ec508316acc6a26a2b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:30 -0800
Subject: bpf: support 'arg:xxx' btf_decl_tag-based hints for global subprog
 args

Add support for annotating global BPF subprog arguments to provide more
information about expected semantics of the argument. Currently,
verifier relies purely on argument's BTF type information, and supports
three general use cases: scalar, pointer-to-context, and
pointer-to-fixed-size-memory.

Scalar and pointer-to-fixed-mem work well in practice and are quite
natural to use. But pointer-to-context is a bit problematic, as typical
BPF users don't realize that they need to use a special type name to
signal to verifier that argument is not just some pointer, but actually
a PTR_TO_CTX. Further, even if users do know which type to use, it is
limiting in situations where the same BPF program logic is used across
few different program types. Common case is kprobes, tracepoints, and
perf_event programs having a helper to send some data over BPF perf
buffer. bpf_perf_event_output() requires `ctx` argument, and so it's
quite cumbersome to share such global subprog across few BPF programs of
different types, necessitating extra static subprog that is context
type-agnostic.

Long story short, there is a need to go beyond types and allow users to
add hints to global subprog arguments to define expectations.

This patch adds such support for two initial special tags:
  - pointer to context;
  - non-null qualifier for generic pointer arguments.

All of the above came up in practice already and seem generally useful
additions. Non-null qualifier is an often requested feature, which
currently has to be worked around by having unnecessary NULL checks
inside subprogs even if we know that arguments are never NULL. Pointer
to context was discussed earlier.

As for implementation, we utilize btf_decl_tag attribute and set up an
"arg:xxx" convention to specify argument hint. As such:
  - btf_decl_tag("arg:ctx") is a PTR_TO_CTX hint;
  - btf_decl_tag("arg:nonnull") marks pointer argument as not allowed to
    be NULL, making NULL check inside global subprog unnecessary.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-7-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c      | 44 ++++++++++++++++++++++++++++++++++++++------
 kernel/bpf/verifier.c |  5 ++++-
 2 files changed, 42 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 341811bcca53..c0fc8977be97 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6782,7 +6782,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 	enum bpf_prog_type prog_type = prog->type;
 	struct btf *btf = prog->aux->btf;
 	const struct btf_param *args;
-	const struct btf_type *t, *ref_t;
+	const struct btf_type *t, *ref_t, *fn_t;
 	u32 i, nargs, btf_id;
 	const char *tname;
 
@@ -6802,8 +6802,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 		return -EFAULT;
 	}
 
-	t = btf_type_by_id(btf, btf_id);
-	if (!t || !btf_type_is_func(t)) {
+	fn_t = btf_type_by_id(btf, btf_id);
+	if (!fn_t || !btf_type_is_func(fn_t)) {
 		/* These checks were already done by the verifier while loading
 		 * struct bpf_func_info
 		 */
@@ -6811,7 +6811,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 			subprog);
 		return -EFAULT;
 	}
-	tname = btf_name_by_offset(btf, t->name_off);
+	tname = btf_name_by_offset(btf, fn_t->name_off);
 
 	if (prog->aux->func_info_aux[subprog].unreliable) {
 		bpf_log(log, "Verifier bug in function %s()\n", tname);
@@ -6820,7 +6820,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 	if (prog_type == BPF_PROG_TYPE_EXT)
 		prog_type = prog->aux->dst_prog->type;
 
-	t = btf_type_by_id(btf, t->type);
+	t = btf_type_by_id(btf, fn_t->type);
 	if (!t || !btf_type_is_func_proto(t)) {
 		bpf_log(log, "Invalid type of function %s()\n", tname);
 		return -EFAULT;
@@ -6846,7 +6846,35 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 	 * Only PTR_TO_CTX and SCALAR are supported atm.
 	 */
 	for (i = 0; i < nargs; i++) {
+		bool is_nonnull = false;
+		const char *tag;
+
 		t = btf_type_by_id(btf, args[i].type);
+
+		tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:");
+		if (IS_ERR(tag) && PTR_ERR(tag) == -ENOENT) {
+			tag = NULL;
+		} else if (IS_ERR(tag)) {
+			bpf_log(log, "arg#%d type's tag fetching failure: %ld\n", i, PTR_ERR(tag));
+			return PTR_ERR(tag);
+		}
+		/* 'arg:<tag>' decl_tag takes precedence over derivation of
+		 * register type from BTF type itself
+		 */
+		if (tag) {
+			/* disallow arg tags in static subprogs */
+			if (!is_global) {
+				bpf_log(log, "arg#%d type tag is not supported in static functions\n", i);
+				return -EOPNOTSUPP;
+			}
+			if (strcmp(tag, "ctx") == 0) {
+				sub->args[i].arg_type = ARG_PTR_TO_CTX;
+				continue;
+			}
+			if (strcmp(tag, "nonnull") == 0)
+				is_nonnull = true;
+		}
+
 		while (btf_type_is_modifier(t))
 			t = btf_type_by_id(btf, t->type);
 		if (btf_type_is_int(t) || btf_is_any_enum(t)) {
@@ -6870,10 +6898,14 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 				return -EINVAL;
 			}
 
-			sub->args[i].arg_type = ARG_PTR_TO_MEM_OR_NULL;
+			sub->args[i].arg_type = is_nonnull ? ARG_PTR_TO_MEM : ARG_PTR_TO_MEM_OR_NULL;
 			sub->args[i].mem_size = mem_size;
 			continue;
 		}
+		if (is_nonnull) {
+			bpf_log(log, "arg#%d marked as non-null, but is not a pointer type\n", i);
+			return -EINVAL;
+		}
 		bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
 			i, btf_type_str(t), tname);
 		return -EINVAL;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3c9e6d0d77d6..4caabfdc15aa 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9290,9 +9290,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 			ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
 			if (ret < 0)
 				return ret;
-
 			if (check_mem_reg(env, reg, regno, arg->mem_size))
 				return -EINVAL;
+			if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) {
+				bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
+				return -EINVAL;
+			}
 		} else {
 			bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n",
 				i, arg->arg_type);
-- 
cgit v1.2.3


From a64bfe618665ea9c722f922cba8c6e3234eac5ac Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 14 Dec 2023 17:13:31 -0800
Subject: bpf: add support for passing dynptr pointer to global subprog

Add ability to pass a pointer to dynptr into global functions.
This allows to have global subprogs that accept and work with generic
dynptrs that are created by caller. Dynptr argument is detected based on
the name of a struct type, if it's "bpf_dynptr", it's assumed to be
a proper dynptr pointer. Both actual struct and forward struct
declaration types are supported.

This is conceptually exactly the same semantics as
bpf_user_ringbuf_drain()'s use of dynptr to pass a variable-sized
pointer to ringbuf record. So we heavily rely on CONST_PTR_TO_DYNPTR
bits of already existing logic in the verifier.

During global subprog validation, we mark such CONST_PTR_TO_DYNPTR as
having LOCAL type, as that's the most unassuming type of dynptr and it
doesn't have any special helpers that can try to free or acquire extra
references (unlike skb, xdp, or ringbuf dynptr). So that seems like a safe
"choice" to make from correctness standpoint. It's still possible to
pass any type of dynptr to such subprog, though, because generic dynptr
helpers, like getting data/slice pointers, read/write memory copying
routines, dynptr adjustment and getter routines all work correctly with
any type of dynptr.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20231215011334.2307144-8-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c      | 23 +++++++++++++++++++++++
 kernel/bpf/verifier.c |  7 +++++++
 2 files changed, 30 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index c0fc8977be97..51e8b4bee0c8 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6765,6 +6765,25 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 	return btf_check_func_type_match(log, btf1, t1, btf2, t2);
 }
 
+static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t)
+{
+	const char *name;
+
+	t = btf_type_by_id(btf, t->type); /* skip PTR */
+
+	while (btf_type_is_modifier(t))
+		t = btf_type_by_id(btf, t->type);
+
+	/* allow either struct or struct forward declaration */
+	if (btf_type_is_struct(t) ||
+	    (btf_type_is_fwd(t) && btf_type_kflag(t) == 0)) {
+		name = btf_str_by_offset(btf, t->name_off);
+		return name && strcmp(name, "bpf_dynptr") == 0;
+	}
+
+	return false;
+}
+
 /* Process BTF of a function to produce high-level expectation of function
  * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information
  * is cached in subprog info for reuse.
@@ -6885,6 +6904,10 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 			sub->args[i].arg_type = ARG_PTR_TO_CTX;
 			continue;
 		}
+		if (btf_type_is_ptr(t) && btf_is_dynptr_ptr(btf, t)) {
+			sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY;
+			continue;
+		}
 		if (is_global && btf_type_is_ptr(t)) {
 			u32 mem_size;
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4caabfdc15aa..f13008d27f35 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9296,6 +9296,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
 				bpf_log(log, "arg#%d is expected to be non-NULL\n", i);
 				return -EINVAL;
 			}
+		} else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+			ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
+			if (ret)
+				return ret;
 		} else {
 			bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n",
 				i, arg->arg_type);
@@ -20052,6 +20056,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 			} else if (arg->arg_type == ARG_ANYTHING) {
 				reg->type = SCALAR_VALUE;
 				mark_reg_unknown(env, regs, i);
+			} else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+				/* assume unspecial LOCAL dynptr type */
+				__mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen);
 			} else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
 				reg->type = PTR_TO_MEM;
 				if (arg->arg_type & PTR_MAYBE_NULL)
-- 
cgit v1.2.3


From d5cfbdfc96aadc96a2bf6176269b994e2ce31717 Mon Sep 17 00:00:00 2001
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
Date: Tue, 19 Dec 2023 13:54:15 -0500
Subject: ring-buffer: Have ring_buffer_print_page_header() be able to access
 ring_buffer_iter

In order to introduce sub-buffer size per ring buffer, some internal
refactoring is needed. As ring_buffer_print_page_header() will depend on
the trace_buffer structure, it is moved after the structure definition.

Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-2-tz.stoyanov@gmail.com
Link: https://lore.kernel.org/linux-trace-kernel/20231219185627.723857541@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 60 +++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f7dc74e45ebf..2400c8e68fd3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -379,36 +379,6 @@ static inline bool test_time_stamp(u64 delta)
 /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
 #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
 
-int ring_buffer_print_page_header(struct trace_seq *s)
-{
-	struct buffer_data_page field;
-
-	trace_seq_printf(s, "\tfield: u64 timestamp;\t"
-			 "offset:0;\tsize:%u;\tsigned:%u;\n",
-			 (unsigned int)sizeof(field.time_stamp),
-			 (unsigned int)is_signed_type(u64));
-
-	trace_seq_printf(s, "\tfield: local_t commit;\t"
-			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
-			 (unsigned int)offsetof(typeof(field), commit),
-			 (unsigned int)sizeof(field.commit),
-			 (unsigned int)is_signed_type(long));
-
-	trace_seq_printf(s, "\tfield: int overwrite;\t"
-			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
-			 (unsigned int)offsetof(typeof(field), commit),
-			 1,
-			 (unsigned int)is_signed_type(long));
-
-	trace_seq_printf(s, "\tfield: char data;\t"
-			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
-			 (unsigned int)offsetof(typeof(field), data),
-			 (unsigned int)BUF_PAGE_SIZE,
-			 (unsigned int)is_signed_type(char));
-
-	return !trace_seq_has_overflowed(s);
-}
-
 struct rb_irq_work {
 	struct irq_work			work;
 	wait_queue_head_t		waiters;
@@ -556,6 +526,36 @@ struct ring_buffer_iter {
 	int				missed_events;
 };
 
+int ring_buffer_print_page_header(struct trace_seq *s)
+{
+	struct buffer_data_page field;
+
+	trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+			 "offset:0;\tsize:%u;\tsigned:%u;\n",
+			 (unsigned int)sizeof(field.time_stamp),
+			 (unsigned int)is_signed_type(u64));
+
+	trace_seq_printf(s, "\tfield: local_t commit;\t"
+			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
+			 (unsigned int)offsetof(typeof(field), commit),
+			 (unsigned int)sizeof(field.commit),
+			 (unsigned int)is_signed_type(long));
+
+	trace_seq_printf(s, "\tfield: int overwrite;\t"
+			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
+			 (unsigned int)offsetof(typeof(field), commit),
+			 1,
+			 (unsigned int)is_signed_type(long));
+
+	trace_seq_printf(s, "\tfield: char data;\t"
+			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
+			 (unsigned int)offsetof(typeof(field), data),
+			 (unsigned int)BUF_PAGE_SIZE,
+			 (unsigned int)is_signed_type(char));
+
+	return !trace_seq_has_overflowed(s);
+}
+
 static inline void rb_time_read(rb_time_t *t, u64 *ret)
 {
 	*ret = local64_read(&t->time);
-- 
cgit v1.2.3


From 139f84002145d8624f0195fb090b3a7670744a13 Mon Sep 17 00:00:00 2001
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
Date: Tue, 19 Dec 2023 13:54:16 -0500
Subject: ring-buffer: Page size per ring buffer

Currently the size of one sub buffer page is global for all buffers and
it is hard coded to one system page. In order to introduce configurable
ring buffer sub page size, the internal logic should be refactored to
work with sub page size per ring buffer.

Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-3-tz.stoyanov@gmail.com
Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.009147038@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  2 +-
 kernel/trace/ring_buffer.c  | 68 +++++++++++++++++++++++++--------------------
 kernel/trace/trace.c        |  2 +-
 kernel/trace/trace.h        |  1 +
 kernel/trace/trace_events.c | 59 +++++++++++++++++++++++++++++----------
 5 files changed, 86 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b1b03b2c0f08..ce46218ce46d 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -200,7 +200,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page,
 struct trace_seq;
 
 int ring_buffer_print_entry_header(struct trace_seq *s);
-int ring_buffer_print_page_header(struct trace_seq *s);
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s);
 
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2400c8e68fd3..d9f656502400 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -374,11 +374,6 @@ static inline bool test_time_stamp(u64 delta)
 	return !!(delta & TS_DELTA_TEST);
 }
 
-#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
-
-/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
-#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-
 struct rb_irq_work {
 	struct irq_work			work;
 	wait_queue_head_t		waiters;
@@ -510,6 +505,9 @@ struct trace_buffer {
 
 	struct rb_irq_work		irq_work;
 	bool				time_stamp_abs;
+
+	unsigned int			subbuf_size;
+	unsigned int			max_data_size;
 };
 
 struct ring_buffer_iter {
@@ -523,10 +521,11 @@ struct ring_buffer_iter {
 	u64				read_stamp;
 	u64				page_stamp;
 	struct ring_buffer_event	*event;
+	size_t				event_size;
 	int				missed_events;
 };
 
-int ring_buffer_print_page_header(struct trace_seq *s)
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s)
 {
 	struct buffer_data_page field;
 
@@ -550,7 +549,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
 	trace_seq_printf(s, "\tfield: char data;\t"
 			 "offset:%u;\tsize:%u;\tsigned:%u;\n",
 			 (unsigned int)offsetof(typeof(field), data),
-			 (unsigned int)BUF_PAGE_SIZE,
+			 (unsigned int)buffer->subbuf_size,
 			 (unsigned int)is_signed_type(char));
 
 	return !trace_seq_has_overflowed(s);
@@ -1625,7 +1624,13 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
 		goto fail_free_buffer;
 
-	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+	/* Default buffer page size - one system page */
+	buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+
+	/* Max payload is buffer page size - header (8bytes) */
+	buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
+
+	nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
 	buffer->flags = flags;
 	buffer->clock = trace_clock_local;
 	buffer->reader_lock_key = key;
@@ -1944,7 +1949,7 @@ static void update_pages_handler(struct work_struct *work)
  * @size: the new size.
  * @cpu_id: the cpu buffer to resize
  *
- * Minimum size is 2 * BUF_PAGE_SIZE.
+ * Minimum size is 2 * buffer->subbuf_size.
  *
  * Returns 0 on success and < 0 on failure.
  */
@@ -1966,7 +1971,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 	    !cpumask_test_cpu(cpu_id, buffer->cpumask))
 		return 0;
 
-	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+	nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
 
 	/* we need a minimum of two pages */
 	if (nr_pages < 2)
@@ -2213,7 +2218,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
 	 */
 	barrier();
 
-	if ((iter->head + length) > commit || length > BUF_PAGE_SIZE)
+	if ((iter->head + length) > commit || length > iter->event_size)
 		/* Writer corrupted the read? */
 		goto reset;
 
@@ -2446,6 +2451,7 @@ static inline void
 rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	      unsigned long tail, struct rb_event_info *info)
 {
+	unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
 	struct buffer_page *tail_page = info->tail_page;
 	struct ring_buffer_event *event;
 	unsigned long length = info->length;
@@ -2454,13 +2460,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	 * Only the event that crossed the page boundary
 	 * must fill the old tail_page with padding.
 	 */
-	if (tail >= BUF_PAGE_SIZE) {
+	if (tail >= bsize) {
 		/*
 		 * If the page was filled, then we still need
 		 * to update the real_end. Reset it to zero
 		 * and the reader will ignore it.
 		 */
-		if (tail == BUF_PAGE_SIZE)
+		if (tail == bsize)
 			tail_page->real_end = 0;
 
 		local_sub(length, &tail_page->write);
@@ -2488,7 +2494,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	 * If we are less than the minimum size, we don't need to
 	 * worry about it.
 	 */
-	if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+	if (tail > (bsize - RB_EVNT_MIN_SIZE)) {
 		/* No room for any events */
 
 		/* Mark the rest of the page with padding */
@@ -2503,19 +2509,19 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	}
 
 	/* Put in a discarded event */
-	event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+	event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE;
 	event->type_len = RINGBUF_TYPE_PADDING;
 	/* time delta must be non zero */
 	event->time_delta = 1;
 
 	/* account for padding bytes */
-	local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
+	local_add(bsize - tail, &cpu_buffer->entries_bytes);
 
 	/* Make sure the padding is visible before the tail_page->write update */
 	smp_wmb();
 
 	/* Set write to end of buffer */
-	length = (tail + length) - BUF_PAGE_SIZE;
+	length = (tail + length) - bsize;
 	local_sub(length, &tail_page->write);
 }
 
@@ -3469,7 +3475,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	tail = write - info->length;
 
 	/* See if we shot pass the end of this buffer page */
-	if (unlikely(write > BUF_PAGE_SIZE)) {
+	if (unlikely(write > cpu_buffer->buffer->subbuf_size)) {
 		check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
 		return rb_move_tail(cpu_buffer, tail, info);
 	}
@@ -3600,7 +3606,7 @@ rb_reserve_next_event(struct trace_buffer *buffer,
 	if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
 		add_ts_default = RB_ADD_STAMP_ABSOLUTE;
 		info.length += RB_LEN_TIME_EXTEND;
-		if (info.length > BUF_MAX_DATA_SIZE)
+		if (info.length > cpu_buffer->buffer->max_data_size)
 			goto out_fail;
 	} else {
 		add_ts_default = RB_ADD_STAMP_NONE;
@@ -3675,7 +3681,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
 	if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
 		goto out;
 
-	if (unlikely(length > BUF_MAX_DATA_SIZE))
+	if (unlikely(length > buffer->max_data_size))
 		goto out;
 
 	if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -3825,7 +3831,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
 	if (atomic_read(&cpu_buffer->record_disabled))
 		goto out;
 
-	if (length > BUF_MAX_DATA_SIZE)
+	if (length > buffer->max_data_size)
 		goto out;
 
 	if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -4405,6 +4411,7 @@ static struct buffer_page *
 rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct buffer_page *reader = NULL;
+	unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
 	unsigned long overwrite;
 	unsigned long flags;
 	int nr_loops = 0;
@@ -4540,7 +4547,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 #define USECS_WAIT	1000000
         for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
 		/* If the write is past the end of page, a writer is still updating it */
-		if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
+		if (likely(!reader || rb_page_write(reader) <= bsize))
 			break;
 
 		udelay(1);
@@ -4984,7 +4991,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 		return NULL;
 
 	/* Holds the entire event: data and meta data */
-	iter->event = kmalloc(BUF_PAGE_SIZE, flags);
+	iter->event_size = buffer->subbuf_size;
+	iter->event = kmalloc(iter->event_size, flags);
 	if (!iter->event) {
 		kfree(iter);
 		return NULL;
@@ -5102,14 +5110,14 @@ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
 {
 	/*
 	 * Earlier, this method returned
-	 *	BUF_PAGE_SIZE * buffer->nr_pages
+	 *	buffer->subbuf_size * buffer->nr_pages
 	 * Since the nr_pages field is now removed, we have converted this to
 	 * return the per cpu buffer value.
 	 */
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return 0;
 
-	return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
+	return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_size);
 
@@ -5123,8 +5131,8 @@ unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer)
 {
 	/* If abs timestamp is requested, events have a timestamp too */
 	if (ring_buffer_time_stamp_abs(buffer))
-		return BUF_MAX_DATA_SIZE - RB_LEN_TIME_EXTEND;
-	return BUF_MAX_DATA_SIZE;
+		return buffer->max_data_size - RB_LEN_TIME_EXTEND;
+	return buffer->max_data_size;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_max_event_size);
 
@@ -5730,7 +5738,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		/* If there is room at the end of the page to save the
 		 * missed events, then record it there.
 		 */
-		if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+		if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
 			memcpy(&bpage->data[commit], &missed_events,
 			       sizeof(missed_events));
 			local_add(RB_MISSED_STORED, &bpage->commit);
@@ -5742,8 +5750,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	/*
 	 * This page may be off to user land. Zero it out here.
 	 */
-	if (commit < BUF_PAGE_SIZE)
-		memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+	if (commit < buffer->subbuf_size)
+		memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
 
  out_unlock:
 	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 55dabee4c78b..76dd0a4c8cb5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5018,7 +5018,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int tracing_release_generic_tr(struct inode *inode, struct file *file)
+int tracing_release_generic_tr(struct inode *inode, struct file *file)
 {
 	struct trace_array *tr = inode->i_private;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 79180aed13ee..00f873910c5d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -616,6 +616,7 @@ void tracing_reset_all_online_cpus(void);
 void tracing_reset_all_online_cpus_unlocked(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 int tracing_open_generic_tr(struct inode *inode, struct file *filp);
+int tracing_release_generic_tr(struct inode *inode, struct file *file);
 int tracing_open_file_tr(struct inode *inode, struct file *filp);
 int tracing_release_file_tr(struct inode *inode, struct file *filp);
 int tracing_single_release_file_tr(struct inode *inode, struct file *filp);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b70d03818038..7c364b87352e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1893,9 +1893,9 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 }
 
 static ssize_t
-show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 {
-	int (*func)(struct trace_seq *s) = filp->private_data;
+	struct trace_array *tr = filp->private_data;
 	struct trace_seq *s;
 	int r;
 
@@ -1908,7 +1908,31 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 
 	trace_seq_init(s);
 
-	func(s);
+	ring_buffer_print_page_header(tr->array_buffer.buffer, s);
+	r = simple_read_from_buffer(ubuf, cnt, ppos,
+				    s->buffer, trace_seq_used(s));
+
+	kfree(s);
+
+	return r;
+}
+
+static ssize_t
+show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	ring_buffer_print_entry_header(s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos,
 				    s->buffer, trace_seq_used(s));
 
@@ -2165,10 +2189,18 @@ static const struct file_operations ftrace_tr_enable_fops = {
 	.release = subsystem_release,
 };
 
-static const struct file_operations ftrace_show_header_fops = {
-	.open = tracing_open_generic,
-	.read = show_header,
+static const struct file_operations ftrace_show_header_page_fops = {
+	.open = tracing_open_generic_tr,
+	.read = show_header_page_file,
 	.llseek = default_llseek,
+	.release = tracing_release_generic_tr,
+};
+
+static const struct file_operations ftrace_show_header_event_fops = {
+	.open = tracing_open_generic_tr,
+	.read = show_header_event_file,
+	.llseek = default_llseek,
+	.release = tracing_release_generic_tr,
 };
 
 static int
@@ -3794,17 +3826,16 @@ static int events_callback(const char *name, umode_t *mode, void **data,
 		return 1;
 	}
 
-	if (strcmp(name, "header_page") == 0)
-		*data = ring_buffer_print_page_header;
-
-	else if (strcmp(name, "header_event") == 0)
-		*data = ring_buffer_print_entry_header;
+	if (strcmp(name, "header_page") == 0) {
+		*mode = TRACE_MODE_READ;
+		*fops = &ftrace_show_header_page_fops;
 
-	else
+	} else if (strcmp(name, "header_event") == 0) {
+		*mode = TRACE_MODE_READ;
+		*fops = &ftrace_show_header_event_fops;
+	} else
 		return 0;
 
-	*mode = TRACE_MODE_READ;
-	*fops = &ftrace_show_header_fops;
 	return 1;
 }
 
-- 
cgit v1.2.3


From 2808e31ec12e5fbe2ae25acc027fcdc67b1fb7f0 Mon Sep 17 00:00:00 2001
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
Date: Tue, 19 Dec 2023 13:54:17 -0500
Subject: ring-buffer: Add interface for configuring trace sub buffer size

The trace ring buffer sub page size can be configured, per trace
instance. A new ftrace file "buffer_subbuf_order" is added to get and
set the size of the ring buffer sub page for current trace instance.
The size must be an order of system page size, that's why the new
interface works with system page order, instead of absolute page size:
0 means the ring buffer sub page is equal to 1 system page and so
forth:
0 - 1 system page
1 - 2 system pages
2 - 4 system pages
...
The ring buffer sub page size is limited between 1 and 128 system
pages. The default value is 1 system page.
New ring buffer APIs are introduced:
 ring_buffer_subbuf_order_set()
 ring_buffer_subbuf_order_get()
 ring_buffer_subbuf_size_get()

Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-4-tz.stoyanov@gmail.com
Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.298324722@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  4 +++
 kernel/trace/ring_buffer.c  | 73 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.c        | 48 +++++++++++++++++++++++++++++
 3 files changed, 125 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index ce46218ce46d..12573306b889 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -202,6 +202,10 @@ struct trace_seq;
 int ring_buffer_print_entry_header(struct trace_seq *s);
 int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s);
 
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer);
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order);
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer);
+
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
 };
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d9f656502400..20fc0121735d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -507,6 +507,7 @@ struct trace_buffer {
 	bool				time_stamp_abs;
 
 	unsigned int			subbuf_size;
+	unsigned int			subbuf_order;
 	unsigned int			max_data_size;
 };
 
@@ -5761,6 +5762,78 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
+/**
+ * ring_buffer_subbuf_size_get - get size of the sub buffer.
+ * @buffer: the buffer to get the sub buffer size from
+ *
+ * Returns size of the sub buffer, in bytes.
+ */
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer)
+{
+	return buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get);
+
+/**
+ * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page.
+ * @buffer: The ring_buffer to get the system sub page order from
+ *
+ * By default, one ring buffer sub page equals to one system page. This parameter
+ * is configurable, per ring buffer. The size of the ring buffer sub page can be
+ * extended, but must be an order of system page size.
+ *
+ * Returns the order of buffer sub page size, in system pages:
+ * 0 means the sub buffer size is 1 system page and so forth.
+ * In case of an error < 0 is returned.
+ */
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer)
+{
+	if (!buffer)
+		return -EINVAL;
+
+	return buffer->subbuf_order;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
+
+/**
+ * ring_buffer_subbuf_order_set - set the size of ring buffer sub page.
+ * @buffer: The ring_buffer to set the new page size.
+ * @order: Order of the system pages in one sub buffer page
+ *
+ * By default, one ring buffer pages equals to one system page. This API can be
+ * used to set new size of the ring buffer page. The size must be order of
+ * system page size, that's why the input parameter @order is the order of
+ * system pages that are allocated for one ring buffer page:
+ *  0 - 1 system page
+ *  1 - 2 system pages
+ *  3 - 4 system pages
+ *  ...
+ *
+ * Returns 0 on success or < 0 in case of an error.
+ */
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
+{
+	int psize;
+
+	if (!buffer || order < 0)
+		return -EINVAL;
+
+	if (buffer->subbuf_order == order)
+		return 0;
+
+	psize = (1 << order) * PAGE_SIZE;
+	if (psize <= BUF_PAGE_HDR_SIZE)
+		return -EINVAL;
+
+	buffer->subbuf_order = order;
+	buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
+
+	/* Todo: reset the buffer with the new page size */
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
+
 /*
  * We only allocate new buffers, never free them if the CPU goes down.
  * If we were to free the buffer, then the user would lose any trace that was in
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 76dd0a4c8cb5..a010aba4c4a4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9358,6 +9358,51 @@ static const struct file_operations buffer_percent_fops = {
 	.llseek		= default_llseek,
 };
 
+static ssize_t
+buffer_order_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%d\n", ring_buffer_subbuf_order_get(tr->array_buffer.buffer));
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+buffer_order_write(struct file *filp, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+	if (ret)
+		return ret;
+
+	/* limit between 1 and 128 system pages */
+	if (val < 0 || val > 7)
+		return -EINVAL;
+
+	ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val);
+	if (ret)
+		return ret;
+
+	(*ppos)++;
+
+	return cnt;
+}
+
+static const struct file_operations buffer_order_fops = {
+	.open		= tracing_open_generic_tr,
+	.read		= buffer_order_read,
+	.write		= buffer_order_write,
+	.release	= tracing_release_generic_tr,
+	.llseek		= default_llseek,
+};
+
 static struct dentry *trace_instance_dir;
 
 static void
@@ -9824,6 +9869,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
 			tr, &buffer_percent_fops);
 
+	trace_create_file("buffer_subbuf_order", TRACE_MODE_WRITE, d_tracer,
+			  tr, &buffer_order_fops);
+
 	create_trace_options_dir(tr);
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-- 
cgit v1.2.3


From f9b94daa542a8d2532f0930f01cd9aec2d19621b Mon Sep 17 00:00:00 2001
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
Date: Tue, 19 Dec 2023 13:54:18 -0500
Subject: ring-buffer: Set new size of the ring buffer sub page

There are two approaches when changing the size of the ring buffer
sub page:
 1. Destroying all pages and allocating new pages with the new size.
 2. Allocating new pages, copying the content of the old pages before
    destroying them.
The first approach is easier, it is selected in the proposed
implementation. Changing the ring buffer sub page size is supposed to
not happen frequently. Usually, that size should be set only once,
when the buffer is not in use yet and is supposed to be empty.

Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-5-tz.stoyanov@gmail.com
Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.588995543@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 80 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 73 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 20fc0121735d..b928bd03dd0e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -332,6 +332,7 @@ struct buffer_page {
 	unsigned	 read;		/* index for next read */
 	local_t		 entries;	/* entries on this page */
 	unsigned long	 real_end;	/* real end of data */
+	unsigned	 order;		/* order of the page */
 	struct buffer_data_page *page;	/* Actual data page */
 };
 
@@ -362,7 +363,7 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
 
 static void free_buffer_page(struct buffer_page *bpage)
 {
-	free_page((unsigned long)bpage->page);
+	free_pages((unsigned long)bpage->page, bpage->order);
 	kfree(bpage);
 }
 
@@ -1460,10 +1461,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 
 		list_add(&bpage->list, pages);
 
-		page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
+		page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags,
+					cpu_buffer->buffer->subbuf_order);
 		if (!page)
 			goto free_pages;
 		bpage->page = page_address(page);
+		bpage->order = cpu_buffer->buffer->subbuf_order;
 		rb_init_page(bpage->page);
 
 		if (user_thread && fatal_signal_pending(current))
@@ -1542,7 +1545,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
 	rb_check_bpage(cpu_buffer, bpage);
 
 	cpu_buffer->reader_page = bpage;
-	page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+
+	page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order);
 	if (!page)
 		goto fail_free_reader;
 	bpage->page = page_address(page);
@@ -1626,6 +1630,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 		goto fail_free_buffer;
 
 	/* Default buffer page size - one system page */
+	buffer->subbuf_order = 0;
 	buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
 
 	/* Max payload is buffer page size - header (8bytes) */
@@ -5503,8 +5508,8 @@ void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
 	if (bpage)
 		goto out;
 
-	page = alloc_pages_node(cpu_to_node(cpu),
-				GFP_KERNEL | __GFP_NORETRY, 0);
+	page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
+				cpu_buffer->buffer->subbuf_order);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
@@ -5553,7 +5558,7 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
 	local_irq_restore(flags);
 
  out:
-	free_page((unsigned long)bpage);
+	free_pages((unsigned long)bpage, buffer->subbuf_order);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
 
@@ -5813,7 +5818,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
  */
 int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
 {
+	struct ring_buffer_per_cpu **cpu_buffers;
+	int old_order, old_size;
+	int nr_pages;
 	int psize;
+	int bsize;
+	int err;
+	int cpu;
 
 	if (!buffer || order < 0)
 		return -EINVAL;
@@ -5825,12 +5836,67 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
 	if (psize <= BUF_PAGE_HDR_SIZE)
 		return -EINVAL;
 
+	bsize = sizeof(void *) * buffer->cpus;
+	cpu_buffers = kzalloc(bsize, GFP_KERNEL);
+	if (!cpu_buffers)
+		return -ENOMEM;
+
+	old_order = buffer->subbuf_order;
+	old_size = buffer->subbuf_size;
+
+	/* prevent another thread from changing buffer sizes */
+	mutex_lock(&buffer->mutex);
+	atomic_inc(&buffer->record_disabled);
+
+	/* Make sure all commits have finished */
+	synchronize_rcu();
+
 	buffer->subbuf_order = order;
 	buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
 
-	/* Todo: reset the buffer with the new page size */
+	/* Make sure all new buffers are allocated, before deleting the old ones */
+	for_each_buffer_cpu(buffer, cpu) {
+		if (!cpumask_test_cpu(cpu, buffer->cpumask))
+			continue;
+
+		nr_pages = buffer->buffers[cpu]->nr_pages;
+		cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
+		if (!cpu_buffers[cpu]) {
+			err = -ENOMEM;
+			goto error;
+		}
+	}
+
+	for_each_buffer_cpu(buffer, cpu) {
+		if (!cpumask_test_cpu(cpu, buffer->cpumask))
+			continue;
+
+		rb_free_cpu_buffer(buffer->buffers[cpu]);
+		buffer->buffers[cpu] = cpu_buffers[cpu];
+	}
+
+	atomic_dec(&buffer->record_disabled);
+	mutex_unlock(&buffer->mutex);
+
+	kfree(cpu_buffers);
 
 	return 0;
+
+error:
+	buffer->subbuf_order = old_order;
+	buffer->subbuf_size = old_size;
+
+	atomic_dec(&buffer->record_disabled);
+	mutex_unlock(&buffer->mutex);
+
+	for_each_buffer_cpu(buffer, cpu) {
+		if (!cpu_buffers[cpu])
+			continue;
+		kfree(cpu_buffers[cpu]);
+	}
+	kfree(cpu_buffers);
+
+	return err;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
 
-- 
cgit v1.2.3


From bce761d757452ba5eb77e11fecc37a04b67494e7 Mon Sep 17 00:00:00 2001
From: "Tzvetomir Stoyanov (VMware)" <tz.stoyanov@gmail.com>
Date: Tue, 19 Dec 2023 13:54:19 -0500
Subject: ring-buffer: Read and write to ring buffers with custom sub buffer
 size

As the size of the ring sub buffer page can be changed dynamically,
the logic that reads and writes to the buffer should be fixed to take
that into account. Some internal ring buffer APIs are changed:
 ring_buffer_alloc_read_page()
 ring_buffer_free_read_page()
 ring_buffer_read_page()
A new API is introduced:
 ring_buffer_read_page_data()

Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-6-tz.stoyanov@gmail.com
Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.875145995@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com>
[ Fixed kerneldoc on data_page parameter in ring_buffer_free_read_page() ]
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h          | 11 ++++--
 kernel/trace/ring_buffer.c           | 75 ++++++++++++++++++++++++++----------
 kernel/trace/ring_buffer_benchmark.c | 10 +++--
 kernel/trace/trace.c                 | 34 +++++++++-------
 4 files changed, 89 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 12573306b889..fa802db216f9 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -192,10 +192,15 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer);
 size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu);
 size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu);
 
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu);
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data);
-int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page,
+struct buffer_data_read_page;
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu);
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+				struct buffer_data_read_page *page);
+int ring_buffer_read_page(struct trace_buffer *buffer,
+			  struct buffer_data_read_page *data_page,
 			  size_t len, int cpu, int full);
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page);
 
 struct trace_seq;
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b928bd03dd0e..c2afcf98ea91 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -318,6 +318,11 @@ struct buffer_data_page {
 	unsigned char	 data[] RB_ALIGN_DATA;	/* data of buffer page */
 };
 
+struct buffer_data_read_page {
+	unsigned		order;	/* order of the page */
+	struct buffer_data_page	*data;	/* actual data, stored in this page */
+};
+
 /*
  * Note, the buffer_page list must be first. The buffer pages
  * are allocated in cache lines, which means that each buffer
@@ -5483,40 +5488,48 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
  * Returns:
  *  The page allocated, or ERR_PTR
  */
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	struct buffer_data_page *bpage = NULL;
+	struct buffer_data_read_page *bpage = NULL;
 	unsigned long flags;
 	struct page *page;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return ERR_PTR(-ENODEV);
 
+	bpage = kzalloc(sizeof(*bpage), GFP_KERNEL);
+	if (!bpage)
+		return ERR_PTR(-ENOMEM);
+
+	bpage->order = buffer->subbuf_order;
 	cpu_buffer = buffer->buffers[cpu];
 	local_irq_save(flags);
 	arch_spin_lock(&cpu_buffer->lock);
 
 	if (cpu_buffer->free_page) {
-		bpage = cpu_buffer->free_page;
+		bpage->data = cpu_buffer->free_page;
 		cpu_buffer->free_page = NULL;
 	}
 
 	arch_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 
-	if (bpage)
+	if (bpage->data)
 		goto out;
 
 	page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
 				cpu_buffer->buffer->subbuf_order);
-	if (!page)
+	if (!page) {
+		kfree(bpage);
 		return ERR_PTR(-ENOMEM);
+	}
 
-	bpage = page_address(page);
+	bpage->data = page_address(page);
 
  out:
-	rb_init_page(bpage);
+	rb_init_page(bpage->data);
 
 	return bpage;
 }
@@ -5526,14 +5539,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
  * ring_buffer_free_read_page - free an allocated read page
  * @buffer: the buffer the page was allocate for
  * @cpu: the cpu buffer the page came from
- * @data: the page to free
+ * @data_page: the page to free
  *
  * Free a page allocated from ring_buffer_alloc_read_page.
  */
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+				struct buffer_data_read_page *data_page)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	struct buffer_data_page *bpage = data;
+	struct buffer_data_page *bpage = data_page->data;
 	struct page *page = virt_to_page(bpage);
 	unsigned long flags;
 
@@ -5542,8 +5556,12 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
 
 	cpu_buffer = buffer->buffers[cpu];
 
-	/* If the page is still in use someplace else, we can't reuse it */
-	if (page_ref_count(page) > 1)
+	/*
+	 * If the page is still in use someplace else, or order of the page
+	 * is different from the subbuffer order of the buffer -
+	 * we can't reuse it
+	 */
+	if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order)
 		goto out;
 
 	local_irq_save(flags);
@@ -5558,7 +5576,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
 	local_irq_restore(flags);
 
  out:
-	free_pages((unsigned long)bpage, buffer->subbuf_order);
+	free_pages((unsigned long)bpage, data_page->order);
+	kfree(data_page);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
 
@@ -5579,9 +5598,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
  *	rpage = ring_buffer_alloc_read_page(buffer, cpu);
  *	if (IS_ERR(rpage))
  *		return PTR_ERR(rpage);
- *	ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
+ *	ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0);
  *	if (ret >= 0)
- *		process_page(rpage, ret);
+ *		process_page(ring_buffer_read_page_data(rpage), ret);
+ *	ring_buffer_free_read_page(buffer, cpu, rpage);
  *
  * When @full is set, the function will not return true unless
  * the writer is off the reader page.
@@ -5596,7 +5616,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
  *  <0 if no data has been transferred.
  */
 int ring_buffer_read_page(struct trace_buffer *buffer,
-			  void **data_page, size_t len, int cpu, int full)
+			  struct buffer_data_read_page *data_page,
+			  size_t len, int cpu, int full)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
@@ -5621,10 +5642,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 
 	len -= BUF_PAGE_HDR_SIZE;
 
-	if (!data_page)
+	if (!data_page || !data_page->data)
+		goto out;
+	if (data_page->order != buffer->subbuf_order)
 		goto out;
 
-	bpage = *data_page;
+	bpage = data_page->data;
 	if (!bpage)
 		goto out;
 
@@ -5718,11 +5741,11 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		/* swap the pages */
 		rb_init_page(bpage);
 		bpage = reader->page;
-		reader->page = *data_page;
+		reader->page = data_page->data;
 		local_set(&reader->write, 0);
 		local_set(&reader->entries, 0);
 		reader->read = 0;
-		*data_page = bpage;
+		data_page->data = bpage;
 
 		/*
 		 * Use the real_end for the data size,
@@ -5767,6 +5790,18 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
+/**
+ * ring_buffer_read_page_data - get pointer to the data in the page.
+ * @page:  the page to get the data from
+ *
+ * Returns pointer to the actual data in this page.
+ */
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page)
+{
+	return page->data;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_page_data);
+
 /**
  * ring_buffer_subbuf_size_get - get size of the sub buffer.
  * @buffer: the buffer to get the sub buffer size from
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index aef34673d79d..008187ebd7fe 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -104,10 +104,11 @@ static enum event_status read_event(int cpu)
 
 static enum event_status read_page(int cpu)
 {
+	struct buffer_data_read_page *bpage;
 	struct ring_buffer_event *event;
 	struct rb_page *rpage;
 	unsigned long commit;
-	void *bpage;
+	int page_size;
 	int *entry;
 	int ret;
 	int inc;
@@ -117,14 +118,15 @@ static enum event_status read_page(int cpu)
 	if (IS_ERR(bpage))
 		return EVENT_DROPPED;
 
-	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+	page_size = ring_buffer_subbuf_size_get(buffer);
+	ret = ring_buffer_read_page(buffer, bpage, page_size, cpu, 1);
 	if (ret >= 0) {
-		rpage = bpage;
+		rpage = ring_buffer_read_page_data(bpage);
 		/* The commit may have missed event flags set, clear them */
 		commit = local_read(&rpage->commit) & 0xfffff;
 		for (i = 0; i < commit && !test_error ; i += inc) {
 
-			if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+			if (i >= (page_size - offsetof(struct rb_page, data))) {
 				TEST_ERROR();
 				break;
 			}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a010aba4c4a4..711095aa731d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8286,6 +8286,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 {
 	struct ftrace_buffer_info *info = filp->private_data;
 	struct trace_iterator *iter = &info->iter;
+	void *trace_data;
+	int page_size;
 	ssize_t ret = 0;
 	ssize_t size;
 
@@ -8297,6 +8299,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 		return -EBUSY;
 #endif
 
+	page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+
 	if (!info->spare) {
 		info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer,
 							  iter->cpu_file);
@@ -8311,13 +8315,13 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 		return ret;
 
 	/* Do we have previous read data to read? */
-	if (info->read < PAGE_SIZE)
+	if (info->read < page_size)
 		goto read;
 
  again:
 	trace_access_lock(iter->cpu_file);
 	ret = ring_buffer_read_page(iter->array_buffer->buffer,
-				    &info->spare,
+				    info->spare,
 				    count,
 				    iter->cpu_file, 0);
 	trace_access_unlock(iter->cpu_file);
@@ -8338,11 +8342,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 
 	info->read = 0;
  read:
-	size = PAGE_SIZE - info->read;
+	size = page_size - info->read;
 	if (size > count)
 		size = count;
-
-	ret = copy_to_user(ubuf, info->spare + info->read, size);
+	trace_data = ring_buffer_read_page_data(info->spare);
+	ret = copy_to_user(ubuf, trace_data + info->read, size);
 	if (ret == size)
 		return -EFAULT;
 
@@ -8453,6 +8457,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		.spd_release	= buffer_spd_release,
 	};
 	struct buffer_ref *ref;
+	int page_size;
 	int entries, i;
 	ssize_t ret = 0;
 
@@ -8461,13 +8466,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		return -EBUSY;
 #endif
 
-	if (*ppos & (PAGE_SIZE - 1))
+	page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+	if (*ppos & (page_size - 1))
 		return -EINVAL;
 
-	if (len & (PAGE_SIZE - 1)) {
-		if (len < PAGE_SIZE)
+	if (len & (page_size - 1)) {
+		if (len < page_size)
 			return -EINVAL;
-		len &= PAGE_MASK;
+		len &= (~(page_size - 1));
 	}
 
 	if (splice_grow_spd(pipe, &spd))
@@ -8477,7 +8483,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	trace_access_lock(iter->cpu_file);
 	entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
 
-	for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
+	for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) {
 		struct page *page;
 		int r;
 
@@ -8498,7 +8504,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		}
 		ref->cpu = iter->cpu_file;
 
-		r = ring_buffer_read_page(ref->buffer, &ref->page,
+		r = ring_buffer_read_page(ref->buffer, ref->page,
 					  len, iter->cpu_file, 1);
 		if (r < 0) {
 			ring_buffer_free_read_page(ref->buffer, ref->cpu,
@@ -8507,14 +8513,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			break;
 		}
 
-		page = virt_to_page(ref->page);
+		page = virt_to_page(ring_buffer_read_page_data(ref->page));
 
 		spd.pages[i] = page;
-		spd.partial[i].len = PAGE_SIZE;
+		spd.partial[i].len = page_size;
 		spd.partial[i].offset = 0;
 		spd.partial[i].private = (unsigned long)ref;
 		spd.nr_pages++;
-		*ppos += PAGE_SIZE;
+		*ppos += page_size;
 
 		entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
 	}
-- 
cgit v1.2.3


From cb665db94fc61512c9c94ed1d42af67e7bf6ce01 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Fri, 1 Dec 2023 10:26:23 +0100
Subject: tick-sched: Fix function names in comments

When referencing functions in comments, it might be helpful to use full
function names (including the prefix) to be able to find it when grepping.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20231201092654.34614-2-anna-maria@linutronix.de
---
 kernel/time/tick-sched.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index be77b021e5d6..ff25fdff6b7c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -920,11 +920,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 	}
 
 	/*
-	 * nohz_stop_sched_tick() can be called several times before
-	 * nohz_restart_sched_tick() is called. This happens when
-	 * interrupts arrive which do not cause a reschedule. In the
-	 * first call we save the current tick time, so we can restart
-	 * the scheduler tick in nohz_restart_sched_tick().
+	 * tick_nohz_stop_tick() can be called several times before
+	 * tick_nohz_restart_sched_tick() is called. This happens when
+	 * interrupts arrive which do not cause a reschedule. In the first
+	 * call we save the current tick time, so we can restart the
+	 * scheduler tick in tick_nohz_restart_sched_tick().
 	 */
 	if (!ts->tick_stopped) {
 		calc_load_nohz_start();
-- 
cgit v1.2.3


From 318050671affa92fd166d988d08d4041c7b113c4 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Fri, 1 Dec 2023 10:26:24 +0100
Subject: tick/sched: Cleanup confusing variables

tick_nohz_stop_tick() contains the expires (u64 variable) and tick
(ktime_t) variable. In the beginning the value of expires is written to
tick. Afterwards none of the variables is changed. They are only used for
checks.

Drop the not required variable tick and use always expires instead.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20231201092654.34614-3-anna-maria@linutronix.de
---
 kernel/time/tick-sched.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ff25fdff6b7c..fce3c6f0e4a6 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -887,7 +887,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 	u64 basemono = ts->timer_expires_base;
 	u64 expires = ts->timer_expires;
-	ktime_t tick = expires;
 
 	/* Make sure we won't be trying to stop it twice in a row. */
 	ts->timer_expires_base = 0;
@@ -910,7 +909,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 	/* Skip reprogram of event if it's not changed */
 	if (ts->tick_stopped && (expires == ts->next_tick)) {
 		/* Sanity check: make sure clockevent is actually programmed */
-		if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
+		if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
 			return;
 
 		WARN_ON_ONCE(1);
@@ -935,7 +934,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 		trace_tick_stop(1, TICK_DEP_MASK_NONE);
 	}
 
-	ts->next_tick = tick;
+	ts->next_tick = expires;
 
 	/*
 	 * If the expiration time == KTIME_MAX, then we simply stop
@@ -950,11 +949,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
 	}
 
 	if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-		hrtimer_start(&ts->sched_timer, tick,
+		hrtimer_start(&ts->sched_timer, expires,
 			      HRTIMER_MODE_ABS_PINNED_HARD);
 	} else {
-		hrtimer_set_expires(&ts->sched_timer, tick);
-		tick_program_event(tick, 1);
+		hrtimer_set_expires(&ts->sched_timer, expires);
+		tick_program_event(expires, 1);
 	}
 }
 
-- 
cgit v1.2.3


From cbf04a22026100dceeceec67fcbf1973383eb32f Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Fri, 1 Dec 2023 10:26:25 +0100
Subject: tick-sched: Warn when next tick seems to be in the past

When the next tick is in the past, the delta between basemono and the next
tick gets negativ. But the next tick should never be in the past. The
negative effect of a wrong next tick might be a stop of the tick and timers
might expire late.

To prevent expensive debugging when changing underlying code, add a
WARN_ON_ONCE into this code path. To prevent complete misbehaviour, also
reset next_tick to basemono in this case.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20231201092654.34614-4-anna-maria@linutronix.de
---
 kernel/time/tick-sched.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index fce3c6f0e4a6..a17d26002831 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -839,6 +839,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
 		ts->next_timer = next_tick;
 	}
 
+	/* Make sure next_tick is never before basemono! */
+	if (WARN_ON_ONCE(basemono > next_tick))
+		next_tick = basemono;
+
 	/*
 	 * If the tick is due in the next period, keep it ticking or
 	 * force prod the timer.
-- 
cgit v1.2.3


From dbcdcb62b59db2cf6a24113873b90da15c6f0b19 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Fri, 1 Dec 2023 10:26:26 +0100
Subject: tracing/timers: Enhance timer_start tracepoint

For starting a timer, the timer is enqueued into a bucket of the timer
wheel. The bucket expiry is the defacto expiry of the timer but it is not
equal the timer expiry because of increasing granularity when bucket is in
a higher level of the wheel. To be able to figure out in a trace whether a
timer expired in time or not, the bucket expiry time is required as well.

Add bucket expiry time to the timer_start tracepoint and thereby simplify
the arguments.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20231201092654.34614-5-anna-maria@linutronix.de
---
 include/trace/events/timer.h | 20 ++++++++++----------
 kernel/time/timer.c          |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index b4bc2828fa09..99ada928d445 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -46,22 +46,21 @@ DEFINE_EVENT(timer_class, timer_init,
 
 /**
  * timer_start - called when the timer is started
- * @timer:	pointer to struct timer_list
- * @expires:	the timers expiry time
- * @flags:	the timers flags
+ * @timer:		pointer to struct timer_list
+ * @bucket_expiry:	the bucket expiry time
  */
 TRACE_EVENT(timer_start,
 
 	TP_PROTO(struct timer_list *timer,
-		unsigned long expires,
-		unsigned int flags),
+		unsigned long bucket_expiry),
 
-	TP_ARGS(timer, expires, flags),
+	TP_ARGS(timer, bucket_expiry),
 
 	TP_STRUCT__entry(
 		__field( void *,	timer		)
 		__field( void *,	function	)
 		__field( unsigned long,	expires		)
+		__field( unsigned long,	bucket_expiry	)
 		__field( unsigned long,	now		)
 		__field( unsigned int,	flags		)
 	),
@@ -69,15 +68,16 @@ TRACE_EVENT(timer_start,
 	TP_fast_assign(
 		__entry->timer		= timer;
 		__entry->function	= timer->function;
-		__entry->expires	= expires;
+		__entry->expires	= timer->expires;
+		__entry->bucket_expiry	= bucket_expiry;
 		__entry->now		= jiffies;
-		__entry->flags		= flags;
+		__entry->flags		= timer->flags;
 	),
 
-	TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s",
+	TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s",
 		  __entry->timer, __entry->function, __entry->expires,
 		  (long)__entry->expires - __entry->now,
-		  __entry->flags & TIMER_CPUMASK,
+		  __entry->bucket_expiry, __entry->flags & TIMER_CPUMASK,
 		  __entry->flags >> TIMER_ARRAYSHIFT,
 		  decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK))
 );
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 63a8ce7177dd..a81d793a43d0 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -606,7 +606,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
 	__set_bit(idx, base->pending_map);
 	timer_set_idx(timer, idx);
 
-	trace_timer_start(timer, timer->expires, timer->flags);
+	trace_timer_start(timer, bucket_expiry);
 
 	/*
 	 * Check whether this is the new first expiring timer. The
-- 
cgit v1.2.3


From b573c73101d8786446535b2ab28cbc8907bda9a9 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Fri, 1 Dec 2023 10:26:27 +0100
Subject: tracing/timers: Add tracepoint for tracking timer base is_idle flag

When debugging timer code the timer tracepoints are very important. There
is no tracepoint when the is_idle flag of the timer base changes. Instead
of always adding manually trace_printk(), add tracepoints which can be
easily enabled whenever required.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20231201092654.34614-6-anna-maria@linutronix.de
---
 include/trace/events/timer.h | 20 ++++++++++++++++++++
 kernel/time/timer.c          | 14 +++++++++++---
 2 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 99ada928d445..1ef58a04fc57 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -142,6 +142,26 @@ DEFINE_EVENT(timer_class, timer_cancel,
 	TP_ARGS(timer)
 );
 
+TRACE_EVENT(timer_base_idle,
+
+	TP_PROTO(bool is_idle, unsigned int cpu),
+
+	TP_ARGS(is_idle, cpu),
+
+	TP_STRUCT__entry(
+		__field( bool,		is_idle	)
+		__field( unsigned int,	cpu	)
+	),
+
+	TP_fast_assign(
+		__entry->is_idle	= is_idle;
+		__entry->cpu		= cpu;
+	),
+
+	TP_printk("is_idle=%d cpu=%d",
+		  __entry->is_idle, __entry->cpu)
+);
+
 #define decode_clockid(type)						\
 	__print_symbolic(type,						\
 		{ CLOCK_REALTIME,	"CLOCK_REALTIME"	},	\
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a81d793a43d0..ed8d6063d9ef 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1950,7 +1950,10 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 
 	if (time_before_eq(nextevt, basej)) {
 		expires = basem;
-		base->is_idle = false;
+		if (base->is_idle) {
+			base->is_idle = false;
+			trace_timer_base_idle(false, base->cpu);
+		}
 	} else {
 		if (base->timers_pending)
 			expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
@@ -1961,8 +1964,10 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 		 * logic is only maintained for the BASE_STD base, deferrable
 		 * timers may still see large granularity skew (by design).
 		 */
-		if ((expires - basem) > TICK_NSEC)
+		if ((expires - basem) > TICK_NSEC && !base->is_idle) {
 			base->is_idle = true;
+			trace_timer_base_idle(true, base->cpu);
+		}
 	}
 	raw_spin_unlock(&base->lock);
 
@@ -1984,7 +1989,10 @@ void timer_clear_idle(void)
 	 * sending the IPI a few instructions smaller for the cost of taking
 	 * the lock in the exit from idle path.
 	 */
-	base->is_idle = false;
+	if (base->is_idle) {
+		base->is_idle = false;
+		trace_timer_base_idle(false, smp_processor_id());
+	}
 }
 #endif
 
-- 
cgit v1.2.3


From d124c3393e798b1fb142ee728d5c8976d11e722d Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Fri, 1 Dec 2023 10:26:28 +0100
Subject: timers: Do not IPI for deferrable timers

Deferrable timers do not prevent CPU from going idle and are not taken into
account on idle path. Sending an IPI to a remote CPU when a new first
deferrable timer was enqueued will wake up the remote CPU but nothing will
be done regarding the deferrable timers.

Drop IPI completely when a new first deferrable timer was enqueued.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20231201092654.34614-7-anna-maria@linutronix.de
---
 kernel/time/timer.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ed8d6063d9ef..91882059bf3d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -571,18 +571,15 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk,
 static void
 trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
 {
-	if (!is_timers_nohz_active())
-		return;
-
 	/*
-	 * TODO: This wants some optimizing similar to the code below, but we
-	 * will do that when we switch from push to pull for deferrable timers.
+	 * Deferrable timers do not prevent the CPU from entering dynticks and
+	 * are not taken into account on the idle/nohz_full path. An IPI when a
+	 * new deferrable timer is enqueued will wake up the remote CPU but
+	 * nothing will be done with the deferrable timer base. Therefore skip
+	 * the remote IPI for deferrable timers completely.
 	 */
-	if (timer->flags & TIMER_DEFERRABLE) {
-		if (tick_nohz_full_cpu(base->cpu))
-			wake_up_nohz_cpu(base->cpu);
+	if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
 		return;
-	}
 
 	/*
 	 * We might have to IPI the remote CPU if the base is idle and the
-- 
cgit v1.2.3


From b5e6f59888c7bde3c05f61b3ce06b78a86713fc0 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Fri, 1 Dec 2023 10:26:29 +0100
Subject: timers: Move store of next event into __next_timer_interrupt()

Both call sites of __next_timer_interrupt() store the return value directly
in base->next_expiry. Move the store into __next_timer_interrupt() and to
make its purpose more clear, rename the function to next_expiry_recalc().

No functional change.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20231201092654.34614-8-anna-maria@linutronix.de
---
 kernel/time/timer.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 91882059bf3d..490ff8e66fc2 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1800,8 +1800,10 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset,
 /*
  * Search the first expiring timer in the various clock levels. Caller must
  * hold base->lock.
+ *
+ * Store next expiry time in base->next_expiry.
  */
-static unsigned long __next_timer_interrupt(struct timer_base *base)
+static void next_expiry_recalc(struct timer_base *base)
 {
 	unsigned long clk, next, adj;
 	unsigned lvl, offset = 0;
@@ -1867,10 +1869,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
 		clk += adj;
 	}
 
+	base->next_expiry = next;
 	base->next_expiry_recalc = false;
 	base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
-
-	return next;
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -1930,7 +1931,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 
 	raw_spin_lock(&base->lock);
 	if (base->next_expiry_recalc)
-		base->next_expiry = __next_timer_interrupt(base);
+		next_expiry_recalc(base);
 	nextevt = base->next_expiry;
 
 	/*
@@ -2021,7 +2022,7 @@ static inline void __run_timers(struct timer_base *base)
 		WARN_ON_ONCE(!levels && !base->next_expiry_recalc
 			     && base->timers_pending);
 		base->clk++;
-		base->next_expiry = __next_timer_interrupt(base);
+		next_expiry_recalc(base);
 
 		while (levels--)
 			expire_timers(base, heads + levels);
-- 
cgit v1.2.3


From 8a2c9c7e7848d7f63d38b698209148b5bb4ba7f3 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Fri, 1 Dec 2023 10:26:30 +0100
Subject: timers: Clarify check in forward_timer_base()

The current check whether a forward of the timer base is required can be
simplified by using an already existing comparison function which is easier
to read. The related comment is outdated and was not updated when the check
changed in commit 36cd28a4cdd0 ("timers: Lower base clock forwarding
threshold").

Use time_before_eq() for the check and replace the comment by copying the
comment from the same check inside get_next_timer_interrupt(). Move the
precious information of the outdated comment to the proper place in
__run_timers().

No functional change.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20231201092654.34614-9-anna-maria@linutronix.de
---
 kernel/time/timer.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 490ff8e66fc2..f75f932b128e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -944,11 +944,10 @@ static inline void forward_timer_base(struct timer_base *base)
 	unsigned long jnow = READ_ONCE(jiffies);
 
 	/*
-	 * No need to forward if we are close enough below jiffies.
-	 * Also while executing timers, base->clk is 1 offset ahead
-	 * of jiffies to avoid endless requeuing to current jiffies.
+	 * Check whether we can forward the base. We can only do that when
+	 * @basej is past base->clk otherwise we might rewind base->clk.
 	 */
-	if ((long)(jnow - base->clk) < 1)
+	if (time_before_eq(jnow, base->clk))
 		return;
 
 	/*
@@ -2021,6 +2020,10 @@ static inline void __run_timers(struct timer_base *base)
 		 */
 		WARN_ON_ONCE(!levels && !base->next_expiry_recalc
 			     && base->timers_pending);
+		/*
+		 * While executing timers, base->clk is set 1 offset ahead of
+		 * jiffies to avoid endless requeuing to current jiffies.
+		 */
 		base->clk++;
 		next_expiry_recalc(base);
 
-- 
cgit v1.2.3


From 1e490484aa3af42d4eeffabf96d6a02be69d586b Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Fri, 1 Dec 2023 10:26:31 +0100
Subject: timers: Split out forward timer base functionality

Forwarding timer base is done when the next expiry value is calculated and
when a new timer is enqueued. When the next expiry value is calculated the
jiffies value is already available and does not need to be reread a second
time.

Splitting out the forward timer base functionality to make it executable
via both contextes - those where jiffies are already known and those, where
jiffies need to be read.

No functional change.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20231201092654.34614-10-anna-maria@linutronix.de
---
 kernel/time/timer.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index f75f932b128e..5b02e169ab23 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -939,30 +939,34 @@ get_target_base(struct timer_base *base, unsigned tflags)
 	return get_timer_this_cpu_base(tflags);
 }
 
-static inline void forward_timer_base(struct timer_base *base)
+static inline void __forward_timer_base(struct timer_base *base,
+					unsigned long basej)
 {
-	unsigned long jnow = READ_ONCE(jiffies);
-
 	/*
 	 * Check whether we can forward the base. We can only do that when
 	 * @basej is past base->clk otherwise we might rewind base->clk.
 	 */
-	if (time_before_eq(jnow, base->clk))
+	if (time_before_eq(basej, base->clk))
 		return;
 
 	/*
 	 * If the next expiry value is > jiffies, then we fast forward to
 	 * jiffies otherwise we forward to the next expiry value.
 	 */
-	if (time_after(base->next_expiry, jnow)) {
-		base->clk = jnow;
+	if (time_after(base->next_expiry, basej)) {
+		base->clk = basej;
 	} else {
 		if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
 			return;
 		base->clk = base->next_expiry;
 	}
+
 }
 
+static inline void forward_timer_base(struct timer_base *base)
+{
+	__forward_timer_base(base, READ_ONCE(jiffies));
+}
 
 /*
  * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
-- 
cgit v1.2.3


From 7a39a5080ef0e3cf233d92165f6a778f08a08244 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Fri, 1 Dec 2023 10:26:32 +0100
Subject: timers: Use already existing function for forwarding timer base

There is an already existing function for forwarding the timer
base. Forwarding the timer base is implemented directly in
get_next_timer_interrupt() as well.

Remove the code duplication and invoke __forward_timer_base() instead.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20231201092654.34614-11-anna-maria@linutronix.de
---
 kernel/time/timer.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 5b02e169ab23..1a73d396101b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1939,15 +1939,9 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 
 	/*
 	 * We have a fresh next event. Check whether we can forward the
-	 * base. We can only do that when @basej is past base->clk
-	 * otherwise we might rewind base->clk.
+	 * base.
 	 */
-	if (time_after(basej, base->clk)) {
-		if (time_after(nextevt, basej))
-			base->clk = basej;
-		else if (time_after(nextevt, base->clk))
-			base->clk = nextevt;
-	}
+	__forward_timer_base(base, basej);
 
 	if (time_before_eq(nextevt, basej)) {
 		expires = basem;
-- 
cgit v1.2.3


From bb8caad5083f8fbba70faf41f1d3bab7cf09da6d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 Dec 2023 10:26:33 +0100
Subject: timers: Rework idle logic

To improve readability of the code, split base->idle calculation and
expires calculation into separate parts. While at it, update the comment
about timer base idle marking.

Thereby the following subtle change happens if the next event is just one
jiffy ahead and the tick was already stopped: Originally base->is_idle
remains true in this situation. Now base->is_idle turns to false. This may
spare an IPI if a timer is enqueued remotely to an idle CPU that is going
to tick on the next jiffy.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20231201092654.34614-12-anna-maria@linutronix.de
---
 kernel/time/timer.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 1a73d396101b..cf51655add64 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1924,6 +1924,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 	u64 expires = KTIME_MAX;
 	unsigned long nextevt;
+	bool was_idle;
 
 	/*
 	 * Pretend that there is no timer pending if the cpu is offline.
@@ -1943,27 +1944,26 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 	 */
 	__forward_timer_base(base, basej);
 
-	if (time_before_eq(nextevt, basej)) {
-		expires = basem;
-		if (base->is_idle) {
-			base->is_idle = false;
-			trace_timer_base_idle(false, base->cpu);
-		}
-	} else {
-		if (base->timers_pending)
-			expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
-		/*
-		 * If we expect to sleep more than a tick, mark the base idle.
-		 * Also the tick is stopped so any added timer must forward
-		 * the base clk itself to keep granularity small. This idle
-		 * logic is only maintained for the BASE_STD base, deferrable
-		 * timers may still see large granularity skew (by design).
-		 */
-		if ((expires - basem) > TICK_NSEC && !base->is_idle) {
-			base->is_idle = true;
-			trace_timer_base_idle(true, base->cpu);
-		}
+	if (base->timers_pending) {
+		/* If we missed a tick already, force 0 delta */
+		if (time_before(nextevt, basej))
+			nextevt = basej;
+		expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
 	}
+
+	/*
+	 * Base is idle if the next event is more than a tick away.
+	 *
+	 * If the base is marked idle then any timer add operation must forward
+	 * the base clk itself to keep granularity small. This idle logic is
+	 * only maintained for the BASE_STD base, deferrable timers may still
+	 * see large granularity skew (by design).
+	 */
+	was_idle = base->is_idle;
+	base->is_idle = time_after(nextevt, basej + 1);
+	if (was_idle != base->is_idle)
+		trace_timer_base_idle(base->is_idle, base->cpu);
+
 	raw_spin_unlock(&base->lock);
 
 	return cmp_next_hrtimer_event(basem, expires);
-- 
cgit v1.2.3


From da65f29dada7f7cbbf0d6375b88a0316f5f7d6f5 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Fri, 1 Dec 2023 10:26:34 +0100
Subject: timers: Fix nextevt calculation when no timers are pending

When no timer is queued into an empty timer base, the next_expiry will not
be updated. It was originally calculated as

  base->clk + NEXT_TIMER_MAX_DELTA

When the timer base stays empty long enough (> NEXT_TIMER_MAX_DELTA), the
next_expiry value of the empty base suggests that there is a timer pending
soon. This might be more a kind of a theoretical problem, but the fix
doesn't hurt.

Use only base->next_expiry value as nextevt when timers are
pending. Otherwise nextevt will be jiffies + NEXT_TIMER_MAX_DELTA. As all
information is in place, update base->next_expiry value of the empty timer
base as well.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20231201092654.34614-13-anna-maria@linutronix.de
---
 kernel/time/timer.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index cf51655add64..352b161113cd 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1922,8 +1922,8 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
 u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
 	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+	unsigned long nextevt = basej + NEXT_TIMER_MAX_DELTA;
 	u64 expires = KTIME_MAX;
-	unsigned long nextevt;
 	bool was_idle;
 
 	/*
@@ -1936,7 +1936,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 	raw_spin_lock(&base->lock);
 	if (base->next_expiry_recalc)
 		next_expiry_recalc(base);
-	nextevt = base->next_expiry;
 
 	/*
 	 * We have a fresh next event. Check whether we can forward the
@@ -1945,10 +1944,20 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 	__forward_timer_base(base, basej);
 
 	if (base->timers_pending) {
+		nextevt = base->next_expiry;
+
 		/* If we missed a tick already, force 0 delta */
 		if (time_before(nextevt, basej))
 			nextevt = basej;
 		expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
+	} else {
+		/*
+		 * Move next_expiry for the empty base into the future to
+		 * prevent a unnecessary raise of the timer softirq when the
+		 * next_expiry value will be reached even if there is no timer
+		 * pending.
+		 */
+		base->next_expiry = nextevt;
 	}
 
 	/*
-- 
cgit v1.2.3


From 7beb82b7d590e51f698b1cf590b0e2785db1c498 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 19 Dec 2023 22:12:26 -0800
Subject: tracing/synthetic: fix kernel-doc warnings

scripts/kernel-doc warns about using @args: for variadic arguments to
functions. Documentation/doc-guide/kernel-doc.rst says that this should
be written as @...: instead, so update the source code to match that,
preventing the warnings.

trace_events_synth.c:1165: warning: Excess function parameter 'args' description in '__synth_event_gen_cmd_start'
trace_events_synth.c:1714: warning: Excess function parameter 'args' description in 'synth_event_trace'

Link: https://lore.kernel.org/linux-trace-kernel/20231220061226.30962-1-rdunlap@infradead.org

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Fixes: 35ca5207c2d11 ("tracing: Add synthetic event command generation functions")
Fixes: 8dcc53ad956d2 ("tracing: Add synth_event_trace() and related functions")
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_synth.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 846e02c0fb59..e7af286af4f1 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -1137,7 +1137,7 @@ EXPORT_SYMBOL_GPL(synth_event_add_fields);
  * @cmd: A pointer to the dynevent_cmd struct representing the new event
  * @name: The name of the synthetic event
  * @mod: The module creating the event, NULL if not created from a module
- * @args: Variable number of arg (pairs), one pair for each field
+ * @...: Variable number of arg (pairs), one pair for each field
  *
  * NOTE: Users normally won't want to call this function directly, but
  * rather use the synth_event_gen_cmd_start() wrapper, which
@@ -1695,7 +1695,7 @@ __synth_event_trace_end(struct synth_event_trace_state *trace_state)
  * synth_event_trace - Trace a synthetic event
  * @file: The trace_event_file representing the synthetic event
  * @n_vals: The number of values in vals
- * @args: Variable number of args containing the event values
+ * @...: Variable number of args containing the event values
  *
  * Trace a synthetic event using the values passed in the variable
  * argument list.
-- 
cgit v1.2.3


From e0f4bd26e29bf6162cdc9dc6fb7522bde7b74d07 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Wed, 20 Dec 2023 08:35:35 +0800
Subject: PM: sleep: Remove obsolete comment from unlock_system_sleep()

With the freezer changes introduced by commit f5d39b020809
("freezer,sched: Rewrite core freezer logic"), the comment in
unlock_system_sleep() has become obsolete, there is no need to
retain it.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/main.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index f6425ae3e8b0..b1ae9b677d03 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -60,22 +60,6 @@ EXPORT_SYMBOL_GPL(lock_system_sleep);
 
 void unlock_system_sleep(unsigned int flags)
 {
-	/*
-	 * Don't use freezer_count() because we don't want the call to
-	 * try_to_freeze() here.
-	 *
-	 * Reason:
-	 * Fundamentally, we just don't need it, because freezing condition
-	 * doesn't come into effect until we release the
-	 * system_transition_mutex lock, since the freezer always works with
-	 * system_transition_mutex held.
-	 *
-	 * More importantly, in the case of hibernation,
-	 * unlock_system_sleep() gets called in snapshot_read() and
-	 * snapshot_write() when the freezing condition is still in effect.
-	 * Which means, if we use try_to_freeze() here, it would make them
-	 * enter the refrigerator, thus causing hibernation to lockup.
-	 */
 	if (!(flags & PF_NOFREEZE))
 		current->flags &= ~PF_NOFREEZE;
 	mutex_unlock(&system_transition_mutex);
-- 
cgit v1.2.3


From dadce3fbaf10250b35d540caff475ff93b259de0 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 19 Dec 2023 22:02:46 -0800
Subject: PM: hibernate: Repair excess function parameter description warning

Function swsusp_close() does not have any parameters, so remove the
description of parameter @exclusive to prevent this warning.

swap.c:1573: warning: Excess function parameter 'exclusive' description in 'swsusp_close'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
[ rjw: Subject edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/swap.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 975e7195573b..6053ddddaf65 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1566,7 +1566,6 @@ put:
 
 /**
  * swsusp_close - close resume device.
- * @exclusive: Close the resume device which is exclusively opened.
  */
 
 void swsusp_close(void)
-- 
cgit v1.2.3


From 7ac5c53e00735d183a0f5e2cfce5eeb6c16319f2 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Sat, 16 Dec 2023 21:10:51 +0800
Subject: bpf: Use c->unit_size to select target cache during free
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At present, bpf memory allocator uses check_obj_size() to ensure that
ksize() of allocated pointer is equal with the unit_size of used
bpf_mem_cache. Its purpose is to prevent bpf_mem_free() from selecting
a bpf_mem_cache which has different unit_size compared with the
bpf_mem_cache used for allocation. But as reported by lkp, the return
value of ksize() or kmalloc_size_roundup() may change due to slab merge
and it will lead to the warning report in check_obj_size().

The reported warning happened as follows:
(1) in bpf_mem_cache_adjust_size(), kmalloc_size_roundup(96) returns the
object_size of kmalloc-96 instead of kmalloc-cg-96. The object_size of
kmalloc-96 is 96, so size_index for 96 is not adjusted accordingly.
(2) the object_size of kmalloc-cg-96 is adjust from 96 to 128 due to
slab merge in __kmem_cache_alias(). For SLAB, SLAB_HWCACHE_ALIGN is
enabled by default for kmalloc slab, so align is 64 and size is 128 for
kmalloc-cg-96. SLUB has a similar merge logic, but its object_size will
not be changed, because its align is 8 under x86-64.
(3) when unit_alloc() does kmalloc_node(96, __GFP_ACCOUNT, node),
ksize() returns 128 instead of 96 for the returned pointer.
(4) the warning in check_obj_size() is triggered.

Considering the slab merge can happen in anytime (e.g, a slab created in
a new module), the following case is also possible: during the
initialization of bpf_global_ma, there is no slab merge and ksize() for
a 96-bytes object returns 96. But after that a new slab created by a
kernel module is merged to kmalloc-cg-96 and the object_size of
kmalloc-cg-96 is adjust from 96 to 128 (which is possible for x86-64 +
CONFIG_SLAB, because its alignment requirement is 64 for 96-bytes slab).
So soon or later, when bpf_global_ma frees a 96-byte-sized pointer
which is allocated from bpf_mem_cache with unit_size=96, bpf_mem_free()
will free the pointer through a bpf_mem_cache in which unit_size is 128,
because the return value of ksize() changes. The warning for the
mismatch will be triggered again.

A feasible fix is introducing similar APIs compared with ksize() and
kmalloc_size_roundup() to return the actually-allocated size instead of
size which may change due to slab merge, but it will introduce
unnecessary dependency on the implementation details of mm subsystem.

As for now the pointer of bpf_mem_cache is saved in the 8-bytes area
(or 4-bytes under 32-bit host) above the returned pointer, using
unit_size in the saved bpf_mem_cache to select the target cache instead
of inferring the size from the pointer itself. Beside no extra
dependency on mm subsystem, the performance for bpf_mem_free_rcu() is
also improved as shown below.

Before applying the patch, the performances of bpf_mem_alloc() and
bpf_mem_free_rcu() on 8-CPUs VM with one producer are as follows:

kmalloc : alloc 11.69 ± 0.28M/s free 29.58 ± 0.93M/s
percpu  : alloc 14.11 ± 0.52M/s free 14.29 ± 0.99M/s

After apply the patch, the performance for bpf_mem_free_rcu() increases
9% and 146% for kmalloc memory and per-cpu memory respectively:

kmalloc: alloc 11.01 ± 0.03M/s free   32.42 ± 0.48M/s
percpu:  alloc 12.84 ± 0.12M/s free   35.24 ± 0.23M/s

After the fixes, there is no need to adjust size_index to fix the
mismatch between allocation and free, so remove it as well. Also return
NULL instead of ZERO_SIZE_PTR for zero-sized alloc in bpf_mem_alloc(),
because there is no bpf_mem_cache pointer saved above ZERO_SIZE_PTR.

Fixes: 9077fc228f09 ("bpf: Use kmalloc_size_roundup() to adjust size_index")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/bpf/202310302113.9f8fe705-oliver.sang@intel.com
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231216131052.27621-2-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 105 ++++++--------------------------------------------
 1 file changed, 11 insertions(+), 94 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 6a51cfe4c2d6..aa0fbf000a12 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -490,27 +490,6 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
 	alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
 }
 
-static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx)
-{
-	struct llist_node *first;
-	unsigned int obj_size;
-
-	first = c->free_llist.first;
-	if (!first)
-		return 0;
-
-	if (c->percpu_size)
-		obj_size = pcpu_alloc_size(((void **)first)[1]);
-	else
-		obj_size = ksize(first);
-	if (obj_size != c->unit_size) {
-		WARN_ONCE(1, "bpf_mem_cache[%u]: percpu %d, unexpected object size %u, expect %u\n",
-			  idx, c->percpu_size, obj_size, c->unit_size);
-		return -EINVAL;
-	}
-	return 0;
-}
-
 /* When size != 0 bpf_mem_cache for each cpu.
  * This is typical bpf hash map use case when all elements have equal size.
  *
@@ -521,10 +500,10 @@ static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx)
 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 {
 	static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
-	int cpu, i, err, unit_size, percpu_size = 0;
 	struct bpf_mem_caches *cc, __percpu *pcc;
 	struct bpf_mem_cache *c, __percpu *pc;
 	struct obj_cgroup *objcg = NULL;
+	int cpu, i, unit_size, percpu_size = 0;
 
 	/* room for llist_node and per-cpu pointer */
 	if (percpu)
@@ -560,7 +539,6 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 	pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
 	if (!pcc)
 		return -ENOMEM;
-	err = 0;
 #ifdef CONFIG_MEMCG_KMEM
 	objcg = get_obj_cgroup_from_current();
 #endif
@@ -574,28 +552,12 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			c->tgt = c;
 
 			init_refill_work(c);
-			/* Another bpf_mem_cache will be used when allocating
-			 * c->unit_size in bpf_mem_alloc(), so doesn't prefill
-			 * for the bpf_mem_cache because these free objects will
-			 * never be used.
-			 */
-			if (i != bpf_mem_cache_idx(c->unit_size))
-				continue;
 			prefill_mem_cache(c, cpu);
-			err = check_obj_size(c, i);
-			if (err)
-				goto out;
 		}
 	}
 
-out:
 	ma->caches = pcc;
-	/* refill_work is either zeroed or initialized, so it is safe to
-	 * call irq_work_sync().
-	 */
-	if (err)
-		bpf_mem_alloc_destroy(ma);
-	return err;
+	return 0;
 }
 
 static void drain_mem_cache(struct bpf_mem_cache *c)
@@ -869,7 +831,7 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
 	void *ret;
 
 	if (!size)
-		return ZERO_SIZE_PTR;
+		return NULL;
 
 	idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
 	if (idx < 0)
@@ -879,26 +841,17 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
 	return !ret ? NULL : ret + LLIST_NODE_SZ;
 }
 
-static notrace int bpf_mem_free_idx(void *ptr, bool percpu)
-{
-	size_t size;
-
-	if (percpu)
-		size = pcpu_alloc_size(*((void **)ptr));
-	else
-		size = ksize(ptr - LLIST_NODE_SZ);
-	return bpf_mem_cache_idx(size);
-}
-
 void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
 {
+	struct bpf_mem_cache *c;
 	int idx;
 
 	if (!ptr)
 		return;
 
-	idx = bpf_mem_free_idx(ptr, ma->percpu);
-	if (idx < 0)
+	c = *(void **)(ptr - LLIST_NODE_SZ);
+	idx = bpf_mem_cache_idx(c->unit_size);
+	if (WARN_ON_ONCE(idx < 0))
 		return;
 
 	unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
@@ -906,13 +859,15 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
 
 void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
 {
+	struct bpf_mem_cache *c;
 	int idx;
 
 	if (!ptr)
 		return;
 
-	idx = bpf_mem_free_idx(ptr, ma->percpu);
-	if (idx < 0)
+	c = *(void **)(ptr - LLIST_NODE_SZ);
+	idx = bpf_mem_cache_idx(c->unit_size);
+	if (WARN_ON_ONCE(idx < 0))
 		return;
 
 	unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
@@ -986,41 +941,3 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
 
 	return !ret ? NULL : ret + LLIST_NODE_SZ;
 }
-
-/* The alignment of dynamic per-cpu area is 8, so c->unit_size and the
- * actual size of dynamic per-cpu area will always be matched and there is
- * no need to adjust size_index for per-cpu allocation. However for the
- * simplicity of the implementation, use an unified size_index for both
- * kmalloc and per-cpu allocation.
- */
-static __init int bpf_mem_cache_adjust_size(void)
-{
-	unsigned int size;
-
-	/* Adjusting the indexes in size_index() according to the object_size
-	 * of underlying slab cache, so bpf_mem_alloc() will select a
-	 * bpf_mem_cache with unit_size equal to the object_size of
-	 * the underlying slab cache.
-	 *
-	 * The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign() is
-	 * 256-bytes, so only do adjustment for [8-bytes, 192-bytes].
-	 */
-	for (size = 192; size >= 8; size -= 8) {
-		unsigned int kmalloc_size, index;
-
-		kmalloc_size = kmalloc_size_roundup(size);
-		if (kmalloc_size == size)
-			continue;
-
-		if (kmalloc_size <= 192)
-			index = size_index[(kmalloc_size - 1) / 8];
-		else
-			index = fls(kmalloc_size - 1) - 1;
-		/* Only overwrite if necessary */
-		if (size_index[(size - 1) / 8] != index)
-			size_index[(size - 1) / 8] = index;
-	}
-
-	return 0;
-}
-subsys_initcall(bpf_mem_cache_adjust_size);
-- 
cgit v1.2.3


From c1ad12ee0efc07244be37f69311e6f7c4ac98e62 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 23 Oct 2023 13:01:54 +0200
Subject: kexec: fix KEXEC_FILE dependencies

The cleanup for the CONFIG_KEXEC Kconfig logic accidentally changed the
'depends on CRYPTO=y' dependency to a plain 'depends on CRYPTO', which
causes a link failure when all the crypto support is in a loadable module
and kexec_file support is built-in:

x86_64-linux-ld: vmlinux.o: in function `__x64_sys_kexec_file_load':
(.text+0x32e30a): undefined reference to `crypto_alloc_shash'
x86_64-linux-ld: (.text+0x32e58e): undefined reference to `crypto_shash_update'
x86_64-linux-ld: (.text+0x32e6ee): undefined reference to `crypto_shash_final'

Both s390 and x86 have this problem, while ppc64 and riscv have the
correct dependency already.  On riscv, the dependency is only used for the
purgatory, not for the kexec_file code itself, which may be a bit
surprising as it means that with CONFIG_CRYPTO=m, it is possible to enable
KEXEC_FILE but then the purgatory code is silently left out.

Move this into the common Kconfig.kexec file in a way that is correct
everywhere, using the dependency on CRYPTO_SHA256=y only when the
purgatory code is available.  This requires reversing the dependency
between ARCH_SUPPORTS_KEXEC_PURGATORY and KEXEC_FILE, but the effect
remains the same, other than making riscv behave like the other ones.

On s390, there is an additional dependency on CRYPTO_SHA256_S390, which
should technically not be required but gives better performance.  Remove
this dependency here, noting that it was not present in the initial
Kconfig code but was brought in without an explanation in commit
71406883fd357 ("s390/kexec_file: Add kexec_file_load system call").

[arnd@arndb.de: fix riscv build]
  Link: https://lkml.kernel.org/r/67ddd260-d424-4229-a815-e3fcfb864a77@app.fastmail.com
Link: https://lkml.kernel.org/r/20231023110308.1202042-1-arnd@kernel.org
Fixes: 6af5138083005 ("x86/kexec: refactor for kernel/Kconfig.kexec")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Eric DeVolder <eric_devolder@yahoo.com>
Tested-by: Eric DeVolder <eric_devolder@yahoo.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/Kconfig | 4 ++--
 arch/riscv/Kconfig   | 4 +---
 arch/s390/Kconfig    | 4 ++--
 arch/x86/Kconfig     | 4 ++--
 kernel/Kconfig.kexec | 1 +
 5 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6f105ee4f3cf..1f11a62809f2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -608,10 +608,10 @@ config ARCH_SUPPORTS_KEXEC
 	def_bool PPC_BOOK3S || PPC_E500 || (44x && !SMP)
 
 config ARCH_SUPPORTS_KEXEC_FILE
-	def_bool PPC64 && CRYPTO=y && CRYPTO_SHA256=y
+	def_bool PPC64
 
 config ARCH_SUPPORTS_KEXEC_PURGATORY
-	def_bool KEXEC_FILE
+	def_bool y
 
 config ARCH_SELECTS_KEXEC_FILE
 	def_bool y
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 24c1799e2ec4..cd4c9a204d08 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -702,9 +702,7 @@ config ARCH_SELECTS_KEXEC_FILE
 	select KEXEC_ELF
 
 config ARCH_SUPPORTS_KEXEC_PURGATORY
-	def_bool KEXEC_FILE
-	depends on CRYPTO=y
-	depends on CRYPTO_SHA256=y
+	def_bool ARCH_SUPPORTS_KEXEC_FILE
 
 config ARCH_SUPPORTS_CRASH_DUMP
 	def_bool y
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 3bec98d20283..d5d8f99d1f25 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -254,13 +254,13 @@ config ARCH_SUPPORTS_KEXEC
 	def_bool y
 
 config ARCH_SUPPORTS_KEXEC_FILE
-	def_bool CRYPTO && CRYPTO_SHA256 && CRYPTO_SHA256_S390
+	def_bool y
 
 config ARCH_SUPPORTS_KEXEC_SIG
 	def_bool MODULE_SIG_FORMAT
 
 config ARCH_SUPPORTS_KEXEC_PURGATORY
-	def_bool KEXEC_FILE
+	def_bool y
 
 config ARCH_SUPPORTS_CRASH_DUMP
 	def_bool y
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3762f41bb092..1566748f16c4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2072,7 +2072,7 @@ config ARCH_SUPPORTS_KEXEC
 	def_bool y
 
 config ARCH_SUPPORTS_KEXEC_FILE
-	def_bool X86_64 && CRYPTO && CRYPTO_SHA256
+	def_bool X86_64
 
 config ARCH_SELECTS_KEXEC_FILE
 	def_bool y
@@ -2080,7 +2080,7 @@ config ARCH_SELECTS_KEXEC_FILE
 	select HAVE_IMA_KEXEC if IMA
 
 config ARCH_SUPPORTS_KEXEC_PURGATORY
-	def_bool KEXEC_FILE
+	def_bool y
 
 config ARCH_SUPPORTS_KEXEC_SIG
 	def_bool y
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 2fd510256604..92120e396008 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -36,6 +36,7 @@ config KEXEC
 config KEXEC_FILE
 	bool "Enable kexec file based system call"
 	depends on ARCH_SUPPORTS_KEXEC_FILE
+	depends on CRYPTO_SHA256=y || !ARCH_SUPPORTS_KEXEC_PURGATORY
 	select KEXEC_CORE
 	help
 	  This is new version of kexec system call. This system call is
-- 
cgit v1.2.3


From e63bde3d9417f8318d6dd0d0fafa35ebf307aabd Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 23 Oct 2023 13:01:55 +0200
Subject: kexec: select CRYPTO from KEXEC_FILE instead of depending on it

All other users of crypto code use 'select' instead of 'depends on', so do
the same thing with KEXEC_FILE for consistency.

In practice this makes very little difference as kernels with kexec
support are very likely to also include some other feature that already
selects both crypto and crypto_sha256, but being consistent here helps for
usability as well as to avoid potential circular dependencies.

This reverts the dependency back to what it was originally before commit
74ca317c26a3f ("kexec: create a new config option CONFIG_KEXEC_FILE for
new syscall"), which changed changed it with the comment "This should be
safer as "select" is not recursive", but that appears to have been done in
error, as "select" is indeed recursive, and there are no other
dependencies that prevent CRYPTO_SHA256 from being selected here.

Link: https://lkml.kernel.org/r/20231023110308.1202042-2-arnd@kernel.org
Fixes: 74ca317c26a3f ("kexec: create a new config option CONFIG_KEXEC_FILE for new syscall")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Eric DeVolder <eric_devolder@yahoo.com>
Tested-by: Eric DeVolder <eric_devolder@yahoo.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Conor Dooley <conor@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/Kconfig.kexec | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 92120e396008..946dffa048b7 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -36,7 +36,8 @@ config KEXEC
 config KEXEC_FILE
 	bool "Enable kexec file based system call"
 	depends on ARCH_SUPPORTS_KEXEC_FILE
-	depends on CRYPTO_SHA256=y || !ARCH_SUPPORTS_KEXEC_PURGATORY
+	select CRYPTO
+	select CRYPTO_SHA256
 	select KEXEC_CORE
 	help
 	  This is new version of kexec system call. This system call is
-- 
cgit v1.2.3


From cbc2fe9d9cb226347365753f50d81bc48cc3c52e Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 13 Dec 2023 13:57:41 +0800
Subject: kexec_file: add kexec_file flag to control debug printing

Patch series "kexec_file: print out debugging message if required", v4.

Currently, specifying '-d' on kexec command will print a lot of debugging
informationabout kexec/kdump loading with kexec_load interface.

However, kexec_file_load prints nothing even though '-d' is specified.
It's very inconvenient to debug or analyze the kexec/kdump loading when
something wrong happened with kexec/kdump itself or develper want to check
the kexec/kdump loading.

In this patchset, a kexec_file flag is KEXEC_FILE_DEBUG added and checked
in code.  If it's passed in, debugging message of kexec_file code will be
printed out and can be seen from console and dmesg.  Otherwise, the
debugging message is printed like beofre when pr_debug() is taken.

Note:
****
=====
1) The code in kexec-tools utility also need be changed to support
passing KEXEC_FILE_DEBUG to kernel when 'kexec -s -d' is specified.
The patch link is here:
=========
[PATCH] kexec_file: add kexec_file flag to support debug printing
http://lists.infradead.org/pipermail/kexec/2023-November/028505.html

2) s390 also has kexec_file code, while I am not sure what debugging
information is necessary. So leave it to s390 developer.

Test:
****
====
Testing was done in v1 on x86_64 and arm64. For v4, tested on x86_64
again. And on x86_64, the printed messages look like below:
--------------------------------------------------------------
kexec measurement buffer for the loaded kernel at 0x207fffe000.
Loaded purgatory at 0x207fff9000
Loaded boot_param, command line and misc at 0x207fff3000 bufsz=0x1180 memsz=0x1180
Loaded 64bit kernel at 0x207c000000 bufsz=0xc88200 memsz=0x3c4a000
Loaded initrd at 0x2079e79000 bufsz=0x2186280 memsz=0x2186280
Final command line is: root=/dev/mapper/fedora_intel--knightslanding--lb--02-root ro
rd.lvm.lv=fedora_intel-knightslanding-lb-02/root console=ttyS0,115200N81 crashkernel=256M
E820 memmap:
0000000000000000-000000000009a3ff (1)
000000000009a400-000000000009ffff (2)
00000000000e0000-00000000000fffff (2)
0000000000100000-000000006ff83fff (1)
000000006ff84000-000000007ac50fff (2)
......
000000207fff6150-000000207fff615f (128)
000000207fff6160-000000207fff714f (1)
000000207fff7150-000000207fff715f (128)
000000207fff7160-000000207fff814f (1)
000000207fff8150-000000207fff815f (128)
000000207fff8160-000000207fffffff (1)
nr_segments = 5
segment[0]: buf=0x000000004e5ece74 bufsz=0x211 mem=0x207fffe000 memsz=0x1000
segment[1]: buf=0x000000009e871498 bufsz=0x4000 mem=0x207fff9000 memsz=0x5000
segment[2]: buf=0x00000000d879f1fe bufsz=0x1180 mem=0x207fff3000 memsz=0x2000
segment[3]: buf=0x000000001101cd86 bufsz=0xc88200 mem=0x207c000000 memsz=0x3c4a000
segment[4]: buf=0x00000000c6e38ac7 bufsz=0x2186280 mem=0x2079e79000 memsz=0x2187000
kexec_file_load: type:0, start:0x207fff91a0 head:0x109e004002 flags:0x8
---------------------------------------------------------------------------


This patch (of 7):

When specifying 'kexec -c -d', kexec_load interface will print loading
information, e.g the regions where kernel/initrd/purgatory/cmdline are
put, the memmap passed to 2nd kernel taken as system RAM ranges, and
printing all contents of struct kexec_segment, etc.  These are very
helpful for analyzing or positioning what's happening when kexec/kdump
itself failed.  The debugging printing for kexec_load interface is made in
user space utility kexec-tools.

Whereas, with kexec_file_load interface, 'kexec -s -d' print nothing.
Because kexec_file code is mostly implemented in kernel space, and the
debugging printing functionality is missed.  It's not convenient when
debugging kexec/kdump loading and jumping with kexec_file_load interface.

Now add KEXEC_FILE_DEBUG to kexec_file flag to control the debugging
message printing.  And add global variable kexec_file_dbg_print and macro
kexec_dprintk() to facilitate the printing.

This is a preparation, later kexec_dprintk() will be used to replace the
existing pr_debug().  Once 'kexec -s -d' is specified, it will print out
kexec/kdump loading information.  If '-d' is not specified, it regresses
to pr_debug().

Link: https://lkml.kernel.org/r/20231213055747.61826-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20231213055747.61826-2-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Joe Perches <joe@perches.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec.h      | 9 ++++++++-
 include/uapi/linux/kexec.h | 1 +
 kernel/kexec_core.c        | 2 ++
 kernel/kexec_file.c        | 3 +++
 4 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 8227455192b7..400cb6c02176 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -403,7 +403,7 @@ bool kexec_load_permitted(int kexec_image_type);
 
 /* List of defined/legal kexec file flags */
 #define KEXEC_FILE_FLAGS	(KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
-				 KEXEC_FILE_NO_INITRAMFS)
+				 KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG)
 
 /* flag to track if kexec reboot is in progress */
 extern bool kexec_in_progress;
@@ -500,6 +500,13 @@ static inline int crash_hotplug_memory_support(void) { return 0; }
 static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; }
 #endif
 
+extern bool kexec_file_dbg_print;
+
+#define kexec_dprintk(fmt, ...)					\
+	printk("%s" fmt,					\
+	       kexec_file_dbg_print ? KERN_INFO : KERN_DEBUG,	\
+	       ##__VA_ARGS__)
+
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 01766dd839b0..c17bb096ea68 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -25,6 +25,7 @@
 #define KEXEC_FILE_UNLOAD	0x00000001
 #define KEXEC_FILE_ON_CRASH	0x00000002
 #define KEXEC_FILE_NO_INITRAMFS	0x00000004
+#define KEXEC_FILE_DEBUG	0x00000008
 
 /* These values match the ELF architecture values.
  * Unless there is a good reason that should continue to be the case.
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index bc4c096ab1f3..64072acef2b6 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -52,6 +52,8 @@ atomic_t __kexec_lock = ATOMIC_INIT(0);
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
 
+bool kexec_file_dbg_print;
+
 int kexec_should_crash(struct task_struct *p)
 {
 	/*
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index ba3ef30921b8..3ee204474de6 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -123,6 +123,8 @@ void kimage_file_post_load_cleanup(struct kimage *image)
 	 */
 	kfree(image->image_loader_data);
 	image->image_loader_data = NULL;
+
+	kexec_file_dbg_print = false;
 }
 
 #ifdef CONFIG_KEXEC_SIG
@@ -278,6 +280,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
 	if (!image)
 		return -ENOMEM;
 
+	kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG);
 	image->file_mode = 1;
 
 	if (kexec_on_panic) {
-- 
cgit v1.2.3


From a85ee18c7900f001f42082d2fabce4eaf57e655f Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 13 Dec 2023 13:57:42 +0800
Subject: kexec_file: print out debugging message if required

Then when specifying '-d' for kexec_file_load interface, loaded locations
of kernel/initrd/cmdline etc can be printed out to help debug.

Here replace pr_debug() with the newly added kexec_dprintk() in kexec_file
loading related codes.

And also print out type/start/head of kimage and flags to help debug.

Link: https://lkml.kernel.org/r/20231213055747.61826-3-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Joe Perches <joe@perches.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/crash_core.c                |  8 +++++---
 kernel/kexec_file.c                | 11 ++++++++---
 security/integrity/ima/ima_kexec.c |  4 ++--
 3 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index d4313b53837e..c97e825a0fd9 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -551,9 +551,11 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
 		phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
 		phdr->p_align = 0;
 		ehdr->e_phnum++;
-		pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
-			phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
-			ehdr->e_phnum, phdr->p_offset);
+#ifdef CONFIG_KEXEC_FILE
+		kexec_dprintk("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+			      phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+			      ehdr->e_phnum, phdr->p_offset);
+#endif
 		phdr++;
 	}
 
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 3ee204474de6..aca5f3668f4c 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -204,6 +204,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
 	if (ret < 0)
 		return ret;
 	image->kernel_buf_len = ret;
+	kexec_dprintk("kernel: %p kernel_size: %#lx\n",
+		      image->kernel_buf, image->kernel_buf_len);
 
 	/* Call arch image probe handlers */
 	ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
@@ -387,13 +389,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 	if (ret)
 		goto out;
 
+	kexec_dprintk("nr_segments = %lu\n", image->nr_segments);
 	for (i = 0; i < image->nr_segments; i++) {
 		struct kexec_segment *ksegment;
 
 		ksegment = &image->segment[i];
-		pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
-			 i, ksegment->buf, ksegment->bufsz, ksegment->mem,
-			 ksegment->memsz);
+		kexec_dprintk("segment[%d]: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+			      i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+			      ksegment->memsz);
 
 		ret = kimage_load_segment(image, &image->segment[i]);
 		if (ret)
@@ -406,6 +409,8 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 	if (ret)
 		goto out;
 
+	kexec_dprintk("kexec_file_load: type:%u, start:0x%lx head:0x%lx flags:0x%lx\n",
+		      image->type, image->start, image->head, flags);
 	/*
 	 * Free up any temporary buffers allocated which are not needed
 	 * after image has been loaded
diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
index ad133fe120db..dadc1d138118 100644
--- a/security/integrity/ima/ima_kexec.c
+++ b/security/integrity/ima/ima_kexec.c
@@ -129,8 +129,8 @@ void ima_add_kexec_buffer(struct kimage *image)
 	image->ima_buffer_size = kexec_segment_size;
 	image->ima_buffer = kexec_buffer;
 
-	pr_debug("kexec measurement buffer for the loaded kernel at 0x%lx.\n",
-		 kbuf.mem);
+	kexec_dprintk("kexec measurement buffer for the loaded kernel at 0x%lx.\n",
+		      kbuf.mem);
 }
 #endif /* IMA_KEXEC */
 
-- 
cgit v1.2.3


From a903904c5fa06e8d8472509741f79e8eb25ff864 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Fri, 8 Dec 2023 16:41:15 +0800
Subject: fork: remove redundant TASK_UNINTERRUPTIBLE

TASK_KILLABLE already includes TASK_UNINTERRUPTIBLE, so there is no
need to add a separate TASK_UNINTERRUPTIBLE.

Link: https://lkml.kernel.org/r/20231208084115.1973285-1-haokexin@gmail.com
Signed-off-by: Kevin Hao <haokexin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index ce8a4b8c04e2..d71c8ade8f9c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1562,7 +1562,7 @@ static void complete_vfork_done(struct task_struct *tsk)
 static int wait_for_vfork_done(struct task_struct *child,
 				struct completion *vfork)
 {
-	unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE;
+	unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
 	int killed;
 
 	cgroup_enter_frozen();
-- 
cgit v1.2.3


From db6b6fb70193f0defe4d5785e940156c06e9abbe Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Tue, 12 Dec 2023 22:27:06 +0800
Subject: kexec: use ALIGN macro instead of open-coding it

Use ALIGN macro instead of open-coding it to improve code readability.

Link: https://lkml.kernel.org/r/20231212142706.25149-1-ytcoode@gmail.com
Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kexec_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 64072acef2b6..6e0f022987ff 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -432,7 +432,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 
 	pages = NULL;
 	size = (1 << order) << PAGE_SHIFT;
-	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+	hole_start = ALIGN(image->control_page, size);
 	hole_end   = hole_start + size - 1;
 	while (hole_end <= crashk_res.end) {
 		unsigned long i;
@@ -449,7 +449,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 			mend   = mstart + image->segment[i].memsz - 1;
 			if ((hole_end >= mstart) && (hole_start <= mend)) {
 				/* Advance the hole to the end of the segment */
-				hole_start = (mend + (size - 1)) & ~(size - 1);
+				hole_start = ALIGN(mend, size);
 				hole_end   = hole_start + size - 1;
 				break;
 			}
-- 
cgit v1.2.3


From 4459cd2e167e7208e57d517d16282408d9035dad Mon Sep 17 00:00:00 2001
From: Wang Jinchao <wangjinchao@xfusion.com>
Date: Fri, 15 Dec 2023 16:54:51 +0800
Subject: crash_core: remove duplicated including of kexec.h

Remove second include of linux/kexec.h

Link: https://lkml.kernel.org/r/202312151654+0800-wangjinchao@xfusion.com
Signed-off-by: Wang Jinchao <wangjinchao@xfusion.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/crash_core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index c97e825a0fd9..6f074e112c1e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -13,7 +13,6 @@
 #include <linux/memory.h>
 #include <linux/cpuhotplug.h>
 #include <linux/memblock.h>
-#include <linux/kexec.h>
 #include <linux/kmemleak.h>
 
 #include <asm/page.h>
-- 
cgit v1.2.3


From a2bef835d39c5c9d611e9420db4221b0eeec9944 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 20:49:12 -0500
Subject: kernel/fork.c: add missing include

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 10917c3e1f03..319e61297bfb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -53,6 +53,7 @@
 #include <linux/seccomp.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
+#include <linux/syscall_user_dispatch.h>
 #include <linux/jiffies.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
-- 
cgit v1.2.3


From d7a73e3f089204aee3393687e23fd45a22657b08 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:27:00 -0500
Subject: kernel/numa.c: Move logging out of numa.h

Moving these stub functions to a .c file means we can kill a sched.h
dependency on printk.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/numa.h | 19 ++++++-------------
 kernel/Makefile      |  1 +
 kernel/numa.c        | 26 ++++++++++++++++++++++++++
 3 files changed, 33 insertions(+), 13 deletions(-)
 create mode 100644 kernel/numa.c

(limited to 'kernel')

diff --git a/include/linux/numa.h b/include/linux/numa.h
index a904861de800..915033a75731 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_NUMA_H
 #define _LINUX_NUMA_H
+#include <linux/init.h>
 #include <linux/types.h>
 
 #ifdef CONFIG_NODES_SHIFT
@@ -22,34 +23,26 @@
 #endif
 
 #ifdef CONFIG_NUMA
-#include <linux/printk.h>
 #include <asm/sparsemem.h>
 
 /* Generic implementation available */
 int numa_nearest_node(int node, unsigned int state);
 
 #ifndef memory_add_physaddr_to_nid
-static inline int memory_add_physaddr_to_nid(u64 start)
-{
-	pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
-			start);
-	return 0;
-}
+int memory_add_physaddr_to_nid(u64 start);
 #endif
+
 #ifndef phys_to_target_node
-static inline int phys_to_target_node(u64 start)
-{
-	pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
-			start);
-	return 0;
-}
+int phys_to_target_node(u64 start);
 #endif
+
 #ifndef numa_fill_memblks
 static inline int __init numa_fill_memblks(u64 start, u64 end)
 {
 	return NUMA_NO_MEMBLK;
 }
 #endif
+
 #else /* !CONFIG_NUMA */
 static inline int numa_nearest_node(int node, unsigned int state)
 {
diff --git a/kernel/Makefile b/kernel/Makefile
index 3947122d618b..ce105a5558fc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -114,6 +114,7 @@ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
 obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
 obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
 obj-$(CONFIG_CFI_CLANG) += cfi.o
+obj-$(CONFIG_NUMA) += numa.o
 
 obj-$(CONFIG_PERF_EVENTS) += events/
 
diff --git a/kernel/numa.c b/kernel/numa.c
new file mode 100644
index 000000000000..67ca6b8585c0
--- /dev/null
+++ b/kernel/numa.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/printk.h>
+#include <linux/numa.h>
+
+/* Stub functions: */
+
+#ifndef memory_add_physaddr_to_nid
+int memory_add_physaddr_to_nid(u64 start)
+{
+	pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
+			start);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+
+#ifndef phys_to_target_node
+int phys_to_target_node(u64 start)
+{
+	pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+			start);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
+#endif
-- 
cgit v1.2.3


From f551103cb964e9e6f5c03b3b8723424723731e76 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 17:49:24 -0500
Subject: sched.h: move pid helpers to pid.h

This is needed for killing the sched.h dependency on rcupdate.h, and
pid.h is a better place for this code anyways.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 arch/x86/um/sysrq_64.c          |   1 +
 drivers/gpu/drm/lima/lima_ctx.c |   1 +
 drivers/irqchip/irq-gic-v4.c    |   1 +
 include/linux/pid.h             | 125 ++++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h           | 122 ---------------------------------------
 include/linux/sched/signal.h    |   1 +
 ipc/util.h                      |   1 +
 kernel/async.c                  |   5 +-
 kernel/locking/spinlock_debug.c |   1 +
 9 files changed, 134 insertions(+), 124 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c
index ef1eb4f4f612..0bf6de40abff 100644
--- a/arch/x86/um/sysrq_64.c
+++ b/arch/x86/um/sysrq_64.c
@@ -6,6 +6,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/pid.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/utsname.h>
diff --git a/drivers/gpu/drm/lima/lima_ctx.c b/drivers/gpu/drm/lima/lima_ctx.c
index 891d5cd5019a..8389f2d7d021 100644
--- a/drivers/gpu/drm/lima/lima_ctx.c
+++ b/drivers/gpu/drm/lima/lima_ctx.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR MIT
 /* Copyright 2018-2019 Qiang Yu <yuq825@gmail.com> */
 
+#include <linux/pid.h>
 #include <linux/slab.h>
 
 #include "lima_device.h"
diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c
index 94d56a03b175..ca32ac19d284 100644
--- a/drivers/irqchip/irq-gic-v4.c
+++ b/drivers/irqchip/irq-gic-v4.c
@@ -8,6 +8,7 @@
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
 #include <linux/msi.h>
+#include <linux/pid.h>
 #include <linux/sched.h>
 
 #include <linux/irqchip/arm-gic-v4.h>
diff --git a/include/linux/pid.h b/include/linux/pid.h
index f254c3a45b9b..395cacce1179 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -4,7 +4,9 @@
 
 #include <linux/pid_types.h>
 #include <linux/rculist.h>
+#include <linux/rcupdate.h>
 #include <linux/refcount.h>
+#include <linux/sched.h>
 #include <linux/wait.h>
 
 /*
@@ -204,4 +206,127 @@ pid_t pid_vnr(struct pid *pid);
 		}							\
 		task = tg___;						\
 	} while_each_pid_task(pid, type, task)
+
+static inline struct pid *task_pid(struct task_struct *task)
+{
+	return task->thread_pid;
+}
+
+/*
+ * the helpers to get the task's different pids as they are seen
+ * from various namespaces
+ *
+ * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
+ * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
+ *                     current.
+ * task_xid_nr_ns()  : id seen from the ns specified;
+ *
+ * see also pid_nr() etc in include/linux/pid.h
+ */
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
+
+static inline pid_t task_pid_nr(struct task_struct *tsk)
+{
+	return tsk->pid;
+}
+
+static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
+}
+
+static inline pid_t task_pid_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
+}
+
+
+static inline pid_t task_tgid_nr(struct task_struct *tsk)
+{
+	return tsk->tgid;
+}
+
+/**
+ * pid_alive - check that a task structure is not stale
+ * @p: Task structure to be checked.
+ *
+ * Test if a process is not yet dead (at most zombie state)
+ * If pid_alive fails, then pointers within the task structure
+ * can be stale and must not be dereferenced.
+ *
+ * Return: 1 if the process is alive. 0 otherwise.
+ */
+static inline int pid_alive(const struct task_struct *p)
+{
+	return p->thread_pid != NULL;
+}
+
+static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
+}
+
+static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
+}
+
+
+static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
+}
+
+static inline pid_t task_session_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
+}
+
+static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
+}
+
+static inline pid_t task_tgid_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
+}
+
+static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
+{
+	pid_t pid = 0;
+
+	rcu_read_lock();
+	if (pid_alive(tsk))
+		pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
+	rcu_read_unlock();
+
+	return pid;
+}
+
+static inline pid_t task_ppid_nr(const struct task_struct *tsk)
+{
+	return task_ppid_nr_ns(tsk, &init_pid_ns);
+}
+
+/* Obsolete, do not use: */
+static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+{
+	return task_pgrp_nr_ns(tsk, &init_pid_ns);
+}
+
+/**
+ * is_global_init - check if a task structure is init. Since init
+ * is free to have sub-threads we need to check tgid.
+ * @tsk: Task structure to be checked.
+ *
+ * Check if a task structure is the first user space task the kernel created.
+ *
+ * Return: 1 if the task structure is init. 0 otherwise.
+ */
+static inline int is_global_init(struct task_struct *tsk)
+{
+	return task_tgid_nr(tsk) == 1;
+}
+
 #endif /* _LINUX_PID_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 37cc9d257073..9e2708c2cfa6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1561,114 +1561,6 @@ struct task_struct {
 	 */
 };
 
-static inline struct pid *task_pid(struct task_struct *task)
-{
-	return task->thread_pid;
-}
-
-/*
- * the helpers to get the task's different pids as they are seen
- * from various namespaces
- *
- * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
- * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
- *                     current.
- * task_xid_nr_ns()  : id seen from the ns specified;
- *
- * see also pid_nr() etc in include/linux/pid.h
- */
-pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
-
-static inline pid_t task_pid_nr(struct task_struct *tsk)
-{
-	return tsk->pid;
-}
-
-static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
-}
-
-static inline pid_t task_pid_vnr(struct task_struct *tsk)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
-}
-
-
-static inline pid_t task_tgid_nr(struct task_struct *tsk)
-{
-	return tsk->tgid;
-}
-
-/**
- * pid_alive - check that a task structure is not stale
- * @p: Task structure to be checked.
- *
- * Test if a process is not yet dead (at most zombie state)
- * If pid_alive fails, then pointers within the task structure
- * can be stale and must not be dereferenced.
- *
- * Return: 1 if the process is alive. 0 otherwise.
- */
-static inline int pid_alive(const struct task_struct *p)
-{
-	return p->thread_pid != NULL;
-}
-
-static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
-}
-
-static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
-}
-
-
-static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
-}
-
-static inline pid_t task_session_vnr(struct task_struct *tsk)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
-}
-
-static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
-}
-
-static inline pid_t task_tgid_vnr(struct task_struct *tsk)
-{
-	return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
-}
-
-static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
-{
-	pid_t pid = 0;
-
-	rcu_read_lock();
-	if (pid_alive(tsk))
-		pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
-	rcu_read_unlock();
-
-	return pid;
-}
-
-static inline pid_t task_ppid_nr(const struct task_struct *tsk)
-{
-	return task_ppid_nr_ns(tsk, &init_pid_ns);
-}
-
-/* Obsolete, do not use: */
-static inline pid_t task_pgrp_nr(struct task_struct *tsk)
-{
-	return task_pgrp_nr_ns(tsk, &init_pid_ns);
-}
-
 #define TASK_REPORT_IDLE	(TASK_REPORT + 1)
 #define TASK_REPORT_MAX		(TASK_REPORT_IDLE << 1)
 
@@ -1712,20 +1604,6 @@ static inline char task_state_to_char(struct task_struct *tsk)
 	return task_index_to_char(task_state_index(tsk));
 }
 
-/**
- * is_global_init - check if a task structure is init. Since init
- * is free to have sub-threads we need to check tgid.
- * @tsk: Task structure to be checked.
- *
- * Check if a task structure is the first user space task the kernel created.
- *
- * Return: 1 if the task structure is init. 0 otherwise.
- */
-static inline int is_global_init(struct task_struct *tsk)
-{
-	return task_tgid_nr(tsk) == 1;
-}
-
 extern struct pid *cad_pid;
 
 /*
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 3499c1a8b929..b847d8fa75a9 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -9,6 +9,7 @@
 #include <linux/sched/task.h>
 #include <linux/cred.h>
 #include <linux/refcount.h>
+#include <linux/pid.h>
 #include <linux/posix-timers.h>
 #include <linux/mm_types.h>
 #include <asm/ptrace.h>
diff --git a/ipc/util.h b/ipc/util.h
index 67bdd2aa2c28..a55d6cebe6d3 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -14,6 +14,7 @@
 #include <linux/unistd.h>
 #include <linux/err.h>
 #include <linux/ipc_namespace.h>
+#include <linux/pid.h>
 
 /*
  * The IPC ID contains 2 separate numbers - index and sequence number.
diff --git a/kernel/async.c b/kernel/async.c
index b2c4ba5686ee..79f6a3034b1f 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -46,11 +46,12 @@ asynchronous and synchronous parts of the kernel.
 
 #include <linux/async.h>
 #include <linux/atomic.h>
-#include <linux/ktime.h>
 #include <linux/export.h>
-#include <linux/wait.h>
+#include <linux/ktime.h>
+#include <linux/pid.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/wait.h>
 #include <linux/workqueue.h>
 
 #include "workqueue_internal.h"
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 14235671a1a7..87b03d2e41db 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -12,6 +12,7 @@
 #include <linux/debug_locks.h>
 #include <linux/delay.h>
 #include <linux/export.h>
+#include <linux/pid.h>
 
 void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
 			  struct lock_class_key *key, short inner)
-- 
cgit v1.2.3


From 8b7787a543cde905e53eaf29172c9472fe8a6a75 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 11 Dec 2023 13:12:49 -0500
Subject: plist: Split out plist_types.h

Trimming down sched.h dependencies: we don't want to include more than
the base types.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/plist.h       | 12 +-----------
 include/linux/plist_types.h | 17 +++++++++++++++++
 include/linux/sched.h       |  2 +-
 init/init_task.c            |  1 +
 kernel/futex/core.c         |  1 +
 kernel/futex/requeue.c      |  1 +
 kernel/futex/waitwake.c     |  1 +
 mm/swapfile.c               |  1 +
 8 files changed, 24 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/plist_types.h

(limited to 'kernel')

diff --git a/include/linux/plist.h b/include/linux/plist.h
index 0f352c1d3c80..8c1c8adf7fe9 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -75,20 +75,10 @@
 
 #include <linux/container_of.h>
 #include <linux/list.h>
-#include <linux/types.h>
+#include <linux/plist_types.h>
 
 #include <asm/bug.h>
 
-struct plist_head {
-	struct list_head node_list;
-};
-
-struct plist_node {
-	int			prio;
-	struct list_head	prio_list;
-	struct list_head	node_list;
-};
-
 /**
  * PLIST_HEAD_INIT - static struct plist_head initializer
  * @head:	struct plist_head variable name
diff --git a/include/linux/plist_types.h b/include/linux/plist_types.h
new file mode 100644
index 000000000000..c37e784330af
--- /dev/null
+++ b/include/linux/plist_types.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_PLIST_TYPES_H
+#define _LINUX_PLIST_TYPES_H
+
+#include <linux/types.h>
+
+struct plist_head {
+	struct list_head node_list;
+};
+
+struct plist_node {
+	int			prio;
+	struct list_head	prio_list;
+	struct list_head	node_list;
+};
+
+#endif /* _LINUX_PLIST_TYPES_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9e2708c2cfa6..8c230f24688b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -16,7 +16,7 @@
 #include <linux/shm.h>
 #include <linux/kmsan_types.h>
 #include <linux/mutex_types.h>
-#include <linux/plist.h>
+#include <linux/plist_types.h>
 #include <linux/hrtimer_types.h>
 #include <linux/irqflags.h>
 #include <linux/seccomp.h>
diff --git a/init/init_task.c b/init/init_task.c
index 5727d42149c3..56220898a256 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -12,6 +12,7 @@
 #include <linux/audit.h>
 #include <linux/numa.h>
 #include <linux/scs.h>
+#include <linux/plist.h>
 
 #include <linux/uaccess.h>
 
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index dad981a865b8..e0e853412c15 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -34,6 +34,7 @@
 #include <linux/compat.h>
 #include <linux/jhash.h>
 #include <linux/pagemap.h>
+#include <linux/plist.h>
 #include <linux/memblock.h>
 #include <linux/fault-inject.h>
 #include <linux/slab.h>
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index eb21f065816b..b47bb764b352 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include <linux/plist.h>
 #include <linux/sched/signal.h>
 
 #include "futex.h"
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 61b112897a84..3a10375d9521 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include <linux/plist.h>
 #include <linux/sched/task.h>
 #include <linux/sched/signal.h>
 #include <linux/freezer.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4bc70f459164..25019af07181 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -42,6 +42,7 @@
 #include <linux/completion.h>
 #include <linux/suspend.h>
 #include <linux/zswap.h>
+#include <linux/plist.h>
 
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
-- 
cgit v1.2.3


From 6dfeff09d5ad331905c7066207053d286d58ac83 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 11 Dec 2023 18:14:41 +0000
Subject: wait: Remove uapi header file from main header file

There's really no overlap between uapi/linux/wait.h and linux/wait.h.
There are two files which rely on the uapi file being implcitly included,
so explicitly include it there and remove it from the main header file.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Reviewed-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/wait.h   | 1 -
 kernel/exit.c          | 4 +++-
 kernel/pid_namespace.c | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 3473b663176f..8aa3372f21a0 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -9,7 +9,6 @@
 #include <linux/spinlock.h>
 
 #include <asm/current.h>
-#include <uapi/linux/wait.h>
 
 typedef struct wait_queue_entry wait_queue_entry_t;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index ee9f43bed49a..2ef33047371b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,8 +69,10 @@
 #include <linux/rethook.h>
 #include <linux/sysfs.h>
 #include <linux/user_events.h>
-
 #include <linux/uaccess.h>
+
+#include <uapi/linux/wait.h>
+
 #include <asm/unistd.h>
 #include <asm/mmu_context.h>
 
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 3028b2218aa4..7ade20e95232 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,6 +23,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/signal.h>
 #include <linux/idr.h>
+#include <uapi/linux/wait.h>
 #include "pid_sysctl.h"
 
 static DEFINE_MUTEX(pid_caches_mutex);
-- 
cgit v1.2.3


From a4aebe936554dac6a91e5d091179c934f8325708 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 19 Dec 2023 15:26:59 -0800
Subject: posix-timers: Get rid of [COMPAT_]SYS_NI() uses

Only the posix timer system calls use this (when the posix timer support
is disabled, which does not actually happen in any normal case), because
they had debug code to print out a warning about missing system calls.

Get rid of that special case, and just use the standard COND_SYSCALL
interface that creates weak system call stubs that return -ENOSYS for
when the system call does not exist.

This fixes a kCFI issue with the SYS_NI() hackery:

  CFI failure at int80_emulation+0x67/0xb0 (target: sys_ni_posix_timers+0x0/0x70; expected type: 0xb02b34d9)
  WARNING: CPU: 0 PID: 48 at int80_emulation+0x67/0xb0

Reported-by: kernel test robot <oliver.sang@intel.com>
Reviewed-by: Sami Tolvanen <samitolvanen@google.com>
Tested-by: Sami Tolvanen <samitolvanen@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/syscall_wrapper.h |  4 ---
 arch/riscv/include/asm/syscall_wrapper.h |  5 ----
 arch/s390/include/asm/syscall_wrapper.h  | 13 +--------
 arch/x86/include/asm/syscall_wrapper.h   | 34 +++---------------------
 kernel/sys_ni.c                          | 14 ++++++++++
 kernel/time/posix-stubs.c                | 45 --------------------------------
 6 files changed, 19 insertions(+), 96 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm64/include/asm/syscall_wrapper.h b/arch/arm64/include/asm/syscall_wrapper.h
index d977713ec0ba..abb57bc54305 100644
--- a/arch/arm64/include/asm/syscall_wrapper.h
+++ b/arch/arm64/include/asm/syscall_wrapper.h
@@ -44,9 +44,6 @@
 		return sys_ni_syscall();						\
 	}
 
-#define COMPAT_SYS_NI(name) \
-	SYSCALL_ALIAS(__arm64_compat_sys_##name, sys_ni_posix_timers);
-
 #endif /* CONFIG_COMPAT */
 
 #define __SYSCALL_DEFINEx(x, name, ...)						\
@@ -81,6 +78,5 @@
 	}
 
 asmlinkage long __arm64_sys_ni_syscall(const struct pt_regs *__unused);
-#define SYS_NI(name) SYSCALL_ALIAS(__arm64_sys_##name, sys_ni_posix_timers);
 
 #endif /* __ASM_SYSCALL_WRAPPER_H */
diff --git a/arch/riscv/include/asm/syscall_wrapper.h b/arch/riscv/include/asm/syscall_wrapper.h
index 1d7942c8a6cb..eeec04b7dae6 100644
--- a/arch/riscv/include/asm/syscall_wrapper.h
+++ b/arch/riscv/include/asm/syscall_wrapper.h
@@ -46,9 +46,6 @@ asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *);
 		return sys_ni_syscall();						\
 	}
 
-#define COMPAT_SYS_NI(name) \
-	SYSCALL_ALIAS(__riscv_compat_sys_##name, sys_ni_posix_timers);
-
 #endif /* CONFIG_COMPAT */
 
 #define __SYSCALL_DEFINEx(x, name, ...)						\
@@ -82,6 +79,4 @@ asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *);
 		return sys_ni_syscall();					\
 	}
 
-#define SYS_NI(name) SYSCALL_ALIAS(__riscv_sys_##name, sys_ni_posix_timers);
-
 #endif /* __ASM_SYSCALL_WRAPPER_H */
diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h
index 9286430fe729..35c1d1b860d8 100644
--- a/arch/s390/include/asm/syscall_wrapper.h
+++ b/arch/s390/include/asm/syscall_wrapper.h
@@ -63,10 +63,6 @@
 	cond_syscall(__s390x_sys_##name);				\
 	cond_syscall(__s390_sys_##name)
 
-#define SYS_NI(name)							\
-	SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers);		\
-	SYSCALL_ALIAS(__s390_sys_##name, sys_ni_posix_timers)
-
 #define COMPAT_SYSCALL_DEFINEx(x, name, ...)						\
 	long __s390_compat_sys##name(struct pt_regs *regs);				\
 	ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO);				\
@@ -85,15 +81,11 @@
 
 /*
  * As some compat syscalls may not be implemented, we need to expand
- * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in
- * kernel/time/posix-stubs.c to cover this case as well.
+ * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well.
  */
 #define COND_SYSCALL_COMPAT(name)					\
 	cond_syscall(__s390_compat_sys_##name)
 
-#define COMPAT_SYS_NI(name)						\
-	SYSCALL_ALIAS(__s390_compat_sys_##name, sys_ni_posix_timers)
-
 #define __S390_SYS_STUBx(x, name, ...)						\
 	long __s390_sys##name(struct pt_regs *regs);				\
 	ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO);				\
@@ -124,9 +116,6 @@
 #define COND_SYSCALL(name)						\
 	cond_syscall(__s390x_sys_##name)
 
-#define SYS_NI(name)							\
-	SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers)
-
 #define __S390_SYS_STUBx(x, fullname, name, ...)
 
 #endif /* CONFIG_COMPAT */
diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h
index fd2669b1cb2d..21f9407be5d3 100644
--- a/arch/x86/include/asm/syscall_wrapper.h
+++ b/arch/x86/include/asm/syscall_wrapper.h
@@ -86,9 +86,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
 		return sys_ni_syscall();				\
 	}
 
-#define __SYS_NI(abi, name)						\
-	SYSCALL_ALIAS(__##abi##_##name, sys_ni_posix_timers);
-
 #ifdef CONFIG_X86_64
 #define __X64_SYS_STUB0(name)						\
 	__SYS_STUB0(x64, sys_##name)
@@ -100,13 +97,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
 #define __X64_COND_SYSCALL(name)					\
 	__COND_SYSCALL(x64, sys_##name)
 
-#define __X64_SYS_NI(name)						\
-	__SYS_NI(x64, sys_##name)
 #else /* CONFIG_X86_64 */
 #define __X64_SYS_STUB0(name)
 #define __X64_SYS_STUBx(x, name, ...)
 #define __X64_COND_SYSCALL(name)
-#define __X64_SYS_NI(name)
 #endif /* CONFIG_X86_64 */
 
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -120,13 +114,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
 #define __IA32_COND_SYSCALL(name)					\
 	__COND_SYSCALL(ia32, sys_##name)
 
-#define __IA32_SYS_NI(name)						\
-	__SYS_NI(ia32, sys_##name)
 #else /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
 #define __IA32_SYS_STUB0(name)
 #define __IA32_SYS_STUBx(x, name, ...)
 #define __IA32_COND_SYSCALL(name)
-#define __IA32_SYS_NI(name)
 #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
 
 #ifdef CONFIG_IA32_EMULATION
@@ -135,8 +126,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
  * additional wrappers (aptly named __ia32_sys_xyzzy) which decode the
  * ia32 regs in the proper order for shared or "common" syscalls. As some
  * syscalls may not be implemented, we need to expand COND_SYSCALL in
- * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this
- * case as well.
+ * kernel/sys_ni.c to cover this case as well.
  */
 #define __IA32_COMPAT_SYS_STUB0(name)					\
 	__SYS_STUB0(ia32, compat_sys_##name)
@@ -148,14 +138,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
 #define __IA32_COMPAT_COND_SYSCALL(name)				\
 	__COND_SYSCALL(ia32, compat_sys_##name)
 
-#define __IA32_COMPAT_SYS_NI(name)					\
-	__SYS_NI(ia32, compat_sys_##name)
-
 #else /* CONFIG_IA32_EMULATION */
 #define __IA32_COMPAT_SYS_STUB0(name)
 #define __IA32_COMPAT_SYS_STUBx(x, name, ...)
 #define __IA32_COMPAT_COND_SYSCALL(name)
-#define __IA32_COMPAT_SYS_NI(name)
 #endif /* CONFIG_IA32_EMULATION */
 
 
@@ -175,13 +161,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
 #define __X32_COMPAT_COND_SYSCALL(name)					\
 	__COND_SYSCALL(x64, compat_sys_##name)
 
-#define __X32_COMPAT_SYS_NI(name)					\
-	__SYS_NI(x64, compat_sys_##name)
 #else /* CONFIG_X86_X32_ABI */
 #define __X32_COMPAT_SYS_STUB0(name)
 #define __X32_COMPAT_SYS_STUBx(x, name, ...)
 #define __X32_COMPAT_COND_SYSCALL(name)
-#define __X32_COMPAT_SYS_NI(name)
 #endif /* CONFIG_X86_X32_ABI */
 
 
@@ -212,17 +195,12 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
 
 /*
  * As some compat syscalls may not be implemented, we need to expand
- * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in
- * kernel/time/posix-stubs.c to cover this case as well.
+ * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well.
  */
 #define COND_SYSCALL_COMPAT(name) 					\
 	__IA32_COMPAT_COND_SYSCALL(name)				\
 	__X32_COMPAT_COND_SYSCALL(name)
 
-#define COMPAT_SYS_NI(name)						\
-	__IA32_COMPAT_SYS_NI(name)					\
-	__X32_COMPAT_SYS_NI(name)
-
 #endif /* CONFIG_COMPAT */
 
 #define __SYSCALL_DEFINEx(x, name, ...)					\
@@ -243,8 +221,8 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
  * As the generic SYSCALL_DEFINE0() macro does not decode any parameters for
  * obvious reasons, and passing struct pt_regs *regs to it in %rdi does not
  * hurt, we only need to re-define it here to keep the naming congruent to
- * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() and SYS_NI()
- * macros to work correctly.
+ * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() macro
+ * to work correctly.
  */
 #define SYSCALL_DEFINE0(sname)						\
 	SYSCALL_METADATA(_##sname, 0);					\
@@ -257,10 +235,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
 	__X64_COND_SYSCALL(name)					\
 	__IA32_COND_SYSCALL(name)
 
-#define SYS_NI(name)							\
-	__X64_SYS_NI(name)						\
-	__IA32_SYS_NI(name)
-
 
 /*
  * For VSYSCALLS, we need to declare these three syscalls with the new
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e1a6e3c675c0..9a846439b36a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -201,6 +201,20 @@ COND_SYSCALL(recvmmsg_time32);
 COND_SYSCALL_COMPAT(recvmmsg_time32);
 COND_SYSCALL_COMPAT(recvmmsg_time64);
 
+/* Posix timer syscalls may be configured out */
+COND_SYSCALL(timer_create);
+COND_SYSCALL(timer_gettime);
+COND_SYSCALL(timer_getoverrun);
+COND_SYSCALL(timer_settime);
+COND_SYSCALL(timer_delete);
+COND_SYSCALL(clock_adjtime);
+COND_SYSCALL(getitimer);
+COND_SYSCALL(setitimer);
+COND_SYSCALL(alarm);
+COND_SYSCALL_COMPAT(timer_create);
+COND_SYSCALL_COMPAT(getitimer);
+COND_SYSCALL_COMPAT(setitimer);
+
 /*
  * Architecture specific syscalls: see further below
  */
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 828aeecbd1e8..9b6fcb8d85e7 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -17,40 +17,6 @@
 #include <linux/time_namespace.h>
 #include <linux/compat.h>
 
-#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
-/* Architectures may override SYS_NI and COMPAT_SYS_NI */
-#include <asm/syscall_wrapper.h>
-#endif
-
-asmlinkage long sys_ni_posix_timers(void)
-{
-	pr_err_once("process %d (%s) attempted a POSIX timer syscall "
-		    "while CONFIG_POSIX_TIMERS is not set\n",
-		    current->pid, current->comm);
-	return -ENOSYS;
-}
-
-#ifndef SYS_NI
-#define SYS_NI(name)  SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers)
-#endif
-
-#ifndef COMPAT_SYS_NI
-#define COMPAT_SYS_NI(name)  SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers)
-#endif
-
-SYS_NI(timer_create);
-SYS_NI(timer_gettime);
-SYS_NI(timer_getoverrun);
-SYS_NI(timer_settime);
-SYS_NI(timer_delete);
-SYS_NI(clock_adjtime);
-SYS_NI(getitimer);
-SYS_NI(setitimer);
-SYS_NI(clock_adjtime32);
-#ifdef __ARCH_WANT_SYS_ALARM
-SYS_NI(alarm);
-#endif
-
 /*
  * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
  * as it is easy to remain compatible with little code. CLOCK_BOOTTIME
@@ -158,18 +124,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 				 which_clock);
 }
 
-#ifdef CONFIG_COMPAT
-COMPAT_SYS_NI(timer_create);
-#endif
-
-#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
-COMPAT_SYS_NI(getitimer);
-COMPAT_SYS_NI(setitimer);
-#endif
-
 #ifdef CONFIG_COMPAT_32BIT_TIME
-SYS_NI(timer_settime32);
-SYS_NI(timer_gettime32);
 
 SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock,
 		struct old_timespec32 __user *, tp)
-- 
cgit v1.2.3


From 1bfc466b13cf6652ba227c282c27a30ffede69a5 Mon Sep 17 00:00:00 2001
From: Dmitry Antipov <dmantipov@yandex.ru>
Date: Thu, 21 Dec 2023 12:01:21 +0300
Subject: watch_queue: fix kcalloc() arguments order

When compiling with gcc version 14.0.0 20231220 (experimental)
and W=1, I've noticed the following warning:

kernel/watch_queue.c: In function 'watch_queue_set_size':
kernel/watch_queue.c:273:32: warning: 'kcalloc' sizes specified with 'sizeof'
in the earlier argument and not in the later argument [-Wcalloc-transposed-args]
  273 |         pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL);
      |                                ^~~~~~

Since 'n' and 'size' arguments of 'kcalloc()' are multiplied to
calculate the final size, their actual order doesn't affect the
result and so this is not a bug. But it's still worth to fix it.

Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Link: https://lore.kernel.org/r/20231221090139.12579-1-dmantipov@yandex.ru
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/watch_queue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 778b4056700f..03b90d7d2175 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -270,7 +270,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
 		goto error;
 
 	ret = -ENOMEM;
-	pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL);
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
 	if (!pages)
 		goto error;
 
-- 
cgit v1.2.3


From b08c8fc0411dce0fc44b78ce4d67f1b67c35c196 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 20 Dec 2023 14:38:05 +0100
Subject: bpf: Re-support uid and gid when mounting bpffs

For a clean, conflict-free revert of the token-related patches in commit
d17aff807f84 ("Revert BPF token-related functionality"), the bpf fs commit
750e785796bb ("bpf: Support uid and gid when mounting bpffs") was undone
temporarily as well.

This patch manually re-adds the functionality from the original one back
in 750e785796bb, no other functional changes intended.

Testing:

  # mount -t bpf -o uid=65534,gid=65534 bpffs ./foo
  # ls -la . | grep foo
  drwxrwxrwt   2 nobody nogroup          0 Dec 20 13:16 foo
  # mount -t bpf
  bpffs on /root/foo type bpf (rw,relatime,uid=65534,gid=65534)

Also, passing invalid arguments for uid/gid are properly rejected as expected.

Fixes: d17aff807f84 ("Revert BPF token-related functionality")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Cc: Jie Jiang <jiejiang@chromium.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/bpf/20231220133805.20953-1-daniel@iogearbox.net
---
 kernel/bpf/inode.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 1aafb2ff2e95..41e0a55c35f5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -599,8 +599,15 @@ EXPORT_SYMBOL(bpf_prog_get_type_path);
  */
 static int bpf_show_options(struct seq_file *m, struct dentry *root)
 {
-	umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
-
+	struct inode *inode = d_inode(root);
+	umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
+
+	if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
+		seq_printf(m, ",uid=%u",
+			   from_kuid_munged(&init_user_ns, inode->i_uid));
+	if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID))
+		seq_printf(m, ",gid=%u",
+			   from_kgid_munged(&init_user_ns, inode->i_gid));
 	if (mode != S_IRWXUGO)
 		seq_printf(m, ",mode=%o", mode);
 	return 0;
@@ -625,15 +632,21 @@ static const struct super_operations bpf_super_ops = {
 };
 
 enum {
+	OPT_UID,
+	OPT_GID,
 	OPT_MODE,
 };
 
 static const struct fs_parameter_spec bpf_fs_parameters[] = {
+	fsparam_u32	("uid",				OPT_UID),
+	fsparam_u32	("gid",				OPT_GID),
 	fsparam_u32oct	("mode",			OPT_MODE),
 	{}
 };
 
 struct bpf_mount_opts {
+	kuid_t uid;
+	kgid_t gid;
 	umode_t mode;
 };
 
@@ -641,6 +654,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
 	struct bpf_mount_opts *opts = fc->fs_private;
 	struct fs_parse_result result;
+	kuid_t uid;
+	kgid_t gid;
 	int opt;
 
 	opt = fs_parse(fc, bpf_fs_parameters, param, &result);
@@ -662,12 +677,42 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	}
 
 	switch (opt) {
+	case OPT_UID:
+		uid = make_kuid(current_user_ns(), result.uint_32);
+		if (!uid_valid(uid))
+			goto bad_value;
+
+		/*
+		 * The requested uid must be representable in the
+		 * filesystem's idmapping.
+		 */
+		if (!kuid_has_mapping(fc->user_ns, uid))
+			goto bad_value;
+
+		opts->uid = uid;
+		break;
+	case OPT_GID:
+		gid = make_kgid(current_user_ns(), result.uint_32);
+		if (!gid_valid(gid))
+			goto bad_value;
+
+		/*
+		 * The requested gid must be representable in the
+		 * filesystem's idmapping.
+		 */
+		if (!kgid_has_mapping(fc->user_ns, gid))
+			goto bad_value;
+
+		opts->gid = gid;
+		break;
 	case OPT_MODE:
 		opts->mode = result.uint_32 & S_IALLUGO;
 		break;
 	}
 
 	return 0;
+bad_value:
+	return invalfc(fc, "Bad value for '%s'", param->key);
 }
 
 struct bpf_preload_ops *bpf_preload_ops;
@@ -750,6 +795,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_op = &bpf_super_ops;
 
 	inode = sb->s_root->d_inode;
+	inode->i_uid = opts->uid;
+	inode->i_gid = opts->gid;
 	inode->i_op = &bpf_dir_iops;
 	inode->i_mode &= ~S_IALLUGO;
 	populate_bpffs(sb->s_root);
@@ -785,6 +832,8 @@ static int bpf_init_fs_context(struct fs_context *fc)
 		return -ENOMEM;
 
 	opts->mode = S_IRWXUGO;
+	opts->uid = current_fsuid();
+	opts->gid = current_fsgid();
 
 	fc->fs_private = opts;
 	fc->ops = &bpf_context_ops;
-- 
cgit v1.2.3


From 88b30c7f5d27e1594d70dc2bd7199b18f2b57fa9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 20 Dec 2023 11:15:25 -0500
Subject: tracing / synthetic: Disable events after testing in
 synth_event_gen_test_init()

The synth_event_gen_test module can be built in, if someone wants to run
the tests at boot up and not have to load them.

The synth_event_gen_test_init() function creates and enables the synthetic
events and runs its tests.

The synth_event_gen_test_exit() disables the events it created and
destroys the events.

If the module is builtin, the events are never disabled. The issue is, the
events should be disable after the tests are run. This could be an issue
if the rest of the boot up tests are enabled, as they expect the events to
be in a known state before testing. That known state happens to be
disabled.

When CONFIG_SYNTH_EVENT_GEN_TEST=y and CONFIG_EVENT_TRACE_STARTUP_TEST=y
a warning will trigger:

 Running tests on trace events:
 Testing event create_synth_test:
 Enabled event during self test!
 ------------[ cut here ]------------
 WARNING: CPU: 2 PID: 1 at kernel/trace/trace_events.c:4150 event_trace_self_tests+0x1c2/0x480
 Modules linked in:
 CPU: 2 PID: 1 Comm: swapper/0 Not tainted 6.7.0-rc2-test-00031-gb803d7c664d5-dirty #276
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
 RIP: 0010:event_trace_self_tests+0x1c2/0x480
 Code: bb e8 a2 ab 5d fc 48 8d 7b 48 e8 f9 3d 99 fc 48 8b 73 48 40 f6 c6 01 0f 84 d6 fe ff ff 48 c7 c7 20 b6 ad bb e8 7f ab 5d fc 90 <0f> 0b 90 48 89 df e8 d3 3d 99 fc 48 8b 1b 4c 39 f3 0f 85 2c ff ff
 RSP: 0000:ffffc9000001fdc0 EFLAGS: 00010246
 RAX: 0000000000000029 RBX: ffff88810399ca80 RCX: 0000000000000000
 RDX: 0000000000000000 RSI: ffffffffb9f19478 RDI: ffff88823c734e64
 RBP: ffff88810399f300 R08: 0000000000000000 R09: fffffbfff79eb32a
 R10: ffffffffbcf59957 R11: 0000000000000001 R12: ffff888104068090
 R13: ffffffffbc89f0a0 R14: ffffffffbc8a0f08 R15: 0000000000000078
 FS:  0000000000000000(0000) GS:ffff88823c700000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000000 CR3: 00000001f6282001 CR4: 0000000000170ef0
 Call Trace:
  <TASK>
  ? __warn+0xa5/0x200
  ? event_trace_self_tests+0x1c2/0x480
  ? report_bug+0x1f6/0x220
  ? handle_bug+0x6f/0x90
  ? exc_invalid_op+0x17/0x50
  ? asm_exc_invalid_op+0x1a/0x20
  ? tracer_preempt_on+0x78/0x1c0
  ? event_trace_self_tests+0x1c2/0x480
  ? __pfx_event_trace_self_tests_init+0x10/0x10
  event_trace_self_tests_init+0x27/0xe0
  do_one_initcall+0xd6/0x3c0
  ? __pfx_do_one_initcall+0x10/0x10
  ? kasan_set_track+0x25/0x30
  ? rcu_is_watching+0x38/0x60
  kernel_init_freeable+0x324/0x450
  ? __pfx_kernel_init+0x10/0x10
  kernel_init+0x1f/0x1e0
  ? _raw_spin_unlock_irq+0x33/0x50
  ret_from_fork+0x34/0x60
  ? __pfx_kernel_init+0x10/0x10
  ret_from_fork_asm+0x1b/0x30
  </TASK>

This is because the synth_event_gen_test_init() left the synthetic events
that it created enabled. By having it disable them after testing, the
other selftests will run fine.

Link: https://lore.kernel.org/linux-trace-kernel/20231220111525.2f0f49b0@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Fixes: 9fe41efaca084 ("tracing: Add synth event generation test module")
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reported-by: Alexander Graf <graf@amazon.com>
Tested-by: Alexander Graf <graf@amazon.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/synth_event_gen_test.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index 8dfe85499d4a..354c2117be43 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -477,6 +477,17 @@ static int __init synth_event_gen_test_init(void)
 
 	ret = test_trace_synth_event();
 	WARN_ON(ret);
+
+	/* Disable when done */
+	trace_array_set_clr_event(gen_synth_test->tr,
+				  "synthetic",
+				  "gen_synth_test", false);
+	trace_array_set_clr_event(empty_synth_test->tr,
+				  "synthetic",
+				  "empty_synth_test", false);
+	trace_array_set_clr_event(create_synth_test->tr,
+				  "synthetic",
+				  "create_synth_test", false);
  out:
 	return ret;
 }
-- 
cgit v1.2.3


From 22887dfba0633c09444562722f0cf68f61ec9a50 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 19 Dec 2023 13:54:20 -0500
Subject: ring-buffer: Clear pages on error in ring_buffer_subbuf_order_set()
 failure

On failure to allocate ring buffer pages, the pointer to the CPU buffer
pages is freed, but the pages that were allocated previously were not.
Make sure they are freed too.

Link: https://lore.kernel.org/linux-trace-kernel/20231219185629.179352802@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Fixes: f9b94daa542a ("tracing: Set new size of the ring buffer sub page")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c2afcf98ea91..3c11e8e811ed 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5927,6 +5927,7 @@ error:
 	for_each_buffer_cpu(buffer, cpu) {
 		if (!cpu_buffers[cpu])
 			continue;
+		rb_free_cpu_buffer(cpu_buffers[cpu]);
 		kfree(cpu_buffers[cpu]);
 	}
 	kfree(cpu_buffers);
-- 
cgit v1.2.3


From b81e03a24966dca0b119eff0549a4e44befff419 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 19 Dec 2023 13:54:21 -0500
Subject: ring-buffer: Do no swap cpu buffers if order is different

As all the subbuffer order (subbuffer sizes) must be the same throughout
the ring buffer, check the order of the buffers that are doing a CPU
buffer swap in ring_buffer_swap_cpu() to make sure they are the same.

If the are not the same, then fail to do the swap, otherwise the ring
buffer will think the CPU buffer has a specific subbuffer size when it
does not.

Link: https://lore.kernel.org/linux-trace-kernel/20231219185629.467894710@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3c11e8e811ed..fdcd171b09b5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5417,6 +5417,9 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
 	if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
 		goto out;
 
+	if (buffer_a->subbuf_order != buffer_b->subbuf_order)
+		goto out;
+
 	ret = -EAGAIN;
 
 	if (atomic_read(&buffer_a->record_disabled))
-- 
cgit v1.2.3


From 4e958db34fd5324421ef9562c31c15e2d152dc63 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 19 Dec 2023 13:54:22 -0500
Subject: ring-buffer: Make sure the spare sub buffer used for reads has same
 size

Now that the ring buffer specifies the size of its sub buffers, they all
need to be the same size. When doing a read, a swap is done with a spare
page. Make sure they are the same size before doing the swap, otherwise
the read will fail.

Link: https://lore.kernel.org/linux-trace-kernel/20231219185629.763664788@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 711095aa731d..4dcdc30aa110 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7582,6 +7582,7 @@ struct ftrace_buffer_info {
 	struct trace_iterator	iter;
 	void			*spare;
 	unsigned int		spare_cpu;
+	unsigned int		spare_size;
 	unsigned int		read;
 };
 
@@ -8301,6 +8302,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 
 	page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
 
+	/* Make sure the spare matches the current sub buffer size */
+	if (info->spare) {
+		if (page_size != info->spare_size) {
+			ring_buffer_free_read_page(iter->array_buffer->buffer,
+						   info->spare_cpu, info->spare);
+			info->spare = NULL;
+		}
+	}
+
 	if (!info->spare) {
 		info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer,
 							  iter->cpu_file);
@@ -8309,6 +8319,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 			info->spare = NULL;
 		} else {
 			info->spare_cpu = iter->cpu_file;
+			info->spare_size = page_size;
 		}
 	}
 	if (!info->spare)
-- 
cgit v1.2.3


From aa067682adf19ca78f81cd57539282dbfd7cce21 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 19 Dec 2023 13:54:23 -0500
Subject: tracing: Update snapshot order along with main buffer order

When updating the order of the sub buffers for the main buffer, make sure
that if the snapshot buffer exists, that it gets its order updated as
well.

Link: https://lore.kernel.org/linux-trace-kernel/20231219185630.054668186@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4dcdc30aa110..2439e00aa4ce 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1263,10 +1263,17 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val);
 
 int tracing_alloc_snapshot_instance(struct trace_array *tr)
 {
+	int order;
 	int ret;
 
 	if (!tr->allocated_snapshot) {
 
+		/* Make the snapshot buffer have the same order as main buffer */
+		order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+		ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+		if (ret < 0)
+			return ret;
+
 		/* allocate spare buffer */
 		ret = resize_buffer_duplicate_size(&tr->max_buffer,
 				   &tr->array_buffer, RING_BUFFER_ALL_CPUS);
@@ -1286,6 +1293,7 @@ static void free_snapshot(struct trace_array *tr)
 	 * The max_tr ring buffer has some state (e.g. ring->clock) and
 	 * we want preserve it.
 	 */
+	ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0);
 	ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
 	set_buffer_entries(&tr->max_buffer, 1);
 	tracing_reset_online_cpus(&tr->max_buffer);
@@ -9393,6 +9401,7 @@ buffer_order_write(struct file *filp, const char __user *ubuf,
 {
 	struct trace_array *tr = filp->private_data;
 	unsigned long val;
+	int old_order;
 	int ret;
 
 	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
@@ -9403,12 +9412,44 @@ buffer_order_write(struct file *filp, const char __user *ubuf,
 	if (val < 0 || val > 7)
 		return -EINVAL;
 
+	old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+	if (old_order == val)
+		return 0;
+
 	ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val);
 	if (ret)
-		return ret;
+		return 0;
 
-	(*ppos)++;
+#ifdef CONFIG_TRACER_MAX_TRACE
+
+	if (!tr->allocated_snapshot)
+		goto out_max;
 
+	ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, val);
+	if (ret) {
+		/* Put back the old order */
+		cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order);
+		if (WARN_ON_ONCE(cnt)) {
+			/*
+			 * AARGH! We are left with different orders!
+			 * The max buffer is our "snapshot" buffer.
+			 * When a tracer needs a snapshot (one of the
+			 * latency tracers), it swaps the max buffer
+			 * with the saved snap shot. We succeeded to
+			 * update the order of the main buffer, but failed to
+			 * update the order of the max buffer. But when we tried
+			 * to reset the main buffer to the original size, we
+			 * failed there too. This is very unlikely to
+			 * happen, but if it does, warn and kill all
+			 * tracing.
+			 */
+			tracing_disabled = 1;
+		}
+		return ret;
+	}
+ out_max:
+#endif
+	(*ppos)++;
 	return cnt;
 }
 
-- 
cgit v1.2.3


From fa4b54af5ba17a59ce082e9d710eb638bdce7637 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 19 Dec 2023 13:54:24 -0500
Subject: tracing: Stop the tracing while changing the ring buffer subbuf size

Because the main buffer and the snapshot buffer need to be the same for
some tracers, otherwise it will fail and disable all tracing, the tracers
need to be stopped while updating the sub buffer sizes so that the tracers
see the main and snapshot buffers with the same sub buffer size.

Link: https://lore.kernel.org/linux-trace-kernel/20231219185630.353222794@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Fixes: 2808e31ec12e ("ring-buffer: Add interface for configuring trace sub buffer size")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2439e00aa4ce..82303bd2bba1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9412,13 +9412,16 @@ buffer_order_write(struct file *filp, const char __user *ubuf,
 	if (val < 0 || val > 7)
 		return -EINVAL;
 
+	/* Do not allow tracing while changing the order of the ring buffer */
+	tracing_stop_tr(tr);
+
 	old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
 	if (old_order == val)
-		return 0;
+		goto out;
 
 	ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val);
 	if (ret)
-		return 0;
+		goto out;
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 
@@ -9445,11 +9448,15 @@ buffer_order_write(struct file *filp, const char __user *ubuf,
 			 */
 			tracing_disabled = 1;
 		}
-		return ret;
+		goto out;
 	}
  out_max:
 #endif
 	(*ppos)++;
+ out:
+	if (ret)
+		cnt = ret;
+	tracing_start_tr(tr);
 	return cnt;
 }
 
-- 
cgit v1.2.3


From 353cc219372935805218e05a7754a059ab104641 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 19 Dec 2023 13:54:25 -0500
Subject: ring-buffer: Keep the same size when updating the order

The function ring_buffer_subbuf_order_set() just updated the sub-buffers
to the new size, but this also changes the size of the buffer in doing so.
As the size is determined by nr_pages * subbuf_size. If the subbuf_size is
increased without decreasing the nr_pages, this causes the total size of
the buffer to increase.

This broke the latency tracers as the snapshot needs to be the same size
as the main buffer. The size of the snapshot buffer is only expanded when
needed, and because the order is still the same, the size becomes out of
sync with the main buffer, as the main buffer increased in size without
the tracing system knowing.

Calculate the nr_pages to allocate with the new subbuf_size to be
buffer_size / new_subbuf_size.

Link: https://lore.kernel.org/linux-trace-kernel/20231219185630.649397785@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Fixes: f9b94daa542a ("ring-buffer: Set new size of the ring buffer sub page")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fdcd171b09b5..23ead7602da0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5897,7 +5897,10 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
 		if (!cpumask_test_cpu(cpu, buffer->cpumask))
 			continue;
 
-		nr_pages = buffer->buffers[cpu]->nr_pages;
+		/* Update the number of pages to match the new size */
+		nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
+		nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
+
 		cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
 		if (!cpu_buffers[cpu]) {
 			err = -ENOMEM;
-- 
cgit v1.2.3


From 8e7b58c27b3c567316a51079b375b846f9223bba Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 19 Dec 2023 13:54:26 -0500
Subject: ring-buffer: Just update the subbuffers when changing their
 allocation order

The ring_buffer_subbuf_order_set() was creating ring_buffer_per_cpu
cpu_buffers with the new subbuffers with the updated order, and if they
all successfully were created, then they the ring_buffer's per_cpu buffers
would be freed and replaced by them.

The problem is that the freed per_cpu buffers contains state that would be
lost. Running the following commands:

1. # echo 3 > /sys/kernel/tracing/buffer_subbuf_order
2. # echo 0 > /sys/kernel/tracing/tracing_cpumask
3. # echo 1 > /sys/kernel/tracing/snapshot
4. # echo ff > /sys/kernel/tracing/tracing_cpumask
5. # echo test > /sys/kernel/tracing/trace_marker

Would result in:

 -bash: echo: write error: Bad file descriptor

That's because the state of the per_cpu buffers of the snapshot buffer is
lost when the order is changed (the order of a freed snapshot buffer goes
to 0 to save memory, and when the snapshot buffer is allocated again, it
goes back to what the main buffer is).

In operation 2, the snapshot buffers were set to "disable" (as all the
ring buffers CPUs were disabled).

In operation 3, the snapshot is allocated and a call to
ring_buffer_subbuf_order_set() replaced the per_cpu buffers losing the
"record_disable" count.

When it was enabled again, the atomic_dec(&cpu_buffer->record_disable) was
decrementing a zero, setting it to -1. Writing 1 into the snapshot would
swap the snapshot buffer with the main buffer, so now the main buffer is
"disabled", and nothing can write to the ring buffer anymore.

Instead of creating new per_cpu buffers and losing the state of the old
buffers, basically do what the resize does and just allocate new subbuf
pages into the new_pages link list of the per_cpu buffer and if they all
succeed, then replace the old sub buffers with the new ones. This keeps
the per_cpu buffer descriptor in tact and by doing so, keeps its state.

Link: https://lore.kernel.org/linux-trace-kernel/20231219185630.944104939@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Fixes: f9b94daa542a ("ring-buffer: Set new size of the ring buffer sub page")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 88 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 71 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 23ead7602da0..7ee6779bf292 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5856,11 +5856,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
  */
 int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
 {
-	struct ring_buffer_per_cpu **cpu_buffers;
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct buffer_page *bpage, *tmp;
 	int old_order, old_size;
 	int nr_pages;
 	int psize;
-	int bsize;
 	int err;
 	int cpu;
 
@@ -5874,11 +5874,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
 	if (psize <= BUF_PAGE_HDR_SIZE)
 		return -EINVAL;
 
-	bsize = sizeof(void *) * buffer->cpus;
-	cpu_buffers = kzalloc(bsize, GFP_KERNEL);
-	if (!cpu_buffers)
-		return -ENOMEM;
-
 	old_order = buffer->subbuf_order;
 	old_size = buffer->subbuf_size;
 
@@ -5894,33 +5889,88 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
 
 	/* Make sure all new buffers are allocated, before deleting the old ones */
 	for_each_buffer_cpu(buffer, cpu) {
+
 		if (!cpumask_test_cpu(cpu, buffer->cpumask))
 			continue;
 
+		cpu_buffer = buffer->buffers[cpu];
+
 		/* Update the number of pages to match the new size */
 		nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
 		nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
 
-		cpu_buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
-		if (!cpu_buffers[cpu]) {
+		/* we need a minimum of two pages */
+		if (nr_pages < 2)
+			nr_pages = 2;
+
+		cpu_buffer->nr_pages_to_update = nr_pages;
+
+		/* Include the reader page */
+		nr_pages++;
+
+		/* Allocate the new size buffer */
+		INIT_LIST_HEAD(&cpu_buffer->new_pages);
+		if (__rb_allocate_pages(cpu_buffer, nr_pages,
+					&cpu_buffer->new_pages)) {
+			/* not enough memory for new pages */
 			err = -ENOMEM;
 			goto error;
 		}
 	}
 
 	for_each_buffer_cpu(buffer, cpu) {
+
 		if (!cpumask_test_cpu(cpu, buffer->cpumask))
 			continue;
 
-		rb_free_cpu_buffer(buffer->buffers[cpu]);
-		buffer->buffers[cpu] = cpu_buffers[cpu];
+		cpu_buffer = buffer->buffers[cpu];
+
+		/* Clear the head bit to make the link list normal to read */
+		rb_head_page_deactivate(cpu_buffer);
+
+		/* Now walk the list and free all the old sub buffers */
+		list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) {
+			list_del_init(&bpage->list);
+			free_buffer_page(bpage);
+		}
+		/* The above loop stopped an the last page needing to be freed */
+		bpage = list_entry(cpu_buffer->pages, struct buffer_page, list);
+		free_buffer_page(bpage);
+
+		/* Free the current reader page */
+		free_buffer_page(cpu_buffer->reader_page);
+
+		/* One page was allocated for the reader page */
+		cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next,
+						     struct buffer_page, list);
+		list_del_init(&cpu_buffer->reader_page->list);
+
+		/* The cpu_buffer pages are a link list with no head */
+		cpu_buffer->pages = cpu_buffer->new_pages.next;
+		cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev;
+		cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next;
+
+		/* Clear the new_pages list */
+		INIT_LIST_HEAD(&cpu_buffer->new_pages);
+
+		cpu_buffer->head_page
+			= list_entry(cpu_buffer->pages, struct buffer_page, list);
+		cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+		cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update;
+		cpu_buffer->nr_pages_to_update = 0;
+
+		free_pages((unsigned long)cpu_buffer->free_page, old_order);
+		cpu_buffer->free_page = NULL;
+
+		rb_head_page_activate(cpu_buffer);
+
+		rb_check_pages(cpu_buffer);
 	}
 
 	atomic_dec(&buffer->record_disabled);
 	mutex_unlock(&buffer->mutex);
 
-	kfree(cpu_buffers);
-
 	return 0;
 
 error:
@@ -5931,12 +5981,16 @@ error:
 	mutex_unlock(&buffer->mutex);
 
 	for_each_buffer_cpu(buffer, cpu) {
-		if (!cpu_buffers[cpu])
+		cpu_buffer = buffer->buffers[cpu];
+
+		if (!cpu_buffer->nr_pages_to_update)
 			continue;
-		rb_free_cpu_buffer(cpu_buffers[cpu]);
-		kfree(cpu_buffers[cpu]);
+
+		list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) {
+			list_del_init(&bpage->list);
+			free_buffer_page(bpage);
+		}
 	}
-	kfree(cpu_buffers);
 
 	return err;
 }
-- 
cgit v1.2.3


From 2f84b39f48476615186af5b3220ad5a2c756679b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 19 Dec 2023 13:54:29 -0500
Subject: tracing: Update subbuffer with kilobytes not page order

Using page order for deciding what the size of the ring buffer sub buffers
are is exposing a bit too much of the implementation. Although the sub
buffers are only allocated in orders of pages, allow the user to specify
the minimum size of each sub-buffer via kilobytes like they can with the
buffer size itself.

If the user specifies 3 via:

  echo 3 > buffer_subbuf_size_kb

Then the sub-buffer size will round up to 4kb (on a 4kb page size system).

If they specify:

  echo 6 > buffer_subbuf_size_kb

The sub-buffer size will become 8kb.

and so on.

Link: https://lore.kernel.org/linux-trace-kernel/20231219185631.809766769@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace.rst                     | 46 +++++------
 kernel/trace/trace.c                               | 38 ++++++---
 .../ftrace/test.d/00basic/ringbuffer_order.tc      | 95 ----------------------
 .../test.d/00basic/ringbuffer_subbuf_size.tc       | 95 ++++++++++++++++++++++
 4 files changed, 140 insertions(+), 134 deletions(-)
 delete mode 100644 tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
 create mode 100644 tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc

(limited to 'kernel')

diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index 231d26ceedb0..933e7efb9f1b 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -203,32 +203,26 @@ of ftrace. Here is a list of some of the key files:
 
 	This displays the total combined size of all the trace buffers.
 
-  buffer_subbuf_order:
-
-	This sets or displays the sub buffer page size order. The ring buffer
-	is broken up into several same size "sub buffers". An event can not be
-	bigger than the size of the sub buffer. Normally, the sub buffer is
-	the size of the architecture's page (4K on x86). The sub buffer also
-	contains meta data at the start which also limits the size of an event.
-	That means when the sub buffer is a page size, no event can be larger
-	than the page size minus the sub buffer meta data.
-
-	The buffer_subbuf_order allows the user to change the size of the sub
-	buffer. As the sub buffer is a set of pages by the power of 2, thus
-	the sub buffer total size is defined by the order:
-
-	order		size
-	----		----
-	0		PAGE_SIZE
-	1		PAGE_SIZE * 2
-	2		PAGE_SIZE * 4
-	3		PAGE_SIZE * 8
-
-	Changing the order will change the sub buffer size allowing for events
-	to be larger than the page size.
-
-	Note: When changing the order, tracing is stopped and any data in the
-	ring buffer and the snapshot buffer will be discarded.
+  buffer_subbuf_size_kb:
+
+	This sets or displays the sub buffer size. The ring buffer is broken up
+	into several same size "sub buffers". An event can not be bigger than
+	the size of the sub buffer. Normally, the sub buffer is the size of the
+	architecture's page (4K on x86). The sub buffer also contains meta data
+	at the start which also limits the size of an event.  That means when
+	the sub buffer is a page size, no event can be larger than the page
+	size minus the sub buffer meta data.
+
+	Note, the buffer_subbuf_size_kb is a way for the user to specify the
+	minimum size of the subbuffer. The kernel may make it bigger due to the
+	implementation details, or simply fail the operation if the kernel can
+	not handle the request.
+
+	Changing the sub buffer size allows for events to be larger than the
+	page size.
+
+	Note: When changing the sub-buffer size, tracing is stopped and any
+	data in the ring buffer and the snapshot buffer will be discarded.
 
   free_buffer:
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 82303bd2bba1..46dbe22121e9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9384,42 +9384,54 @@ static const struct file_operations buffer_percent_fops = {
 };
 
 static ssize_t
-buffer_order_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 {
 	struct trace_array *tr = filp->private_data;
+	size_t size;
 	char buf[64];
+	int order;
 	int r;
 
-	r = sprintf(buf, "%d\n", ring_buffer_subbuf_order_get(tr->array_buffer.buffer));
+	order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+	size = (PAGE_SIZE << order) / 1024;
+
+	r = sprintf(buf, "%zd\n", size);
 
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
 static ssize_t
-buffer_order_write(struct file *filp, const char __user *ubuf,
-		   size_t cnt, loff_t *ppos)
+buffer_subbuf_size_write(struct file *filp, const char __user *ubuf,
+			 size_t cnt, loff_t *ppos)
 {
 	struct trace_array *tr = filp->private_data;
 	unsigned long val;
 	int old_order;
+	int order;
+	int pages;
 	int ret;
 
 	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
 	if (ret)
 		return ret;
 
+	val *= 1024; /* value passed in is in KB */
+
+	pages = DIV_ROUND_UP(val, PAGE_SIZE);
+	order = fls(pages - 1);
+
 	/* limit between 1 and 128 system pages */
-	if (val < 0 || val > 7)
+	if (order < 0 || order > 7)
 		return -EINVAL;
 
 	/* Do not allow tracing while changing the order of the ring buffer */
 	tracing_stop_tr(tr);
 
 	old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
-	if (old_order == val)
+	if (old_order == order)
 		goto out;
 
-	ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, val);
+	ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order);
 	if (ret)
 		goto out;
 
@@ -9428,7 +9440,7 @@ buffer_order_write(struct file *filp, const char __user *ubuf,
 	if (!tr->allocated_snapshot)
 		goto out_max;
 
-	ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, val);
+	ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
 	if (ret) {
 		/* Put back the old order */
 		cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order);
@@ -9460,10 +9472,10 @@ buffer_order_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static const struct file_operations buffer_order_fops = {
+static const struct file_operations buffer_subbuf_size_fops = {
 	.open		= tracing_open_generic_tr,
-	.read		= buffer_order_read,
-	.write		= buffer_order_write,
+	.read		= buffer_subbuf_size_read,
+	.write		= buffer_subbuf_size_write,
 	.release	= tracing_release_generic_tr,
 	.llseek		= default_llseek,
 };
@@ -9934,8 +9946,8 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
 			tr, &buffer_percent_fops);
 
-	trace_create_file("buffer_subbuf_order", TRACE_MODE_WRITE, d_tracer,
-			  tr, &buffer_order_fops);
+	trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
+			  tr, &buffer_subbuf_size_fops);
 
 	create_trace_options_dir(tr);
 
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
deleted file mode 100644
index ecbcc810e6c1..000000000000
--- a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_order.tc
+++ /dev/null
@@ -1,95 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-# description: Change the ringbuffer sub-buffer order
-# requires: buffer_subbuf_order
-# flags: instance
-
-get_buffer_data_size() {
-	sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page
-}
-
-get_buffer_data_offset() {
-	sed -ne 's/^.*data.*offset:\([0-9][0-9]*\).*/\1/p' events/header_page
-}
-
-get_event_header_size() {
-	type_len=`sed -ne 's/^.*type_len.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
-	time_len=`sed -ne 's/^.*time_delta.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
-	array_len=`sed -ne 's/^.*array.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
-	total_bits=$((type_len+time_len+array_len))
-	total_bits=$((total_bits+7))
-	echo $((total_bits/8))
-}
-
-get_print_event_buf_offset() {
-	sed -ne 's/^.*buf.*offset:\([0-9][0-9]*\).*/\1/p' events/ftrace/print/format
-}
-
-event_header_size=`get_event_header_size`
-print_header_size=`get_print_event_buf_offset`
-
-data_offset=`get_buffer_data_offset`
-
-marker_meta=$((event_header_size+print_header_size))
-
-make_str() {
-        cnt=$1
-	printf -- 'X%.0s' $(seq $cnt)
-}
-
-write_buffer() {
-	size=$1
-
-	str=`make_str $size`
-
-	# clear the buffer
-	echo > trace
-
-	# write the string into the marker
-	echo $str > trace_marker
-
-	echo $str
-}
-
-test_buffer() {
-	orde=$1
-	page_size=$((4096<<order))
-
-	size=`get_buffer_data_size`
-
-	# the size must be greater than or equal to page_size - data_offset
-	page_size=$((page_size-data_offset))
-	if [ $size -lt $page_size ]; then
-		exit fail
-	fi
-
-	# Now add a little more the meta data overhead will overflow
-
-	str=`write_buffer $size`
-
-	# Make sure the line was broken
-	new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; exit}' trace`
-
-	if [ "$new_str" = "$str" ]; then
-		exit fail;
-	fi
-
-	# Make sure the entire line can be found
-	new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; }' trace`
-
-	if [ "$new_str" != "$str" ]; then
-		exit fail;
-	fi
-}
-
-ORIG=`cat buffer_subbuf_order`
-
-# Could test bigger orders than 3, but then creating the string
-# to write into the ring buffer takes too long
-for a in 0 1 2 3 ; do
-	echo $a > buffer_subbuf_order
-	test_buffer $a
-done
-
-echo $ORIG > buffer_subbuf_order
-
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc
new file mode 100644
index 000000000000..d44d09a33a74
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc
@@ -0,0 +1,95 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Change the ringbuffer sub-buffer size
+# requires: buffer_subbuf_size_kb
+# flags: instance
+
+get_buffer_data_size() {
+	sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page
+}
+
+get_buffer_data_offset() {
+	sed -ne 's/^.*data.*offset:\([0-9][0-9]*\).*/\1/p' events/header_page
+}
+
+get_event_header_size() {
+	type_len=`sed -ne 's/^.*type_len.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
+	time_len=`sed -ne 's/^.*time_delta.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
+	array_len=`sed -ne 's/^.*array.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
+	total_bits=$((type_len+time_len+array_len))
+	total_bits=$((total_bits+7))
+	echo $((total_bits/8))
+}
+
+get_print_event_buf_offset() {
+	sed -ne 's/^.*buf.*offset:\([0-9][0-9]*\).*/\1/p' events/ftrace/print/format
+}
+
+event_header_size=`get_event_header_size`
+print_header_size=`get_print_event_buf_offset`
+
+data_offset=`get_buffer_data_offset`
+
+marker_meta=$((event_header_size+print_header_size))
+
+make_str() {
+        cnt=$1
+	printf -- 'X%.0s' $(seq $cnt)
+}
+
+write_buffer() {
+	size=$1
+
+	str=`make_str $size`
+
+	# clear the buffer
+	echo > trace
+
+	# write the string into the marker
+	echo $str > trace_marker
+
+	echo $str
+}
+
+test_buffer() {
+	size_kb=$1
+	page_size=$((size_kb*1024))
+
+	size=`get_buffer_data_size`
+
+	# the size must be greater than or equal to page_size - data_offset
+	page_size=$((page_size-data_offset))
+	if [ $size -lt $page_size ]; then
+		exit fail
+	fi
+
+	# Now add a little more the meta data overhead will overflow
+
+	str=`write_buffer $size`
+
+	# Make sure the line was broken
+	new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; exit}' trace`
+
+	if [ "$new_str" = "$str" ]; then
+		exit fail;
+	fi
+
+	# Make sure the entire line can be found
+	new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; }' trace`
+
+	if [ "$new_str" != "$str" ]; then
+		exit fail;
+	fi
+}
+
+ORIG=`cat buffer_subbuf_size_kb`
+
+# Could test bigger sizes than 32K, but then creating the string
+# to write into the ring buffer takes too long
+for a in 4 8 16 32 ; do
+	echo $a > buffer_subbuf_size_kb
+	test_buffer $a
+done
+
+echo $ORIG > buffer_subbuf_size_kb
+
-- 
cgit v1.2.3


From 3cb3091138ca0921c4569bcf7ffa062519639b6a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 19 Dec 2023 17:38:00 -0500
Subject: ring-buffer: Use subbuf_order for buffer page masking

The comparisons to PAGE_SIZE were all converted to use the
buffer->subbuf_order, but the use of PAGE_MASK was missed.

Convert all the PAGE_MASK usages over to:

  (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1

Link: https://lore.kernel.org/linux-trace-kernel/20231219173800.66eefb7a@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Fixes: 139f84002145 ("ring-buffer: Page size per ring buffer")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7ee6779bf292..173d2595ce2d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2269,11 +2269,13 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
 }
 
 static __always_inline unsigned
-rb_event_index(struct ring_buffer_event *event)
+rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event)
 {
 	unsigned long addr = (unsigned long)event;
 
-	return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
+	addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1;
+
+	return addr - BUF_PAGE_HDR_SIZE;
 }
 
 static void rb_inc_iter(struct ring_buffer_iter *iter)
@@ -2646,7 +2648,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
 /* Slow path */
 static struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
+rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+		  struct ring_buffer_event *event, u64 delta, bool abs)
 {
 	if (abs)
 		event->type_len = RINGBUF_TYPE_TIME_STAMP;
@@ -2654,7 +2657,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
 		event->type_len = RINGBUF_TYPE_TIME_EXTEND;
 
 	/* Not the first event on the page, or not delta? */
-	if (abs || rb_event_index(event)) {
+	if (abs || rb_event_index(cpu_buffer, event)) {
 		event->time_delta = delta & TS_MASK;
 		event->array[0] = delta >> TS_SHIFT;
 	} else {
@@ -2728,7 +2731,7 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 		if (!abs)
 			info->delta = 0;
 	}
-	*event = rb_add_time_stamp(*event, info->delta, abs);
+	*event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs);
 	*length -= RB_LEN_TIME_EXTEND;
 	*delta = 0;
 }
@@ -2812,10 +2815,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
 	struct buffer_page *bpage;
 	unsigned long addr;
 
-	new_index = rb_event_index(event);
+	new_index = rb_event_index(cpu_buffer, event);
 	old_index = new_index + rb_event_ts_length(event);
 	addr = (unsigned long)event;
-	addr &= PAGE_MASK;
+	addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
 
 	bpage = READ_ONCE(cpu_buffer->tail_page);
 
@@ -3726,7 +3729,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
 	struct buffer_page *bpage = cpu_buffer->commit_page;
 	struct buffer_page *start;
 
-	addr &= PAGE_MASK;
+	addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
 
 	/* Do the likely case first */
 	if (likely(bpage->page == (void *)addr)) {
-- 
cgit v1.2.3


From 5abde62465222edd3080b70099bd809f166d5d7d Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Thu, 21 Dec 2023 18:03:52 +0100
Subject: bpf: Avoid unnecessary use of comma operator in verifier

Although it does not seem to have any untoward side-effects, the use
of ';' to separate to assignments seems more appropriate than ','.

Flagged by clang-17 -Wcomma

No functional change intended. Compile tested only.

Signed-off-by: Simon Horman <horms@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/bpf/20231221-bpf-verifier-comma-v1-1-cde2530912e9@kernel.org
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f13008d27f35..a376eb609c41 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9616,7 +9616,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
 	callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
 	__mark_reg_known_zero(&callee->regs[BPF_REG_2]);
 	callee->regs[BPF_REG_2].btf =  btf_vmlinux;
-	callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA],
+	callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
 
 	/* pointer to stack or null */
 	callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
-- 
cgit v1.2.3


From d68019471995ba47e56a9da355df13a1cdb5bf7e Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 18 Dec 2023 08:45:18 +0100
Subject: entry: Move exit to usermode functions to header file

To allow inlining, move exit_to_user_mode() to
entry-common.h.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218074520.1998026-2-svens@linux.ibm.com
---
 include/linux/entry-common.h | 53 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/entry/common.c        | 52 ++++++++-----------------------------------
 2 files changed, 61 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index d95ab85f96ba..6a6e98f3805f 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -7,6 +7,10 @@
 #include <linux/syscalls.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
+#include <linux/context_tracking.h>
+#include <linux/livepatch.h>
+#include <linux/resume_user_mode.h>
+#include <linux/tick.h>
 
 #include <asm/entry-common.h>
 
@@ -258,6 +262,43 @@ static __always_inline void arch_exit_to_user_mode(void) { }
  */
 void arch_do_signal_or_restart(struct pt_regs *regs);
 
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ */
+unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+				     unsigned long ti_work);
+
+/**
+ * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * @regs:	Pointer to pt_regs on entry stack
+ *
+ * 1) check that interrupts are disabled
+ * 2) call tick_nohz_user_enter_prepare()
+ * 3) call exit_to_user_mode_loop() if any flags from
+ *    EXIT_TO_USER_MODE_WORK are set
+ * 4) check that interrupts are still disabled
+ */
+static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+	unsigned long ti_work;
+
+	lockdep_assert_irqs_disabled();
+
+	/* Flush pending rcuog wakeup before the last need_resched() check */
+	tick_nohz_user_enter_prepare();
+
+	ti_work = read_thread_flags();
+	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
+		ti_work = exit_to_user_mode_loop(regs, ti_work);
+
+	arch_exit_to_user_mode_prepare(regs, ti_work);
+
+	/* Ensure that kernel state is sane for a return to userspace */
+	kmap_assert_nomap();
+	lockdep_assert_irqs_disabled();
+	lockdep_sys_exit();
+}
+
 /**
  * exit_to_user_mode - Fixup state when exiting to user mode
  *
@@ -276,7 +317,17 @@ void arch_do_signal_or_restart(struct pt_regs *regs);
  * non-instrumentable.
  * The caller has to invoke syscall_exit_to_user_mode_work() before this.
  */
-void exit_to_user_mode(void);
+static __always_inline void exit_to_user_mode(void)
+{
+	instrumentation_begin();
+	trace_hardirqs_on_prepare();
+	lockdep_hardirqs_on_prepare();
+	instrumentation_end();
+
+	user_enter_irqoff();
+	arch_exit_to_user_mode();
+	lockdep_hardirqs_on(CALLER_ADDR0);
+}
 
 /**
  * syscall_exit_to_user_mode_work - Handle work before returning to user mode
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index d7ee4bc3f2ba..7f8f8c16140a 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -123,29 +123,16 @@ noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
 	instrumentation_end();
 }
 
-/* See comment for exit_to_user_mode() in entry-common.h */
-static __always_inline void __exit_to_user_mode(void)
-{
-	instrumentation_begin();
-	trace_hardirqs_on_prepare();
-	lockdep_hardirqs_on_prepare();
-	instrumentation_end();
-
-	user_enter_irqoff();
-	arch_exit_to_user_mode();
-	lockdep_hardirqs_on(CALLER_ADDR0);
-}
-
-void noinstr exit_to_user_mode(void)
-{
-	__exit_to_user_mode();
-}
-
 /* Workaround to allow gradual conversion of architecture code */
 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
 
-static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-					    unsigned long ti_work)
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ * @regs:	Pointer to pt_regs on entry stack
+ * @ti_work:	TIF work flags as read by the caller
+ */
+__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+						     unsigned long ti_work)
 {
 	/*
 	 * Before returning to user space ensure that all pending work
@@ -190,27 +177,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 	return ti_work;
 }
 
-static void exit_to_user_mode_prepare(struct pt_regs *regs)
-{
-	unsigned long ti_work;
-
-	lockdep_assert_irqs_disabled();
-
-	/* Flush pending rcuog wakeup before the last need_resched() check */
-	tick_nohz_user_enter_prepare();
-
-	ti_work = read_thread_flags();
-	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
-		ti_work = exit_to_user_mode_loop(regs, ti_work);
-
-	arch_exit_to_user_mode_prepare(regs, ti_work);
-
-	/* Ensure that kernel state is sane for a return to userspace */
-	kmap_assert_nomap();
-	lockdep_assert_irqs_disabled();
-	lockdep_sys_exit();
-}
-
 /*
  * If SYSCALL_EMU is set, then the only reason to report is when
  * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
@@ -295,7 +261,7 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
 	instrumentation_begin();
 	__syscall_exit_to_user_mode_work(regs);
 	instrumentation_end();
-	__exit_to_user_mode();
+	exit_to_user_mode();
 }
 
 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
@@ -308,7 +274,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
 	instrumentation_begin();
 	exit_to_user_mode_prepare(regs);
 	instrumentation_end();
-	__exit_to_user_mode();
+	exit_to_user_mode();
 }
 
 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
-- 
cgit v1.2.3


From caf4062e35b21cd7d3d35ac2f58f9765d02d32a0 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 18 Dec 2023 08:45:19 +0100
Subject: entry: Move enter_from_user_mode() to header file

To allow inlining of enter_from_user_mode(), move it to
entry-common.h.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218074520.1998026-3-svens@linux.ibm.com
---
 include/linux/entry-common.h | 15 ++++++++++++++-
 kernel/entry/common.c        | 26 +++-----------------------
 2 files changed, 17 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 6a6e98f3805f..c4205390448e 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -11,6 +11,7 @@
 #include <linux/livepatch.h>
 #include <linux/resume_user_mode.h>
 #include <linux/tick.h>
+#include <linux/kmsan.h>
 
 #include <asm/entry-common.h>
 
@@ -102,7 +103,19 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {}
  * done between establishing state and enabling interrupts. The caller must
  * enable interrupts before invoking syscall_enter_from_user_mode_work().
  */
-void enter_from_user_mode(struct pt_regs *regs);
+static __always_inline void enter_from_user_mode(struct pt_regs *regs)
+{
+	arch_enter_from_user_mode(regs);
+	lockdep_hardirqs_off(CALLER_ADDR0);
+
+	CT_WARN_ON(__ct_state() != CONTEXT_USER);
+	user_exit_irqoff();
+
+	instrumentation_begin();
+	kmsan_unpoison_entry_regs(regs);
+	trace_hardirqs_off_finish();
+	instrumentation_end();
+}
 
 /**
  * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 7f8f8c16140a..0616f239da4b 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -15,26 +15,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
 
-/* See comment for enter_from_user_mode() in entry-common.h */
-static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
-{
-	arch_enter_from_user_mode(regs);
-	lockdep_hardirqs_off(CALLER_ADDR0);
-
-	CT_WARN_ON(__ct_state() != CONTEXT_USER);
-	user_exit_irqoff();
-
-	instrumentation_begin();
-	kmsan_unpoison_entry_regs(regs);
-	trace_hardirqs_off_finish();
-	instrumentation_end();
-}
-
-void noinstr enter_from_user_mode(struct pt_regs *regs)
-{
-	__enter_from_user_mode(regs);
-}
-
 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
 {
 	if (unlikely(audit_context())) {
@@ -105,7 +85,7 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
 {
 	long ret;
 
-	__enter_from_user_mode(regs);
+	enter_from_user_mode(regs);
 
 	instrumentation_begin();
 	local_irq_enable();
@@ -117,7 +97,7 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
 
 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
 {
-	__enter_from_user_mode(regs);
+	enter_from_user_mode(regs);
 	instrumentation_begin();
 	local_irq_enable();
 	instrumentation_end();
@@ -266,7 +246,7 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
 
 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
 {
-	__enter_from_user_mode(regs);
+	enter_from_user_mode(regs);
 }
 
 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
-- 
cgit v1.2.3


From 221a164035fd8b554a44bd7c4bf8e7715a497561 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 18 Dec 2023 08:45:20 +0100
Subject: entry: Move syscall_enter_from_user_mode() to header file

To allow inlining of syscall_enter_from_user_mode(), move it
to entry-common.h.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20231218074520.1998026-4-svens@linux.ibm.com
---
 include/linux/entry-common.h | 27 +++++++++++++++++++++++++--
 kernel/entry/common.c        | 32 +-------------------------------
 2 files changed, 26 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index c4205390448e..b0fb775a600d 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -134,6 +134,9 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
  */
 void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
 
+long syscall_trace_enter(struct pt_regs *regs, long syscall,
+			 unsigned long work);
+
 /**
  * syscall_enter_from_user_mode_work - Check and handle work before invoking
  *				       a syscall
@@ -157,7 +160,15 @@ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
  *     ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter()
  *  2) Invocation of audit_syscall_entry()
  */
-long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall);
+static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
+{
+	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
+
+	if (work & SYSCALL_WORK_ENTER)
+		syscall = syscall_trace_enter(regs, syscall, work);
+
+	return syscall;
+}
 
 /**
  * syscall_enter_from_user_mode - Establish state and check and handle work
@@ -176,7 +187,19 @@ long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall);
  * Returns: The original or a modified syscall number. See
  * syscall_enter_from_user_mode_work() for further explanation.
  */
-long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall);
+static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+{
+	long ret;
+
+	enter_from_user_mode(regs);
+
+	instrumentation_begin();
+	local_irq_enable();
+	ret = syscall_enter_from_user_mode_work(regs, syscall);
+	instrumentation_end();
+
+	return ret;
+}
 
 /**
  * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable()
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 0616f239da4b..88cb3c88aaa5 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -25,7 +25,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
 	}
 }
 
-static long syscall_trace_enter(struct pt_regs *regs, long syscall,
+long syscall_trace_enter(struct pt_regs *regs, long syscall,
 				unsigned long work)
 {
 	long ret = 0;
@@ -65,36 +65,6 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
 	return ret ? : syscall;
 }
 
-static __always_inline long
-__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
-{
-	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
-
-	if (work & SYSCALL_WORK_ENTER)
-		syscall = syscall_trace_enter(regs, syscall, work);
-
-	return syscall;
-}
-
-long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
-{
-	return __syscall_enter_from_user_work(regs, syscall);
-}
-
-noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
-{
-	long ret;
-
-	enter_from_user_mode(regs);
-
-	instrumentation_begin();
-	local_irq_enable();
-	ret = __syscall_enter_from_user_work(regs, syscall);
-	instrumentation_end();
-
-	return ret;
-}
-
 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
 {
 	enter_from_user_mode(regs);
-- 
cgit v1.2.3


From b3edde44e5d4504c23a176819865cd603fd16d6c Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Mon, 11 Dec 2023 11:48:51 +0100
Subject: cpufreq/schedutil: Use a fixed reference frequency

cpuinfo.max_freq can change at runtime because of boost as an example. This
implies that the value could be different than the one that has been
used when computing the capacity of a CPU.

The new arch_scale_freq_ref() returns a fixed and coherent reference
frequency that can be used when computing a frequency based on utilization.

Use this arch_scale_freq_ref() when available and fallback to
policy otherwise.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://lore.kernel.org/r/20231211104855.558096-4-vincent.guittot@linaro.org
---
 kernel/sched/cpufreq_schedutil.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 4ee8ad70be99..95c3c097083e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -114,6 +114,28 @@ static void sugov_deferred_update(struct sugov_policy *sg_policy)
 	}
 }
 
+/**
+ * get_capacity_ref_freq - get the reference frequency that has been used to
+ * correlate frequency and compute capacity for a given cpufreq policy. We use
+ * the CPU managing it for the arch_scale_freq_ref() call in the function.
+ * @policy: the cpufreq policy of the CPU in question.
+ *
+ * Return: the reference CPU frequency to compute a capacity.
+ */
+static __always_inline
+unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy)
+{
+	unsigned int freq = arch_scale_freq_ref(policy->cpu);
+
+	if (freq)
+		return freq;
+
+	if (arch_scale_freq_invariant())
+		return policy->cpuinfo.max_freq;
+
+	return policy->cur;
+}
+
 /**
  * get_next_freq - Compute a new frequency for a given cpufreq policy.
  * @sg_policy: schedutil policy object to compute the new frequency for.
@@ -140,9 +162,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 				  unsigned long util, unsigned long max)
 {
 	struct cpufreq_policy *policy = sg_policy->policy;
-	unsigned int freq = arch_scale_freq_invariant() ?
-				policy->cpuinfo.max_freq : policy->cur;
+	unsigned int freq;
 
+	freq = get_capacity_ref_freq(policy);
 	freq = map_util_freq(util, freq, max);
 
 	if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
-- 
cgit v1.2.3


From 7736ae5572eb344c090fbef9621a228e7e3d6276 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 1 Dec 2023 17:16:51 +0100
Subject: sched/fair: Remove SCHED_FEAT(UTIL_EST_FASTUP, true)

sched_feat(UTIL_EST_FASTUP) has been added to easily disable the feature
in order to check for possibly related regressions. After 3 years, it has
never been used and no regression has been reported. Let's remove it
and make fast increase a permanent behavior.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reviewed-by: Hongyan Xia <hongyan.xia2@arm.com>
Reviewed-by: Tang Yizhou <yizhou.tang@shopee.com>
Reviewed-by: Yanteng Si <siyanteng@loongson.cn> [for the Chinese translation]
Reviewed-by: Alex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/20231201161652.1241695-2-vincent.guittot@linaro.org
---
 Documentation/scheduler/schedutil.rst                    | 7 +++----
 Documentation/translations/zh_CN/scheduler/schedutil.rst | 7 +++----
 kernel/sched/fair.c                                      | 8 +++-----
 kernel/sched/features.h                                  | 1 -
 4 files changed, 9 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/scheduler/schedutil.rst b/Documentation/scheduler/schedutil.rst
index 32c7d69fc86c..803fba8fc714 100644
--- a/Documentation/scheduler/schedutil.rst
+++ b/Documentation/scheduler/schedutil.rst
@@ -90,8 +90,8 @@ For more detail see:
  - Documentation/scheduler/sched-capacity.rst:"1. CPU Capacity + 2. Task utilization"
 
 
-UTIL_EST / UTIL_EST_FASTUP
-==========================
+UTIL_EST
+========
 
 Because periodic tasks have their averages decayed while they sleep, even
 though when running their expected utilization will be the same, they suffer a
@@ -99,8 +99,7 @@ though when running their expected utilization will be the same, they suffer a
 
 To alleviate this (a default enabled option) UTIL_EST drives an Infinite
 Impulse Response (IIR) EWMA with the 'running' value on dequeue -- when it is
-highest. A further default enabled option UTIL_EST_FASTUP modifies the IIR
-filter to instantly increase and only decay on decrease.
+highest. UTIL_EST filters to instantly increase and only decay on decrease.
 
 A further runqueue wide sum (of runnable tasks) is maintained of:
 
diff --git a/Documentation/translations/zh_CN/scheduler/schedutil.rst b/Documentation/translations/zh_CN/scheduler/schedutil.rst
index d1ea68007520..7c8d87f21c42 100644
--- a/Documentation/translations/zh_CN/scheduler/schedutil.rst
+++ b/Documentation/translations/zh_CN/scheduler/schedutil.rst
@@ -89,16 +89,15 @@ r_cpu被定义为当前CPU的最高性能水平与系统中任何其它CPU的最
  - Documentation/translations/zh_CN/scheduler/sched-capacity.rst:"1. CPU Capacity + 2. Task utilization"
 
 
-UTIL_EST / UTIL_EST_FASTUP
-==========================
+UTIL_EST
+========
 
 由于周期性任务的平均数在睡眠时会衰减，而在运行时其预期利用率会和睡眠前相同，
 因此它们在再次运行后会面临（DVFS）的上涨。
 
 为了缓解这个问题，（一个默认使能的编译选项）UTIL_EST驱动一个无限脉冲响应
 （Infinite Impulse Response，IIR）的EWMA，“运行”值在出队时是最高的。
-另一个默认使能的编译选项UTIL_EST_FASTUP修改了IIR滤波器，使其允许立即增加，
-仅在利用率下降时衰减。
+UTIL_EST滤波使其在遇到更高值时立刻增加，而遇到低值时会缓慢衰减。
 
 进一步，运行队列的（可运行任务的）利用率之和由下式计算：
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bcea3d55d95d..e94d65da8d66 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4870,11 +4870,9 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	 * to smooth utilization decreases.
 	 */
 	ue.enqueued = task_util(p);
-	if (sched_feat(UTIL_EST_FASTUP)) {
-		if (ue.ewma < ue.enqueued) {
-			ue.ewma = ue.enqueued;
-			goto done;
-		}
+	if (ue.ewma < ue.enqueued) {
+		ue.ewma = ue.enqueued;
+		goto done;
 	}
 
 	/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index a3ddf84de430..143f55df890b 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -83,7 +83,6 @@ SCHED_FEAT(WA_BIAS, true)
  * UtilEstimation. Use estimated CPU utilization.
  */
 SCHED_FEAT(UTIL_EST, true)
-SCHED_FEAT(UTIL_EST_FASTUP, true)
 
 SCHED_FEAT(LATENCY_WARN, false)
 
-- 
cgit v1.2.3


From 11137d384996bb05cf33c8163db271e1bac3f4bf Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 1 Dec 2023 17:16:52 +0100
Subject: sched/fair: Simplify util_est

With UTIL_EST_FASTUP now being permanent, we can take advantage of the
fact that the ewma jumps directly to a higher utilization at dequeue to
simplify util_est and remove the enqueued field.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reviewed-by: Hongyan Xia <hongyan.xia2@arm.com>
Reviewed-by: Alex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/20231201161652.1241695-3-vincent.guittot@linaro.org
---
 include/linux/sched.h | 49 ++++++++----------------------
 kernel/sched/debug.c  |  7 ++---
 kernel/sched/fair.c   | 82 +++++++++++++++++++--------------------------------
 kernel/sched/pelt.h   |  4 +--
 4 files changed, 48 insertions(+), 94 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8d258162deb0..03bfe9ab2951 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -415,42 +415,6 @@ struct load_weight {
 	u32				inv_weight;
 };
 
-/**
- * struct util_est - Estimation utilization of FAIR tasks
- * @enqueued: instantaneous estimated utilization of a task/cpu
- * @ewma:     the Exponential Weighted Moving Average (EWMA)
- *            utilization of a task
- *
- * Support data structure to track an Exponential Weighted Moving Average
- * (EWMA) of a FAIR task's utilization. New samples are added to the moving
- * average each time a task completes an activation. Sample's weight is chosen
- * so that the EWMA will be relatively insensitive to transient changes to the
- * task's workload.
- *
- * The enqueued attribute has a slightly different meaning for tasks and cpus:
- * - task:   the task's util_avg at last task dequeue time
- * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
- * Thus, the util_est.enqueued of a task represents the contribution on the
- * estimated utilization of the CPU where that task is currently enqueued.
- *
- * Only for tasks we track a moving average of the past instantaneous
- * estimated utilization. This allows to absorb sporadic drops in utilization
- * of an otherwise almost periodic task.
- *
- * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
- * updates. When a task is dequeued, its util_est should not be updated if its
- * util_avg has not been updated in the meantime.
- * This information is mapped into the MSB bit of util_est.enqueued at dequeue
- * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
- * for a task) it is safe to use MSB.
- */
-struct util_est {
-	unsigned int			enqueued;
-	unsigned int			ewma;
-#define UTIL_EST_WEIGHT_SHIFT		2
-#define UTIL_AVG_UNCHANGED		0x80000000
-} __attribute__((__aligned__(sizeof(u64))));
-
 /*
  * The load/runnable/util_avg accumulates an infinite geometric series
  * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
@@ -505,9 +469,20 @@ struct sched_avg {
 	unsigned long			load_avg;
 	unsigned long			runnable_avg;
 	unsigned long			util_avg;
-	struct util_est			util_est;
+	unsigned int			util_est;
 } ____cacheline_aligned;
 
+/*
+ * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+ * updates. When a task is dequeued, its util_est should not be updated if its
+ * util_avg has not been updated in the meantime.
+ * This information is mapped into the MSB bit of util_est at dequeue time.
+ * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
+ * it is safe to use MSB.
+ */
+#define UTIL_EST_WEIGHT_SHIFT		2
+#define UTIL_AVG_UNCHANGED		0x80000000
+
 struct sched_statistics {
 #ifdef CONFIG_SCHEDSTATS
 	u64				wait_start;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 168eecc209b4..8d5d98a5834d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -684,8 +684,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->avg.runnable_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
 			cfs_rq->avg.util_avg);
-	SEQ_printf(m, "  .%-30s: %u\n", "util_est_enqueued",
-			cfs_rq->avg.util_est.enqueued);
+	SEQ_printf(m, "  .%-30s: %u\n", "util_est",
+			cfs_rq->avg.util_est);
 	SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
 			cfs_rq->removed.load_avg);
 	SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
@@ -1075,8 +1075,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 	P(se.avg.runnable_avg);
 	P(se.avg.util_avg);
 	P(se.avg.last_update_time);
-	P(se.avg.util_est.ewma);
-	PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
+	PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
 #endif
 #ifdef CONFIG_UCLAMP_TASK
 	__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e94d65da8d66..823dd76d0546 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4781,9 +4781,7 @@ static inline unsigned long task_runnable(struct task_struct *p)
 
 static inline unsigned long _task_util_est(struct task_struct *p)
 {
-	struct util_est ue = READ_ONCE(p->se.avg.util_est);
-
-	return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
+	return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
 }
 
 static inline unsigned long task_util_est(struct task_struct *p)
@@ -4800,9 +4798,9 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
 		return;
 
 	/* Update root cfs_rq's estimated utilization */
-	enqueued  = cfs_rq->avg.util_est.enqueued;
+	enqueued  = cfs_rq->avg.util_est;
 	enqueued += _task_util_est(p);
-	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
 
 	trace_sched_util_est_cfs_tp(cfs_rq);
 }
@@ -4816,34 +4814,20 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
 		return;
 
 	/* Update root cfs_rq's estimated utilization */
-	enqueued  = cfs_rq->avg.util_est.enqueued;
+	enqueued  = cfs_rq->avg.util_est;
 	enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
-	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
 
 	trace_sched_util_est_cfs_tp(cfs_rq);
 }
 
 #define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
 
-/*
- * Check if a (signed) value is within a specified (unsigned) margin,
- * based on the observation that:
- *
- *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
- *
- * NOTE: this only works when value + margin < INT_MAX.
- */
-static inline bool within_margin(int value, int margin)
-{
-	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
-}
-
 static inline void util_est_update(struct cfs_rq *cfs_rq,
 				   struct task_struct *p,
 				   bool task_sleep)
 {
-	long last_ewma_diff, last_enqueued_diff;
-	struct util_est ue;
+	unsigned int ewma, dequeued, last_ewma_diff;
 
 	if (!sched_feat(UTIL_EST))
 		return;
@@ -4855,23 +4839,25 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	if (!task_sleep)
 		return;
 
+	/* Get current estimate of utilization */
+	ewma = READ_ONCE(p->se.avg.util_est);
+
 	/*
 	 * If the PELT values haven't changed since enqueue time,
 	 * skip the util_est update.
 	 */
-	ue = p->se.avg.util_est;
-	if (ue.enqueued & UTIL_AVG_UNCHANGED)
+	if (ewma & UTIL_AVG_UNCHANGED)
 		return;
 
-	last_enqueued_diff = ue.enqueued;
+	/* Get utilization at dequeue */
+	dequeued = task_util(p);
 
 	/*
 	 * Reset EWMA on utilization increases, the moving average is used only
 	 * to smooth utilization decreases.
 	 */
-	ue.enqueued = task_util(p);
-	if (ue.ewma < ue.enqueued) {
-		ue.ewma = ue.enqueued;
+	if (ewma <= dequeued) {
+		ewma = dequeued;
 		goto done;
 	}
 
@@ -4879,27 +4865,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	 * Skip update of task's estimated utilization when its members are
 	 * already ~1% close to its last activation value.
 	 */
-	last_ewma_diff = ue.enqueued - ue.ewma;
-	last_enqueued_diff -= ue.enqueued;
-	if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
-		if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
-			goto done;
-
-		return;
-	}
+	last_ewma_diff = ewma - dequeued;
+	if (last_ewma_diff < UTIL_EST_MARGIN)
+		goto done;
 
 	/*
 	 * To avoid overestimation of actual task utilization, skip updates if
 	 * we cannot grant there is idle time in this CPU.
 	 */
-	if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
+	if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
 		return;
 
 	/*
 	 * To avoid underestimate of task utilization, skip updates of EWMA if
 	 * we cannot grant that thread got all CPU time it wanted.
 	 */
-	if ((ue.enqueued + UTIL_EST_MARGIN) < task_runnable(p))
+	if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
 		goto done;
 
 
@@ -4907,25 +4888,24 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	 * Update Task's estimated utilization
 	 *
 	 * When *p completes an activation we can consolidate another sample
-	 * of the task size. This is done by storing the current PELT value
-	 * as ue.enqueued and by using this value to update the Exponential
-	 * Weighted Moving Average (EWMA):
+	 * of the task size. This is done by using this value to update the
+	 * Exponential Weighted Moving Average (EWMA):
 	 *
 	 *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
 	 *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
 	 *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
-	 *          = w * (      last_ewma_diff            ) +     ewma(t-1)
-	 *          = w * (last_ewma_diff  +  ewma(t-1) / w)
+	 *          = w * (      -last_ewma_diff           ) +     ewma(t-1)
+	 *          = w * (-last_ewma_diff +  ewma(t-1) / w)
 	 *
 	 * Where 'w' is the weight of new samples, which is configured to be
 	 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
 	 */
-	ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
-	ue.ewma  += last_ewma_diff;
-	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+	ewma <<= UTIL_EST_WEIGHT_SHIFT;
+	ewma  -= last_ewma_diff;
+	ewma >>= UTIL_EST_WEIGHT_SHIFT;
 done:
-	ue.enqueued |= UTIL_AVG_UNCHANGED;
-	WRITE_ONCE(p->se.avg.util_est, ue);
+	ewma |= UTIL_AVG_UNCHANGED;
+	WRITE_ONCE(p->se.avg.util_est, ewma);
 
 	trace_sched_util_est_se_tp(&p->se);
 }
@@ -7653,16 +7633,16 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
 	if (sched_feat(UTIL_EST)) {
 		unsigned long util_est;
 
-		util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+		util_est = READ_ONCE(cfs_rq->avg.util_est);
 
 		/*
 		 * During wake-up @p isn't enqueued yet and doesn't contribute
-		 * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
+		 * to any cpu_rq(cpu)->cfs.avg.util_est.
 		 * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
 		 * has been enqueued.
 		 *
 		 * During exec (@dst_cpu = -1) @p is enqueued and does
-		 * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
+		 * contribute to cpu_rq(cpu)->cfs.util_est.
 		 * Remove it to "simulate" cpu_util without @p's contribution.
 		 *
 		 * Despite the task_on_rq_queued(@p) check there is still a
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 3a0e0dc28721..9e1083465fbc 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -52,13 +52,13 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
 		return;
 
 	/* Avoid store if the flag has been already reset */
-	enqueued = avg->util_est.enqueued;
+	enqueued = avg->util_est;
 	if (!(enqueued & UTIL_AVG_UNCHANGED))
 		return;
 
 	/* Reset flag to report util_avg has been updated */
 	enqueued &= ~UTIL_AVG_UNCHANGED;
-	WRITE_ONCE(avg->util_est.enqueued, enqueued);
+	WRITE_ONCE(avg->util_est, enqueued);
 }
 
 static inline u64 rq_clock_pelt(struct rq *rq)
-- 
cgit v1.2.3


From 3af7524b14198f5159a86692d57a9f28ec9375ce Mon Sep 17 00:00:00 2001
From: Pierre Gondois <pierre.gondois@arm.com>
Date: Wed, 6 Dec 2023 10:00:43 +0100
Subject: sched/fair: Use all little CPUs for CPU-bound workloads

Running N CPU-bound tasks on an N CPUs platform:

- with asymmetric CPU capacity

- not being a DynamIq system (i.e. having a PKG level sched domain
  without the SD_SHARE_PKG_RESOURCES flag set)

.. might result in a task placement where two tasks run on a big CPU
and none on a little CPU. This placement could be more optimal by
using all CPUs.

Testing platform:

  Juno-r2:
    - 2 big CPUs (1-2), maximum capacity of 1024
    - 4 little CPUs (0,3-5), maximum capacity of 383

Testing workload ([1]):

  Spawn 6 CPU-bound tasks. During the first 100ms (step 1), each tasks
  is affine to a CPU, except for:

    - one little CPU which is left idle.
    - one big CPU which has 2 tasks affine.

  After the 100ms (step 2), remove the cpumask affinity.

Behavior before the patch:

  During step 2, the load balancer running from the idle CPU tags sched
  domains as:

  - little CPUs: 'group_has_spare'. Cf. group_has_capacity() and
    group_is_overloaded(), 3 CPU-bound tasks run on a 4 CPUs
    sched-domain, and the idle CPU provides enough spare capacity
    regarding the imbalance_pct

  - big CPUs: 'group_overloaded'. Indeed, 3 tasks run on a 2 CPUs
    sched-domain, so the following path is used:

      group_is_overloaded()
      \-if (sgs->sum_nr_running <= sgs->group_weight) return true;

    The following path which would change the migration type to
    'migrate_task' is not taken:

      calculate_imbalance()
      \-if (env->idle != CPU_NOT_IDLE && env->imbalance == 0)

    as the local group has some spare capacity, so the imbalance
    is not 0.

  The migration type requested is 'migrate_util' and the busiest
  runqueue is the big CPU's runqueue having 2 tasks (each having a
  utilization of 512). The idle little CPU cannot pull one of these
  task as its capacity is too small for the task. The following path
  is used:

   detach_tasks()
   \-case migrate_util:
     \-if (util > env->imbalance) goto next;

After the patch:

As the number of failed balancing attempts grows (with
'nr_balance_failed'), progressively make it easier to migrate
a big task to the idling little CPU. A similar mechanism is
used for the 'migrate_load' migration type.

Improvement:

Running the testing workload [1] with the step 2 representing
a ~10s load for a big CPU:

  Before patch: ~19.3s
  After patch:  ~18s (-6.7%)

Similar issue reported at:

  https://lore.kernel.org/lkml/20230716014125.139577-1-qyousef@layalina.io/

Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Pierre Gondois <pierre.gondois@arm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Acked-by: Qais Yousef <qyousef@layalina.io>
Link: https://lore.kernel.org/r/20231206090043.634697-1-pierre.gondois@arm.com
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 823dd76d0546..1d561b5aae2a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9071,7 +9071,7 @@ static int detach_tasks(struct lb_env *env)
 		case migrate_util:
 			util = task_util_est(p);
 
-			if (util > env->imbalance)
+			if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance)
 				goto next;
 
 			env->imbalance -= util;
-- 
cgit v1.2.3


From fbb66ce0b1d670c72def736a13ac9176b860df4e Mon Sep 17 00:00:00 2001
From: Wang Jinchao <wangjinchao@xfusion.com>
Date: Thu, 14 Dec 2023 13:20:29 +0800
Subject: sched/fair: Remove unused 'next_buddy_marked' local variable in
 check_preempt_wakeup_fair()

This variable became unused in:

    5e963f2bd465 ("sched/fair: Commit to EEVDF")

Signed-off-by: Wang Jinchao <wangjinchao@xfusion.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/202312141319+0800-wangjinchao@xfusion.com
---
 kernel/sched/fair.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1d561b5aae2a..9cc20855dc2b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8221,7 +8221,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	int next_buddy_marked = 0;
 	int cse_is_idle, pse_is_idle;
 
 	if (unlikely(se == pse))
@@ -8238,7 +8237,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 
 	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
 		set_next_buddy(pse);
-		next_buddy_marked = 1;
 	}
 
 	/*
-- 
cgit v1.2.3


From 932562a6045ed613d45bd100db37114273c22077 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 15:58:20 -0500
Subject: rseq: Split out rseq.h from sched.h

We're trying to get sched.h down to more or less just types only, not
code - rseq can live in its own header.

This helps us kill the dependency on preempt.h in sched.h.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 arch/arm64/kernel/ptrace.c       |   1 +
 arch/powerpc/kernel/interrupt.c  |   1 +
 arch/s390/kernel/signal.c        |   1 +
 arch/x86/kernel/signal.c         |   1 +
 fs/exec.c                        |   1 +
 include/linux/resume_user_mode.h |   1 +
 include/linux/rseq.h             | 131 +++++++++++++++++++++++++++++++++++++++
 include/linux/sched.h            | 125 +------------------------------------
 kernel/fork.c                    |   1 +
 kernel/sched/core.c              |   1 +
 10 files changed, 140 insertions(+), 124 deletions(-)
 create mode 100644 include/linux/rseq.h

(limited to 'kernel')

diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 20d7ef82de90..09bb7fc7d3c2 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -28,6 +28,7 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/regset.h>
 #include <linux/elf.h>
+#include <linux/rseq.h>
 
 #include <asm/compat.h>
 #include <asm/cpufeature.h>
diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index c4f6d3c69ba9..eca293794a1e 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -3,6 +3,7 @@
 #include <linux/context_tracking.h>
 #include <linux/err.h>
 #include <linux/compat.h>
+#include <linux/rseq.h>
 #include <linux/sched/debug.h> /* for show_regs */
 
 #include <asm/kup.h>
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index d63557d3868c..48e46809ee27 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -12,6 +12,7 @@
 
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
+#include <linux/rseq.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 65fe2094da59..31b6f5dddfc2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -27,6 +27,7 @@
 #include <linux/context_tracking.h>
 #include <linux/entry-common.h>
 #include <linux/syscalls.h>
+#include <linux/rseq.h>
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
diff --git a/fs/exec.c b/fs/exec.c
index 4aa19b24f281..41773af7e3dc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -66,6 +66,7 @@
 #include <linux/coredump.h>
 #include <linux/time_namespace.h>
 #include <linux/user_events.h>
+#include <linux/rseq.h>
 
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h
index f8f3e958e9cf..e0135e0adae0 100644
--- a/include/linux/resume_user_mode.h
+++ b/include/linux/resume_user_mode.h
@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/task_work.h>
 #include <linux/memcontrol.h>
+#include <linux/rseq.h>
 #include <linux/blk-cgroup.h>
 
 /**
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
new file mode 100644
index 000000000000..bc8af3eb5598
--- /dev/null
+++ b/include/linux/rseq.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _LINUX_RSEQ_H
+#define _LINUX_RSEQ_H
+
+#ifdef CONFIG_RSEQ
+
+#include <linux/preempt.h>
+#include <linux/sched.h>
+
+/*
+ * Map the event mask on the user-space ABI enum rseq_cs_flags
+ * for direct mask checks.
+ */
+enum rseq_event_mask_bits {
+	RSEQ_EVENT_PREEMPT_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
+	RSEQ_EVENT_SIGNAL_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
+	RSEQ_EVENT_MIGRATE_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
+};
+
+enum rseq_event_mask {
+	RSEQ_EVENT_PREEMPT	= (1U << RSEQ_EVENT_PREEMPT_BIT),
+	RSEQ_EVENT_SIGNAL	= (1U << RSEQ_EVENT_SIGNAL_BIT),
+	RSEQ_EVENT_MIGRATE	= (1U << RSEQ_EVENT_MIGRATE_BIT),
+};
+
+static inline void rseq_set_notify_resume(struct task_struct *t)
+{
+	if (t->rseq)
+		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+}
+
+void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
+
+static inline void rseq_handle_notify_resume(struct ksignal *ksig,
+					     struct pt_regs *regs)
+{
+	if (current->rseq)
+		__rseq_handle_notify_resume(ksig, regs);
+}
+
+static inline void rseq_signal_deliver(struct ksignal *ksig,
+				       struct pt_regs *regs)
+{
+	preempt_disable();
+	__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
+	preempt_enable();
+	rseq_handle_notify_resume(ksig, regs);
+}
+
+/* rseq_preempt() requires preemption to be disabled. */
+static inline void rseq_preempt(struct task_struct *t)
+{
+	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
+	rseq_set_notify_resume(t);
+}
+
+/* rseq_migrate() requires preemption to be disabled. */
+static inline void rseq_migrate(struct task_struct *t)
+{
+	__set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
+	rseq_set_notify_resume(t);
+}
+
+/*
+ * If parent process has a registered restartable sequences area, the
+ * child inherits. Unregister rseq for a clone with CLONE_VM set.
+ */
+static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+{
+	if (clone_flags & CLONE_VM) {
+		t->rseq = NULL;
+		t->rseq_len = 0;
+		t->rseq_sig = 0;
+		t->rseq_event_mask = 0;
+	} else {
+		t->rseq = current->rseq;
+		t->rseq_len = current->rseq_len;
+		t->rseq_sig = current->rseq_sig;
+		t->rseq_event_mask = current->rseq_event_mask;
+	}
+}
+
+static inline void rseq_execve(struct task_struct *t)
+{
+	t->rseq = NULL;
+	t->rseq_len = 0;
+	t->rseq_sig = 0;
+	t->rseq_event_mask = 0;
+}
+
+#else
+
+static inline void rseq_set_notify_resume(struct task_struct *t)
+{
+}
+static inline void rseq_handle_notify_resume(struct ksignal *ksig,
+					     struct pt_regs *regs)
+{
+}
+static inline void rseq_signal_deliver(struct ksignal *ksig,
+				       struct pt_regs *regs)
+{
+}
+static inline void rseq_preempt(struct task_struct *t)
+{
+}
+static inline void rseq_migrate(struct task_struct *t)
+{
+}
+static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+{
+}
+static inline void rseq_execve(struct task_struct *t)
+{
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_RSEQ
+
+void rseq_syscall(struct pt_regs *regs);
+
+#else
+
+static inline void rseq_syscall(struct pt_regs *regs)
+{
+}
+
+#endif
+
+#endif /* _LINUX_RSEQ_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dd002d193726..a588b94988bc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -35,7 +35,7 @@
 #include <linux/task_io_accounting.h>
 #include <linux/posix-timers_types.h>
 #include <linux/restart_block.h>
-#include <linux/rseq.h>
+#include <uapi/linux/rseq.h>
 #include <linux/seqlock_types.h>
 #include <linux/kcsan.h>
 #include <linux/rv.h>
@@ -2181,129 +2181,6 @@ static inline bool owner_on_cpu(struct task_struct *owner)
 unsigned long sched_cpu_util(int cpu);
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_RSEQ
-
-/*
- * Map the event mask on the user-space ABI enum rseq_cs_flags
- * for direct mask checks.
- */
-enum rseq_event_mask_bits {
-	RSEQ_EVENT_PREEMPT_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
-	RSEQ_EVENT_SIGNAL_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
-	RSEQ_EVENT_MIGRATE_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
-};
-
-enum rseq_event_mask {
-	RSEQ_EVENT_PREEMPT	= (1U << RSEQ_EVENT_PREEMPT_BIT),
-	RSEQ_EVENT_SIGNAL	= (1U << RSEQ_EVENT_SIGNAL_BIT),
-	RSEQ_EVENT_MIGRATE	= (1U << RSEQ_EVENT_MIGRATE_BIT),
-};
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-	if (t->rseq)
-		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
-}
-
-void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
-
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
-					     struct pt_regs *regs)
-{
-	if (current->rseq)
-		__rseq_handle_notify_resume(ksig, regs);
-}
-
-static inline void rseq_signal_deliver(struct ksignal *ksig,
-				       struct pt_regs *regs)
-{
-	preempt_disable();
-	__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
-	preempt_enable();
-	rseq_handle_notify_resume(ksig, regs);
-}
-
-/* rseq_preempt() requires preemption to be disabled. */
-static inline void rseq_preempt(struct task_struct *t)
-{
-	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
-	rseq_set_notify_resume(t);
-}
-
-/* rseq_migrate() requires preemption to be disabled. */
-static inline void rseq_migrate(struct task_struct *t)
-{
-	__set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
-	rseq_set_notify_resume(t);
-}
-
-/*
- * If parent process has a registered restartable sequences area, the
- * child inherits. Unregister rseq for a clone with CLONE_VM set.
- */
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
-{
-	if (clone_flags & CLONE_VM) {
-		t->rseq = NULL;
-		t->rseq_len = 0;
-		t->rseq_sig = 0;
-		t->rseq_event_mask = 0;
-	} else {
-		t->rseq = current->rseq;
-		t->rseq_len = current->rseq_len;
-		t->rseq_sig = current->rseq_sig;
-		t->rseq_event_mask = current->rseq_event_mask;
-	}
-}
-
-static inline void rseq_execve(struct task_struct *t)
-{
-	t->rseq = NULL;
-	t->rseq_len = 0;
-	t->rseq_sig = 0;
-	t->rseq_event_mask = 0;
-}
-
-#else
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-}
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
-					     struct pt_regs *regs)
-{
-}
-static inline void rseq_signal_deliver(struct ksignal *ksig,
-				       struct pt_regs *regs)
-{
-}
-static inline void rseq_preempt(struct task_struct *t)
-{
-}
-static inline void rseq_migrate(struct task_struct *t)
-{
-}
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
-{
-}
-static inline void rseq_execve(struct task_struct *t)
-{
-}
-
-#endif
-
-#ifdef CONFIG_DEBUG_RSEQ
-
-void rseq_syscall(struct pt_regs *regs);
-
-#else
-
-static inline void rseq_syscall(struct pt_regs *regs)
-{
-}
-
-#endif
-
 #ifdef CONFIG_SCHED_CORE
 extern void sched_core_free(struct task_struct *tsk);
 extern void sched_core_fork(struct task_struct *p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 319e61297bfb..53816393995b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -100,6 +100,7 @@
 #include <linux/stackprotector.h>
 #include <linux/user_events.h>
 #include <linux/iommu.h>
+#include <linux/rseq.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a708d225c28e..d04cf3c47899 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -57,6 +57,7 @@
 #include <linux/profile.h>
 #include <linux/psi.h>
 #include <linux/rcuwait_api.h>
+#include <linux/rseq.h>
 #include <linux/sched/wake_q.h>
 #include <linux/scs.h>
 #include <linux/slab.h>
-- 
cgit v1.2.3


From 1e2f2d31997a9496f99e2b43255d6a48b06fbcc2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 15 Dec 2023 15:51:54 -0500
Subject: Kill sched.h dependency on rcupdate.h

by moving cond_resched_rcu() to rcupdate_wait.h, we can kill another big
sched.h dependency.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 arch/powerpc/kvm/book3s_64_vio.c        |  1 +
 include/linux/rcupdate_wait.h           | 10 ++++++++++
 include/linux/sched.h                   | 15 +++++----------
 include/linux/sched/task.h              |  1 +
 kernel/bpf/hashtab.c                    |  1 +
 lib/test_rhashtable.c                   |  1 +
 mm/filemap.c                            |  1 +
 mm/khugepaged.c                         |  1 +
 mm/shmem.c                              |  1 +
 net/ipv4/fib_trie.c                     |  1 +
 net/netfilter/ipset/ip_set_bitmap_gen.h |  2 ++
 net/netfilter/ipset/ip_set_hash_gen.h   |  1 +
 net/netfilter/ipvs/ip_vs_conn.c         |  1 +
 net/netfilter/ipvs/ip_vs_est.c          |  1 +
 14 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 14c6d7e318da..b569ebaa590e 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -20,6 +20,7 @@
 #include <linux/iommu.h>
 #include <linux/file.h>
 #include <linux/mm.h>
+#include <linux/rcupdate_wait.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h
index 5e0f74f2f8ca..d07f0848802e 100644
--- a/include/linux/rcupdate_wait.h
+++ b/include/linux/rcupdate_wait.h
@@ -8,6 +8,7 @@
 
 #include <linux/rcupdate.h>
 #include <linux/completion.h>
+#include <linux/sched.h>
 
 /*
  * Structure allowing asynchronous waiting on RCU.
@@ -55,4 +56,13 @@ do {									\
 #define synchronize_rcu_mult(...) \
 	_wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__)
 
+static inline void cond_resched_rcu(void)
+{
+#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
+	rcu_read_unlock();
+	cond_resched();
+	rcu_read_lock();
+#endif
+}
+
 #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a588b94988bc..814bfdafbc1c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -10,9 +10,14 @@
 #include <uapi/linux/sched.h>
 
 #include <asm/current.h>
+#include <asm/processor.h>
+#include <linux/thread_info.h>
+#include <linux/preempt.h>
+#include <linux/cpumask.h>
 
 #include <linux/cache.h>
 #include <linux/irqflags_types.h>
+#include <linux/smp_types.h>
 #include <linux/pid_types.h>
 #include <linux/sem_types.h>
 #include <linux/shm.h>
@@ -23,7 +28,6 @@
 #include <linux/timer_types.h>
 #include <linux/seccomp_types.h>
 #include <linux/nodemask_types.h>
-#include <linux/rcupdate.h>
 #include <linux/refcount_types.h>
 #include <linux/resource.h>
 #include <linux/latencytop.h>
@@ -2059,15 +2063,6 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
 	__cond_resched_rwlock_write(lock);					\
 })
 
-static inline void cond_resched_rcu(void)
-{
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
-	rcu_read_unlock();
-	cond_resched();
-	rcu_read_lock();
-#endif
-}
-
 #ifdef CONFIG_PREEMPT_DYNAMIC
 
 extern bool preempt_model_none(void);
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 1880ae21a9cb..538cdfbe895f 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -7,6 +7,7 @@
  * functionality:
  */
 
+#include <linux/rcupdate.h>
 #include <linux/refcount.h>
 #include <linux/sched.h>
 #include <linux/uaccess.h>
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fd8d4b0addfc..3ba98ed2b3f9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -7,6 +7,7 @@
 #include <linux/jhash.h>
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/random.h>
 #include <uapi/linux/btf.h>
 #include <linux/rcupdate_trace.h>
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index c20f6cb4bf55..42b585208249 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -16,6 +16,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/rhashtable.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
diff --git a/mm/filemap.c b/mm/filemap.c
index f1c8c278310f..1219ffc04a26 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -45,6 +45,7 @@
 #include <linux/migrate.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/splice.h>
+#include <linux/rcupdate_wait.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 064654717843..47a20a4ece09 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -17,6 +17,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/page_idle.h>
 #include <linux/page_table_check.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/swapops.h>
 #include <linux/shmem_fs.h>
 #include <linux/ksm.h>
diff --git a/mm/shmem.c b/mm/shmem.c
index 91e2620148b2..98f6ca7bdae1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -79,6 +79,7 @@ static struct vfsmount *shm_mnt __ro_after_init;
 #include <linux/rmap.h>
 #include <linux/uuid.h>
 #include <linux/quotaops.h>
+#include <linux/rcupdate_wait.h>
 
 #include <linux/uaccess.h>
 
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 9bdfdab906fe..3ff35f811765 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -52,6 +52,7 @@
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/init.h>
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 26ab0e9612d8..21f7860e8fa1 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -4,6 +4,8 @@
 #ifndef __IP_SET_BITMAP_IP_GEN_H
 #define __IP_SET_BITMAP_IP_GEN_H
 
+#include <linux/rcupdate_wait.h>
+
 #define mtype_do_test		IPSET_TOKEN(MTYPE, _do_test)
 #define mtype_gc_test		IPSET_TOKEN(MTYPE, _gc_test)
 #define mtype_is_filled		IPSET_TOKEN(MTYPE, _is_filled)
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 7c2399541771..cbf80da9a01c 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -5,6 +5,7 @@
 #define _IP_SET_HASH_GEN_H
 
 #include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
 #include <linux/jhash.h>
 #include <linux/types.h>
 #include <linux/netfilter/nfnetlink.h>
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 9065da3cdd12..a743db073887 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -31,6 +31,7 @@
 #include <linux/seq_file.h>
 #include <linux/jhash.h>
 #include <linux/random.h>
+#include <linux/rcupdate_wait.h>
 
 #include <net/net_namespace.h>
 #include <net/ip_vs.h>
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index c5970ba416ae..f821ad2e19b3 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -21,6 +21,7 @@
 #include <linux/interrupt.h>
 #include <linux/sysctl.h>
 #include <linux/list.h>
+#include <linux/rcupdate_wait.h>
 
 #include <net/ip_vs.h>
 
-- 
cgit v1.2.3


From f60a631ab9ed5df15e446269ea515f2b8948ba0c Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Thu, 21 Dec 2023 17:40:14 +0100
Subject: sched/fair: Fix tg->load when offlining a CPU

When a CPU is taken offline, the contribution of its cfs_rqs to task_groups'
load may remain and will negatively impact the calculation of the share of
the online CPUs.

To fix this bug, clear the contribution of an offlining CPU to task groups'
load and skip its contribution while it is inactive.

Here's the reproducer of the anomaly, by Imran Khan:

	"So far I have encountered only one rather lengthy way of reproducing this issue,
	which is as follows:

	1. Take a KVM guest (booted with 4 CPUs and can be scaled up to 124 CPUs) and
	   create 2 custom cgroups: /sys/fs/cgroup/cpu/test_group_1 and /sys/fs/cgroup/
	   cpu/test_group_2

	2. Assign a CPU intensive workload to each of these cgroups and start the
	   workload.

	For my tests I am using following app:

	int main(int argc, char *argv[])
	{
		unsigned long count, i, val;
		if (argc != 2) {
		      printf("usage: ./a.out <number of random nums to generate> \n");
		      return 0;
		}

		count = strtoul(argv[1], NULL, 10);

		printf("Generating %lu random numbers \n", count);
		for (i = 0; i < count; i++) {
			val = rand();
			val = val % 2;
			//usleep(1);
		}
		printf("Generated %lu random numbers \n", count);
		return 0;
	}

	Also since the system is booted with 4 CPUs, in order to completely load the
	system I am also launching 4 instances of same test app under:

	   /sys/fs/cgroup/cpu/

	3. We can see that both of the cgroups get similar CPU time:

        # systemd-cgtop --depth 1
	Path                                 Tasks    %CPU  Memory  Input/s    Output/s
	/                                      659      -     5.5G        -        -
	/system.slice                            -      -     5.7G        -        -
	/test_group_1                            4      -        -        -        -
	/test_group_2                            3      -        -        -        -
	/user.slice                             31      -    56.5M        -        -

	Path                                 Tasks   %CPU   Memory  Input/s    Output/s
	/                                      659  394.6     5.5G        -        -
	/test_group_2                            3   65.7        -        -        -
	/user.slice                             29   55.1    48.0M        -        -
	/test_group_1                            4   47.3        -        -        -
	/system.slice                            -    2.2     5.7G        -        -

	Path                                 Tasks  %CPU    Memory  Input/s    Output/s
	/                                      659  394.8     5.5G        -        -
	/test_group_1                            4   62.9        -        -        -
	/user.slice                             28   44.9    54.2M        -        -
	/test_group_2                            3   44.7        -        -        -
	/system.slice                            -    0.9     5.7G        -        -

	Path                                 Tasks  %CPU    Memory  Input/s     Output/s
	/                                      659  394.4     5.5G        -        -
	/test_group_2                            3   58.8        -        -        -
	/test_group_1                            4   51.9        -        -        -
	/user.slice                              30   39.3    59.6M        -        -
	/system.slice                            -    1.9     5.7G        -        -

	Path                                 Tasks  %CPU     Memory  Input/s    Output/s
	/                                      659  394.7     5.5G        -        -
	/test_group_1                            4   60.9        -        -        -
	/test_group_2                            3   57.9        -        -        -
	/user.slice                             28   43.5    36.9M        -        -
	/system.slice                            -    3.0     5.7G        -        -

	Path                                 Tasks  %CPU     Memory  Input/s     Output/s
	/                                      659  395.0     5.5G        -        -
	/test_group_1                            4   66.8        -        -        -
	/test_group_2                            3   56.3        -        -        -
	/user.slice                             29   43.1    51.8M        -        -
	/system.slice                            -    0.7     5.7G        -        -

	4. Now move systemd-udevd to one of these test groups, say test_group_1, and
	   perform scale up to 124 CPUs followed by scale down back to 4 CPUs from the
	   host side.

	5. Run the same workload i.e 4 instances of CPU hogger under /sys/fs/cgroup/cpu
	   and one instance of  CPU hogger each in /sys/fs/cgroup/cpu/test_group_1 and
	   /sys/fs/cgroup/test_group_2.

	It can be seen that test_group_1 (the one where systemd-udevd was moved) is getting
	much less CPU time than the test_group_2, even though at this point of time both of
	these groups have only CPU hogger running:

        # systemd-cgtop --depth 1
	Path                                   Tasks   %CPU   Memory  Input/s   Output/s
	/                                      1219     -     5.4G        -        -
	/system.slice                           -       -     5.6G        -        -
	/test_group_1                           4       -        -        -        -
	/test_group_2                           3       -        -        -        -
	/user.slice                            26       -    91.3M        -        -

	Path                                   Tasks  %CPU     Memory  Input/s   Output/s
	/                                      1221  394.3     5.4G        -        -
	/test_group_2                             3   82.7        -        -        -
	/test_group_1                             4   14.3        -        -        -
	/system.slice                             -    0.8     5.6G        -        -
	/user.slice                              26    0.4    91.2M        -        -

	Path                                   Tasks  %CPU    Memory  Input/s    Output/s
	/                                      1221  394.6     5.4G        -        -
	/test_group_2                             3   67.4        -        -        -
	/system.slice                             -   24.6     5.6G        -        -
	/test_group_1                             4   12.5        -        -        -
	/user.slice                              26    0.4    91.2M        -        -

	Path                                  Tasks  %CPU    Memory  Input/s    Output/s
	/                                     1221  395.2     5.4G        -        -
	/test_group_2                            3   60.9        -        -        -
	/system.slice                            -   27.9     5.6G        -        -
	/test_group_1                            4   12.2        -        -        -
	/user.slice                             26    0.4    91.2M        -        -

	Path                                  Tasks  %CPU    Memory  Input/s    Output/s
	/                                     1221  395.2     5.4G        -        -
	/test_group_2                            3   69.4        -        -        -
	/test_group_1                            4   13.9        -        -        -
	/user.slice                             28    1.6    92.0M        -        -
	/system.slice                            -    1.0     5.6G        -        -

	Path                                  Tasks  %CPU    Memory  Input/s    Output/s
	/                                      1221  395.6     5.4G        -        -
	/test_group_2                             3   59.3        -        -        -
	/test_group_1                             4   14.1        -        -        -
	/user.slice                              28    1.3    92.2M        -        -
	/system.slice                             -    0.7     5.6G        -        -

	Path                                  Tasks  %CPU    Memory  Input/s    Output/s
	/                                      1221  395.5     5.4G        -        -
	/test_group_2                            3   67.2        -        -        -
	/test_group_1                            4   11.5        -        -        -
	/user.slice                             28    1.3    92.5M        -        -
	/system.slice                            -    0.6     5.6G        -        -

	Path                                  Tasks  %CPU    Memory  Input/s    Output/s
	/                                      1221  395.1     5.4G        -        -
	/test_group_2                             3   76.8        -        -        -
	/test_group_1                             4   12.9        -        -        -
	/user.slice                              28    1.3    92.8M        -        -
	/system.slice                             -    1.2     5.6G        -        -

	From sched_debug data it can be seen that in bad case the load.weight of per-CPU
	sched entities corresponding to test_group_1 has reduced significantly and
	also load_avg of test_group_1 remains much higher than that of test_group_2,
	even though systemd-udevd stopped running long time back and at this point of
	time both cgroups just have the CPU hogger app as running entity."

[ mingo: Added details from the original discussion, plus minor edits to the patch. ]

Reported-by: Imran Khan <imran.f.khan@oracle.com>
Tested-by: Imran Khan <imran.f.khan@oracle.com>
Tested-by: Aaron Lu <aaron.lu@intel.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Imran Khan <imran.f.khan@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Link: https://lore.kernel.org/r/20231223111545.62135-1-vincent.guittot@linaro.org
---
 kernel/sched/fair.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d7a3c63a2171..43c1216898cb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4096,6 +4096,10 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 	if (cfs_rq->tg == &root_task_group)
 		return;
 
+	/* rq has been offline and doesn't contribute to the share anymore: */
+	if (!cpu_active(cpu_of(rq_of(cfs_rq))))
+		return;
+
 	/*
 	 * For migration heavy workloads, access to tg->load_avg can be
 	 * unbound. Limit the update rate to at most once per ms.
@@ -4112,6 +4116,49 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 	}
 }
 
+static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
+{
+	long delta;
+	u64 now;
+
+	/*
+	 * No need to update load_avg for root_task_group, as it is not used.
+	 */
+	if (cfs_rq->tg == &root_task_group)
+		return;
+
+	now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+	delta = 0 - cfs_rq->tg_load_avg_contrib;
+	atomic_long_add(delta, &cfs_rq->tg->load_avg);
+	cfs_rq->tg_load_avg_contrib = 0;
+	cfs_rq->last_update_tg_load_avg = now;
+}
+
+/* CPU offline callback: */
+static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
+{
+	struct task_group *tg;
+
+	lockdep_assert_rq_held(rq);
+
+	/*
+	 * The rq clock has already been updated in
+	 * set_rq_offline(), so we should skip updating
+	 * the rq clock again in unthrottle_cfs_rq().
+	 */
+	rq_clock_start_loop_update(rq);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+		clear_tg_load_avg(cfs_rq);
+	}
+	rcu_read_unlock();
+
+	rq_clock_stop_loop_update(rq);
+}
+
 /*
  * Called within set_task_rq() right before setting a task's CPU. The
  * caller only guarantees p->pi_lock is held; no other assumptions,
@@ -4408,6 +4455,8 @@ static inline bool skip_blocked_update(struct sched_entity *se)
 
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
 
+static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
+
 static inline int propagate_entity_load_avg(struct sched_entity *se)
 {
 	return 0;
@@ -12413,6 +12462,9 @@ static void rq_offline_fair(struct rq *rq)
 
 	/* Ensure any throttled groups are reachable by pick_next_task */
 	unthrottle_offline_cfs_rqs(rq);
+
+	/* Ensure that we remove rq contribution to group share: */
+	clear_tg_offline_cfs_rqs(rq);
 }
 
 #endif /* CONFIG_SMP */
-- 
cgit v1.2.3


From 623b1f896fa8a669a277ee5a258307a16c7377a3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 26 Dec 2023 12:59:02 -0500
Subject: ring-buffer: Fix wake ups when buffer_percent is set to 100

The tracefs file "buffer_percent" is to allow user space to set a
water-mark on how much of the tracing ring buffer needs to be filled in
order to wake up a blocked reader.

 0 - is to wait until any data is in the buffer
 1 - is to wait for 1% of the sub buffers to be filled
 50 - would be half of the sub buffers are filled with data
 100 - is not to wake the waiter until the ring buffer is completely full

Unfortunately the test for being full was:

	dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
	return (dirty * 100) > (full * nr_pages);

Where "full" is the value for "buffer_percent".

There is two issues with the above when full == 100.

1. dirty * 100 > 100 * nr_pages will never be true
   That is, the above is basically saying that if the user sets
   buffer_percent to 100, more pages need to be dirty than exist in the
   ring buffer!

2. The page that the writer is on is never considered dirty, as dirty
   pages are only those that are full. When the writer goes to a new
   sub-buffer, it clears the contents of that sub-buffer.

That is, even if the check was ">=" it would still not be equal as the
most pages that can be considered "dirty" is nr_pages - 1.

To fix this, add one to dirty and use ">=" in the compare.

Link: https://lore.kernel.org/linux-trace-kernel/20231226125902.4a057f1d@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Fixes: 03329f9939781 ("tracing: Add tracefs file buffer_percentage")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 83eab547f1d1..32c0dd2fd1c3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -881,9 +881,14 @@ static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int f
 	if (!nr_pages || !full)
 		return true;
 
-	dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+	/*
+	 * Add one as dirty will never equal nr_pages, as the sub-buffer
+	 * that the writer is on is not counted as dirty.
+	 * This is needed if "buffer_percent" is set to 100.
+	 */
+	dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1;
 
-	return (dirty * 100) > (full * nr_pages);
+	return (dirty * 100) >= (full * nr_pages);
 }
 
 /*
-- 
cgit v1.2.3


From 39a7dc23a1ed0fe81141792a09449d124c5953bd Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 28 Dec 2023 09:51:49 -0500
Subject: tracing: Fix blocked reader of snapshot buffer

If an application blocks on the snapshot or snapshot_raw files, expecting
to be woken up when a snapshot occurs, it will not happen. Or it may
happen with an unexpected result.

That result is that the application will be reading the main buffer
instead of the snapshot buffer. That is because when the snapshot occurs,
the main and snapshot buffers are swapped. But the reader has a descriptor
still pointing to the buffer that it originally connected to.

This is fine for the main buffer readers, as they may be blocked waiting
for a watermark to be hit, and when a snapshot occurs, the data that the
main readers want is now on the snapshot buffer.

But for waiters of the snapshot buffer, they are waiting for an event to
occur that will trigger the snapshot and they can then consume it quickly
to save the snapshot before the next snapshot occurs. But to do this, they
need to read the new snapshot buffer, not the old one that is now
receiving new data.

Also, it does not make sense to have a watermark "buffer_percent" on the
snapshot buffer, as the snapshot buffer is static and does not receive new
data except all at once.

Link: https://lore.kernel.org/linux-trace-kernel/20231228095149.77f5b45d@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Fixes: debdd57f5145f ("tracing: Make a snapshot feature available from userspace")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c |  3 ++-
 kernel/trace/trace.c       | 20 +++++++++++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 32c0dd2fd1c3..9286f88fcd32 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -949,7 +949,8 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
 	/* make sure the waiters see the new index */
 	smp_wmb();
 
-	rb_wake_up_waiters(&rbwork->work);
+	/* This can be called in any context */
+	irq_work_queue(&rbwork->work);
 }
 
 /**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 199df497db07..a0defe156b57 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1894,6 +1894,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
 	__update_max_tr(tr, tsk, cpu);
 
 	arch_spin_unlock(&tr->max_lock);
+
+	/* Any waiters on the old snapshot buffer need to wake up */
+	ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
 }
 
 /**
@@ -1945,12 +1948,23 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 static int wait_on_pipe(struct trace_iterator *iter, int full)
 {
+	int ret;
+
 	/* Iterators are static, they should be filled or empty */
 	if (trace_buffer_iter(iter, iter->cpu_file))
 		return 0;
 
-	return ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file,
-				full);
+	ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+	/*
+	 * Make sure this is still the snapshot buffer, as if a snapshot were
+	 * to happen, this would now be the main buffer.
+	 */
+	if (iter->snapshot)
+		iter->array_buffer = &iter->tr->max_buffer;
+#endif
+	return ret;
 }
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -8517,7 +8531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 
 		wait_index = READ_ONCE(iter->wait_index);
 
-		ret = wait_on_pipe(iter, iter->tr->buffer_percent);
+		ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent);
 		if (ret)
 			goto out;
 
-- 
cgit v1.2.3


From 2853b66b601a265306be709b4d86aaff7d92a0fc Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 11 Dec 2023 16:22:09 +0000
Subject: mm: remove some calls to page_add_new_anon_rmap()

We already have the folio in these functions, we just need to use it.
folio_add_new_anon_rmap() didn't exist at the time they were converted to
folios.

Link: https://lkml.kernel.org/r/20231211162214.2146080-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/events/uprobes.c | 2 +-
 mm/memory.c             | 2 +-
 mm/userfaultfd.c        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 435aac1d8c27..8b115fc43f04 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -181,7 +181,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
 	if (new_page) {
 		folio_get(new_folio);
-		page_add_new_anon_rmap(new_page, vma, addr);
+		folio_add_new_anon_rmap(new_folio, vma, addr);
 		folio_add_lru_vma(new_folio, vma);
 	} else
 		/* no new page, just dec_mm_counter for old_page */
diff --git a/mm/memory.c b/mm/memory.c
index 7649cb9eb7f5..5c023bef2adb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4071,7 +4071,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 
 	/* ksm created a completely new copy */
 	if (unlikely(folio != swapcache && swapcache)) {
-		page_add_new_anon_rmap(page, vma, vmf->address);
+		folio_add_new_anon_rmap(folio, vma, vmf->address);
 		folio_add_lru_vma(folio, vma);
 	} else {
 		page_add_anon_rmap(page, vma, vmf->address, rmap_flags);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9ec814e47e99..203cda9192c2 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -116,7 +116,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
 			folio_add_lru(folio);
 		page_add_file_rmap(page, dst_vma, false);
 	} else {
-		page_add_new_anon_rmap(page, dst_vma, dst_addr);
+		folio_add_new_anon_rmap(folio, dst_vma, dst_addr);
 		folio_add_lru_vma(folio, dst_vma);
 	}
 
-- 
cgit v1.2.3


From 5cc9695f06b065168f5c893c8e006b6a8a2c9c91 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 20 Dec 2023 23:44:48 +0100
Subject: kernel/events/uprobes: page_remove_rmap() -> folio_remove_rmap_pte()

Let's convert __replace_page().

Link: https://lkml.kernel.org/r/20231220224504.646757-25-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/events/uprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8b115fc43f04..485bb0389b48 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -198,7 +198,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 		set_pte_at_notify(mm, addr, pvmw.pte,
 				  mk_pte(new_page, vma->vm_page_prot));
 
-	page_remove_rmap(old_page, vma, false);
+	folio_remove_rmap_pte(old_folio, old_page, vma);
 	if (!folio_mapped(old_folio))
 		folio_free_swap(old_folio);
 	page_vma_mapped_walk_done(&pvmw);
-- 
cgit v1.2.3


From 816d334afa85c836080b41bb6238aea845615ad9 Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Sun, 17 Dec 2023 11:35:26 +0800
Subject: kexec: modify the meaning of the end parameter in
 kimage_is_destination_range()

The end parameter received by kimage_is_destination_range() should be the
last valid byte address of the target memory segment plus 1.  However, in
the locate_mem_hole_bottom_up() and locate_mem_hole_top_down() functions,
the corresponding value passed to kimage_is_destination_range() is the
last valid byte address of the target memory segment, which is 1 less.

There are two ways to fix this bug.  We can either correct the logic of
the locate_mem_hole_bottom_up() and locate_mem_hole_top_down() functions,
or we can fix kimage_is_destination_range() by making the end parameter
represent the last valid byte address of the target memory segment.  Here,
we choose the second approach.

Due to the modification to kimage_is_destination_range(), we also need to
adjust its callers, such as kimage_alloc_normal_control_pages() and
kimage_alloc_page().

Link: https://lkml.kernel.org/r/20231217033528.303333-2-ytcoode@gmail.com
Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kexec_core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 6e0f022987ff..2f039a7d9af9 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -278,8 +278,8 @@ int kimage_is_destination_range(struct kimage *image,
 		unsigned long mstart, mend;
 
 		mstart = image->segment[i].mem;
-		mend = mstart + image->segment[i].memsz;
-		if ((end > mstart) && (start < mend))
+		mend = mstart + image->segment[i].memsz - 1;
+		if ((end >= mstart) && (start <= mend))
 			return 1;
 	}
 
@@ -372,7 +372,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 		pfn   = page_to_boot_pfn(pages);
 		epfn  = pfn + count;
 		addr  = pfn << PAGE_SHIFT;
-		eaddr = epfn << PAGE_SHIFT;
+		eaddr = (epfn << PAGE_SHIFT) - 1;
 		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
 			      kimage_is_destination_range(image, addr, eaddr)) {
 			list_add(&pages->lru, &extra_pages);
@@ -718,7 +718,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
 
 		/* If the page is not a destination page use it */
 		if (!kimage_is_destination_range(image, addr,
-						  addr + PAGE_SIZE))
+						  addr + PAGE_SIZE - 1))
 			break;
 
 		/*
-- 
cgit v1.2.3


From 18d565ea95fe553f442c5bbc5050415bab3c3fa4 Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Sun, 17 Dec 2023 11:35:27 +0800
Subject: kexec_file: fix incorrect temp_start value in
 locate_mem_hole_top_down()

temp_end represents the address of the last available byte.  Therefore,
the starting address of the memory segment with temp_end as its last
available byte and a size of `kbuf->memsz`, that is, the value of
temp_start, should be `temp_end - kbuf->memsz + 1` instead of `temp_end -
kbuf->memsz`.

Additionally, use the ALIGN_DOWN macro instead of open-coding it directly
in locate_mem_hole_top_down() to improve code readability.

Link: https://lkml.kernel.org/r/20231217033528.303333-3-ytcoode@gmail.com
Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kexec_file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index aca5f3668f4c..bef2f6f2571b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -434,11 +434,11 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
 	unsigned long temp_start, temp_end;
 
 	temp_end = min(end, kbuf->buf_max);
-	temp_start = temp_end - kbuf->memsz;
+	temp_start = temp_end - kbuf->memsz + 1;
 
 	do {
 		/* align down start */
-		temp_start = temp_start & (~(kbuf->buf_align - 1));
+		temp_start = ALIGN_DOWN(temp_start, kbuf->buf_align);
 
 		if (temp_start < start || temp_start < kbuf->buf_min)
 			return 0;
-- 
cgit v1.2.3


From d391615618e8b2c30ef1e09c1705a7b1751f74f0 Mon Sep 17 00:00:00 2001
From: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Date: Tue, 19 Dec 2023 23:24:14 +0100
Subject: kernel: relay: remove relay_file_splice_read dead code, doesn't work
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Documentation/filesystems/relay.rst says to use
	return debugfs_create_file(filename, mode, parent, buf,
	                           &relay_file_operations);
and this is the only way relay_file_operations is used.

Thus: debugfs_create_file(&relay_file_operations)
   -> __debugfs_create_file(&debugfs_full_proxy_file_operations,
                            &relay_file_operations)
   -> dentry{inode: {i_fop: &debugfs_full_proxy_file_operations},
             d_fsdata: &relay_file_operations
                       | DEBUGFS_FSDATA_IS_REAL_FOPS_BIT}

debugfs_full_proxy_file_operations.open is full_proxy_open, which extracts
the &relay_file_operations from the dentry, and allocates via
__full_proxy_fops_init() new fops, with trivial wrappers around release,
llseek, read, write, poll, and unlocked_ioctl, then replaces the fops on
the opened file therewith.

Naturally, all thusly-created debugfs files have .splice_read = NULL.
This was introduced in commit 49d200deaa68 ("debugfs: prevent access to
removed files' private data") from 2016-03-22.

AFAICT, relay_file_operations is the only struct file_operations used for
debugfs which defines a .splice_read callback.  Hooking it up with

>	diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
>	index 5063434be0fc..952fcf5b2afa 100644
>	--- a/fs/debugfs/file.c
>	+++ b/fs/debugfs/file.c
>	@@ -328,6 +328,11 @@ FULL_PROXY_FUNC(write, ssize_t, filp,
>	 			loff_t *ppos),
>	 		ARGS(filp, buf, size, ppos));
>
>	+FULL_PROXY_FUNC(splice_read, long, in,
>	+		PROTO(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe,
>	+			size_t len, unsigned int flags),
>	+		ARGS(in, ppos, pipe, len, flags));
>	+
>	 FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
>	 		PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
>	 		ARGS(filp, cmd, arg));
>	@@ -382,6 +387,8 @@ static void __full_proxy_fops_init(struct file_operations *proxy_fops,
>	 		proxy_fops->write = full_proxy_write;
>	 	if (real_fops->poll)
>	 		proxy_fops->poll = full_proxy_poll;
>	+	if (real_fops->splice_read)
>	+		proxy_fops->splice_read = full_proxy_splice_read;
>	 	if (real_fops->unlocked_ioctl)
>	 		proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl;
>	 }

shows it just doesn't work, and splicing always instantly returns empty
(subsequent reads actually return the contents).

No-one noticed it became dead code in 2016, who knows if it worked back
then. Clearly no-one cares; just delete it.

Link: https://lkml.kernel.org/r/dtexwpw6zcdx7dkx3xj5gyjp5syxmyretdcbcdtvrnukd4vvuh@tarta.nabijaczleweli.xyz
Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Li kunyu <kunyu@nfschina.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zhang Zhengming <zhang.zhengming@h3c.com>
Cc: Zhao Lei <zhao_lei1@hoperun.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/relay.c | 162 ---------------------------------------------------------
 1 file changed, 162 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 83fe0325cde1..a8e90e98bf2c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1073,167 +1073,6 @@ static ssize_t relay_file_read(struct file *filp,
 	return written;
 }
 
-static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
-{
-	rbuf->bytes_consumed += bytes_consumed;
-
-	if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {
-		relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);
-		rbuf->bytes_consumed %= rbuf->chan->subbuf_size;
-	}
-}
-
-static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
-				   struct pipe_buffer *buf)
-{
-	struct rchan_buf *rbuf;
-
-	rbuf = (struct rchan_buf *)page_private(buf->page);
-	relay_consume_bytes(rbuf, buf->private);
-}
-
-static const struct pipe_buf_operations relay_pipe_buf_ops = {
-	.release	= relay_pipe_buf_release,
-	.try_steal	= generic_pipe_buf_try_steal,
-	.get		= generic_pipe_buf_get,
-};
-
-static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
-{
-}
-
-/*
- *	subbuf_splice_actor - splice up to one subbuf's worth of data
- */
-static ssize_t subbuf_splice_actor(struct file *in,
-			       loff_t *ppos,
-			       struct pipe_inode_info *pipe,
-			       size_t len,
-			       unsigned int flags,
-			       int *nonpad_ret)
-{
-	unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
-	struct rchan_buf *rbuf = in->private_data;
-	unsigned int subbuf_size = rbuf->chan->subbuf_size;
-	uint64_t pos = (uint64_t) *ppos;
-	uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size;
-	size_t read_start = (size_t) do_div(pos, alloc_size);
-	size_t read_subbuf = read_start / subbuf_size;
-	size_t padding = rbuf->padding[read_subbuf];
-	size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
-	struct page *pages[PIPE_DEF_BUFFERS];
-	struct partial_page partial[PIPE_DEF_BUFFERS];
-	struct splice_pipe_desc spd = {
-		.pages = pages,
-		.nr_pages = 0,
-		.nr_pages_max = PIPE_DEF_BUFFERS,
-		.partial = partial,
-		.ops = &relay_pipe_buf_ops,
-		.spd_release = relay_page_release,
-	};
-	ssize_t ret;
-
-	if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
-		return 0;
-	if (splice_grow_spd(pipe, &spd))
-		return -ENOMEM;
-
-	/*
-	 * Adjust read len, if longer than what is available
-	 */
-	if (len > (subbuf_size - read_start % subbuf_size))
-		len = subbuf_size - read_start % subbuf_size;
-
-	subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
-	pidx = (read_start / PAGE_SIZE) % subbuf_pages;
-	poff = read_start & ~PAGE_MASK;
-	nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
-
-	for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
-		unsigned int this_len, this_end, private;
-		unsigned int cur_pos = read_start + total_len;
-
-		if (!len)
-			break;
-
-		this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
-		private = this_len;
-
-		spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
-		spd.partial[spd.nr_pages].offset = poff;
-
-		this_end = cur_pos + this_len;
-		if (this_end >= nonpad_end) {
-			this_len = nonpad_end - cur_pos;
-			private = this_len + padding;
-		}
-		spd.partial[spd.nr_pages].len = this_len;
-		spd.partial[spd.nr_pages].private = private;
-
-		len -= this_len;
-		total_len += this_len;
-		poff = 0;
-		pidx = (pidx + 1) % subbuf_pages;
-
-		if (this_end >= nonpad_end) {
-			spd.nr_pages++;
-			break;
-		}
-	}
-
-	ret = 0;
-	if (!spd.nr_pages)
-		goto out;
-
-	ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
-	if (ret < 0 || ret < total_len)
-		goto out;
-
-        if (read_start + ret == nonpad_end)
-                ret += padding;
-
-out:
-	splice_shrink_spd(&spd);
-	return ret;
-}
-
-static ssize_t relay_file_splice_read(struct file *in,
-				      loff_t *ppos,
-				      struct pipe_inode_info *pipe,
-				      size_t len,
-				      unsigned int flags)
-{
-	ssize_t spliced;
-	int ret;
-	int nonpad_ret = 0;
-
-	ret = 0;
-	spliced = 0;
-
-	while (len && !spliced) {
-		ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
-		if (ret < 0)
-			break;
-		else if (!ret) {
-			if (flags & SPLICE_F_NONBLOCK)
-				ret = -EAGAIN;
-			break;
-		}
-
-		*ppos += ret;
-		if (ret > len)
-			len = 0;
-		else
-			len -= ret;
-		spliced += nonpad_ret;
-		nonpad_ret = 0;
-	}
-
-	if (spliced)
-		return spliced;
-
-	return ret;
-}
 
 const struct file_operations relay_file_operations = {
 	.open		= relay_file_open,
@@ -1242,6 +1081,5 @@ const struct file_operations relay_file_operations = {
 	.read		= relay_file_read,
 	.llseek		= no_llseek,
 	.release	= relay_file_release,
-	.splice_read	= relay_file_splice_read,
 };
 EXPORT_SYMBOL_GPL(relay_file_operations);
-- 
cgit v1.2.3


From 5f981878c71eaa8dc5563805152bd129a73a90be Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 19 Dec 2023 21:49:45 -0800
Subject: stacktrace: fix kernel-doc typo

Change @task to @tsk to prevent kernel-doc warnings:

kernel/stacktrace.c:138: warning: Excess function parameter 'task' description in 'stack_trace_save_tsk'
kernel/stacktrace.c:138: warning: Function parameter or member 'tsk' not described in 'stack_trace_save_tsk'

Link: https://lkml.kernel.org/r/20231220054945.17663-1-rdunlap@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/stacktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 4f65824879ab..afb3c116da91 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(stack_trace_save);
 
 /**
  * stack_trace_save_tsk - Save a task stack trace into a storage array
- * @task:	The task to examine
+ * @tsk:	The task to examine
  * @store:	Pointer to storage array
  * @size:	Size of the storage array
  * @skipnr:	Number of entries to skip at the start of the stack trace
-- 
cgit v1.2.3


From 2861b37732627d7d115d77585ce4853f25cf332d Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Thu, 21 Dec 2023 12:23:08 +0800
Subject: kexec_core: fix the assignment to kimage->control_page

image->control_page represents the starting address for allocating the
next control page, while hole_end represents the address of the last valid
byte of the currently allocated control page.

This bug actually does not affect the correctness of allocating control
pages, because image->control_page is currently only used in
kimage_alloc_crash_control_pages(), and this function, when allocating
control pages, will first align image->control_page up to the nearest
`(1 << order) << PAGE_SHIFT` boundary, then use this value as the
starting address of the next control page.  This ensures that the newly
allocated control page will use the correct starting address and not
overlap with previously allocated control pages.

Although it does not affect the correctness of the final result, it is
better for us to set image->control_page to the correct value, in case
it might be used elsewhere in the future, potentially causing errors.

Therefore, after successfully allocating a control page,
image->control_page should be updated to `hole_end + 1`, rather than
hole_end.

Link: https://lkml.kernel.org/r/20231221042308.11076-1-ytcoode@gmail.com
Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kexec_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 2f039a7d9af9..a08031b57a61 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -457,7 +457,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 		/* If I don't overlap any segments I have found my hole! */
 		if (i == image->nr_segments) {
 			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
-			image->control_page = hole_end;
+			image->control_page = hole_end + 1;
 			break;
 		}
 	}
-- 
cgit v1.2.3


From 6dcde5d5f248291c5ff6cbe00a7fa6ae400d1aa9 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 20 Dec 2023 13:15:34 -0800
Subject: watchdog/hardlockup: adopt softlockup logic avoiding double-dumps

Patch series "watchdog: Better handling of concurrent lockups".

When we get multiple lockups at roughly the same time, the output in the
kernel logs can be very confusing since the reports about the lockups end
up interleaved in the logs.  There is some code in the kernel to try to
handle this but it wasn't that complete.

Li Zhe recently made this a bit better for softlockups (specifically for
the case where `kernel.softlockup_all_cpu_backtrace` is not set) in commit
9d02330abd3e ("softlockup: serialized softlockup's log"), but that only
handled softlockup reports.  Hardlockup reports still had similar issues.

This series also has a small fix to avoid dumping all stacks a second time
in the case of a panic.  This is a bit unrelated to the interleaving fixes
but it does also improve the clarity of lockup reports.


This patch (of 4):

The hardlockup detector and softlockup detector both have the ability to
dump the stack of all CPUs (`kernel.hardlockup_all_cpu_backtrace` and
`kernel.softlockup_all_cpu_backtrace`).  Both detectors also have some
logic to attempt to avoid interleaving printouts if two CPUs were trying
to do dumps of all CPUs at the same time.  However:

- The hardlockup detector's logic still allowed interleaving some
  information. Specifically another CPU could print modules and dump
  the stack of the locked CPU at the same time we were dumping all
  CPUs.

- In the case where `kernel.hardlockup_panic` was set in addition to
  `kernel.hardlockup_all_cpu_backtrace`, when two CPUs both detected
  hardlockups at the same time the second CPU could call panic() while
  the first was still dumping stacks. This was especially bad if the
  locked up CPU wasn't responding to the request for a backtrace since
  the function nmi_trigger_cpumask_backtrace() can wait up to 10
  seconds.

Let's resolve this by adopting the softlockup logic in the hardlockup
handler.

NOTES:

- As part of this, one might think that we should make a helper
  function that both the hard and softlockup detectors call. This
  turns out not to be super trivial since it would have to be
  parameterized quite a bit since there are separate global variables
  controlling each lockup detector and they print log messages that
  are just different enough that it would be a pain. We probably don't
  want to change the messages that are printed without good reason to
  avoid throwing log parsers for a loop.

- One might also think that it would be a good idea to have the
  hardlockup and softlockup detector use the same global variable to
  prevent interleaving. This would make sure that softlockups and
  hardlockups can't interleave each other. That _almost_ works but has
  a dangerous flaw if `kernel.hardlockup_panic` is not the same as
  `kernel.softlockup_panic` because we might skip a call to panic() if
  one type of lockup was detected at the same time as another.

Link: https://lkml.kernel.org/r/20231220211640.2023645-1-dianders@chromium.org
Link: https://lkml.kernel.org/r/20231220131534.1.I4f35a69fbb124b5f0c71f75c631e11fabbe188ff@changeid
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Lecopzer Chen <lecopzer.chen@mediatek.com>
Cc: Li Zhe <lizhe.67@bytedance.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Pingfan Liu <kernelfans@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/watchdog.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index bf30a6fac665..b4fd2f12137f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -91,7 +91,7 @@ static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts);
 static DEFINE_PER_CPU(int, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned);
 static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched);
-static unsigned long watchdog_hardlockup_all_cpu_dumped;
+static unsigned long hard_lockup_nmi_warn;
 
 notrace void arch_touch_nmi_watchdog(void)
 {
@@ -156,6 +156,15 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
 		if (per_cpu(watchdog_hardlockup_warned, cpu))
 			return;
 
+		/*
+		 * Prevent multiple hard-lockup reports if one cpu is already
+		 * engaged in dumping all cpu back traces.
+		 */
+		if (sysctl_hardlockup_all_cpu_backtrace) {
+			if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn))
+				return;
+		}
+
 		pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu);
 		print_modules();
 		print_irqtrace_events(current);
@@ -168,13 +177,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
 			trigger_single_cpu_backtrace(cpu);
 		}
 
-		/*
-		 * Perform multi-CPU dump only once to avoid multiple
-		 * hardlockups generating interleaving traces
-		 */
-		if (sysctl_hardlockup_all_cpu_backtrace &&
-		    !test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped))
+		if (sysctl_hardlockup_all_cpu_backtrace) {
 			trigger_allbutcpu_cpu_backtrace(cpu);
+			clear_bit_unlock(0, &hard_lockup_nmi_warn);
+		}
 
 		if (hardlockup_panic)
 			nmi_panic(regs, "Hard LOCKUP");
-- 
cgit v1.2.3


From 896260a6d69d01ceb1aad8cfe4298bd837a67432 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 20 Dec 2023 13:15:35 -0800
Subject: watchdog/softlockup: use printk_cpu_sync_get_irqsave() to serialize
 reporting

Instead of introducing a spinlock, use printk_cpu_sync_get_irqsave() and
printk_cpu_sync_put_irqrestore() to serialize softlockup reporting.  Alone
this doesn't have any real advantage over the spinlock, but this will
allow us to use the same function in a future change to also serialize
hardlockup crawls.

NOTE: for the most part this serialization is important because we often
end up in the show_regs() path and that has no built-in serialization if
there are multiple callers at once.  However, even in the case where we
end up in the dump_stack() path this still has some advantages because the
stack will be guaranteed to be together in the logs with the lockup
message with no interleaving.

NOTE: the fact that printk_cpu_sync_get_irqsave() is allowed to be called
multiple times on the same CPU is important here.  Specifically we hold
the "lock" while calling dump_stack() which also gets the same "lock".
This is explicitly documented to be OK and means we don't need to
introduce a variant of dump_stack() that doesn't grab the lock.

Link: https://lkml.kernel.org/r/20231220131534.2.Ia5906525d440d8e8383cde31b7c61c2aadc8f907@changeid
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Li Zhe <lizhe.67@bytedance.com>
Cc: Lecopzer Chen <lecopzer.chen@mediatek.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Pingfan Liu <kernelfans@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/watchdog.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b4fd2f12137f..526041a1100a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -454,7 +454,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	struct pt_regs *regs = get_irq_regs();
 	int duration;
 	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
-	static DEFINE_SPINLOCK(watchdog_output_lock);
+	unsigned long flags;
 
 	if (!watchdog_enabled)
 		return HRTIMER_NORESTART;
@@ -521,7 +521,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		/* Start period for the next softlockup warning. */
 		update_report_ts();
 
-		spin_lock(&watchdog_output_lock);
+		printk_cpu_sync_get_irqsave(flags);
 		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
 			smp_processor_id(), duration,
 			current->comm, task_pid_nr(current));
@@ -531,7 +531,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 			show_regs(regs);
 		else
 			dump_stack();
-		spin_unlock(&watchdog_output_lock);
+		printk_cpu_sync_put_irqrestore(flags);
 
 		if (softlockup_all_cpu_backtrace) {
 			trigger_allbutcpu_cpu_backtrace(smp_processor_id());
-- 
cgit v1.2.3


From ee6bdb3f4bf046ff7878c6103b8c88bb4ccfb11d Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 20 Dec 2023 13:15:36 -0800
Subject: watchdog/hardlockup: use printk_cpu_sync_get_irqsave() to serialize
 reporting

If two CPUs end up reporting a hardlockup at the same time then their logs
could get interleaved which is hard to read.

The interleaving problem was especially bad with the "perf" hardlockup
detector where the locked up CPU is always the same as the running CPU and
we end up in show_regs().  show_regs() has no inherent serialization so we
could mix together two crawls if two hardlockups happened at the same time
(and if we didn't have `sysctl_hardlockup_all_cpu_backtrace` set).  With
this change we'll fully serialize hardlockups when using the "perf"
hardlockup detector.

The interleaving problem was less bad with the "buddy" hardlockup
detector.  With "buddy" we always end up calling
`trigger_single_cpu_backtrace(cpu)` on some CPU other than the running
one.  trigger_single_cpu_backtrace() always at least serializes the
individual stack crawls because it eventually uses
printk_cpu_sync_get_irqsave().  Unfortunately the fact that
trigger_single_cpu_backtrace() eventually calls
printk_cpu_sync_get_irqsave() (on a different CPU) means that we have to
drop the "lock" before calling it and we can't fully serialize all
printouts associated with a given hardlockup.  However, we still do get
the advantage of serializing the output of print_modules() and
print_irqtrace_events().

Aside from serializing hardlockups from each other, this change also has
the advantage of serializing hardlockups and softlockups from each other
if they happen to happen at the same time since they are both using the
same "lock".

Even though nobody is expected to hang while holding the lock associated
with printk_cpu_sync_get_irqsave(), out of an abundance of caution, we
don't call printk_cpu_sync_get_irqsave() until after we print out about
the hardlockup.  This makes extra sure that, even if
printk_cpu_sync_get_irqsave() somehow never runs we at least print that we
saw the hardlockup.  This is different than the choice made for softlockup
because hardlockup is really our last resort.

Link: https://lkml.kernel.org/r/20231220131534.3.I6ff691b3b40f0379bc860f80c6e729a0485b5247@changeid
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Cc: Lecopzer Chen <lecopzer.chen@mediatek.com>
Cc: Li Zhe <lizhe.67@bytedance.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Pingfan Liu <kernelfans@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/watchdog.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 526041a1100a..11f9577accca 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -151,6 +151,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
 	 */
 	if (is_hardlockup(cpu)) {
 		unsigned int this_cpu = smp_processor_id();
+		unsigned long flags;
 
 		/* Only print hardlockups once. */
 		if (per_cpu(watchdog_hardlockup_warned, cpu))
@@ -165,7 +166,17 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
 				return;
 		}
 
+		/*
+		 * NOTE: we call printk_cpu_sync_get_irqsave() after printing
+		 * the lockup message. While it would be nice to serialize
+		 * that printout, we really want to make sure that if some
+		 * other CPU somehow locked up while holding the lock associated
+		 * with printk_cpu_sync_get_irqsave() that we can still at least
+		 * get the message about the lockup out.
+		 */
 		pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu);
+		printk_cpu_sync_get_irqsave(flags);
+
 		print_modules();
 		print_irqtrace_events(current);
 		if (cpu == this_cpu) {
@@ -173,7 +184,9 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
 				show_regs(regs);
 			else
 				dump_stack();
+			printk_cpu_sync_put_irqrestore(flags);
 		} else {
+			printk_cpu_sync_put_irqrestore(flags);
 			trigger_single_cpu_backtrace(cpu);
 		}
 
-- 
cgit v1.2.3


From 55efe4abf927aca3692870a1851067f309e9a374 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 20 Dec 2023 13:15:37 -0800
Subject: watchdog: if panicking and we dumped everything, don't re-enable
 dumping

If, as part of handling a hardlockup or softlockup, we've already dumped
all CPUs and we're just about to panic, don't reenable dumping and give
some other CPU a chance to hop in there and add some confusing logs right
as the panic is happening.

Link: https://lkml.kernel.org/r/20231220131534.4.Id3a9c7ec2d7d83e4080da6f8662ba2226b40543f@changeid
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Lecopzer Chen <lecopzer.chen@mediatek.com>
Cc: Li Zhe <lizhe.67@bytedance.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Pingfan Liu <kernelfans@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/watchdog.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 11f9577accca..81a8862295d6 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -192,7 +192,8 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
 
 		if (sysctl_hardlockup_all_cpu_backtrace) {
 			trigger_allbutcpu_cpu_backtrace(cpu);
-			clear_bit_unlock(0, &hard_lockup_nmi_warn);
+			if (!hardlockup_panic)
+				clear_bit_unlock(0, &hard_lockup_nmi_warn);
 		}
 
 		if (hardlockup_panic)
@@ -548,7 +549,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 
 		if (softlockup_all_cpu_backtrace) {
 			trigger_allbutcpu_cpu_backtrace(smp_processor_id());
-			clear_bit_unlock(0, &soft_lockup_nmi_warn);
+			if (!softlockup_panic)
+				clear_bit_unlock(0, &soft_lockup_nmi_warn);
 		}
 
 		add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
-- 
cgit v1.2.3


From d05cb470663a2a1879277e544f69e660208f08f2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 29 Dec 2023 11:51:34 -0500
Subject: ftrace: Fix modification of direct_function hash while in use

Masami Hiramatsu reported a memory leak in register_ftrace_direct() where
if the number of new entries are added is large enough to cause two
allocations in the loop:

        for (i = 0; i < size; i++) {
                hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
                        new = ftrace_add_rec_direct(entry->ip, addr, &free_hash);
                        if (!new)
                                goto out_remove;
                        entry->direct = addr;
                }
        }

Where ftrace_add_rec_direct() has:

        if (ftrace_hash_empty(direct_functions) ||
            direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
                struct ftrace_hash *new_hash;
                int size = ftrace_hash_empty(direct_functions) ? 0 :
                        direct_functions->count + 1;

                if (size < 32)
                        size = 32;

                new_hash = dup_hash(direct_functions, size);
                if (!new_hash)
                        return NULL;

                *free_hash = direct_functions;
                direct_functions = new_hash;
        }

The "*free_hash = direct_functions;" can happen twice, losing the previous
allocation of direct_functions.

But this also exposed a more serious bug.

The modification of direct_functions above is not safe. As
direct_functions can be referenced at any time to find what direct caller
it should call, the time between:

                new_hash = dup_hash(direct_functions, size);
 and
                direct_functions = new_hash;

can have a race with another CPU (or even this one if it gets interrupted),
and the entries being moved to the new hash are not referenced.

That's because the "dup_hash()" is really misnamed and is really a
"move_hash()". It moves the entries from the old hash to the new one.

Now even if that was changed, this code is not proper as direct_functions
should not be updated until the end. That is the best way to handle
function reference changes, and is the way other parts of ftrace handles
this.

The following is done:

 1. Change add_hash_entry() to return the entry it created and inserted
    into the hash, and not just return success or not.

 2. Replace ftrace_add_rec_direct() with add_hash_entry(), and remove
    the former.

 3. Allocate a "new_hash" at the start that is made for holding both the
    new hash entries as well as the existing entries in direct_functions.

 4. Copy (not move) the direct_function entries over to the new_hash.

 5. Copy the entries of the added hash to the new_hash.

 6. If everything succeeds, then use rcu_pointer_assign() to update the
    direct_functions with the new_hash.

This simplifies the code and fixes both the memory leak as well as the
race condition mentioned above.

Link: https://lore.kernel.org/all/170368070504.42064.8960569647118388081.stgit@devnote2/
Link: https://lore.kernel.org/linux-trace-kernel/20231229115134.08dd5174@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Fixes: 763e34e74bb7d ("ftrace: Add register_ftrace_direct()")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 100 ++++++++++++++++++++++++--------------------------
 1 file changed, 47 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8de8bec5f366..b01ae7d36021 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1183,18 +1183,19 @@ static void __add_hash_entry(struct ftrace_hash *hash,
 	hash->count++;
 }
 
-static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+static struct ftrace_func_entry *
+add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
 {
 	struct ftrace_func_entry *entry;
 
 	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 	if (!entry)
-		return -ENOMEM;
+		return NULL;
 
 	entry->ip = ip;
 	__add_hash_entry(hash, entry);
 
-	return 0;
+	return entry;
 }
 
 static void
@@ -1349,7 +1350,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
 	struct ftrace_func_entry *entry;
 	struct ftrace_hash *new_hash;
 	int size;
-	int ret;
 	int i;
 
 	new_hash = alloc_ftrace_hash(size_bits);
@@ -1366,8 +1366,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
 	size = 1 << hash->size_bits;
 	for (i = 0; i < size; i++) {
 		hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
-			ret = add_hash_entry(new_hash, entry->ip);
-			if (ret < 0)
+			if (add_hash_entry(new_hash, entry->ip) == NULL)
 				goto free_hash;
 		}
 	}
@@ -2536,7 +2535,7 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec)
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 /* Protected by rcu_tasks for reading, and direct_mutex for writing */
-static struct ftrace_hash *direct_functions = EMPTY_HASH;
+static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH;
 static DEFINE_MUTEX(direct_mutex);
 int ftrace_direct_func_count;
 
@@ -2555,39 +2554,6 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
 	return entry->direct;
 }
 
-static struct ftrace_func_entry*
-ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
-		      struct ftrace_hash **free_hash)
-{
-	struct ftrace_func_entry *entry;
-
-	if (ftrace_hash_empty(direct_functions) ||
-	    direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
-		struct ftrace_hash *new_hash;
-		int size = ftrace_hash_empty(direct_functions) ? 0 :
-			direct_functions->count + 1;
-
-		if (size < 32)
-			size = 32;
-
-		new_hash = dup_hash(direct_functions, size);
-		if (!new_hash)
-			return NULL;
-
-		*free_hash = direct_functions;
-		direct_functions = new_hash;
-	}
-
-	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return NULL;
-
-	entry->ip = ip;
-	entry->direct = addr;
-	__add_hash_entry(direct_functions, entry);
-	return entry;
-}
-
 static void call_direct_funcs(unsigned long ip, unsigned long pip,
 			      struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
@@ -4223,8 +4189,8 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter)
 		/* Do nothing if it exists */
 		if (entry)
 			return 0;
-
-		ret = add_hash_entry(hash, rec->ip);
+		if (add_hash_entry(hash, rec->ip) == NULL)
+			ret = -ENOMEM;
 	}
 	return ret;
 }
@@ -5266,7 +5232,8 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
 		return 0;
 	}
 
-	return add_hash_entry(hash, ip);
+	entry = add_hash_entry(hash, ip);
+	return entry ? 0 :  -ENOMEM;
 }
 
 static int
@@ -5410,7 +5377,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
  */
 int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 {
-	struct ftrace_hash *hash, *free_hash = NULL;
+	struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL;
 	struct ftrace_func_entry *entry, *new;
 	int err = -EBUSY, size, i;
 
@@ -5436,17 +5403,44 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 		}
 	}
 
-	/* ... and insert them to direct_functions hash. */
 	err = -ENOMEM;
+
+	/* Make a copy hash to place the new and the old entries in */
+	size = hash->count + direct_functions->count;
+	if (size > 32)
+		size = 32;
+	new_hash = alloc_ftrace_hash(fls(size));
+	if (!new_hash)
+		goto out_unlock;
+
+	/* Now copy over the existing direct entries */
+	size = 1 << direct_functions->size_bits;
+	for (i = 0; i < size; i++) {
+		hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) {
+			new = add_hash_entry(new_hash, entry->ip);
+			if (!new)
+				goto out_unlock;
+			new->direct = entry->direct;
+		}
+	}
+
+	/* ... and add the new entries */
+	size = 1 << hash->size_bits;
 	for (i = 0; i < size; i++) {
 		hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
-			new = ftrace_add_rec_direct(entry->ip, addr, &free_hash);
+			new = add_hash_entry(new_hash, entry->ip);
 			if (!new)
-				goto out_remove;
+				goto out_unlock;
+			/* Update both the copy and the hash entry */
+			new->direct = addr;
 			entry->direct = addr;
 		}
 	}
 
+	free_hash = direct_functions;
+	rcu_assign_pointer(direct_functions, new_hash);
+	new_hash = NULL;
+
 	ops->func = call_direct_funcs;
 	ops->flags = MULTI_FLAGS;
 	ops->trampoline = FTRACE_REGS_ADDR;
@@ -5454,17 +5448,17 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 
 	err = register_ftrace_function_nolock(ops);
 
- out_remove:
-	if (err)
-		remove_direct_functions_hash(hash, addr);
-
  out_unlock:
 	mutex_unlock(&direct_mutex);
 
-	if (free_hash) {
+	if (free_hash && free_hash != EMPTY_HASH) {
 		synchronize_rcu_tasks();
 		free_ftrace_hash(free_hash);
 	}
+
+	if (new_hash)
+		free_ftrace_hash(new_hash);
+
 	return err;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_direct);
@@ -6309,7 +6303,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
 
 				if (entry)
 					continue;
-				if (add_hash_entry(hash, rec->ip) < 0)
+				if (add_hash_entry(hash, rec->ip) == NULL)
 					goto out;
 			} else {
 				if (entry) {
-- 
cgit v1.2.3


From 7c223098212957a1ecd8768e8e747ae2cf88e880 Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Fri, 29 Dec 2023 20:53:49 +0000
Subject: locking/osq_lock: Move the definition of optimistic_spin_node into
 osq_lock.c

struct optimistic_spin_node is private to the implementation.
Move it into the C file to ensure nothing is accessing it.

Signed-off-by: David Laight <david.laight@aculab.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/osq_lock.h  | 5 -----
 kernel/locking/osq_lock.c | 7 +++++++
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index 5581dbd3bd34..ea8fb31379e3 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -6,11 +6,6 @@
  * An MCS like lock especially tailored for optimistic spinning for sleeping
  * lock implementations (mutex, rwsem, etc).
  */
-struct optimistic_spin_node {
-	struct optimistic_spin_node *next, *prev;
-	int locked; /* 1 if lock acquired */
-	int cpu; /* encoded CPU # + 1 value */
-};
 
 struct optimistic_spin_queue {
 	/*
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index d5610ad52b92..d414eef4bec6 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -11,6 +11,13 @@
  * called from interrupt context and we have preemption disabled while
  * spinning.
  */
+
+struct optimistic_spin_node {
+	struct optimistic_spin_node *next, *prev;
+	int locked; /* 1 if lock acquired */
+	int cpu; /* encoded CPU # + 1 value */
+};
+
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
 
 /*
-- 
cgit v1.2.3


From 563adbfc351b2af9f1406b809ba60b9f1673a882 Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Fri, 29 Dec 2023 20:56:03 +0000
Subject: locking/osq_lock: Clarify osq_wait_next() calling convention

osq_wait_next() is passed 'prev' from osq_lock() and NULL from
osq_unlock() but only needs the 'cpu' value to write to lock->tail.

Just pass prev->cpu or OSQ_UNLOCKED_VAL instead.

Should have no effect on the generated code since gcc manages to assume
that 'prev != NULL' due to an earlier dereference.

Signed-off-by: David Laight <david.laight@aculab.com>
[ Changed 'old' to 'old_cpu' by request from Waiman Long  - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/locking/osq_lock.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index d414eef4bec6..15955ce35c53 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -44,26 +44,23 @@ static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
 /*
  * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
  * Can return NULL in case we were the last queued and we updated @lock instead.
+ *
+ * If osq_lock() is being cancelled there must be a previous node
+ * and 'old_cpu' is its CPU #.
+ * For osq_unlock() there is never a previous node and old_cpu is
+ * set to OSQ_UNLOCKED_VAL.
  */
 static inline struct optimistic_spin_node *
 osq_wait_next(struct optimistic_spin_queue *lock,
 	      struct optimistic_spin_node *node,
-	      struct optimistic_spin_node *prev)
+	      int old_cpu)
 {
 	struct optimistic_spin_node *next = NULL;
 	int curr = encode_cpu(smp_processor_id());
-	int old;
-
-	/*
-	 * If there is a prev node in queue, then the 'old' value will be
-	 * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
-	 * we're currently last in queue, then the queue will then become empty.
-	 */
-	old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
 
 	for (;;) {
 		if (atomic_read(&lock->tail) == curr &&
-		    atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
+		    atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) {
 			/*
 			 * We were the last queued, we moved @lock back. @prev
 			 * will now observe @lock and will complete its
@@ -193,7 +190,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	 * back to @prev.
 	 */
 
-	next = osq_wait_next(lock, node, prev);
+	next = osq_wait_next(lock, node, prev->cpu);
 	if (!next)
 		return false;
 
@@ -233,7 +230,7 @@ void osq_unlock(struct optimistic_spin_queue *lock)
 		return;
 	}
 
-	next = osq_wait_next(lock, node, NULL);
+	next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL);
 	if (next)
 		WRITE_ONCE(next->locked, 1);
 }
-- 
cgit v1.2.3


From b106bcf0f99ae0459f3c8c2f0af575ef9f5d9bde Mon Sep 17 00:00:00 2001
From: David Laight <David.Laight@ACULAB.COM>
Date: Fri, 29 Dec 2023 20:56:03 +0000
Subject: locking/osq_lock: Clarify osq_wait_next()

Directly return NULL or 'next' instead of breaking out of the loop.

Signed-off-by: David Laight <david.laight@aculab.com>
[ Split original patch into two independent parts  - Linus ]
Link: https://lore.kernel.org/lkml/7c8828aec72e42eeb841ca0ee3397e9a@AcuMS.aculab.com/
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/locking/osq_lock.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 15955ce35c53..75a6f6133866 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -55,7 +55,6 @@ osq_wait_next(struct optimistic_spin_queue *lock,
 	      struct optimistic_spin_node *node,
 	      int old_cpu)
 {
-	struct optimistic_spin_node *next = NULL;
 	int curr = encode_cpu(smp_processor_id());
 
 	for (;;) {
@@ -66,7 +65,7 @@ osq_wait_next(struct optimistic_spin_queue *lock,
 			 * will now observe @lock and will complete its
 			 * unlock()/unqueue().
 			 */
-			break;
+			return NULL;
 		}
 
 		/*
@@ -80,15 +79,15 @@ osq_wait_next(struct optimistic_spin_queue *lock,
 		 * wait for a new @node->next from its Step-C.
 		 */
 		if (node->next) {
+			struct optimistic_spin_node *next;
+
 			next = xchg(&node->next, NULL);
 			if (next)
-				break;
+				return next;
 		}
 
 		cpu_relax();
 	}
-
-	return next;
 }
 
 bool osq_lock(struct optimistic_spin_queue *lock)
-- 
cgit v1.2.3


From 5a0e241003b80247de59727c945bc94c848f893d Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@denx.de>
Date: Wed, 29 Nov 2023 09:43:28 -0300
Subject: thermal/core: Prepare for introduction of thermal reboot

Add some helper functions to make it easier introducing the support
for thermal reboot.

No functional change.

Signed-off-by: Fabio Estevam <festevam@denx.de>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20231129124330.519423-2-festevam@gmail.com
---
 drivers/thermal/thermal_core.c | 14 ++++++++++----
 include/linux/reboot.h         |  7 ++++++-
 kernel/reboot.c                |  8 ++++----
 3 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 5e5fcbd81dda..859f62e9d779 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -311,18 +311,24 @@ static void handle_non_critical_trips(struct thermal_zone_device *tz,
 		       def_governor->throttle(tz, trip);
 }
 
-void thermal_zone_device_critical(struct thermal_zone_device *tz)
+static void thermal_zone_device_halt(struct thermal_zone_device *tz, bool shutdown)
 {
 	/*
 	 * poweroff_delay_ms must be a carefully profiled positive value.
 	 * Its a must for forced_emergency_poweroff_work to be scheduled.
 	 */
 	int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS;
+	const char *msg = "Temperature too high";
+
+	dev_emerg(&tz->device, "%s: critical temperature reached\n", tz->type);
 
-	dev_emerg(&tz->device, "%s: critical temperature reached, "
-		  "shutting down\n", tz->type);
+	if (shutdown)
+		hw_protection_shutdown(msg, poweroff_delay_ms);
+}
 
-	hw_protection_shutdown("Temperature too high", poweroff_delay_ms);
+void thermal_zone_device_critical(struct thermal_zone_device *tz)
+{
+	thermal_zone_device_halt(tz, true);
 }
 EXPORT_SYMBOL(thermal_zone_device_critical);
 
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index c4cc3b89ced1..4586c663884e 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -177,7 +177,12 @@ void ctrl_alt_del(void);
 
 extern void orderly_poweroff(bool force);
 extern void orderly_reboot(void);
-void hw_protection_shutdown(const char *reason, int ms_until_forced);
+void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown);
+
+static inline void hw_protection_shutdown(const char *reason, int ms_until_forced)
+{
+	__hw_protection_shutdown(reason, ms_until_forced, true);
+}
 
 /*
  * Emergency restart, callable from an interrupt handler.
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 395a0ea3c7a8..b236c4c06bb3 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -957,7 +957,7 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms)
 }
 
 /**
- * hw_protection_shutdown - Trigger an emergency system poweroff
+ * __hw_protection_shutdown - Trigger an emergency system poweroff
  *
  * @reason:		Reason of emergency shutdown to be printed.
  * @ms_until_forced:	Time to wait for orderly shutdown before tiggering a
@@ -971,7 +971,7 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms)
  * if the previous request has given a large timeout for forced shutdown.
  * Can be called from any context.
  */
-void hw_protection_shutdown(const char *reason, int ms_until_forced)
+void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown)
 {
 	static atomic_t allow_proceed = ATOMIC_INIT(1);
 
@@ -986,9 +986,9 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
 	 * orderly_poweroff failure
 	 */
 	hw_failure_emergency_poweroff(ms_until_forced);
-	orderly_poweroff(true);
+	if (shutdown)
+		orderly_poweroff(true);
 }
-EXPORT_SYMBOL_GPL(hw_protection_shutdown);
 
 static int __init reboot_setup(char *str)
 {
-- 
cgit v1.2.3


From 79fa723ba84c2b1b3124c72df8a3b07b851a5477 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@denx.de>
Date: Wed, 29 Nov 2023 09:43:29 -0300
Subject: reboot: Introduce thermal_zone_device_critical_reboot()

Introduce thermal_zone_device_critical_reboot() to trigger an
emergency reboot.

It is a counterpart of thermal_zone_device_critical() with the
difference that it will force a reboot instead of shutdown.

The motivation for doing this is to allow the thermal subystem
to trigger a reboot when the temperature reaches the critical
temperature.

Signed-off-by: Fabio Estevam <festevam@denx.de>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20231129124330.519423-3-festevam@gmail.com
---
 drivers/thermal/thermal_core.c |  7 +++++++
 drivers/thermal/thermal_core.h |  1 +
 include/linux/reboot.h         |  5 +++++
 kernel/reboot.c                | 28 +++++++++++++++++-----------
 4 files changed, 30 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 859f62e9d779..0d761afb7cbc 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -324,6 +324,8 @@ static void thermal_zone_device_halt(struct thermal_zone_device *tz, bool shutdo
 
 	if (shutdown)
 		hw_protection_shutdown(msg, poweroff_delay_ms);
+	else
+		hw_protection_reboot(msg, poweroff_delay_ms);
 }
 
 void thermal_zone_device_critical(struct thermal_zone_device *tz)
@@ -332,6 +334,11 @@ void thermal_zone_device_critical(struct thermal_zone_device *tz)
 }
 EXPORT_SYMBOL(thermal_zone_device_critical);
 
+void thermal_zone_device_critical_reboot(struct thermal_zone_device *tz)
+{
+	thermal_zone_device_halt(tz, false);
+}
+
 static void handle_critical_trips(struct thermal_zone_device *tz,
 				  const struct thermal_trip *trip)
 {
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index fe2917a74054..b5e6743bd157 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -114,6 +114,7 @@ int thermal_zone_device_set_policy(struct thermal_zone_device *, char *);
 int thermal_build_list_of_policies(char *buf);
 void __thermal_zone_device_update(struct thermal_zone_device *tz,
 				  enum thermal_notify_event event);
+void thermal_zone_device_critical_reboot(struct thermal_zone_device *tz);
 
 /* Helpers */
 #define for_each_trip(__tz, __trip)	\
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 4586c663884e..abcdde4df697 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -179,6 +179,11 @@ extern void orderly_poweroff(bool force);
 extern void orderly_reboot(void);
 void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown);
 
+static inline void hw_protection_reboot(const char *reason, int ms_until_forced)
+{
+	__hw_protection_shutdown(reason, ms_until_forced, false);
+}
+
 static inline void hw_protection_shutdown(const char *reason, int ms_until_forced)
 {
 	__hw_protection_shutdown(reason, ms_until_forced, true);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index b236c4c06bb3..35d5e0b67993 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -957,19 +957,22 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms)
 }
 
 /**
- * __hw_protection_shutdown - Trigger an emergency system poweroff
+ * __hw_protection_shutdown - Trigger an emergency system shutdown or reboot
  *
- * @reason:		Reason of emergency shutdown to be printed.
- * @ms_until_forced:	Time to wait for orderly shutdown before tiggering a
- *			forced shudown. Negative value disables the forced
- *			shutdown.
+ * @reason:		Reason of emergency shutdown or reboot to be printed.
+ * @ms_until_forced:	Time to wait for orderly shutdown or reboot before
+ *			triggering it. Negative value disables the forced
+ *			shutdown or reboot.
+ * @shutdown:		If true, indicates that a shutdown will happen
+ *			after the critical tempeature is reached.
+ *			If false, indicates that a reboot will happen
+ *			after the critical tempeature is reached.
  *
- * Initiate an emergency system shutdown in order to protect hardware from
- * further damage. Usage examples include a thermal protection or a voltage or
- * current regulator failures.
- * NOTE: The request is ignored if protection shutdown is already pending even
- * if the previous request has given a large timeout for forced shutdown.
- * Can be called from any context.
+ * Initiate an emergency system shutdown or reboot in order to protect
+ * hardware from further damage. Usage examples include a thermal protection.
+ * NOTE: The request is ignored if protection shutdown or reboot is already
+ * pending even if the previous request has given a large timeout for forced
+ * shutdown/reboot.
  */
 void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown)
 {
@@ -988,7 +991,10 @@ void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shut
 	hw_failure_emergency_poweroff(ms_until_forced);
 	if (shutdown)
 		orderly_poweroff(true);
+	else
+		orderly_reboot();
 }
+EXPORT_SYMBOL_GPL(__hw_protection_shutdown);
 
 static int __init reboot_setup(char *str)
 {
-- 
cgit v1.2.3


From 6aa09a5bccd8e224d917afdb4c278fc66aacde4d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 27 Dec 2023 21:37:02 +0100
Subject: async: Split async_schedule_node_domain()

In preparation for subsequent changes, split async_schedule_node_domain()
in two pieces so as to allow the bottom part of it to be called from a
somewhat different code path.

No functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
Tested-by: Youngmin Nam <youngmin.nam@samsung.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 kernel/async.c | 56 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index b2c4ba5686ee..cffe6b4cff9f 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -145,6 +145,39 @@ static void async_run_entry_fn(struct work_struct *work)
 	wake_up(&async_done);
 }
 
+static async_cookie_t __async_schedule_node_domain(async_func_t func,
+						   void *data, int node,
+						   struct async_domain *domain,
+						   struct async_entry *entry)
+{
+	async_cookie_t newcookie;
+	unsigned long flags;
+
+	INIT_LIST_HEAD(&entry->domain_list);
+	INIT_LIST_HEAD(&entry->global_list);
+	INIT_WORK(&entry->work, async_run_entry_fn);
+	entry->func = func;
+	entry->data = data;
+	entry->domain = domain;
+
+	spin_lock_irqsave(&async_lock, flags);
+
+	/* allocate cookie and queue */
+	newcookie = entry->cookie = next_cookie++;
+
+	list_add_tail(&entry->domain_list, &domain->pending);
+	if (domain->registered)
+		list_add_tail(&entry->global_list, &async_global_pending);
+
+	atomic_inc(&entry_count);
+	spin_unlock_irqrestore(&async_lock, flags);
+
+	/* schedule for execution */
+	queue_work_node(node, system_unbound_wq, &entry->work);
+
+	return newcookie;
+}
+
 /**
  * async_schedule_node_domain - NUMA specific version of async_schedule_domain
  * @func: function to execute asynchronously
@@ -186,29 +219,8 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
 		func(data, newcookie);
 		return newcookie;
 	}
-	INIT_LIST_HEAD(&entry->domain_list);
-	INIT_LIST_HEAD(&entry->global_list);
-	INIT_WORK(&entry->work, async_run_entry_fn);
-	entry->func = func;
-	entry->data = data;
-	entry->domain = domain;
-
-	spin_lock_irqsave(&async_lock, flags);
 
-	/* allocate cookie and queue */
-	newcookie = entry->cookie = next_cookie++;
-
-	list_add_tail(&entry->domain_list, &domain->pending);
-	if (domain->registered)
-		list_add_tail(&entry->global_list, &async_global_pending);
-
-	atomic_inc(&entry_count);
-	spin_unlock_irqrestore(&async_lock, flags);
-
-	/* schedule for execution */
-	queue_work_node(node, system_unbound_wq, &entry->work);
-
-	return newcookie;
+	return __async_schedule_node_domain(func, data, node, domain, entry);
 }
 EXPORT_SYMBOL_GPL(async_schedule_node_domain);
 
-- 
cgit v1.2.3


From 7d4b5d7a37bdd63a5a3371b988744b060d5bb86f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 27 Dec 2023 21:38:23 +0100
Subject: async: Introduce async_schedule_dev_nocall()

In preparation for subsequent changes, introduce a specialized variant
of async_schedule_dev() that will not invoke the argument function
synchronously when it cannot be scheduled for asynchronous execution.

The new function, async_schedule_dev_nocall(), will be used for fixing
possible deadlocks in the system-wide power management core code.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com> for the series.
Tested-by: Youngmin Nam <youngmin.nam@samsung.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/async.h |  2 ++
 kernel/async.c        | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/async.h b/include/linux/async.h
index cce4ad31e8fc..33c9ff4afb49 100644
--- a/include/linux/async.h
+++ b/include/linux/async.h
@@ -90,6 +90,8 @@ async_schedule_dev(async_func_t func, struct device *dev)
 	return async_schedule_node(func, dev, dev_to_node(dev));
 }
 
+bool async_schedule_dev_nocall(async_func_t func, struct device *dev);
+
 /**
  * async_schedule_dev_domain - A device specific version of async_schedule_domain
  * @func: function to execute asynchronously
diff --git a/kernel/async.c b/kernel/async.c
index cffe6b4cff9f..673bba6bdf3a 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -243,6 +243,35 @@ async_cookie_t async_schedule_node(async_func_t func, void *data, int node)
 }
 EXPORT_SYMBOL_GPL(async_schedule_node);
 
+/**
+ * async_schedule_dev_nocall - A simplified variant of async_schedule_dev()
+ * @func: function to execute asynchronously
+ * @dev: device argument to be passed to function
+ *
+ * @dev is used as both the argument for the function and to provide NUMA
+ * context for where to run the function.
+ *
+ * If the asynchronous execution of @func is scheduled successfully, return
+ * true. Otherwise, do nothing and return false, unlike async_schedule_dev()
+ * that will run the function synchronously then.
+ */
+bool async_schedule_dev_nocall(async_func_t func, struct device *dev)
+{
+	struct async_entry *entry;
+
+	entry = kzalloc(sizeof(struct async_entry), GFP_KERNEL);
+
+	/* Give up if there is no memory or too much work. */
+	if (!entry || atomic_read(&entry_count) > MAX_WORK) {
+		kfree(entry);
+		return false;
+	}
+
+	__async_schedule_node_domain(func, dev, dev_to_node(dev),
+				     &async_dfl_domain, entry);
+	return true;
+}
+
 /**
  * async_synchronize_full - synchronize all asynchronous function calls
  *
-- 
cgit v1.2.3


From 8a021e7fa10576eeb3938328f39bbf98fe7d4715 Mon Sep 17 00:00:00 2001
From: Andrei Matei <andreimatei1@gmail.com>
Date: Thu, 21 Dec 2023 18:22:24 -0500
Subject: bpf: Simplify checking size of helper accesses

This patch simplifies the verification of size arguments associated to
pointer arguments to helpers and kfuncs. Many helpers take a pointer
argument followed by the size of the memory access performed to be
performed through that pointer. Before this patch, the handling of the
size argument in check_mem_size_reg() was confusing and wasteful: if the
size register's lower bound was 0, then the verification was done twice:
once considering the size of the access to be the lower-bound of the
respective argument, and once considering the upper bound (even if the
two are the same). The upper bound checking is a super-set of the
lower-bound checking(*), except: the only point of the lower-bound check
is to handle the case where zero-sized-accesses are explicitly not
allowed and the lower-bound is zero. This static condition is now
checked explicitly, replacing a much more complex, expensive and
confusing verification call to check_helper_mem_access().

Error messages change in this patch. Before, messages about illegal
zero-size accesses depended on the type of the pointer and on other
conditions, and sometimes the message was plain wrong: in some tests
that changed you'll see that the old message was something like "R1 min
value is outside of the allowed memory range", where R1 is the pointer
register; the error was wrongly claiming that the pointer was bad
instead of the size being bad. Other times the information that the size
came for a register with a possible range of values was wrong, and the
error presented the size as a fixed zero. Now the errors refer to the
right register. However, the old error messages did contain useful
information about the pointer register which is now lost; recovering
this information was deemed not important enough.

(*) Besides standing to reason that the checks for a bigger size access
are a super-set of the checks for a smaller size access, I have also
mechanically verified this by reading the code for all types of
pointers. I could convince myself that it's true for all but
PTR_TO_BTF_ID (check_ptr_to_btf_access). There, simply looking
line-by-line does not immediately prove what we want. If anyone has any
qualms, let me know.

Signed-off-by: Andrei Matei <andreimatei1@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20231221232225.568730-2-andreimatei1@gmail.com
---
 kernel/bpf/verifier.c                                          | 10 ++++------
 .../testing/selftests/bpf/progs/verifier_helper_value_access.c |  8 ++++----
 tools/testing/selftests/bpf/progs/verifier_raw_stack.c         |  2 +-
 3 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a376eb609c41..d4e31f61de0e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7279,12 +7279,10 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
-	if (reg->umin_value == 0) {
-		err = check_helper_mem_access(env, regno - 1, 0,
-					      zero_size_allowed,
-					      meta);
-		if (err)
-			return err;
+	if (reg->umin_value == 0 && !zero_size_allowed) {
+		verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n",
+			regno, reg->umin_value, reg->umax_value);
+		return -EACCES;
 	}
 
 	if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c b/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c
index 692216c0ad3d..3e8340c2408f 100644
--- a/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c
+++ b/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c
@@ -91,7 +91,7 @@ l0_%=:	exit;						\
 
 SEC("tracepoint")
 __description("helper access to map: empty range")
-__failure __msg("invalid access to map value, value_size=48 off=0 size=0")
+__failure __msg("R2 invalid zero-sized read")
 __naked void access_to_map_empty_range(void)
 {
 	asm volatile ("					\
@@ -221,7 +221,7 @@ l0_%=:	exit;						\
 
 SEC("tracepoint")
 __description("helper access to adjusted map (via const imm): empty range")
-__failure __msg("invalid access to map value, value_size=48 off=4 size=0")
+__failure __msg("R2 invalid zero-sized read")
 __naked void via_const_imm_empty_range(void)
 {
 	asm volatile ("					\
@@ -386,7 +386,7 @@ l0_%=:	exit;						\
 
 SEC("tracepoint")
 __description("helper access to adjusted map (via const reg): empty range")
-__failure __msg("R1 min value is outside of the allowed memory range")
+__failure __msg("R2 invalid zero-sized read")
 __naked void via_const_reg_empty_range(void)
 {
 	asm volatile ("					\
@@ -556,7 +556,7 @@ l0_%=:	exit;						\
 
 SEC("tracepoint")
 __description("helper access to adjusted map (via variable): empty range")
-__failure __msg("R1 min value is outside of the allowed memory range")
+__failure __msg("R2 invalid zero-sized read")
 __naked void map_via_variable_empty_range(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_raw_stack.c b/tools/testing/selftests/bpf/progs/verifier_raw_stack.c
index f67390224a9c..7cc83acac727 100644
--- a/tools/testing/selftests/bpf/progs/verifier_raw_stack.c
+++ b/tools/testing/selftests/bpf/progs/verifier_raw_stack.c
@@ -64,7 +64,7 @@ __naked void load_bytes_negative_len_2(void)
 
 SEC("tc")
 __description("raw_stack: skb_load_bytes, zero len")
-__failure __msg("invalid zero-sized read")
+__failure __msg("R4 invalid zero-sized read: u64=[0,0]")
 __naked void skb_load_bytes_zero_len(void)
 {
 	asm volatile ("					\
-- 
cgit v1.2.3


From 9beda16c257d55213f70adee2f16d7f13a8502e1 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 21 Dec 2023 19:17:34 -0800
Subject: bpf: Avoid unnecessary extra percpu memory allocation

Currently, for percpu memory allocation, say if the user
requests allocation size to be 32 bytes, the actually
calculated size will be 40 bytes and it further rounds
to 64 bytes, and eventually 64 bytes are allocated,
wasting 32-byte memory.

Change bpf_mem_alloc() to calculate the cache index
based on the user-provided allocation size so unnecessary
extra memory can be avoided.

Suggested-by: Hou Tao <houtao1@huawei.com>
Acked-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231222031734.1288400-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index aa0fbf000a12..288ec4a967d0 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -833,7 +833,9 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
 	if (!size)
 		return NULL;
 
-	idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
+	if (!ma->percpu)
+		size += LLIST_NODE_SZ;
+	idx = bpf_mem_cache_idx(size);
 	if (idx < 0)
 		return NULL;
 
-- 
cgit v1.2.3


From 9fc8e802048ad150e8032c4f3dbf40112160cfe9 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 21 Dec 2023 19:17:39 -0800
Subject: bpf: Add objcg to bpf_mem_alloc

The objcg is a bpf_mem_alloc level property since all bpf_mem_cache's
are with the same objcg. This patch made such a property explicit.
The next patch will use this property to save and restore objcg
for percpu unit allocator.

Acked-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231222031739.1288590-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |  1 +
 kernel/bpf/memalloc.c         | 11 ++++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index bb1223b21308..acef8c808599 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -11,6 +11,7 @@ struct bpf_mem_caches;
 struct bpf_mem_alloc {
 	struct bpf_mem_caches __percpu *caches;
 	struct bpf_mem_cache __percpu *cache;
+	struct obj_cgroup *objcg;
 	bool percpu;
 	struct work_struct work;
 };
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 288ec4a967d0..4a21050f0359 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -523,6 +523,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 		if (memcg_bpf_enabled())
 			objcg = get_obj_cgroup_from_current();
 #endif
+		ma->objcg = objcg;
 		for_each_possible_cpu(cpu) {
 			c = per_cpu_ptr(pc, cpu);
 			c->unit_size = unit_size;
@@ -542,6 +543,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 #ifdef CONFIG_MEMCG_KMEM
 	objcg = get_obj_cgroup_from_current();
 #endif
+	ma->objcg = objcg;
 	for_each_possible_cpu(cpu) {
 		cc = per_cpu_ptr(pcc, cpu);
 		for (i = 0; i < NUM_CACHES; i++) {
@@ -691,9 +693,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
 			rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
 			rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
 		}
-		/* objcg is the same across cpus */
-		if (c->objcg)
-			obj_cgroup_put(c->objcg);
+		if (ma->objcg)
+			obj_cgroup_put(ma->objcg);
 		destroy_mem_alloc(ma, rcu_in_progress);
 	}
 	if (ma->caches) {
@@ -709,8 +710,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
 				rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
 			}
 		}
-		if (c->objcg)
-			obj_cgroup_put(c->objcg);
+		if (ma->objcg)
+			obj_cgroup_put(ma->objcg);
 		destroy_mem_alloc(ma, rcu_in_progress);
 	}
 }
-- 
cgit v1.2.3


From c39aa3b289e9c10d0d246cd919b06809f13b72b8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 21 Dec 2023 19:17:45 -0800
Subject: bpf: Allow per unit prefill for non-fix-size percpu memory allocator

Commit 41a5db8d8161 ("Add support for non-fix-size percpu mem allocation")
added support for non-fix-size percpu memory allocation.
Such allocation will allocate percpu memory for all buckets on all
cpus and the memory consumption is in the order to quadratic.
For example, let us say, 4 cpus, unit size 16 bytes, so each
cpu has 16 * 4 = 64 bytes, with 4 cpus, total will be 64 * 4 = 256 bytes.
Then let us say, 8 cpus with the same unit size, each cpu
has 16 * 8 = 128 bytes, with 8 cpus, total will be 128 * 8 = 1024 bytes.
So if the number of cpus doubles, the number of memory consumption
will be 4 times. So for a system with large number of cpus, the
memory consumption goes up quickly with quadratic order.
For example, for 4KB percpu allocation, 128 cpus. The total memory
consumption will 4KB * 128 * 128 = 64MB. Things will become
worse if the number of cpus is bigger (e.g., 512, 1024, etc.)

In Commit 41a5db8d8161, the non-fix-size percpu memory allocation is
done in boot time, so for system with large number of cpus, the initial
percpu memory consumption is very visible. For example, for 128 cpu
system, the total percpu memory allocation will be at least
(16 + 32 + 64 + 96 + 128 + 196 + 256 + 512 + 1024 + 2048 + 4096)
  * 128 * 128 = ~138MB.
which is pretty big. It will be even bigger for larger number of cpus.

Note that the current prefill also allocates 4 entries if the unit size
is less than 256. So on top of 138MB memory consumption, this will
add more consumption with
3 * (16 + 32 + 64 + 96 + 128 + 196 + 256) * 128 * 128 = ~38MB.
Next patch will try to reduce this memory consumption.

Later on, Commit 1fda5bb66ad8 ("bpf: Do not allocate percpu memory
at init stage") moved the non-fix-size percpu memory allocation
to bpf verificaiton stage. Once a particular bpf_percpu_obj_new()
is called by bpf program, the memory allocator will try to fill in
the cache with all sizes, causing the same amount of percpu memory
consumption as in the boot stage.

To reduce the initial percpu memory consumption for non-fix-size
percpu memory allocation, instead of filling the cache with all
supported allocation sizes, this patch intends to fill the cache
only for the requested size. As typically users will not use large
percpu data structure, this can save memory significantly.
For example, the allocation size is 64 bytes with 128 cpus.
Then total percpu memory amount will be 64 * 128 * 128 = 1MB,
much less than previous 138MB.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231222031745.1289082-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |  7 ++++++
 kernel/bpf/memalloc.c         | 57 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/verifier.c         | 37 +++++++++++++++++-----------
 3 files changed, 86 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index acef8c808599..aaf004d94322 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -22,8 +22,15 @@ struct bpf_mem_alloc {
  * 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects.
  * Alloc and free are done with bpf_mem_{alloc,free}() and the size of
  * the returned object is given by the size argument of bpf_mem_alloc().
+ * If percpu equals true, error will be returned in order to avoid
+ * large memory consumption and the below bpf_mem_alloc_percpu_unit_init()
+ * should be used to do on-demand per-cpu allocation for each size.
  */
 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
+/* Initialize a non-fix-size percpu memory allocator */
+int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg);
+/* The percpu allocation with a specific unit size. */
+int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size);
 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
 
 /* kmalloc/kfree equivalent: */
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 4a21050f0359..f71da07eb8a0 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -121,6 +121,8 @@ struct bpf_mem_caches {
 	struct bpf_mem_cache cache[NUM_CACHES];
 };
 
+static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+
 static struct llist_node notrace *__llist_del_first(struct llist_head *head)
 {
 	struct llist_node *entry, *next;
@@ -499,12 +501,14 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
  */
 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 {
-	static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
 	struct bpf_mem_caches *cc, __percpu *pcc;
 	struct bpf_mem_cache *c, __percpu *pc;
 	struct obj_cgroup *objcg = NULL;
 	int cpu, i, unit_size, percpu_size = 0;
 
+	if (percpu && size == 0)
+		return -EINVAL;
+
 	/* room for llist_node and per-cpu pointer */
 	if (percpu)
 		percpu_size = LLIST_NODE_SZ + sizeof(void *);
@@ -524,6 +528,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			objcg = get_obj_cgroup_from_current();
 #endif
 		ma->objcg = objcg;
+
 		for_each_possible_cpu(cpu) {
 			c = per_cpu_ptr(pc, cpu);
 			c->unit_size = unit_size;
@@ -562,6 +567,56 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 	return 0;
 }
 
+int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg)
+{
+	struct bpf_mem_caches __percpu *pcc;
+
+	pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL);
+	if (!pcc)
+		return -ENOMEM;
+
+	ma->caches = pcc;
+	ma->objcg = objcg;
+	ma->percpu = true;
+	return 0;
+}
+
+int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
+{
+	struct bpf_mem_caches *cc, __percpu *pcc;
+	int cpu, i, unit_size, percpu_size;
+	struct obj_cgroup *objcg;
+	struct bpf_mem_cache *c;
+
+	i = bpf_mem_cache_idx(size);
+	if (i < 0)
+		return -EINVAL;
+
+	/* room for llist_node and per-cpu pointer */
+	percpu_size = LLIST_NODE_SZ + sizeof(void *);
+
+	unit_size = sizes[i];
+	objcg = ma->objcg;
+	pcc = ma->caches;
+
+	for_each_possible_cpu(cpu) {
+		cc = per_cpu_ptr(pcc, cpu);
+		c = &cc->cache[i];
+		if (cpu == 0 && c->unit_size)
+			break;
+
+		c->unit_size = unit_size;
+		c->objcg = objcg;
+		c->percpu_size = percpu_size;
+		c->tgt = c;
+
+		init_refill_work(c);
+		prefill_mem_cache(c, cpu);
+	}
+
+	return 0;
+}
+
 static void drain_mem_cache(struct bpf_mem_cache *c)
 {
 	bool percpu = !!c->percpu_size;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d4e31f61de0e..e9699a2cfe4f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12139,20 +12139,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
 					return -ENOMEM;
 
-				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
-					if (!bpf_global_percpu_ma_set) {
-						mutex_lock(&bpf_percpu_ma_lock);
-						if (!bpf_global_percpu_ma_set) {
-							err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
-							if (!err)
-								bpf_global_percpu_ma_set = true;
-						}
-						mutex_unlock(&bpf_percpu_ma_lock);
-						if (err)
-							return err;
-					}
-				}
-
 				if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
 					verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
 					return -EINVAL;
@@ -12173,6 +12159,29 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 					return -EINVAL;
 				}
 
+				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+					if (!bpf_global_percpu_ma_set) {
+						mutex_lock(&bpf_percpu_ma_lock);
+						if (!bpf_global_percpu_ma_set) {
+							/* Charge memory allocated with bpf_global_percpu_ma to
+							 * root memcg. The obj_cgroup for root memcg is NULL.
+							 */
+							err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
+							if (!err)
+								bpf_global_percpu_ma_set = true;
+						}
+						mutex_unlock(&bpf_percpu_ma_lock);
+						if (err)
+							return err;
+					}
+
+					mutex_lock(&bpf_percpu_ma_lock);
+					err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
+					mutex_unlock(&bpf_percpu_ma_lock);
+					if (err)
+						return err;
+				}
+
 				struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
 				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
 					if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
-- 
cgit v1.2.3


From 5b95e638f134e552b5ba2976326c02babe248615 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 21 Dec 2023 19:17:50 -0800
Subject: bpf: Refill only one percpu element in memalloc

Typically for percpu map element or data structure, once allocated,
most operations are lookup or in-place update. Deletion are really
rare. Currently, for percpu data strcture, 4 elements will be
refilled if the size is <= 256. Let us just do with one element
for percpu data. For example, for size 256 and 128 cpus, the
potential saving will be 3 * 256 * 128 * 128 = 12MB.

Acked-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231222031750.1289290-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index f71da07eb8a0..a8ee6fb8401c 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -485,11 +485,16 @@ static void init_refill_work(struct bpf_mem_cache *c)
 
 static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
 {
-	/* To avoid consuming memory assume that 1st run of bpf
-	 * prog won't be doing more than 4 map_update_elem from
-	 * irq disabled region
+	int cnt = 1;
+
+	/* To avoid consuming memory, for non-percpu allocation, assume that
+	 * 1st run of bpf prog won't be doing more than 4 map_update_elem from
+	 * irq disabled region if unit size is less than or equal to 256.
+	 * For all other cases, let us just do one allocation.
 	 */
-	alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
+	if (!c->percpu_size && c->unit_size <= 256)
+		cnt = 4;
+	alloc_bulk(c, cnt, cpu_to_node(cpu), false);
 }
 
 /* When size != 0 bpf_mem_cache for each cpu.
-- 
cgit v1.2.3


From 0e2ba9f96f9b82893ba19170ae48d46003f8ef44 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 21 Dec 2023 19:17:55 -0800
Subject: bpf: Use smaller low/high marks for percpu allocation

Currently, refill low/high marks are set with the assumption
of normal non-percpu memory allocation. For example, for
an allocation size 256, for non-percpu memory allocation,
low mark is 32 and high mark is 96, resulting in the
batch allocation of 48 elements and the allocated memory
will be 48 * 256 = 12KB for this particular cpu.
Assuming an 128-cpu system, the total memory consumption
across all cpus will be 12K * 128 = 1.5MB memory.

This might be okay for non-percpu allocation, but may not be
good for percpu allocation, which will consume 1.5MB * 128 = 192MB
memory in the worst case if every cpu has a chance of memory
allocation.

In practice, percpu allocation is very rare compared to
non-percpu allocation. So let us have smaller low/high marks
which can avoid unnecessary memory consumption.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20231222031755.1289671-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index a8ee6fb8401c..460c8f38fed6 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -464,11 +464,17 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
  * consume ~ 11 Kbyte per cpu.
  * Typical case will be between 11K and 116K closer to 11K.
  * bpf progs can and should share bpf_mem_cache when possible.
+ *
+ * Percpu allocation is typically rare. To avoid potential unnecessary large
+ * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1.
  */
 static void init_refill_work(struct bpf_mem_cache *c)
 {
 	init_irq_work(&c->refill_work, bpf_mem_refill);
-	if (c->unit_size <= 256) {
+	if (c->percpu_size) {
+		c->low_watermark = 1;
+		c->high_watermark = 3;
+	} else if (c->unit_size <= 256) {
 		c->low_watermark = 32;
 		c->high_watermark = 96;
 	} else {
-- 
cgit v1.2.3


From 5c1a37653260ed5d9c8b26fb7fe7b99629612982 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 21 Dec 2023 19:18:01 -0800
Subject: bpf: Limit up to 512 bytes for bpf_global_percpu_ma allocation

For percpu data structure allocation with bpf_global_percpu_ma,
the maximum data size is 4K. But for a system with large
number of cpus, bigger data size (e.g., 2K, 4K) might consume
a lot of memory. For example, the percpu memory consumption
with unit size 2K and 1024 cpus will be 2K * 1K * 1k = 2GB
memory.

We should discourage such usage. Let us limit the maximum data
size to be 512 for bpf_global_percpu_ma allocation.

Acked-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20231222031801.1290841-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e9699a2cfe4f..d5f4ff1eb235 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -195,6 +195,8 @@ struct bpf_verifier_stack_elem {
 					  POISON_POINTER_DELTA))
 #define BPF_MAP_PTR(X)		((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
 
+#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE  512
+
 static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
 static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
 static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
@@ -12160,6 +12162,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				}
 
 				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+					if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
+						verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
+							ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
+						return -EINVAL;
+					}
+
 					if (!bpf_global_percpu_ma_set) {
 						mutex_lock(&bpf_percpu_ma_lock);
 						if (!bpf_global_percpu_ma_set) {
-- 
cgit v1.2.3


From 86438841e48f6361f0a6a04805b7d7813738761f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 19 Dec 2023 14:41:42 +0100
Subject: dma-debug: make dma_debug_add_bus take a const pointer

The driver core now can handle a const struct bus_type pointer, and the
dma_debug_add_bus() call just passes on the pointer give to it to the
driver core, so make this pointer const as well to allow everyone to use
read-only struct bus_type pointers going forward.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc:  <iommu@lists.linux.dev>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/2023121941-dejected-nugget-681e@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/dma-map-ops.h | 4 ++--
 kernel/dma/debug.c          | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index f2fc203fb8a1..e401f824a007 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -443,10 +443,10 @@ static inline void arch_teardown_dma_ops(struct device *dev)
 #endif /* CONFIG_ARCH_HAS_TEARDOWN_DMA_OPS */
 
 #ifdef CONFIG_DMA_API_DEBUG
-void dma_debug_add_bus(struct bus_type *bus);
+void dma_debug_add_bus(const struct bus_type *bus);
 void debug_dma_dump_mappings(struct device *dev);
 #else
-static inline void dma_debug_add_bus(struct bus_type *bus)
+static inline void dma_debug_add_bus(const struct bus_type *bus)
 {
 }
 static inline void debug_dma_dump_mappings(struct device *dev)
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 3de494375b7b..1a5c86dd87d5 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -876,7 +876,7 @@ static int dma_debug_device_change(struct notifier_block *nb, unsigned long acti
 	return 0;
 }
 
-void dma_debug_add_bus(struct bus_type *bus)
+void dma_debug_add_bus(const struct bus_type *bus)
 {
 	struct notifier_block *nb;
 
-- 
cgit v1.2.3


From 9ddf872b47e3ac8f27dbfc4a4737a976c7588de6 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 4 Jan 2024 08:57:44 -0800
Subject: bpf: Remove unnecessary cpu == 0 check in memalloc

After merging the patch set [1] to reduce memory usage
for bpf_global_percpu_ma, Alexei found a redundant check (cpu == 0)
in function bpf_mem_alloc_percpu_unit_init() ([2]).
Indeed, the check is unnecessary since c->unit_size will
be all NULL or all non-NULL for all cpus before
for_each_possible_cpu() loop.
Removing the check makes code less confusing.

  [1] https://lore.kernel.org/all/20231222031729.1287957-1-yonghong.song@linux.dev/
  [2] https://lore.kernel.org/all/20231222031745.1289082-1-yonghong.song@linux.dev/

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240104165744.702239-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 460c8f38fed6..550f02e2cb13 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -613,7 +613,7 @@ int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
 	for_each_possible_cpu(cpu) {
 		cc = per_cpu_ptr(pcc, cpu);
 		c = &cc->cache[i];
-		if (cpu == 0 && c->unit_size)
+		if (c->unit_size)
 			break;
 
 		c->unit_size = unit_size;
-- 
cgit v1.2.3


From 19bfcdf9498aa968ea293417fbbc39e523527ca8 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Wed, 3 Jan 2024 20:05:44 +0100
Subject: bpf: Relax tracing prog recursive attach rules

Currently, it's not allowed to attach an fentry/fexit prog to another
one fentry/fexit. At the same time it's not uncommon to see a tracing
program with lots of logic in use, and the attachment limitation
prevents usage of fentry/fexit for performance analysis (e.g. with
"bpftool prog profile" command) in this case. An example could be
falcosecurity libs project that uses tp_btf tracing programs.

Following the corresponding discussion [1], the reason for that is to
avoid tracing progs call cycles without introducing more complex
solutions. But currently it seems impossible to load and attach tracing
programs in a way that will form such a cycle. The limitation is coming
from the fact that attach_prog_fd is specified at the prog load (thus
making it impossible to attach to a program loaded after it in this
way), as well as tracing progs not implementing link_detach.

Replace "no same type" requirement with verification that no more than
one level of attachment nesting is allowed. In this way only one
fentry/fexit program could be attached to another fentry/fexit to cover
profiling use case, and still no cycle could be formed. To implement,
add a new field into bpf_prog_aux to track nested attachment for tracing
programs.

[1]: https://lore.kernel.org/bpf/20191108064039.2041889-16-ast@kernel.org/

Acked-by: Jiri Olsa <olsajiri@gmail.com>
Acked-by: Song Liu <song@kernel.org>
Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Link: https://lore.kernel.org/r/20240103190559.14750-2-9erthalion6@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  1 +
 kernel/bpf/syscall.c  | 23 ++++++++++++++++++++++-
 kernel/bpf/verifier.c | 39 +++++++++++++++++++++++++--------------
 3 files changed, 48 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7671530d6e4e..e30100597d0a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1449,6 +1449,7 @@ struct bpf_prog_aux {
 	bool dev_bound; /* Program is bound to the netdev. */
 	bool offload_requested; /* Program is bound and offloaded to the netdev. */
 	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
+	bool attach_tracing_prog; /* true if tracing another tracing program */
 	bool func_proto_unreliable;
 	bool sleepable;
 	bool tail_call_reachable;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1bf9805ee185..d15f9998bc20 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2738,6 +2738,22 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 			goto free_prog_sec;
 	}
 
+	/*
+	 * Bookkeeping for managing the program attachment chain.
+	 *
+	 * It might be tempting to set attach_tracing_prog flag at the attachment
+	 * time, but this will not prevent from loading bunch of tracing prog
+	 * first, then attach them one to another.
+	 *
+	 * The flag attach_tracing_prog is set for the whole program lifecycle, and
+	 * doesn't have to be cleared in bpf_tracing_link_release, since tracing
+	 * programs cannot change attachment target.
+	 */
+	if (type == BPF_PROG_TYPE_TRACING && dst_prog &&
+	    dst_prog->type == BPF_PROG_TYPE_TRACING) {
+		prog->aux->attach_tracing_prog = true;
+	}
+
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
 	if (err < 0)
@@ -3171,7 +3187,12 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 	}
 
 	if (tgt_prog_fd) {
-		/* For now we only allow new targets for BPF_PROG_TYPE_EXT */
+		/*
+		 * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this
+		 * part would be changed to implement the same for
+		 * BPF_PROG_TYPE_TRACING, do not forget to update the way how
+		 * attach_tracing_prog flag is set.
+		 */
 		if (prog->type != BPF_PROG_TYPE_EXT) {
 			err = -EINVAL;
 			goto out_put_prog;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d5f4ff1eb235..adbf330d364b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20317,6 +20317,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    struct bpf_attach_target_info *tgt_info)
 {
 	bool prog_extension = prog->type == BPF_PROG_TYPE_EXT;
+	bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING;
 	const char prefix[] = "btf_trace_";
 	int ret = 0, subprog = -1, i;
 	const struct btf_type *t;
@@ -20387,10 +20388,21 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			bpf_log(log, "Can attach to only JITed progs\n");
 			return -EINVAL;
 		}
-		if (tgt_prog->type == prog->type) {
-			/* Cannot fentry/fexit another fentry/fexit program.
-			 * Cannot attach program extension to another extension.
-			 * It's ok to attach fentry/fexit to extension program.
+		if (prog_tracing) {
+			if (aux->attach_tracing_prog) {
+				/*
+				 * Target program is an fentry/fexit which is already attached
+				 * to another tracing program. More levels of nesting
+				 * attachment are not allowed.
+				 */
+				bpf_log(log, "Cannot nest tracing program attach more than once\n");
+				return -EINVAL;
+			}
+		} else if (tgt_prog->type == prog->type) {
+			/*
+			 * To avoid potential call chain cycles, prevent attaching of a
+			 * program extension to another extension. It's ok to attach
+			 * fentry/fexit to extension program.
 			 */
 			bpf_log(log, "Cannot recursively attach\n");
 			return -EINVAL;
@@ -20403,16 +20415,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			 * except fentry/fexit. The reason is the following.
 			 * The fentry/fexit programs are used for performance
 			 * analysis, stats and can be attached to any program
-			 * type except themselves. When extension program is
-			 * replacing XDP function it is necessary to allow
-			 * performance analysis of all functions. Both original
-			 * XDP program and its program extension. Hence
-			 * attaching fentry/fexit to BPF_PROG_TYPE_EXT is
-			 * allowed. If extending of fentry/fexit was allowed it
-			 * would be possible to create long call chain
-			 * fentry->extension->fentry->extension beyond
-			 * reasonable stack size. Hence extending fentry is not
-			 * allowed.
+			 * type. When extension program is replacing XDP function
+			 * it is necessary to allow performance analysis of all
+			 * functions. Both original XDP program and its program
+			 * extension. Hence attaching fentry/fexit to
+			 * BPF_PROG_TYPE_EXT is allowed. If extending of
+			 * fentry/fexit was allowed it would be possible to create
+			 * long call chain fentry->extension->fentry->extension
+			 * beyond reasonable stack size. Hence extending fentry
+			 * is not allowed.
 			 */
 			bpf_log(log, "Cannot extend fentry/fexit\n");
 			return -EINVAL;
-- 
cgit v1.2.3


From 715d82ba636cb3629a6e18a33bb9dbe53f9936ee Mon Sep 17 00:00:00 2001
From: Jiri Olsa <olsajiri@gmail.com>
Date: Wed, 3 Jan 2024 20:05:46 +0100
Subject: bpf: Fix re-attachment branch in bpf_tracing_prog_attach

The following case can cause a crash due to missing attach_btf:

1) load rawtp program
2) load fentry program with rawtp as target_fd
3) create tracing link for fentry program with target_fd = 0
4) repeat 3

In the end we have:

- prog->aux->dst_trampoline == NULL
- tgt_prog == NULL (because we did not provide target_fd to link_create)
- prog->aux->attach_btf == NULL (the program was loaded with attach_prog_fd=X)
- the program was loaded for tgt_prog but we have no way to find out which one

    BUG: kernel NULL pointer dereference, address: 0000000000000058
    Call Trace:
     <TASK>
     ? __die+0x20/0x70
     ? page_fault_oops+0x15b/0x430
     ? fixup_exception+0x22/0x330
     ? exc_page_fault+0x6f/0x170
     ? asm_exc_page_fault+0x22/0x30
     ? bpf_tracing_prog_attach+0x279/0x560
     ? btf_obj_id+0x5/0x10
     bpf_tracing_prog_attach+0x439/0x560
     __sys_bpf+0x1cf4/0x2de0
     __x64_sys_bpf+0x1c/0x30
     do_syscall_64+0x41/0xf0
     entry_SYSCALL_64_after_hwframe+0x6e/0x76

Return -EINVAL in this situation.

Fixes: f3a95075549e0 ("bpf: Allow trampoline re-attach for tracing and lsm programs")
Cc: stable@vger.kernel.org
Signed-off-by: Jiri Olsa <olsajiri@gmail.com>
Acked-by: Jiri Olsa <olsajiri@gmail.com>
Acked-by: Song Liu <song@kernel.org>
Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Link: https://lore.kernel.org/r/20240103190559.14750-4-9erthalion6@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d15f9998bc20..a1f18681721c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3237,6 +3237,10 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 	 *
 	 * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
 	 *   was detached and is going for re-attachment.
+	 *
+	 * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf
+	 *   are NULL, then program was already attached and user did not provide
+	 *   tgt_prog_fd so we have no way to find out or create trampoline
 	 */
 	if (!prog->aux->dst_trampoline && !tgt_prog) {
 		/*
@@ -3250,6 +3254,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 			err = -EINVAL;
 			goto out_unlock;
 		}
+		/* We can allow re-attach only if we have valid attach_btf. */
+		if (!prog->aux->attach_btf) {
+			err = -EINVAL;
+			goto out_unlock;
+		}
 		btf_id = prog->aux->attach_btf_id;
 		key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id);
 	}
-- 
cgit v1.2.3


From 61dd3f246b3adaabff3241c586f2210ac91b05a4 Mon Sep 17 00:00:00 2001
From: Kinsey Ho <kinseyho@google.com>
Date: Wed, 27 Dec 2023 14:12:02 +0000
Subject: mm/mglru: add CONFIG_LRU_GEN_WALKS_MMU

Add CONFIG_LRU_GEN_WALKS_MMU such that if disabled, the code that
walks page tables to promote pages into the youngest generation will
not be built.

Also improves code readability by adding two helper functions
get_mm_state() and get_next_mm().

Link: https://lkml.kernel.org/r/20231227141205.2200125-3-kinseyho@google.com
Signed-off-by: Kinsey Ho <kinseyho@google.com>
Co-developed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Tested-by: Donet Tom <donettom@linux.vnet.ibm.com>
Acked-by: Yu Zhao <yuzhao@google.com>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |   2 +-
 include/linux/mm_types.h   |  12 ++-
 include/linux/mmzone.h     |   2 +
 kernel/fork.c              |   2 +-
 mm/Kconfig                 |   4 +
 mm/vmscan.c                | 192 +++++++++++++++++++++++++++++----------------
 6 files changed, 139 insertions(+), 75 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5de775e6cdd9..20ff87f8e001 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -330,7 +330,7 @@ struct mem_cgroup {
 	struct deferred_split deferred_split_queue;
 #endif
 
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
 	/* per-memcg mm_struct list */
 	struct lru_gen_mm_list mm_list;
 #endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a66534c78c4d..552fa2d11c57 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -958,7 +958,7 @@ struct mm_struct {
 		 */
 		unsigned long ksm_zero_pages;
 #endif /* CONFIG_KSM */
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
 		struct {
 			/* this mm_struct is on lru_gen_mm_list */
 			struct list_head list;
@@ -973,7 +973,7 @@ struct mm_struct {
 			struct mem_cgroup *memcg;
 #endif
 		} lru_gen;
-#endif /* CONFIG_LRU_GEN */
+#endif /* CONFIG_LRU_GEN_WALKS_MMU */
 	} __randomize_layout;
 
 	/*
@@ -1011,6 +1011,10 @@ struct lru_gen_mm_list {
 	spinlock_t lock;
 };
 
+#endif /* CONFIG_LRU_GEN */
+
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
+
 void lru_gen_add_mm(struct mm_struct *mm);
 void lru_gen_del_mm(struct mm_struct *mm);
 #ifdef CONFIG_MEMCG
@@ -1036,7 +1040,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm)
 	WRITE_ONCE(mm->lru_gen.bitmap, -1);
 }
 
-#else /* !CONFIG_LRU_GEN */
+#else /* !CONFIG_LRU_GEN_WALKS_MMU */
 
 static inline void lru_gen_add_mm(struct mm_struct *mm)
 {
@@ -1060,7 +1064,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm)
 {
 }
 
-#endif /* CONFIG_LRU_GEN */
+#endif /* CONFIG_LRU_GEN_WALKS_MMU */
 
 struct vma_iterator {
 	struct ma_state mas;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2efd3be484fd..bc3f63ec4291 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -640,9 +640,11 @@ struct lruvec {
 #ifdef CONFIG_LRU_GEN
 	/* evictable pages divided into generations */
 	struct lru_gen_folio		lrugen;
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
 	/* to concurrently iterate lru_gen_mm_list */
 	struct lru_gen_mm_state		mm_state;
 #endif
+#endif /* CONFIG_LRU_GEN */
 #ifdef CONFIG_MEMCG
 	struct pglist_data *pgdat;
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 93924392a5c3..56cf276432c8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2946,7 +2946,7 @@ pid_t kernel_clone(struct kernel_clone_args *args)
 		get_task_struct(p);
 	}
 
-	if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+	if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
 		/* lock the task to synchronize with memcg migration */
 		task_lock(p);
 		lru_gen_add_mm(p->mm);
diff --git a/mm/Kconfig b/mm/Kconfig
index b072664b889a..79d563d8f9e0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1274,6 +1274,10 @@ config LRU_GEN_STATS
 	  from evicted generations for debugging purpose.
 
 	  This option has a per-memcg and per-node memory overhead.
+
+config LRU_GEN_WALKS_MMU
+	def_bool y
+	depends on LRU_GEN && ARCH_HAS_HW_PTE_YOUNG
 # }
 
 config ARCH_SUPPORTS_PER_VMA_LOCK
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b4ca3563bcf4..aa7ea09ffb4c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2671,13 +2671,14 @@ static void get_item_key(void *item, int *key)
 	key[1] = hash >> BLOOM_FILTER_SHIFT;
 }
 
-static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq,
+			      void *item)
 {
 	int key[2];
 	unsigned long *filter;
 	int gen = filter_gen_from_seq(seq);
 
-	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+	filter = READ_ONCE(mm_state->filters[gen]);
 	if (!filter)
 		return true;
 
@@ -2686,13 +2687,14 @@ static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *it
 	return test_bit(key[0], filter) && test_bit(key[1], filter);
 }
 
-static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq,
+				void *item)
 {
 	int key[2];
 	unsigned long *filter;
 	int gen = filter_gen_from_seq(seq);
 
-	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+	filter = READ_ONCE(mm_state->filters[gen]);
 	if (!filter)
 		return;
 
@@ -2704,12 +2706,12 @@ static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *
 		set_bit(key[1], filter);
 }
 
-static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
+static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq)
 {
 	unsigned long *filter;
 	int gen = filter_gen_from_seq(seq);
 
-	filter = lruvec->mm_state.filters[gen];
+	filter = mm_state->filters[gen];
 	if (filter) {
 		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
 		return;
@@ -2717,13 +2719,15 @@ static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
 
 	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
 			       __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
-	WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
+	WRITE_ONCE(mm_state->filters[gen], filter);
 }
 
 /******************************************************************************
  *                          mm_struct list
  ******************************************************************************/
 
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
+
 static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
 {
 	static struct lru_gen_mm_list mm_list = {
@@ -2740,6 +2744,29 @@ static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
 	return &mm_list;
 }
 
+static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec)
+{
+	return &lruvec->mm_state;
+}
+
+static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
+{
+	int key;
+	struct mm_struct *mm;
+	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+	struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
+
+	mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
+	key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
+
+	if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
+		return NULL;
+
+	clear_bit(key, &mm->lru_gen.bitmap);
+
+	return mmget_not_zero(mm) ? mm : NULL;
+}
+
 void lru_gen_add_mm(struct mm_struct *mm)
 {
 	int nid;
@@ -2755,10 +2782,11 @@ void lru_gen_add_mm(struct mm_struct *mm)
 
 	for_each_node_state(nid, N_MEMORY) {
 		struct lruvec *lruvec = get_lruvec(memcg, nid);
+		struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 		/* the first addition since the last iteration */
-		if (lruvec->mm_state.tail == &mm_list->fifo)
-			lruvec->mm_state.tail = &mm->lru_gen.list;
+		if (mm_state->tail == &mm_list->fifo)
+			mm_state->tail = &mm->lru_gen.list;
 	}
 
 	list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
@@ -2784,14 +2812,15 @@ void lru_gen_del_mm(struct mm_struct *mm)
 
 	for_each_node(nid) {
 		struct lruvec *lruvec = get_lruvec(memcg, nid);
+		struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 		/* where the current iteration continues after */
-		if (lruvec->mm_state.head == &mm->lru_gen.list)
-			lruvec->mm_state.head = lruvec->mm_state.head->prev;
+		if (mm_state->head == &mm->lru_gen.list)
+			mm_state->head = mm_state->head->prev;
 
 		/* where the last iteration ended before */
-		if (lruvec->mm_state.tail == &mm->lru_gen.list)
-			lruvec->mm_state.tail = lruvec->mm_state.tail->next;
+		if (mm_state->tail == &mm->lru_gen.list)
+			mm_state->tail = mm_state->tail->next;
 	}
 
 	list_del_init(&mm->lru_gen.list);
@@ -2834,10 +2863,30 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
 }
 #endif
 
+#else /* !CONFIG_LRU_GEN_WALKS_MMU */
+
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
+{
+	return NULL;
+}
+
+static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec)
+{
+	return NULL;
+}
+
+static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
+{
+	return NULL;
+}
+
+#endif
+
 static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
 {
 	int i;
 	int hist;
+	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
 
@@ -2845,44 +2894,20 @@ static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
 		hist = lru_hist_from_seq(walk->max_seq);
 
 		for (i = 0; i < NR_MM_STATS; i++) {
-			WRITE_ONCE(lruvec->mm_state.stats[hist][i],
-				   lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
+			WRITE_ONCE(mm_state->stats[hist][i],
+				   mm_state->stats[hist][i] + walk->mm_stats[i]);
 			walk->mm_stats[i] = 0;
 		}
 	}
 
 	if (NR_HIST_GENS > 1 && last) {
-		hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
+		hist = lru_hist_from_seq(mm_state->seq + 1);
 
 		for (i = 0; i < NR_MM_STATS; i++)
-			WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
+			WRITE_ONCE(mm_state->stats[hist][i], 0);
 	}
 }
 
-static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
-{
-	int type;
-	unsigned long size = 0;
-	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
-	int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
-
-	if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
-		return true;
-
-	clear_bit(key, &mm->lru_gen.bitmap);
-
-	for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
-		size += type ? get_mm_counter(mm, MM_FILEPAGES) :
-			       get_mm_counter(mm, MM_ANONPAGES) +
-			       get_mm_counter(mm, MM_SHMEMPAGES);
-	}
-
-	if (size < MIN_LRU_BATCH)
-		return true;
-
-	return !mmget_not_zero(mm);
-}
-
 static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
 			    struct mm_struct **iter)
 {
@@ -2891,7 +2916,7 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
 	struct mm_struct *mm = NULL;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
-	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 	/*
 	 * mm_state->seq is incremented after each iteration of mm_list. There
@@ -2929,11 +2954,7 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
 			mm_state->tail = mm_state->head->next;
 			walk->force_scan = true;
 		}
-
-		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
-		if (should_skip_mm(mm, walk))
-			mm = NULL;
-	} while (!mm);
+	} while (!(mm = get_next_mm(walk)));
 done:
 	if (*iter || last)
 		reset_mm_stats(lruvec, walk, last);
@@ -2941,7 +2962,7 @@ done:
 	spin_unlock(&mm_list->lock);
 
 	if (mm && first)
-		reset_bloom_filter(lruvec, walk->max_seq + 1);
+		reset_bloom_filter(mm_state, walk->max_seq + 1);
 
 	if (*iter)
 		mmput_async(*iter);
@@ -2956,7 +2977,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
 	bool success = false;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
-	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 	spin_lock(&mm_list->lock);
 
@@ -3469,6 +3490,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
 	DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
 	unsigned long first = -1;
 	struct lru_gen_mm_walk *walk = args->private;
+	struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
 
 	VM_WARN_ON_ONCE(pud_leaf(*pud));
 
@@ -3520,7 +3542,7 @@ restart:
 			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
 		}
 
-		if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
+		if (!walk->force_scan && !test_bloom_filter(mm_state, walk->max_seq, pmd + i))
 			continue;
 
 		walk->mm_stats[MM_NONLEAF_FOUND]++;
@@ -3531,7 +3553,7 @@ restart:
 		walk->mm_stats[MM_NONLEAF_ADDED]++;
 
 		/* carry over to the next generation */
-		update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
+		update_bloom_filter(mm_state, walk->max_seq + 1, pmd + i);
 	}
 
 	walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
@@ -3738,16 +3760,25 @@ next:
 	return success;
 }
 
-static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
+static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+			bool can_swap, bool force_scan)
 {
+	bool success;
 	int prev, next;
 	int type, zone;
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
 restart:
+	if (max_seq < READ_ONCE(lrugen->max_seq))
+		return false;
+
 	spin_lock_irq(&lruvec->lru_lock);
 
 	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
 
+	success = max_seq == lrugen->max_seq;
+	if (!success)
+		goto unlock;
+
 	for (type = ANON_AND_FILE - 1; type >= 0; type--) {
 		if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
 			continue;
@@ -3791,8 +3822,10 @@ restart:
 	WRITE_ONCE(lrugen->timestamps[next], jiffies);
 	/* make sure preceding modifications appear */
 	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
-
+unlock:
 	spin_unlock_irq(&lruvec->lru_lock);
+
+	return success;
 }
 
 static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
@@ -3802,14 +3835,16 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 	struct lru_gen_mm_walk *walk;
 	struct mm_struct *mm = NULL;
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 	VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
 
+	if (!mm_state)
+		return inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+
 	/* see the comment in iterate_mm_list() */
-	if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
-		success = false;
-		goto done;
-	}
+	if (max_seq <= READ_ONCE(mm_state->seq))
+		return false;
 
 	/*
 	 * If the hardware doesn't automatically set the accessed bit, fallback
@@ -3839,8 +3874,10 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 			walk_mm(lruvec, mm, walk);
 	} while (mm);
 done:
-	if (success)
-		inc_max_seq(lruvec, can_swap, force_scan);
+	if (success) {
+		success = inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+		WARN_ON_ONCE(!success);
+	}
 
 	return success;
 }
@@ -3964,6 +4001,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	struct mem_cgroup *memcg = folio_memcg(folio);
 	struct pglist_data *pgdat = folio_pgdat(folio);
 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 	DEFINE_MAX_SEQ(lruvec);
 	int old_gen, new_gen = lru_gen_from_seq(max_seq);
 
@@ -4042,8 +4080,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 	mem_cgroup_unlock_pages();
 
 	/* feedback from rmap walkers to page table walkers */
-	if (suitable_to_scan(i, young))
-		update_bloom_filter(lruvec, max_seq, pvmw->pmd);
+	if (mm_state && suitable_to_scan(i, young))
+		update_bloom_filter(mm_state, max_seq, pvmw->pmd);
 }
 
 /******************************************************************************
@@ -5219,6 +5257,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
 	int type, tier;
 	int hist = lru_hist_from_seq(seq);
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
 		seq_printf(m, "            %10d", tier);
@@ -5244,6 +5283,9 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
 		seq_putc(m, '\n');
 	}
 
+	if (!mm_state)
+		return;
+
 	seq_puts(m, "                      ");
 	for (i = 0; i < NR_MM_STATS; i++) {
 		const char *s = "      ";
@@ -5251,10 +5293,10 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
 
 		if (seq == max_seq && NR_HIST_GENS == 1) {
 			s = "LOYNFA";
-			n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+			n = READ_ONCE(mm_state->stats[hist][i]);
 		} else if (seq != max_seq && NR_HIST_GENS > 1) {
 			s = "loynfa";
-			n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+			n = READ_ONCE(mm_state->stats[hist][i]);
 		}
 
 		seq_printf(m, " %10lu%c", n, s[i]);
@@ -5523,6 +5565,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
 	int i;
 	int gen, type, zone;
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
+	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 	lrugen->max_seq = MIN_NR_GENS + 1;
 	lrugen->enabled = lru_gen_enabled();
@@ -5533,7 +5576,8 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
 	for_each_gen_type_zone(gen, type, zone)
 		INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
 
-	lruvec->mm_state.seq = MIN_NR_GENS;
+	if (mm_state)
+		mm_state->seq = MIN_NR_GENS;
 }
 
 #ifdef CONFIG_MEMCG
@@ -5552,28 +5596,38 @@ void lru_gen_init_pgdat(struct pglist_data *pgdat)
 
 void lru_gen_init_memcg(struct mem_cgroup *memcg)
 {
-	INIT_LIST_HEAD(&memcg->mm_list.fifo);
-	spin_lock_init(&memcg->mm_list.lock);
+	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+	if (!mm_list)
+		return;
+
+	INIT_LIST_HEAD(&mm_list->fifo);
+	spin_lock_init(&mm_list->lock);
 }
 
 void lru_gen_exit_memcg(struct mem_cgroup *memcg)
 {
 	int i;
 	int nid;
+	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
 
-	VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo));
+	VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo));
 
 	for_each_node(nid) {
 		struct lruvec *lruvec = get_lruvec(memcg, nid);
+		struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
 
 		VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
 					   sizeof(lruvec->lrugen.nr_pages)));
 
 		lruvec->lrugen.list.next = LIST_POISON1;
 
+		if (!mm_state)
+			continue;
+
 		for (i = 0; i < NR_BLOOM_FILTERS; i++) {
-			bitmap_free(lruvec->mm_state.filters[i]);
-			lruvec->mm_state.filters[i] = NULL;
+			bitmap_free(mm_state->filters[i]);
+			mm_state->filters[i] = NULL;
 		}
 	}
 }
-- 
cgit v1.2.3


From 6dff315972640bfe542e2d044933751afd8e6c4a Mon Sep 17 00:00:00 2001
From: Yuntao Wang <ytcoode@gmail.com>
Date: Tue, 2 Jan 2024 22:49:05 +0800
Subject: crash_core: fix and simplify the logic of crash_exclude_mem_range()

The purpose of crash_exclude_mem_range() is to remove all memory ranges
that overlap with [mstart-mend].  However, the current logic only removes
the first overlapping memory range.

Commit a2e9a95d2190 ("kexec: Improve & fix crash_exclude_mem_range() to
handle overlapping ranges") attempted to address this issue, but it did
not fix all error cases.

Let's fix and simplify the logic of crash_exclude_mem_range().

Link: https://lkml.kernel.org/r/20240102144905.110047-4-ytcoode@gmail.com
Signed-off-by: Yuntao Wang <ytcoode@gmail.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/crash_core.c | 80 +++++++++++++++++++----------------------------------
 1 file changed, 29 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 6f074e112c1e..62e0227d390e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -566,9 +566,8 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
 int crash_exclude_mem_range(struct crash_mem *mem,
 			    unsigned long long mstart, unsigned long long mend)
 {
-	int i, j;
+	int i;
 	unsigned long long start, end, p_start, p_end;
-	struct range temp_range = {0, 0};
 
 	for (i = 0; i < mem->nr_ranges; i++) {
 		start = mem->ranges[i].start;
@@ -576,72 +575,51 @@ int crash_exclude_mem_range(struct crash_mem *mem,
 		p_start = mstart;
 		p_end = mend;
 
-		if (mstart > end || mend < start)
+		if (p_start > end)
 			continue;
 
+		/*
+		 * Because the memory ranges in mem->ranges are stored in
+		 * ascending order, when we detect `p_end < start`, we can
+		 * immediately exit the for loop, as the subsequent memory
+		 * ranges will definitely be outside the range we are looking
+		 * for.
+		 */
+		if (p_end < start)
+			break;
+
 		/* Truncate any area outside of range */
-		if (mstart < start)
+		if (p_start < start)
 			p_start = start;
-		if (mend > end)
+		if (p_end > end)
 			p_end = end;
 
 		/* Found completely overlapping range */
 		if (p_start == start && p_end == end) {
-			mem->ranges[i].start = 0;
-			mem->ranges[i].end = 0;
-			if (i < mem->nr_ranges - 1) {
-				/* Shift rest of the ranges to left */
-				for (j = i; j < mem->nr_ranges - 1; j++) {
-					mem->ranges[j].start =
-						mem->ranges[j+1].start;
-					mem->ranges[j].end =
-							mem->ranges[j+1].end;
-				}
-
-				/*
-				 * Continue to check if there are another overlapping ranges
-				 * from the current position because of shifting the above
-				 * mem ranges.
-				 */
-				i--;
-				mem->nr_ranges--;
-				continue;
-			}
+			memmove(&mem->ranges[i], &mem->ranges[i + 1],
+				(mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i]));
+			i--;
 			mem->nr_ranges--;
-			return 0;
-		}
-
-		if (p_start > start && p_end < end) {
+		} else if (p_start > start && p_end < end) {
 			/* Split original range */
+			if (mem->nr_ranges >= mem->max_nr_ranges)
+				return -ENOMEM;
+
+			memmove(&mem->ranges[i + 2], &mem->ranges[i + 1],
+				(mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i]));
+
 			mem->ranges[i].end = p_start - 1;
-			temp_range.start = p_end + 1;
-			temp_range.end = end;
+			mem->ranges[i + 1].start = p_end + 1;
+			mem->ranges[i + 1].end = end;
+
+			i++;
+			mem->nr_ranges++;
 		} else if (p_start != start)
 			mem->ranges[i].end = p_start - 1;
 		else
 			mem->ranges[i].start = p_end + 1;
-		break;
-	}
-
-	/* If a split happened, add the split to array */
-	if (!temp_range.end)
-		return 0;
-
-	/* Split happened */
-	if (i == mem->max_nr_ranges - 1)
-		return -ENOMEM;
-
-	/* Location where new range should go */
-	j = i + 1;
-	if (j < mem->nr_ranges) {
-		/* Move over all ranges one slot towards the end */
-		for (i = mem->nr_ranges - 1; i >= j; i--)
-			mem->ranges[i + 1] = mem->ranges[i];
 	}
 
-	mem->ranges[j].start = temp_range.start;
-	mem->ranges[j].end = temp_range.end;
-	mem->nr_ranges++;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4f1991a92cfe89096b2d1f5583a2e093bdd55c37 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Sun, 7 Jan 2024 20:32:58 -0500
Subject: tracing histograms: Simplify parse_actions() function

The parse_actions() function uses 'len = str_has_prefix()' to test which
action is in the string being parsed. But then it goes and repeats the
logic for each different action. This logic can be simplified and
duplicate code can be removed as 'len' contains the length of the found
prefix which should be used for all actions.

Link: https://lore.kernel.org/all/20240107112044.6702cb66@gandalf.local.home/
Link: https://lore.kernel.org/linux-trace-kernel/20240107203258.37e26d2b@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andy Shevchenko <andy@kernel.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 49 ++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 5ecf3c8bde20..6ece1308d36a 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -4805,36 +4805,35 @@ static int parse_actions(struct hist_trigger_data *hist_data)
 	int len;
 
 	for (i = 0; i < hist_data->attrs->n_actions; i++) {
+		enum handler_id hid = 0;
+		char *action_str;
+
 		str = hist_data->attrs->action_str[i];
 
-		if ((len = str_has_prefix(str, "onmatch("))) {
-			char *action_str = str + len;
+		if ((len = str_has_prefix(str, "onmatch(")))
+			hid = HANDLER_ONMATCH;
+		else if ((len = str_has_prefix(str, "onmax(")))
+			hid = HANDLER_ONMAX;
+		else if ((len = str_has_prefix(str, "onchange(")))
+			hid = HANDLER_ONCHANGE;
 
-			data = onmatch_parse(tr, action_str);
-			if (IS_ERR(data)) {
-				ret = PTR_ERR(data);
-				break;
-			}
-		} else if ((len = str_has_prefix(str, "onmax("))) {
-			char *action_str = str + len;
+		action_str = str + len;
 
-			data = track_data_parse(hist_data, action_str,
-						HANDLER_ONMAX);
-			if (IS_ERR(data)) {
-				ret = PTR_ERR(data);
-				break;
-			}
-		} else if ((len = str_has_prefix(str, "onchange("))) {
-			char *action_str = str + len;
+		switch (hid) {
+		case HANDLER_ONMATCH:
+			data = onmatch_parse(tr, action_str);
+			break;
+		case HANDLER_ONMAX:
+		case HANDLER_ONCHANGE:
+			data = track_data_parse(hist_data, action_str, hid);
+			break;
+		default:
+			data = ERR_PTR(-EINVAL);
+			break;
+		}
 
-			data = track_data_parse(hist_data, action_str,
-						HANDLER_ONCHANGE);
-			if (IS_ERR(data)) {
-				ret = PTR_ERR(data);
-				break;
-			}
-		} else {
-			ret = -EINVAL;
+		if (IS_ERR(data)) {
+			ret = PTR_ERR(data);
 			break;
 		}
 
-- 
cgit v1.2.3


From fd37721803c6e73619108f76ad2e12a9aa5fafaf Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 28 Dec 2023 17:47:03 +0300
Subject: mm, treewide: introduce NR_PAGE_ORDERS

NR_PAGE_ORDERS defines the number of page orders supported by the page
allocator, ranging from 0 to MAX_ORDER, MAX_ORDER + 1 in total.

NR_PAGE_ORDERS assists in defining arrays of page orders and allows for
more natural iteration over them.

[kirill.shutemov@linux.intel.com: fixup for kerneldoc warning]
  Link: https://lkml.kernel.org/r/20240101111512.7empzyifq7kxtzk3@box
Link: https://lkml.kernel.org/r/20231228144704.14033-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kdump/vmcoreinfo.rst |  6 +++---
 arch/arm64/kvm/hyp/include/nvhe/gfp.h          |  2 +-
 arch/sparc/kernel/traps_64.c                   |  2 +-
 drivers/gpu/drm/ttm/tests/ttm_device_test.c    |  2 +-
 drivers/gpu/drm/ttm/ttm_pool.c                 | 20 ++++++++++----------
 include/drm/ttm/ttm_pool.h                     |  2 +-
 include/linux/mmzone.h                         |  6 ++++--
 kernel/crash_core.c                            |  2 +-
 lib/test_meminit.c                             |  2 +-
 mm/compaction.c                                |  2 +-
 mm/kmsan/init.c                                |  2 +-
 mm/page_alloc.c                                | 13 ++++++-------
 mm/page_reporting.c                            |  2 +-
 mm/show_mem.c                                  |  8 ++++----
 mm/vmstat.c                                    | 12 ++++++------
 15 files changed, 42 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 78e4d2e7ba14..3f8769e46b07 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -172,7 +172,7 @@ variables.
 Offset of the free_list's member. This value is used to compute the number
 of free pages.
 
-Each zone has a free_area structure array called free_area[MAX_ORDER + 1].
+Each zone has a free_area structure array called free_area[NR_PAGE_ORDERS].
 The free_list represents a linked list of free page blocks.
 
 (list_head, next|prev)
@@ -189,8 +189,8 @@ Offsets of the vmap_area's members. They carry vmalloc-specific
 information. Makedumpfile gets the start address of the vmalloc region
 from this.
 
-(zone.free_area, MAX_ORDER + 1)
--------------------------------
+(zone.free_area, NR_PAGE_ORDERS)
+--------------------------------
 
 Free areas descriptor. User-space tools use this value to iterate the
 free_area ranges. MAX_ORDER is used by the zone buddy allocator.
diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
index fe5472a184a3..97c527ef53c2 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@@ -16,7 +16,7 @@ struct hyp_pool {
 	 * API at EL2.
 	 */
 	hyp_spinlock_t lock;
-	struct list_head free_area[MAX_ORDER + 1];
+	struct list_head free_area[NR_PAGE_ORDERS];
 	phys_addr_t range_start;
 	phys_addr_t range_end;
 	unsigned short max_order;
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 08ffd17d5ec3..523a6e5ee925 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -897,7 +897,7 @@ void __init cheetah_ecache_flush_init(void)
 
 	/* Now allocate error trap reporting scoreboard. */
 	sz = NR_CPUS * (2 * sizeof(struct cheetah_err_info));
-	for (order = 0; order <= MAX_ORDER; order++) {
+	for (order = 0; order < NR_PAGE_ORDERS; order++) {
 		if ((PAGE_SIZE << order) >= sz)
 			break;
 	}
diff --git a/drivers/gpu/drm/ttm/tests/ttm_device_test.c b/drivers/gpu/drm/ttm/tests/ttm_device_test.c
index b1b423b68cdf..19eaff22e6ae 100644
--- a/drivers/gpu/drm/ttm/tests/ttm_device_test.c
+++ b/drivers/gpu/drm/ttm/tests/ttm_device_test.c
@@ -175,7 +175,7 @@ static void ttm_device_init_pools(struct kunit *test)
 
 	if (params->pools_init_expected) {
 		for (int i = 0; i < TTM_NUM_CACHING_TYPES; ++i) {
-			for (int j = 0; j <= MAX_ORDER; ++j) {
+			for (int j = 0; j < NR_PAGE_ORDERS; ++j) {
 				pt = pool->caching[i].orders[j];
 				KUNIT_EXPECT_PTR_EQ(test, pt.pool, pool);
 				KUNIT_EXPECT_EQ(test, pt.caching, i);
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index fe610a3cace0..d183bb97c526 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -65,11 +65,11 @@ module_param(page_pool_size, ulong, 0644);
 
 static atomic_long_t allocated_pages;
 
-static struct ttm_pool_type global_write_combined[MAX_ORDER + 1];
-static struct ttm_pool_type global_uncached[MAX_ORDER + 1];
+static struct ttm_pool_type global_write_combined[NR_PAGE_ORDERS];
+static struct ttm_pool_type global_uncached[NR_PAGE_ORDERS];
 
-static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER + 1];
-static struct ttm_pool_type global_dma32_uncached[MAX_ORDER + 1];
+static struct ttm_pool_type global_dma32_write_combined[NR_PAGE_ORDERS];
+static struct ttm_pool_type global_dma32_uncached[NR_PAGE_ORDERS];
 
 static spinlock_t shrinker_lock;
 static struct list_head shrinker_list;
@@ -568,7 +568,7 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
 
 	if (use_dma_alloc || nid != NUMA_NO_NODE) {
 		for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
-			for (j = 0; j <= MAX_ORDER; ++j)
+			for (j = 0; j < NR_PAGE_ORDERS; ++j)
 				ttm_pool_type_init(&pool->caching[i].orders[j],
 						   pool, i, j);
 	}
@@ -601,7 +601,7 @@ void ttm_pool_fini(struct ttm_pool *pool)
 
 	if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE) {
 		for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i)
-			for (j = 0; j <= MAX_ORDER; ++j)
+			for (j = 0; j < NR_PAGE_ORDERS; ++j)
 				ttm_pool_type_fini(&pool->caching[i].orders[j]);
 	}
 
@@ -656,7 +656,7 @@ static void ttm_pool_debugfs_header(struct seq_file *m)
 	unsigned int i;
 
 	seq_puts(m, "\t ");
-	for (i = 0; i <= MAX_ORDER; ++i)
+	for (i = 0; i < NR_PAGE_ORDERS; ++i)
 		seq_printf(m, " ---%2u---", i);
 	seq_puts(m, "\n");
 }
@@ -667,7 +667,7 @@ static void ttm_pool_debugfs_orders(struct ttm_pool_type *pt,
 {
 	unsigned int i;
 
-	for (i = 0; i <= MAX_ORDER; ++i)
+	for (i = 0; i < NR_PAGE_ORDERS; ++i)
 		seq_printf(m, " %8u", ttm_pool_type_count(&pt[i]));
 	seq_puts(m, "\n");
 }
@@ -776,7 +776,7 @@ int ttm_pool_mgr_init(unsigned long num_pages)
 	spin_lock_init(&shrinker_lock);
 	INIT_LIST_HEAD(&shrinker_list);
 
-	for (i = 0; i <= MAX_ORDER; ++i) {
+	for (i = 0; i < NR_PAGE_ORDERS; ++i) {
 		ttm_pool_type_init(&global_write_combined[i], NULL,
 				   ttm_write_combined, i);
 		ttm_pool_type_init(&global_uncached[i], NULL, ttm_uncached, i);
@@ -816,7 +816,7 @@ void ttm_pool_mgr_fini(void)
 {
 	unsigned int i;
 
-	for (i = 0; i <= MAX_ORDER; ++i) {
+	for (i = 0; i < NR_PAGE_ORDERS; ++i) {
 		ttm_pool_type_fini(&global_write_combined[i]);
 		ttm_pool_type_fini(&global_uncached[i]);
 
diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h
index 30a347e5aa11..4490d43c63e3 100644
--- a/include/drm/ttm/ttm_pool.h
+++ b/include/drm/ttm/ttm_pool.h
@@ -74,7 +74,7 @@ struct ttm_pool {
 	bool use_dma32;
 
 	struct {
-		struct ttm_pool_type orders[MAX_ORDER + 1];
+		struct ttm_pool_type orders[NR_PAGE_ORDERS];
 	} caching[TTM_NUM_CACHING_TYPES];
 };
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c18c53353b50..1ea7636dfb76 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -35,6 +35,8 @@
 
 #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)
 
+#define NR_PAGE_ORDERS (MAX_ORDER + 1)
+
 /*
  * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
  * costly to service.  That is between allocation orders which should
@@ -96,7 +98,7 @@ static inline bool migratetype_is_mergeable(int mt)
 }
 
 #define for_each_migratetype_order(order, type) \
-	for (order = 0; order <= MAX_ORDER; order++) \
+	for (order = 0; order < NR_PAGE_ORDERS; order++) \
 		for (type = 0; type < MIGRATE_TYPES; type++)
 
 extern int page_group_by_mobility_disabled;
@@ -933,7 +935,7 @@ struct zone {
 	CACHELINE_PADDING(_pad1_);
 
 	/* free areas of different sizes */
-	struct free_area	free_area[MAX_ORDER + 1];
+	struct free_area	free_area[NR_PAGE_ORDERS];
 
 #ifdef CONFIG_UNACCEPTED_MEMORY
 	/* Pages to be accepted. All pages on the list are MAX_ORDER */
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index d4313b53837e..56cf4ad7abbb 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -802,7 +802,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(list_head, prev);
 	VMCOREINFO_OFFSET(vmap_area, va_start);
 	VMCOREINFO_OFFSET(vmap_area, list);
-	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1);
+	VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
 	log_buf_vmcoreinfo_setup();
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
 	VMCOREINFO_NUMBER(NR_FREE_PAGES);
diff --git a/lib/test_meminit.c b/lib/test_meminit.c
index 0ae35223d773..0dc173849a54 100644
--- a/lib/test_meminit.c
+++ b/lib/test_meminit.c
@@ -93,7 +93,7 @@ static int __init test_pages(int *total_failures)
 	int failures = 0, num_tests = 0;
 	int i;
 
-	for (i = 0; i <= MAX_ORDER; i++)
+	for (i = 0; i < NR_PAGE_ORDERS; i++)
 		num_tests += do_alloc_pages_order(i, &failures);
 
 	REPORT_FAILURES_IN_FN();
diff --git a/mm/compaction.c b/mm/compaction.c
index de15a2ef0af5..24f8eb4d6260 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2229,7 +2229,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
 
 	/* Direct compactor: Is a suitable page free? */
 	ret = COMPACT_NO_SUITABLE_PAGE;
-	for (order = cc->order; order <= MAX_ORDER; order++) {
+	for (order = cc->order; order < NR_PAGE_ORDERS; order++) {
 		struct free_area *area = &cc->zone->free_area[order];
 		bool can_steal;
 
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
index ffedf4dbc49d..103e2e88ea03 100644
--- a/mm/kmsan/init.c
+++ b/mm/kmsan/init.c
@@ -96,7 +96,7 @@ void __init kmsan_init_shadow(void)
 struct metadata_page_pair {
 	struct page *shadow, *origin;
 };
-static struct metadata_page_pair held_back[MAX_ORDER + 1] __initdata;
+static struct metadata_page_pair held_back[NR_PAGE_ORDERS] __initdata;
 
 /*
  * Eager metadata allocation. When the memblock allocator is freeing pages to
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5526797b7f96..ccecf6158ae4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1560,7 +1560,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 	struct page *page;
 
 	/* Find a page of the appropriate size in the preferred list */
-	for (current_order = order; current_order <= MAX_ORDER; ++current_order) {
+	for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) {
 		area = &(zone->free_area[current_order]);
 		page = get_page_from_free_area(area, migratetype);
 		if (!page)
@@ -1934,7 +1934,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			continue;
 
 		spin_lock_irqsave(&zone->lock, flags);
-		for (order = 0; order <= MAX_ORDER; order++) {
+		for (order = 0; order < NR_PAGE_ORDERS; order++) {
 			struct free_area *area = &(zone->free_area[order]);
 
 			page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
@@ -2044,8 +2044,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 	return false;
 
 find_smallest:
-	for (current_order = order; current_order <= MAX_ORDER;
-							current_order++) {
+	for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
 		area = &(zone->free_area[current_order]);
 		fallback_mt = find_suitable_fallback(area, current_order,
 				start_migratetype, false, &can_steal);
@@ -3000,7 +2999,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 		return true;
 
 	/* For a high-order request, check at least one suitable page is free */
-	for (o = order; o <= MAX_ORDER; o++) {
+	for (o = order; o < NR_PAGE_ORDERS; o++) {
 		struct free_area *area = &z->free_area[o];
 		int mt;
 
@@ -6628,7 +6627,7 @@ bool is_free_buddy_page(struct page *page)
 	unsigned long pfn = page_to_pfn(page);
 	unsigned int order;
 
-	for (order = 0; order <= MAX_ORDER; order++) {
+	for (order = 0; order < NR_PAGE_ORDERS; order++) {
 		struct page *page_head = page - (pfn & ((1 << order) - 1));
 
 		if (PageBuddy(page_head) &&
@@ -6683,7 +6682,7 @@ bool take_page_off_buddy(struct page *page)
 	bool ret = false;
 
 	spin_lock_irqsave(&zone->lock, flags);
-	for (order = 0; order <= MAX_ORDER; order++) {
+	for (order = 0; order < NR_PAGE_ORDERS; order++) {
 		struct page *page_head = page - (pfn & ((1 << order) - 1));
 		int page_order = buddy_order(page_head);
 
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index b021f482a4cb..66369cc5279b 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -276,7 +276,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev,
 		return err;
 
 	/* Process each free list starting from lowest order/mt */
-	for (order = page_reporting_order; order <= MAX_ORDER; order++) {
+	for (order = page_reporting_order; order < NR_PAGE_ORDERS; order++) {
 		for (mt = 0; mt < MIGRATE_TYPES; mt++) {
 			/* We do not pull pages from the isolate free list */
 			if (is_migrate_isolate(mt))
diff --git a/mm/show_mem.c b/mm/show_mem.c
index ba0808d6917f..8dcfafbd283c 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -352,8 +352,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
 
 	for_each_populated_zone(zone) {
 		unsigned int order;
-		unsigned long nr[MAX_ORDER + 1], flags, total = 0;
-		unsigned char types[MAX_ORDER + 1];
+		unsigned long nr[NR_PAGE_ORDERS], flags, total = 0;
+		unsigned char types[NR_PAGE_ORDERS];
 
 		if (zone_idx(zone) > max_zone_idx)
 			continue;
@@ -363,7 +363,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
 		printk(KERN_CONT "%s: ", zone->name);
 
 		spin_lock_irqsave(&zone->lock, flags);
-		for (order = 0; order <= MAX_ORDER; order++) {
+		for (order = 0; order < NR_PAGE_ORDERS; order++) {
 			struct free_area *area = &zone->free_area[order];
 			int type;
 
@@ -377,7 +377,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
 			}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
-		for (order = 0; order <= MAX_ORDER; order++) {
+		for (order = 0; order < NR_PAGE_ORDERS; order++) {
 			printk(KERN_CONT "%lu*%lukB ",
 			       nr[order], K(1UL) << order);
 			if (nr[order])
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1437ca2f28c5..03ead31c46a0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1059,7 +1059,7 @@ static void fill_contig_page_info(struct zone *zone,
 	info->free_blocks_total = 0;
 	info->free_blocks_suitable = 0;
 
-	for (order = 0; order <= MAX_ORDER; order++) {
+	for (order = 0; order < NR_PAGE_ORDERS; order++) {
 		unsigned long blocks;
 
 		/*
@@ -1476,7 +1476,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
 	int order;
 
 	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
-	for (order = 0; order <= MAX_ORDER; ++order)
+	for (order = 0; order < NR_PAGE_ORDERS; ++order)
 		/*
 		 * Access to nr_free is lockless as nr_free is used only for
 		 * printing purposes. Use data_race to avoid KCSAN warning.
@@ -1505,7 +1505,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
 					pgdat->node_id,
 					zone->name,
 					migratetype_names[mtype]);
-		for (order = 0; order <= MAX_ORDER; ++order) {
+		for (order = 0; order < NR_PAGE_ORDERS; ++order) {
 			unsigned long freecount = 0;
 			struct free_area *area;
 			struct list_head *curr;
@@ -1545,7 +1545,7 @@ static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
 
 	/* Print header */
 	seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
-	for (order = 0; order <= MAX_ORDER; ++order)
+	for (order = 0; order < NR_PAGE_ORDERS; ++order)
 		seq_printf(m, "%6d ", order);
 	seq_putc(m, '\n');
 
@@ -2181,7 +2181,7 @@ static void unusable_show_print(struct seq_file *m,
 	seq_printf(m, "Node %d, zone %8s ",
 				pgdat->node_id,
 				zone->name);
-	for (order = 0; order <= MAX_ORDER; ++order) {
+	for (order = 0; order < NR_PAGE_ORDERS; ++order) {
 		fill_contig_page_info(zone, order, &info);
 		index = unusable_free_index(order, &info);
 		seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
@@ -2233,7 +2233,7 @@ static void extfrag_show_print(struct seq_file *m,
 	seq_printf(m, "Node %d, zone %8s ",
 				pgdat->node_id,
 				zone->name);
-	for (order = 0; order <= MAX_ORDER; ++order) {
+	for (order = 0; order < NR_PAGE_ORDERS; ++order) {
 		fill_contig_page_info(zone, order, &info);
 		index = __fragmentation_index(order, &info);
 		seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
-- 
cgit v1.2.3


From 5e0a760b44417f7cadd79de2204d6247109558a0 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 28 Dec 2023 17:47:04 +0300
Subject: mm, treewide: rename MAX_ORDER to MAX_PAGE_ORDER

commit 23baf831a32c ("mm, treewide: redefine MAX_ORDER sanely") has
changed the definition of MAX_ORDER to be inclusive.  This has caused
issues with code that was not yet upstream and depended on the previous
definition.

To draw attention to the altered meaning of the define, rename MAX_ORDER
to MAX_PAGE_ORDER.

Link: https://lkml.kernel.org/r/20231228144704.14033-2-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kdump/vmcoreinfo.rst  |  2 +-
 Documentation/admin-guide/kernel-parameters.txt | 24 ++++++++++++------------
 Documentation/networking/packet_mmap.rst        | 14 +++++++-------
 arch/arm/Kconfig                                |  2 +-
 arch/arm64/Kconfig                              | 20 ++++++++++----------
 arch/arm64/include/asm/sparsemem.h              |  2 +-
 arch/arm64/kvm/hyp/nvhe/page_alloc.c            |  3 ++-
 arch/arm64/mm/hugetlbpage.c                     |  2 +-
 arch/m68k/Kconfig.cpu                           |  2 +-
 arch/nios2/Kconfig                              |  2 +-
 arch/powerpc/Kconfig                            |  2 +-
 arch/powerpc/mm/book3s64/iommu_api.c            |  2 +-
 arch/powerpc/mm/hugetlbpage.c                   |  2 +-
 arch/powerpc/platforms/powernv/pci-ioda.c       |  2 +-
 arch/sh/mm/Kconfig                              |  2 +-
 arch/sparc/Kconfig                              |  2 +-
 arch/sparc/kernel/pci_sun4v.c                   |  2 +-
 arch/sparc/mm/tsb.c                             |  4 ++--
 arch/um/kernel/um_arch.c                        |  4 ++--
 arch/xtensa/Kconfig                             |  2 +-
 drivers/accel/qaic/qaic_data.c                  |  2 +-
 drivers/base/regmap/regmap-debugfs.c            |  8 ++++----
 drivers/block/floppy.c                          |  2 +-
 drivers/crypto/ccp/sev-dev.c                    |  2 +-
 drivers/crypto/hisilicon/sgl.c                  |  6 +++---
 drivers/gpu/drm/i915/gem/i915_gem_internal.c    |  2 +-
 drivers/gpu/drm/i915/gem/selftests/huge_pages.c |  2 +-
 drivers/gpu/drm/ttm/tests/ttm_pool_test.c       |  8 ++++----
 drivers/gpu/drm/ttm/ttm_pool.c                  |  2 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h     |  2 +-
 drivers/iommu/dma-iommu.c                       |  2 +-
 drivers/irqchip/irq-gic-v3-its.c                |  4 ++--
 drivers/md/dm-bufio.c                           |  2 +-
 drivers/md/dm-crypt.c                           |  2 +-
 drivers/md/dm-flakey.c                          |  2 +-
 drivers/misc/genwqe/card_dev.c                  |  2 +-
 drivers/misc/genwqe/card_utils.c                |  4 ++--
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c |  2 +-
 drivers/net/ethernet/ibm/ibmvnic.h              |  4 ++--
 drivers/video/fbdev/hyperv_fb.c                 |  6 +++---
 drivers/video/fbdev/vermilion/vermilion.c       |  2 +-
 drivers/virtio/virtio_balloon.c                 |  2 +-
 drivers/virtio/virtio_mem.c                     |  8 ++++----
 fs/ramfs/file-nommu.c                           |  2 +-
 include/linux/hugetlb.h                         |  2 +-
 include/linux/mmzone.h                          | 14 +++++++-------
 include/linux/pageblock-flags.h                 |  4 ++--
 include/linux/slab.h                            |  4 ++--
 kernel/dma/pool.c                               |  6 +++---
 kernel/dma/swiotlb.c                            |  4 ++--
 kernel/events/ring_buffer.c                     | 10 +++++-----
 mm/Kconfig                                      |  6 +++---
 mm/compaction.c                                 |  4 ++--
 mm/debug_page_alloc.c                           |  2 +-
 mm/debug_vm_pgtable.c                           |  4 ++--
 mm/huge_memory.c                                |  2 +-
 mm/hugetlb.c                                    |  4 ++--
 mm/internal.h                                   |  2 +-
 mm/kmsan/init.c                                 |  6 +++---
 mm/memblock.c                                   |  7 ++++---
 mm/memory_hotplug.c                             |  9 +++++----
 mm/mm_init.c                                    | 22 +++++++++++-----------
 mm/page_alloc.c                                 | 24 ++++++++++++------------
 mm/page_isolation.c                             | 17 +++++++++--------
 mm/page_owner.c                                 |  6 +++---
 mm/page_reporting.c                             |  4 ++--
 mm/shuffle.h                                    |  2 +-
 mm/slab.c                                       |  2 +-
 mm/slub.c                                       |  4 ++--
 mm/vmscan.c                                     |  2 +-
 mm/vmstat.c                                     |  2 +-
 net/smc/smc_ib.c                                |  2 +-
 security/integrity/ima/ima_crypto.c             |  2 +-
 tools/perf/Documentation/perf-intel-pt.txt      |  2 +-
 tools/testing/memblock/linux/mmzone.h           |  6 +++---
 tools/testing/selftests/mm/thuge-gen.c          |  3 ++-
 76 files changed, 186 insertions(+), 181 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 3f8769e46b07..bced9e4b6e08 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -193,7 +193,7 @@ from this.
 --------------------------------
 
 Free areas descriptor. User-space tools use this value to iterate the
-free_area ranges. MAX_ORDER is used by the zone buddy allocator.
+free_area ranges. NR_PAGE_ORDERS is used by the zone buddy allocator.
 
 prb
 ---
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 65731b060e3f..8a01b8112f0b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -970,17 +970,17 @@
 			buddy allocator. Bigger value increase the probability
 			of catching random memory corruption, but reduce the
 			amount of memory for normal system use. The maximum
-			possible value is MAX_ORDER/2.  Setting this parameter
-			to 1 or 2 should be enough to identify most random
-			memory corruption problems caused by bugs in kernel or
-			driver code when a CPU writes to (or reads from) a
-			random memory location. Note that there exists a class
-			of memory corruptions problems caused by buggy H/W or
-			F/W or by drivers badly programming DMA (basically when
-			memory is written at bus level and the CPU MMU is
-			bypassed) which are not detectable by
-			CONFIG_DEBUG_PAGEALLOC, hence this option will not help
-			tracking down these problems.
+			possible value is MAX_PAGE_ORDER/2.  Setting this
+			parameter to 1 or 2 should be enough to identify most
+			random memory corruption problems caused by bugs in
+			kernel or driver code when a CPU writes to (or reads
+			from) a random memory location. Note that there exists
+			a class of memory corruptions problems caused by buggy
+			H/W or F/W or by drivers badly programming DMA
+			(basically when memory is written at bus level and the
+			CPU MMU is bypassed) which are not detectable by
+			CONFIG_DEBUG_PAGEALLOC, hence this option will not
+			help tracking down these problems.
 
 	debug_pagealloc=
 			[KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter
@@ -4136,7 +4136,7 @@
 			[KNL] Minimal page reporting order
 			Format: <integer>
 			Adjust the minimal page reporting order. The page
-			reporting is disabled when it exceeds MAX_ORDER.
+			reporting is disabled when it exceeds MAX_PAGE_ORDER.
 
 	panic=		[KNL] Kernel behaviour on panic: delay <timeout>
 			timeout > 0: seconds before rebooting
diff --git a/Documentation/networking/packet_mmap.rst b/Documentation/networking/packet_mmap.rst
index 30a3be3c48f3..dca15d15feaf 100644
--- a/Documentation/networking/packet_mmap.rst
+++ b/Documentation/networking/packet_mmap.rst
@@ -263,20 +263,20 @@ the name indicates, this function allocates pages of memory, and the second
 argument is "order" or a power of two number of pages, that is
 (for PAGE_SIZE == 4096) order=0 ==> 4096 bytes, order=1 ==> 8192 bytes,
 order=2 ==> 16384 bytes, etc. The maximum size of a
-region allocated by __get_free_pages is determined by the MAX_ORDER macro. More
-precisely the limit can be calculated as::
+region allocated by __get_free_pages is determined by the MAX_PAGE_ORDER macro.
+More precisely the limit can be calculated as::
 
-   PAGE_SIZE << MAX_ORDER
+   PAGE_SIZE << MAX_PAGE_ORDER
 
    In a i386 architecture PAGE_SIZE is 4096 bytes
-   In a 2.4/i386 kernel MAX_ORDER is 10
-   In a 2.6/i386 kernel MAX_ORDER is 11
+   In a 2.4/i386 kernel MAX_PAGE_ORDER is 10
+   In a 2.6/i386 kernel MAX_PAGE_ORDER is 11
 
 So get_free_pages can allocate as much as 4MB or 8MB in a 2.4/2.6 kernel
 respectively, with an i386 architecture.
 
 User space programs can include /usr/include/sys/user.h and
-/usr/include/linux/mmzone.h to get PAGE_SIZE MAX_ORDER declarations.
+/usr/include/linux/mmzone.h to get PAGE_SIZE MAX_PAGE_ORDER declarations.
 
 The pagesize can also be determined dynamically with the getpagesize (2)
 system call.
@@ -324,7 +324,7 @@ Definitions:
 		(see /proc/slabinfo)
 <pointer size>  depends on the architecture -- ``sizeof(void *)``
 <page size>     depends on the architecture -- PAGE_SIZE or getpagesize (2)
-<max-order>     is the value defined with MAX_ORDER
+<max-order>     is the value defined with MAX_PAGE_ORDER
 <frame size>    it's an upper bound of frame's capture size (more on this later)
 ==============  ================================================================
 
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index f8567e95f98b..b2ab8db63c4b 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1362,7 +1362,7 @@ config ARCH_FORCE_MAX_ORDER
 	default "10"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 12d611f3da5d..442539fd06fe 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1520,15 +1520,15 @@ config XEN
 
 # include/linux/mmzone.h requires the following to be true:
 #
-#   MAX_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS
+#   MAX_PAGE_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS
 #
-# so the maximum value of MAX_ORDER is SECTION_SIZE_BITS - PAGE_SHIFT:
+# so the maximum value of MAX_PAGE_ORDER is SECTION_SIZE_BITS - PAGE_SHIFT:
 #
-#     | SECTION_SIZE_BITS |  PAGE_SHIFT  |  max MAX_ORDER  |  default MAX_ORDER |
-# ----+-------------------+--------------+-----------------+--------------------+
-# 4K  |       27          |      12      |       15        |         10         |
-# 16K |       27          |      14      |       13        |         11         |
-# 64K |       29          |      16      |       13        |         13         |
+#     | SECTION_SIZE_BITS |  PAGE_SHIFT  |  max MAX_PAGE_ORDER  |  default MAX_PAGE_ORDER |
+# ----+-------------------+--------------+----------------------+-------------------------+
+# 4K  |       27          |      12      |       15             |         10              |
+# 16K |       27          |      14      |       13             |         11              |
+# 64K |       29          |      16      |       13             |         13              |
 config ARCH_FORCE_MAX_ORDER
 	int
 	default "13" if ARM64_64K_PAGES
@@ -1536,16 +1536,16 @@ config ARCH_FORCE_MAX_ORDER
 	default "10"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
 	  large blocks of physically contiguous memory is required.
 
 	  The maximal size of allocation cannot exceed the size of the
-	  section, so the value of MAX_ORDER should satisfy
+	  section, so the value of MAX_PAGE_ORDER should satisfy
 
-	    MAX_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS
+	    MAX_PAGE_ORDER + PAGE_SHIFT <= SECTION_SIZE_BITS
 
 	  Don't change if unsure.
 
diff --git a/arch/arm64/include/asm/sparsemem.h b/arch/arm64/include/asm/sparsemem.h
index 5f5437621029..8a8acc220371 100644
--- a/arch/arm64/include/asm/sparsemem.h
+++ b/arch/arm64/include/asm/sparsemem.h
@@ -10,7 +10,7 @@
 /*
  * Section size must be at least 512MB for 64K base
  * page size config. Otherwise it will be less than
- * MAX_ORDER and the build process will fail.
+ * MAX_PAGE_ORDER and the build process will fail.
  */
 #ifdef CONFIG_ARM64_64K_PAGES
 #define SECTION_SIZE_BITS 29
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index b1e392186a0f..e691290d3765 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -228,7 +228,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
 	int i;
 
 	hyp_spin_lock_init(&pool->lock);
-	pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT));
+	pool->max_order = min(MAX_PAGE_ORDER,
+			      get_order(nr_pages << PAGE_SHIFT));
 	for (i = 0; i <= pool->max_order; i++)
 		INIT_LIST_HEAD(&pool->free_area[i]);
 	pool->range_start = phys;
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index f5aae342632c..8116ac599f80 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -51,7 +51,7 @@ void __init arm64_hugetlb_cma_reserve(void)
 	 * page allocator. Just warn if there is any change
 	 * breaking this assumption.
 	 */
-	WARN_ON(order <= MAX_ORDER);
+	WARN_ON(order <= MAX_PAGE_ORDER);
 	hugetlb_cma_reserve(order);
 }
 #endif /* CONFIG_CMA */
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index ad69b466a08b..9dcf245c9cbf 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -402,7 +402,7 @@ config ARCH_FORCE_MAX_ORDER
 	default "10"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index d54464021a61..58d9565dc2c7 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -50,7 +50,7 @@ config ARCH_FORCE_MAX_ORDER
 	default "10"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1f11a62809f2..52d7e3fad553 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -915,7 +915,7 @@ config ARCH_FORCE_MAX_ORDER
 	default "10"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index d19fb1f3007d..c0e8d597e4cb 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -97,7 +97,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
 	}
 
 	mmap_read_lock(mm);
-	chunk = (1UL << (PAGE_SHIFT + MAX_ORDER)) /
+	chunk = (1UL << (PAGE_SHIFT + MAX_PAGE_ORDER)) /
 			sizeof(struct vm_area_struct *);
 	chunk = min(chunk, entries);
 	for (entry = 0; entry < entries; entry += chunk) {
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f7c683b672c1..0a540b37aab6 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -615,7 +615,7 @@ void __init gigantic_hugetlb_cma_reserve(void)
 		order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
 
 	if (order) {
-		VM_WARN_ON(order <= MAX_ORDER);
+		VM_WARN_ON(order <= MAX_PAGE_ORDER);
 		hugetlb_cma_reserve(order);
 	}
 }
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 28fac4770073..23f5b5093ec1 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1389,7 +1389,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	 * DMA window can be larger than available memory, which will
 	 * cause errors later.
 	 */
-	const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER);
+	const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_PAGE_ORDER);
 
 	/*
 	 * We create the default window as big as we can. The constraint is
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index 511c17aede4a..455311d9a5e9 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -26,7 +26,7 @@ config ARCH_FORCE_MAX_ORDER
 	default "10"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE:_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 49849790e66d..204c43cb3d43 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -277,7 +277,7 @@ config ARCH_FORCE_MAX_ORDER
 	default "12"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index c80b0a21d709..083e5f05a7f0 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -194,7 +194,7 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size,
 
 	size = IO_PAGE_ALIGN(size);
 	order = get_order(size);
-	if (unlikely(order > MAX_ORDER))
+	if (unlikely(order > MAX_PAGE_ORDER))
 		return NULL;
 
 	npages = size >> IO_PAGE_SHIFT;
diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c
index 5e2931a18409..6acd8a4c1e2a 100644
--- a/arch/sparc/mm/tsb.c
+++ b/arch/sparc/mm/tsb.c
@@ -402,8 +402,8 @@ void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long rss)
 	unsigned long new_rss_limit;
 	gfp_t gfp_flags;
 
-	if (max_tsb_size > PAGE_SIZE << MAX_ORDER)
-		max_tsb_size = PAGE_SIZE << MAX_ORDER;
+	if (max_tsb_size > PAGE_SIZE << MAX_PAGE_ORDER)
+		max_tsb_size = PAGE_SIZE << MAX_PAGE_ORDER;
 
 	new_cache_index = 0;
 	for (new_size = 8192; new_size < max_tsb_size; new_size <<= 1UL) {
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index b1bfed0c8528..7a9820797eae 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -373,10 +373,10 @@ int __init linux_main(int argc, char **argv)
 	max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC;
 
 	/*
-	 * Zones have to begin on a 1 << MAX_ORDER page boundary,
+	 * Zones have to begin on a 1 << MAX_PAGE_ORDER page boundary,
 	 * so this makes sure that's true for highmem
 	 */
-	max_physmem &= ~((1 << (PAGE_SHIFT + MAX_ORDER)) - 1);
+	max_physmem &= ~((1 << (PAGE_SHIFT + MAX_PAGE_ORDER)) - 1);
 	if (physmem_size + iomem_size > max_physmem) {
 		highmem = physmem_size + iomem_size - max_physmem;
 		physmem_size -= highmem;
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 7d792077e5fd..e031eaf36c99 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -793,7 +793,7 @@ config ARCH_FORCE_MAX_ORDER
 	default "10"
 	help
 	  The kernel page allocator limits the size of maximal physically
-	  contiguous allocations. The limit is called MAX_ORDER and it
+	  contiguous allocations. The limit is called MAX_PAGE_ORDER and it
 	  defines the maximal power of two of number of pages that can be
 	  allocated as a single contiguous block. This option allows
 	  overriding the default setting when ability to allocate very
diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c
index 4a8e43a7a6a4..aaeb2c9c071a 100644
--- a/drivers/accel/qaic/qaic_data.c
+++ b/drivers/accel/qaic/qaic_data.c
@@ -451,7 +451,7 @@ static int create_sgt(struct qaic_device *qdev, struct sg_table **sgt_out, u64 s
 		 * later
 		 */
 		buf_extra = (PAGE_SIZE - size % PAGE_SIZE) % PAGE_SIZE;
-		max_order = min(MAX_ORDER - 1, get_order(size));
+		max_order = min(MAX_PAGE_ORDER - 1, get_order(size));
 	} else {
 		/* allocate a single page for book keeping */
 		nr_pages = 1;
diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c
index bdd80b73c3e6..fb84cda92a75 100644
--- a/drivers/base/regmap/regmap-debugfs.c
+++ b/drivers/base/regmap/regmap-debugfs.c
@@ -226,8 +226,8 @@ static ssize_t regmap_read_debugfs(struct regmap *map, unsigned int from,
 	if (*ppos < 0 || !count)
 		return -EINVAL;
 
-	if (count > (PAGE_SIZE << MAX_ORDER))
-		count = PAGE_SIZE << MAX_ORDER;
+	if (count > (PAGE_SIZE << MAX_PAGE_ORDER))
+		count = PAGE_SIZE << MAX_PAGE_ORDER;
 
 	buf = kmalloc(count, GFP_KERNEL);
 	if (!buf)
@@ -373,8 +373,8 @@ static ssize_t regmap_reg_ranges_read_file(struct file *file,
 	if (*ppos < 0 || !count)
 		return -EINVAL;
 
-	if (count > (PAGE_SIZE << MAX_ORDER))
-		count = PAGE_SIZE << MAX_ORDER;
+	if (count > (PAGE_SIZE << MAX_PAGE_ORDER))
+		count = PAGE_SIZE << MAX_PAGE_ORDER;
 
 	buf = kmalloc(count, GFP_KERNEL);
 	if (!buf)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 11114a5d9e5c..d0e41d52d6a9 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3079,7 +3079,7 @@ static void raw_cmd_free(struct floppy_raw_cmd **ptr)
 	}
 }
 
-#define MAX_LEN (1UL << MAX_ORDER << PAGE_SHIFT)
+#define MAX_LEN (1UL << MAX_PAGE_ORDER << PAGE_SHIFT)
 
 static int raw_cmd_copyin(int cmd, void __user *param,
 				 struct floppy_raw_cmd **rcmd)
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index fcaccd0b5a65..e4d3f45242f6 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -906,7 +906,7 @@ static int sev_ioctl_do_get_id2(struct sev_issue_cmd *argp)
 		/*
 		 * The length of the ID shouldn't be assumed by software since
 		 * it may change in the future.  The allocation size is limited
-		 * to 1 << (PAGE_SHIFT + MAX_ORDER) by the page allocator.
+		 * to 1 << (PAGE_SHIFT + MAX_PAGE_ORDER) by the page allocator.
 		 * If the allocation fails, simply return ENOMEM rather than
 		 * warning in the kernel log.
 		 */
diff --git a/drivers/crypto/hisilicon/sgl.c b/drivers/crypto/hisilicon/sgl.c
index 3df7a256e919..5c1012d7ffa9 100644
--- a/drivers/crypto/hisilicon/sgl.c
+++ b/drivers/crypto/hisilicon/sgl.c
@@ -70,11 +70,11 @@ struct hisi_acc_sgl_pool *hisi_acc_create_sgl_pool(struct device *dev,
 			 HISI_ACC_SGL_ALIGN_SIZE);
 
 	/*
-	 * the pool may allocate a block of memory of size PAGE_SIZE * 2^MAX_ORDER,
+	 * the pool may allocate a block of memory of size PAGE_SIZE * 2^MAX_PAGE_ORDER,
 	 * block size may exceed 2^31 on ia64, so the max of block size is 2^31
 	 */
-	block_size = 1 << (PAGE_SHIFT + MAX_ORDER < 32 ?
-			   PAGE_SHIFT + MAX_ORDER : 31);
+	block_size = 1 << (PAGE_SHIFT + MAX_PAGE_ORDER < 32 ?
+			   PAGE_SHIFT + MAX_PAGE_ORDER : 31);
 	sgl_num_per_block = block_size / sgl_size;
 	block_num = count / sgl_num_per_block;
 	remain_sgl = count % sgl_num_per_block;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
index 6bc26b4b06b8..ea7561ae6e13 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
@@ -36,7 +36,7 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj)
 	struct sg_table *st;
 	struct scatterlist *sg;
 	unsigned int npages; /* restricted by sg_alloc_table */
-	int max_order = MAX_ORDER;
+	int max_order = MAX_PAGE_ORDER;
 	unsigned int max_segment;
 	gfp_t gfp;
 
diff --git a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
index 6b9f6cf50bf6..84c50c4c4af7 100644
--- a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
+++ b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
@@ -115,7 +115,7 @@ static int get_huge_pages(struct drm_i915_gem_object *obj)
 		do {
 			struct page *page;
 
-			GEM_BUG_ON(order > MAX_ORDER);
+			GEM_BUG_ON(order > MAX_PAGE_ORDER);
 			page = alloc_pages(GFP | __GFP_ZERO, order);
 			if (!page)
 				goto err;
diff --git a/drivers/gpu/drm/ttm/tests/ttm_pool_test.c b/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
index 2d9cae8cd984..cceaa18d4e46 100644
--- a/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
+++ b/drivers/gpu/drm/ttm/tests/ttm_pool_test.c
@@ -109,7 +109,7 @@ static const struct ttm_pool_test_case ttm_pool_basic_cases[] = {
 	},
 	{
 		.description = "Above the allocation limit",
-		.order = MAX_ORDER + 1,
+		.order = MAX_PAGE_ORDER + 1,
 	},
 	{
 		.description = "One page, with coherent DMA mappings enabled",
@@ -118,7 +118,7 @@ static const struct ttm_pool_test_case ttm_pool_basic_cases[] = {
 	},
 	{
 		.description = "Above the allocation limit, with coherent DMA mappings enabled",
-		.order = MAX_ORDER + 1,
+		.order = MAX_PAGE_ORDER + 1,
 		.use_dma_alloc = true,
 	},
 };
@@ -165,7 +165,7 @@ static void ttm_pool_alloc_basic(struct kunit *test)
 	fst_page = tt->pages[0];
 	last_page = tt->pages[tt->num_pages - 1];
 
-	if (params->order <= MAX_ORDER) {
+	if (params->order <= MAX_PAGE_ORDER) {
 		if (params->use_dma_alloc) {
 			KUNIT_ASSERT_NOT_NULL(test, (void *)fst_page->private);
 			KUNIT_ASSERT_NOT_NULL(test, (void *)last_page->private);
@@ -182,7 +182,7 @@ static void ttm_pool_alloc_basic(struct kunit *test)
 			 * order 0 blocks
 			 */
 			KUNIT_ASSERT_EQ(test, fst_page->private,
-					min_t(unsigned int, MAX_ORDER,
+					min_t(unsigned int, MAX_PAGE_ORDER,
 					      params->order));
 			KUNIT_ASSERT_EQ(test, last_page->private, 0);
 		}
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index d183bb97c526..b62f420a9f96 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -447,7 +447,7 @@ int ttm_pool_alloc(struct ttm_pool *pool, struct ttm_tt *tt,
 	else
 		gfp_flags |= GFP_HIGHUSER;
 
-	for (order = min_t(unsigned int, MAX_ORDER, __fls(num_pages));
+	for (order = min_t(unsigned int, MAX_PAGE_ORDER, __fls(num_pages));
 	     num_pages;
 	     order = min_t(unsigned int, order, __fls(num_pages))) {
 		struct ttm_pool_type *pt;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 961205ba86d2..925ac6a47bce 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -188,7 +188,7 @@
 #ifdef CONFIG_CMA_ALIGNMENT
 #define Q_MAX_SZ_SHIFT			(PAGE_SHIFT + CONFIG_CMA_ALIGNMENT)
 #else
-#define Q_MAX_SZ_SHIFT			(PAGE_SHIFT + MAX_ORDER)
+#define Q_MAX_SZ_SHIFT			(PAGE_SHIFT + MAX_PAGE_ORDER)
 #endif
 
 /*
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 85163a83df2f..e59f50e11ea8 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -884,7 +884,7 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
 	struct page **pages;
 	unsigned int i = 0, nid = dev_to_node(dev);
 
-	order_mask &= GENMASK(MAX_ORDER, 0);
+	order_mask &= GENMASK(MAX_PAGE_ORDER, 0);
 	if (!order_mask)
 		return NULL;
 
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 9a7a74239eab..d097001c1e3e 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2465,8 +2465,8 @@ static bool its_parse_indirect_baser(struct its_node *its,
 	 * feature is not supported by hardware.
 	 */
 	new_order = max_t(u32, get_order(esz << ids), new_order);
-	if (new_order > MAX_ORDER) {
-		new_order = MAX_ORDER;
+	if (new_order > MAX_PAGE_ORDER) {
+		new_order = MAX_PAGE_ORDER;
 		ids = ilog2(PAGE_ORDER_TO_SIZE(new_order) / (int)esz);
 		pr_warn("ITS@%pa: %s Table too large, reduce ids %llu->%u\n",
 			&its->phys_base, its_base_type_string[type],
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index f03d7dba270c..13c65b7e1ed6 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1170,7 +1170,7 @@ static void __cache_size_refresh(void)
  * If the allocation may fail we use __get_free_pages. Memory fragmentation
  * won't have a fatal effect here, but it just causes flushes of some other
  * buffers and more I/O will be performed. Don't use __get_free_pages if it
- * always fails (i.e. order > MAX_ORDER).
+ * always fails (i.e. order > MAX_PAGE_ORDER).
  *
  * If the allocation shouldn't fail we use __vmalloc. This is only for the
  * initial reserve allocation, so there's no risk of wasting all vmalloc
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 2ae8560b6a14..855b482cbff1 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1673,7 +1673,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size)
 	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
 	unsigned int remaining_size;
-	unsigned int order = MAX_ORDER;
+	unsigned int order = MAX_PAGE_ORDER;
 
 retry:
 	if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM))
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index f57fb821528d..7916ed9f10e8 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -434,7 +434,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b
 
 	remaining_size = size;
 
-	order = MAX_ORDER;
+	order = MAX_PAGE_ORDER;
 	while (remaining_size) {
 		struct page *pages;
 		unsigned size_to_add, to_copy;
diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c
index 55fc5b80e649..4441aca2280a 100644
--- a/drivers/misc/genwqe/card_dev.c
+++ b/drivers/misc/genwqe/card_dev.c
@@ -443,7 +443,7 @@ static int genwqe_mmap(struct file *filp, struct vm_area_struct *vma)
 	if (vsize == 0)
 		return -EINVAL;
 
-	if (get_order(vsize) > MAX_ORDER)
+	if (get_order(vsize) > MAX_PAGE_ORDER)
 		return -ENOMEM;
 
 	dma_map = kzalloc(sizeof(struct dma_mapping), GFP_KERNEL);
diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c
index 1c798d6b2dfb..a2c4a9b4f871 100644
--- a/drivers/misc/genwqe/card_utils.c
+++ b/drivers/misc/genwqe/card_utils.c
@@ -210,7 +210,7 @@ u32 genwqe_crc32(u8 *buff, size_t len, u32 init)
 void *__genwqe_alloc_consistent(struct genwqe_dev *cd, size_t size,
 			       dma_addr_t *dma_handle)
 {
-	if (get_order(size) > MAX_ORDER)
+	if (get_order(size) > MAX_PAGE_ORDER)
 		return NULL;
 
 	return dma_alloc_coherent(&cd->pci_dev->dev, size, dma_handle,
@@ -308,7 +308,7 @@ int genwqe_alloc_sync_sgl(struct genwqe_dev *cd, struct genwqe_sgl *sgl,
 	sgl->write = write;
 	sgl->sgl_size = genwqe_sgl_size(sgl->nr_pages);
 
-	if (get_order(sgl->sgl_size) > MAX_ORDER) {
+	if (get_order(sgl->sgl_size) > MAX_PAGE_ORDER) {
 		dev_err(&pci_dev->dev,
 			"[%s] err: too much memory requested!\n", __func__);
 		return ret;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index b618797a7e8d..f1695c889d3a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1041,7 +1041,7 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring)
 		return;
 
 	order = get_order(alloc_size);
-	if (order > MAX_ORDER) {
+	if (order > MAX_PAGE_ORDER) {
 		if (net_ratelimit())
 			dev_warn(ring_to_dev(ring), "failed to allocate tx spare buffer, exceed to max order\n");
 		return;
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index 4e18b4cefa97..94ac36b1408b 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -48,7 +48,7 @@
  * of 4096 jumbo frames (MTU=9000) we will need about 9K*4K = 36MB plus
  * some padding.
  *
- * But the size of a single DMA region is limited by MAX_ORDER in the
+ * But the size of a single DMA region is limited by MAX_PAGE_ORDER in the
  * kernel (about 16MB currently).  To support say 4K Jumbo frames, we
  * use a set of LTBs (struct ltb_set) per pool.
  *
@@ -75,7 +75,7 @@
  * pool for the 4MB. Thus the 16 Rx and Tx queues require 32 * 5 = 160
  * plus 16 for the TSO pools for a total of 176 LTB mappings per VNIC.
  */
-#define IBMVNIC_ONE_LTB_MAX	((u32)((1 << MAX_ORDER) * PAGE_SIZE))
+#define IBMVNIC_ONE_LTB_MAX	((u32)((1 << MAX_PAGE_ORDER) * PAGE_SIZE))
 #define IBMVNIC_ONE_LTB_SIZE	min((u32)(8 << 20), IBMVNIC_ONE_LTB_MAX)
 #define IBMVNIC_LTB_SET_SIZE	(38 << 20)
 
diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c
index a80939fe2ee6..6a29d2594b91 100644
--- a/drivers/video/fbdev/hyperv_fb.c
+++ b/drivers/video/fbdev/hyperv_fb.c
@@ -927,8 +927,8 @@ static phys_addr_t hvfb_get_phymem(struct hv_device *hdev,
 	if (request_size == 0)
 		return -1;
 
-	if (order <= MAX_ORDER) {
-		/* Call alloc_pages if the size is less than 2^MAX_ORDER */
+	if (order <= MAX_PAGE_ORDER) {
+		/* Call alloc_pages if the size is less than 2^MAX_PAGE_ORDER */
 		page = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
 		if (!page)
 			return -1;
@@ -958,7 +958,7 @@ static void hvfb_release_phymem(struct hv_device *hdev,
 {
 	unsigned int order = get_order(size);
 
-	if (order <= MAX_ORDER)
+	if (order <= MAX_PAGE_ORDER)
 		__free_pages(pfn_to_page(paddr >> PAGE_SHIFT), order);
 	else
 		dma_free_coherent(&hdev->device,
diff --git a/drivers/video/fbdev/vermilion/vermilion.c b/drivers/video/fbdev/vermilion/vermilion.c
index 840ead69654b..a32e5b2924c9 100644
--- a/drivers/video/fbdev/vermilion/vermilion.c
+++ b/drivers/video/fbdev/vermilion/vermilion.c
@@ -197,7 +197,7 @@ static int vmlfb_alloc_vram(struct vml_info *vinfo,
 		va = &vinfo->vram[i];
 		order = 0;
 
-		while (requested > (PAGE_SIZE << order) && order <= MAX_ORDER)
+		while (requested > (PAGE_SIZE << order) && order <= MAX_PAGE_ORDER)
 			order++;
 
 		err = vmlfb_alloc_vram_area(va, order, 0);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 1fe93e93f5bc..59cdc0292dce 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -33,7 +33,7 @@
 #define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \
 					     __GFP_NOMEMALLOC)
 /* The order of free page blocks to report to host */
-#define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_ORDER
+#define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_PAGE_ORDER
 /* The size of a free page block in bytes */
 #define VIRTIO_BALLOON_HINT_BLOCK_BYTES \
 	(1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT))
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index fa5226c198cc..8e3223294442 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -1154,13 +1154,13 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn,
  */
 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
 {
-	unsigned long order = MAX_ORDER;
+	unsigned long order = MAX_PAGE_ORDER;
 	unsigned long i;
 
 	/*
 	 * We might get called for ranges that don't cover properly aligned
-	 * MAX_ORDER pages; however, we can only online properly aligned
-	 * pages with an order of MAX_ORDER at maximum.
+	 * MAX_PAGE_ORDER pages; however, we can only online properly aligned
+	 * pages with an order of MAX_PAGE_ORDER at maximum.
 	 */
 	while (!IS_ALIGNED(pfn | nr_pages, 1 << order))
 		order--;
@@ -1280,7 +1280,7 @@ static void virtio_mem_online_page(struct virtio_mem *vm,
 	bool do_online;
 
 	/*
-	 * We can get called with any order up to MAX_ORDER. If our subblock
+	 * We can get called with any order up to MAX_PAGE_ORDER. If our subblock
 	 * size is smaller than that and we have a mixture of plugged and
 	 * unplugged subblocks within such a page, we have to process in
 	 * smaller granularity. In that case we'll adjust the order exactly once
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index efb1b4c1a0a4..7a6d980e614d 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 
 	/* make various checks */
 	order = get_order(newsize);
-	if (unlikely(order > MAX_ORDER))
+	if (unlikely(order > MAX_PAGE_ORDER))
 		return -EFBIG;
 
 	ret = inode_newsize_ok(inode, newsize);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 236ec7b63c54..c1ee640d87b1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -829,7 +829,7 @@ static inline unsigned huge_page_shift(struct hstate *h)
 
 static inline bool hstate_is_gigantic(struct hstate *h)
 {
-	return huge_page_order(h) > MAX_ORDER;
+	return huge_page_order(h) > MAX_PAGE_ORDER;
 }
 
 static inline unsigned int pages_per_huge_page(const struct hstate *h)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1ea7636dfb76..4ed33b127821 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -27,15 +27,15 @@
 
 /* Free memory management - zoned buddy allocator.  */
 #ifndef CONFIG_ARCH_FORCE_MAX_ORDER
-#define MAX_ORDER 10
+#define MAX_PAGE_ORDER 10
 #else
-#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
+#define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
 #endif
-#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER)
+#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER)
 
 #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)
 
-#define NR_PAGE_ORDERS (MAX_ORDER + 1)
+#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1)
 
 /*
  * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
@@ -938,7 +938,7 @@ struct zone {
 	struct free_area	free_area[NR_PAGE_ORDERS];
 
 #ifdef CONFIG_UNACCEPTED_MEMORY
-	/* Pages to be accepted. All pages on the list are MAX_ORDER */
+	/* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */
 	struct list_head	unaccepted_pages;
 #endif
 
@@ -1748,8 +1748,8 @@ static inline bool movable_only_nodes(nodemask_t *nodes)
 #define SECTION_BLOCKFLAGS_BITS \
 	((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
 
-#if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
-#error Allocator MAX_ORDER exceeds SECTION_SIZE
+#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
+#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE
 #endif
 
 static inline unsigned long pfn_to_section_nr(unsigned long pfn)
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index e83c4c095041..3f2409b968ec 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -41,14 +41,14 @@ extern unsigned int pageblock_order;
  * Huge pages are a constant size, but don't exceed the maximum allocation
  * granularity.
  */
-#define pageblock_order		min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER)
+#define pageblock_order		min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER)
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
 #else /* CONFIG_HUGETLB_PAGE */
 
 /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
-#define pageblock_order		MAX_ORDER
+#define pageblock_order		MAX_PAGE_ORDER
 
 #endif /* CONFIG_HUGETLB_PAGE */
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index d6d6ffeeb9a2..d63823e518c0 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -308,7 +308,7 @@ static inline unsigned int arch_slab_minalign(void)
  * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
  */
 #define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
+#define KMALLOC_SHIFT_MAX	(MAX_PAGE_ORDER + PAGE_SHIFT)
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	5
 #endif
@@ -316,7 +316,7 @@ static inline unsigned int arch_slab_minalign(void)
 
 #ifdef CONFIG_SLUB
 #define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
+#define KMALLOC_SHIFT_MAX	(MAX_PAGE_ORDER + PAGE_SHIFT)
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	3
 #endif
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index b481c48a31a6..d10613eb0f63 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
 	void *addr;
 	int ret = -ENOMEM;
 
-	/* Cannot allocate larger than MAX_ORDER */
-	order = min(get_order(pool_size), MAX_ORDER);
+	/* Cannot allocate larger than MAX_PAGE_ORDER */
+	order = min(get_order(pool_size), MAX_PAGE_ORDER);
 
 	do {
 		pool_size = 1 << (PAGE_SHIFT + order);
@@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void)
 
 	/*
 	 * If coherent_pool was not used on the command line, default the pool
-	 * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER.
+	 * sizes to 128KB per 1GB of memory, min 128KB, max MAX_PAGE_ORDER.
 	 */
 	if (!atomic_pool_size) {
 		unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 33d942615be5..176078bf2215 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -686,8 +686,8 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
 	size_t pool_size;
 	size_t tlb_size;
 
-	if (nslabs > SLABS_PER_PAGE << MAX_ORDER) {
-		nslabs = SLABS_PER_PAGE << MAX_ORDER;
+	if (nslabs > SLABS_PER_PAGE << MAX_PAGE_ORDER) {
+		nslabs = SLABS_PER_PAGE << MAX_PAGE_ORDER;
 		nareas = limit_nareas(nareas, nslabs);
 	}
 
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index e8d82c2f07d0..60ed43d1c29e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -610,8 +610,8 @@ static struct page *rb_alloc_aux_page(int node, int order)
 {
 	struct page *page;
 
-	if (order > MAX_ORDER)
-		order = MAX_ORDER;
+	if (order > MAX_PAGE_ORDER)
+		order = MAX_PAGE_ORDER;
 
 	do {
 		page = alloc_pages_node(node, PERF_AUX_GFP, order);
@@ -702,9 +702,9 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
 
 	/*
 	 * kcalloc_node() is unable to allocate buffer if the size is larger
-	 * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case.
+	 * than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case.
 	 */
-	if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER)
+	if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER)
 		return -ENOMEM;
 	rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
 				     node);
@@ -821,7 +821,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
 	size = sizeof(struct perf_buffer);
 	size += nr_pages * sizeof(void *);
 
-	if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER)
+	if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER)
 		goto fail;
 
 	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
diff --git a/mm/Kconfig b/mm/Kconfig
index 79d563d8f9e0..cb9d470f0bf7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -381,7 +381,7 @@ config SHUFFLE_PAGE_ALLOCATOR
 	  the presence of a memory-side-cache. There are also incidental
 	  security benefits as it reduces the predictability of page
 	  allocations to compliment SLAB_FREELIST_RANDOM, but the
-	  default granularity of shuffling on the MAX_ORDER i.e, 10th
+	  default granularity of shuffling on the MAX_PAGE_ORDER i.e, 10th
 	  order of pages is selected based on cache utilization benefits
 	  on x86.
 
@@ -713,8 +713,8 @@ config HUGETLB_PAGE_SIZE_VARIABLE
 	  HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
 	  on a platform.
 
-	  Note that the pageblock_order cannot exceed MAX_ORDER and will be
-	  clamped down to MAX_ORDER.
+	  Note that the pageblock_order cannot exceed MAX_PAGE_ORDER and will be
+	  clamped down to MAX_PAGE_ORDER.
 
 config CONTIG_ALLOC
 	def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
diff --git a/mm/compaction.c b/mm/compaction.c
index 24f8eb4d6260..27ada42924d5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -999,7 +999,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			 * a valid page order. Consider only values in the
 			 * valid order range to prevent low_pfn overflow.
 			 */
-			if (freepage_order > 0 && freepage_order <= MAX_ORDER) {
+			if (freepage_order > 0 && freepage_order <= MAX_PAGE_ORDER) {
 				low_pfn += (1UL << freepage_order) - 1;
 				nr_scanned += (1UL << freepage_order) - 1;
 			}
@@ -1017,7 +1017,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if (PageCompound(page) && !cc->alloc_contig) {
 			const unsigned int order = compound_order(page);
 
-			if (likely(order <= MAX_ORDER)) {
+			if (likely(order <= MAX_PAGE_ORDER)) {
 				low_pfn += (1UL << order) - 1;
 				nr_scanned += (1UL << order) - 1;
 			}
diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
index f9d145730fd1..6755f0c9d4a3 100644
--- a/mm/debug_page_alloc.c
+++ b/mm/debug_page_alloc.c
@@ -22,7 +22,7 @@ static int __init debug_guardpage_minorder_setup(char *buf)
 {
 	unsigned long res;
 
-	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
+	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_PAGE_ORDER / 2) {
 		pr_err("Bad debug_guardpage_minorder value\n");
 		return 0;
 	}
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index e651500e597a..5662e29fe253 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1091,7 +1091,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
 	struct page *page = NULL;
 
 #ifdef CONFIG_CONTIG_ALLOC
-	if (order > MAX_ORDER) {
+	if (order > MAX_PAGE_ORDER) {
 		page = alloc_contig_pages((1 << order), GFP_KERNEL,
 					  first_online_node, NULL);
 		if (page) {
@@ -1101,7 +1101,7 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
 	}
 #endif
 
-	if (order <= MAX_ORDER)
+	if (order <= MAX_PAGE_ORDER)
 		page = alloc_pages(GFP_KERNEL, order);
 
 	return page;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1a588e29d287..b9a7a57691d7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -682,7 +682,7 @@ static int __init hugepage_init(void)
 	/*
 	 * hugepages can't be allocated by the buddy allocator
 	 */
-	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER);
+	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
 	/*
 	 * we use page->mapping and page->index in second tail page
 	 * as list_head: assuming THP order >= 2
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 378e460a6ab4..0d262784ce60 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3410,7 +3410,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
 
 /*
  * Put bootmem huge pages into the standard lists after mem_map is up.
- * Note: This only applies to gigantic (order > MAX_ORDER) pages.
+ * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
  */
 static void __init gather_bootmem_prealloc(void)
 {
@@ -4790,7 +4790,7 @@ static int __init default_hugepagesz_setup(char *s)
 	 * The number of default huge pages (for this size) could have been
 	 * specified as the first hugetlb parameter: hugepages=X.  If so,
 	 * then default_hstate_max_huge_pages is set.  If the default huge
-	 * page size is gigantic (> MAX_ORDER), then the pages must be
+	 * page size is gigantic (> MAX_PAGE_ORDER), then the pages must be
 	 * allocated here from bootmem allocator.
 	 */
 	if (default_hstate_max_huge_pages) {
diff --git a/mm/internal.h b/mm/internal.h
index ac40c3d00336..f309a010d50f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -335,7 +335,7 @@ static inline bool page_is_buddy(struct page *page, struct page *buddy,
  * satisfies the following equation:
  *     P = B & ~(1 << O)
  *
- * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
+ * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER
  */
 static inline unsigned long
 __find_buddy_pfn(unsigned long page_pfn, unsigned int order)
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
index 103e2e88ea03..3ac3b8921d36 100644
--- a/mm/kmsan/init.c
+++ b/mm/kmsan/init.c
@@ -141,7 +141,7 @@ struct smallstack {
 
 static struct smallstack collect = {
 	.index = 0,
-	.order = MAX_ORDER,
+	.order = MAX_PAGE_ORDER,
 };
 
 static void smallstack_push(struct smallstack *stack, struct page *pages)
@@ -211,8 +211,8 @@ static void kmsan_memblock_discard(void)
 	 *    order=N-1,
 	 *  - repeat.
 	 */
-	collect.order = MAX_ORDER;
-	for (int i = MAX_ORDER; i >= 0; i--) {
+	collect.order = MAX_PAGE_ORDER;
+	for (int i = MAX_PAGE_ORDER; i >= 0; i--) {
 		if (held_back[i].shadow)
 			smallstack_push(&collect, held_back[i].shadow);
 		if (held_back[i].origin)
diff --git a/mm/memblock.c b/mm/memblock.c
index 4a62f7774b65..8c194d8afeec 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2113,12 +2113,13 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
 		 * Free the pages in the largest chunks alignment allows.
 		 *
 		 * __ffs() behaviour is undefined for 0. start == 0 is
-		 * MAX_ORDER-aligned, set order to MAX_ORDER for the case.
+		 * MAX_PAGE_ORDER-aligned, set order to MAX_PAGE_ORDER for
+		 * the case.
 		 */
 		if (start)
-			order = min_t(int, MAX_ORDER, __ffs(start));
+			order = min_t(int, MAX_PAGE_ORDER, __ffs(start));
 		else
-			order = MAX_ORDER;
+			order = MAX_PAGE_ORDER;
 
 		while (start + (1UL << order) > end)
 			order--;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 926e1cfb10e9..b3c0ff52bb72 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -645,7 +645,7 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
 	unsigned long pfn;
 
 	/*
-	 * Online the pages in MAX_ORDER aligned chunks. The callback might
+	 * Online the pages in MAX_PAGE_ORDER aligned chunks. The callback might
 	 * decide to not expose all pages to the buddy (e.g., expose them
 	 * later). We account all pages as being online and belonging to this
 	 * zone ("present").
@@ -660,12 +660,13 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
 		 * Free to online pages in the largest chunks alignment allows.
 		 *
 		 * __ffs() behaviour is undefined for 0. start == 0 is
-		 * MAX_ORDER-aligned, Set order to MAX_ORDER for the case.
+		 * MAX_PAGE_ORDER-aligned, Set order to MAX_PAGE_ORDER for
+		 * the case.
 		 */
 		if (pfn)
-			order = min_t(int, MAX_ORDER, __ffs(pfn));
+			order = min_t(int, MAX_PAGE_ORDER, __ffs(pfn));
 		else
-			order = MAX_ORDER;
+			order = MAX_PAGE_ORDER;
 
 		(*online_page_callback)(pfn_to_page(pfn), order);
 		pfn += (1UL << order);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2830eef2b16c..89dc29f1e6c6 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1455,7 +1455,7 @@ static inline void setup_usemap(struct zone *zone) {}
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 void __init set_pageblock_order(void)
 {
-	unsigned int order = MAX_ORDER;
+	unsigned int order = MAX_PAGE_ORDER;
 
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
@@ -1638,7 +1638,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
 	offset = pgdat->node_start_pfn - start;
 	/*
-	 * The zone's endpoints aren't required to be MAX_ORDER
+		 * The zone's endpoints aren't required to be MAX_PAGE_ORDER
 	 * aligned but the node_mem_map endpoints must be in order
 	 * for the buddy allocator to function correctly.
 	 */
@@ -1964,11 +1964,11 @@ static void __init deferred_free_range(unsigned long pfn,
 	if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
 		for (i = 0; i < nr_pages; i += pageblock_nr_pages)
 			set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
-		__free_pages_core(page, MAX_ORDER);
+		__free_pages_core(page, MAX_PAGE_ORDER);
 		return;
 	}
 
-	/* Accept chunks smaller than MAX_ORDER upfront */
+	/* Accept chunks smaller than MAX_PAGE_ORDER upfront */
 	accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));
 
 	for (i = 0; i < nr_pages; i++, page++, pfn++) {
@@ -1991,8 +1991,8 @@ static inline void __init pgdat_init_report_one_done(void)
 /*
  * Returns true if page needs to be initialized or freed to buddy allocator.
  *
- * We check if a current MAX_ORDER block is valid by only checking the validity
- * of the head pfn.
+ * We check if a current MAX_PAGE_ORDER block is valid by only checking the
+ * validity of the head pfn.
  */
 static inline bool __init deferred_pfn_valid(unsigned long pfn)
 {
@@ -2149,8 +2149,8 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
 	deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
 
 	/*
-	 * Initialize and free pages in MAX_ORDER sized increments so that we
-	 * can avoid introducing any issues with the buddy allocator.
+	 * Initialize and free pages in MAX_PAGE_ORDER sized increments so that
+	 * we can avoid introducing any issues with the buddy allocator.
 	 */
 	while (spfn < end_pfn) {
 		deferred_init_maxorder(&i, zone, &spfn, &epfn);
@@ -2291,7 +2291,7 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
 	}
 
 	/*
-	 * Initialize and free pages in MAX_ORDER sized increments so
+	 * Initialize and free pages in MAX_PAGE_ORDER sized increments so
 	 * that we can avoid introducing any issues with the buddy
 	 * allocator.
 	 */
@@ -2509,7 +2509,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 			else
 				table = memblock_alloc_raw(size,
 							   SMP_CACHE_BYTES);
-		} else if (get_order(size) > MAX_ORDER || hashdist) {
+		} else if (get_order(size) > MAX_PAGE_ORDER || hashdist) {
 			table = vmalloc_huge(size, gfp_flags);
 			virt = true;
 			if (table)
@@ -2756,7 +2756,7 @@ void __init mm_core_init(void)
 
 	/*
 	 * page_ext requires contiguous pages,
-	 * bigger than MAX_ORDER unless SPARSEMEM.
+	 * bigger than MAX_PAGE_ORDER unless SPARSEMEM.
 	 */
 	page_ext_init_flatmem();
 	mem_debugging_and_hardening_init();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ccecf6158ae4..a01baf0454f8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -727,7 +727,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
 	unsigned long higher_page_pfn;
 	struct page *higher_page;
 
-	if (order >= MAX_ORDER - 1)
+	if (order >= MAX_PAGE_ORDER - 1)
 		return false;
 
 	higher_page_pfn = buddy_pfn & pfn;
@@ -782,7 +782,7 @@ static inline void __free_one_page(struct page *page,
 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
 
-	while (order < MAX_ORDER) {
+	while (order < MAX_PAGE_ORDER) {
 		if (compaction_capture(capc, page, order, migratetype)) {
 			__mod_zone_freepage_state(zone, -(1 << order),
 								migratetype);
@@ -1297,7 +1297,7 @@ void __free_pages_core(struct page *page, unsigned int order)
 	atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
 
 	if (page_contains_unaccepted(page, order)) {
-		if (order == MAX_ORDER && __free_unaccepted(page))
+		if (order == MAX_PAGE_ORDER && __free_unaccepted(page))
 			return;
 
 		accept_page(page, order);
@@ -1327,7 +1327,7 @@ void __free_pages_core(struct page *page, unsigned int order)
  *
  * Note: the function may return non-NULL struct page even for a page block
  * which contains a memory hole (i.e. there is no physical memory for a subset
- * of the pfn range). For example, if the pageblock order is MAX_ORDER, which
+ * of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which
  * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
  * even though the start pfn is online and valid. This should be safe most of
  * the time because struct pages are still initialized via init_unavailable_range()
@@ -2018,7 +2018,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 	 * approximates finding the pageblock with the most free pages, which
 	 * would be too costly to do exactly.
 	 */
-	for (current_order = MAX_ORDER; current_order >= min_order;
+	for (current_order = MAX_PAGE_ORDER; current_order >= min_order;
 				--current_order) {
 		area = &(zone->free_area[current_order]);
 		fallback_mt = find_suitable_fallback(area, current_order,
@@ -2056,7 +2056,7 @@ find_smallest:
 	 * This should not happen - we already found a suitable fallback
 	 * when looking for the largest page.
 	 */
-	VM_BUG_ON(current_order > MAX_ORDER);
+	VM_BUG_ON(current_order > MAX_PAGE_ORDER);
 
 do_steal:
 	page = get_page_from_free_area(area, fallback_mt);
@@ -4533,7 +4533,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 	 * There are several places where we assume that the order value is sane
 	 * so bail out early if the request is out of bound.
 	 */
-	if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp))
+	if (WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER, gfp))
 		return NULL;
 
 	gfp &= gfp_allowed_mask;
@@ -4815,7 +4815,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
  * minimum number of pages to satisfy the request.  alloc_pages() can only
  * allocate memory in power-of-two pages.
  *
- * This function is also limited by MAX_ORDER.
+ * This function is also limited by MAX_PAGE_ORDER.
  *
  * Memory allocated by this function must be released by free_pages_exact().
  *
@@ -6373,7 +6373,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 	order = 0;
 	outer_start = start;
 	while (!PageBuddy(pfn_to_page(outer_start))) {
-		if (++order > MAX_ORDER) {
+		if (++order > MAX_PAGE_ORDER) {
 			outer_start = start;
 			break;
 		}
@@ -6635,7 +6635,7 @@ bool is_free_buddy_page(struct page *page)
 			break;
 	}
 
-	return order <= MAX_ORDER;
+	return order <= MAX_PAGE_ORDER;
 }
 EXPORT_SYMBOL(is_free_buddy_page);
 
@@ -6807,9 +6807,9 @@ static bool try_to_accept_memory_one(struct zone *zone)
 	__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
 	spin_unlock_irqrestore(&zone->lock, flags);
 
-	accept_page(page, MAX_ORDER);
+	accept_page(page, MAX_PAGE_ORDER);
 
-	__free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL);
+	__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
 
 	if (last)
 		static_branch_dec(&zones_with_unaccepted_pages);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index bcf99ba747a0..cd0ea3668253 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -226,7 +226,7 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
 	 */
 	if (PageBuddy(page)) {
 		order = buddy_order(page);
-		if (order >= pageblock_order && order < MAX_ORDER) {
+		if (order >= pageblock_order && order < MAX_PAGE_ORDER) {
 			buddy = find_buddy_page_pfn(page, page_to_pfn(page),
 						    order, NULL);
 			if (buddy && !is_migrate_isolate_page(buddy)) {
@@ -290,11 +290,12 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  *			isolate_single_pageblock()
  * @migratetype:	migrate type to set in error recovery.
  *
- * Free and in-use pages can be as big as MAX_ORDER and contain more than one
+ * Free and in-use pages can be as big as MAX_PAGE_ORDER and contain more than one
  * pageblock. When not all pageblocks within a page are isolated at the same
  * time, free page accounting can go wrong. For example, in the case of
- * MAX_ORDER = pageblock_order + 1, a MAX_ORDER page has two pagelbocks.
- * [         MAX_ORDER           ]
+ * MAX_PAGE_ORDER = pageblock_order + 1, a MAX_PAGE_ORDER page has two
+ * pagelbocks.
+ * [      MAX_PAGE_ORDER         ]
  * [  pageblock0  |  pageblock1  ]
  * When either pageblock is isolated, if it is a free page, the page is not
  * split into separate migratetype lists, which is supposed to; if it is an
@@ -451,7 +452,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 				 * the free page to the right migratetype list.
 				 *
 				 * head_pfn is not used here as a hugetlb page order
-				 * can be bigger than MAX_ORDER, but after it is
+				 * can be bigger than MAX_PAGE_ORDER, but after it is
 				 * freed, the free page order is not. Use pfn within
 				 * the range to find the head of the free page.
 				 */
@@ -459,7 +460,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 				outer_pfn = pfn;
 				while (!PageBuddy(pfn_to_page(outer_pfn))) {
 					/* stop if we cannot find the free page */
-					if (++order > MAX_ORDER)
+					if (++order > MAX_PAGE_ORDER)
 						goto failed;
 					outer_pfn &= ~0UL << order;
 				}
@@ -660,8 +661,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
 	int ret;
 
 	/*
-	 * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
-	 * are not aligned to pageblock_nr_pages.
+	 * Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free
+	 * pages are not aligned to pageblock_nr_pages.
 	 * Then we just check migratetype first.
 	 */
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 040dbf26a986..5634e5d890f8 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -320,7 +320,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 				unsigned long freepage_order;
 
 				freepage_order = buddy_order_unsafe(page);
-				if (freepage_order <= MAX_ORDER)
+				if (freepage_order <= MAX_PAGE_ORDER)
 					pfn += (1UL << freepage_order) - 1;
 				continue;
 			}
@@ -555,7 +555,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 		if (PageBuddy(page)) {
 			unsigned long freepage_order = buddy_order_unsafe(page);
 
-			if (freepage_order <= MAX_ORDER)
+			if (freepage_order <= MAX_PAGE_ORDER)
 				pfn += (1UL << freepage_order) - 1;
 			continue;
 		}
@@ -663,7 +663,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
 			if (PageBuddy(page)) {
 				unsigned long order = buddy_order_unsafe(page);
 
-				if (order > 0 && order <= MAX_ORDER)
+				if (order > 0 && order <= MAX_PAGE_ORDER)
 					pfn += (1UL << order) - 1;
 				continue;
 			}
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 66369cc5279b..e4c428e61d8c 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -20,7 +20,7 @@ static int page_order_update_notify(const char *val, const struct kernel_param *
 	 * If param is set beyond this limit, order is set to default
 	 * pageblock_order value
 	 */
-	return  param_set_uint_minmax(val, kp, 0, MAX_ORDER);
+	return  param_set_uint_minmax(val, kp, 0, MAX_PAGE_ORDER);
 }
 
 static const struct kernel_param_ops page_reporting_param_ops = {
@@ -370,7 +370,7 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
 	 */
 
 	if (page_reporting_order == -1) {
-		if (prdev->order > 0 && prdev->order <= MAX_ORDER)
+		if (prdev->order > 0 && prdev->order <= MAX_PAGE_ORDER)
 			page_reporting_order = prdev->order;
 		else
 			page_reporting_order = pageblock_order;
diff --git a/mm/shuffle.h b/mm/shuffle.h
index a6bdf54f96f1..61bbcddeeee6 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -4,7 +4,7 @@
 #define _MM_SHUFFLE_H
 #include <linux/jump_label.h>
 
-#define SHUFFLE_ORDER MAX_ORDER
+#define SHUFFLE_ORDER MAX_PAGE_ORDER
 
 #ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
 DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
diff --git a/mm/slab.c b/mm/slab.c
index 773c79e153f3..073cae923d56 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -465,7 +465,7 @@ static int __init slab_max_order_setup(char *str)
 {
 	get_option(&str, &slab_max_order);
 	slab_max_order = slab_max_order < 0 ? 0 :
-				min(slab_max_order, MAX_ORDER);
+				min(slab_max_order, MAX_PAGE_ORDER);
 	slab_max_order_set = true;
 
 	return 1;
diff --git a/mm/slub.c b/mm/slub.c
index a5420be89c8c..ba162e661e2e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4194,7 +4194,7 @@ static inline int calculate_order(unsigned int size)
 	 * Doh this slab cannot be placed using slub_max_order.
 	 */
 	order = get_order(size);
-	if (order <= MAX_ORDER)
+	if (order <= MAX_PAGE_ORDER)
 		return order;
 	return -ENOSYS;
 }
@@ -4722,7 +4722,7 @@ __setup("slub_min_order=", setup_slub_min_order);
 static int __init setup_slub_max_order(char *str)
 {
 	get_option(&str, (int *)&slub_max_order);
-	slub_max_order = min_t(unsigned int, slub_max_order, MAX_ORDER);
+	slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER);
 
 	if (slub_min_order > slub_max_order)
 		slub_min_order = slub_max_order;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 600ed3cbf7cb..68f0abbb8e59 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6415,7 +6415,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 	 * scan_control uses s8 fields for order, priority, and reclaim_idx.
 	 * Confirm they are large enough for max values.
 	 */
-	BUILD_BUG_ON(MAX_ORDER >= S8_MAX);
+	BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX);
 	BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
 	BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 03ead31c46a0..db79935e4a54 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1092,7 +1092,7 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
 {
 	unsigned long requested = 1UL << order;
 
-	if (WARN_ON_ONCE(order > MAX_ORDER))
+	if (WARN_ON_ONCE(order > MAX_PAGE_ORDER))
 		return 0;
 
 	if (!info->free_blocks_total)
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 89981dbe46c9..97704a9e84c7 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -844,7 +844,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
 		goto out;
 	/* the calculated number of cq entries fits to mlx5 cq allocation */
 	cqe_size_order = cache_line_size() == 128 ? 7 : 6;
-	smc_order = MAX_ORDER - cqe_size_order;
+	smc_order = MAX_PAGE_ORDER - cqe_size_order;
 	if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
 		cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
 	smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index 51ad29940f05..f3738b2c8bcd 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c
@@ -38,7 +38,7 @@ static int param_set_bufsize(const char *val, const struct kernel_param *kp)
 
 	size = memparse(val, NULL);
 	order = get_order(size);
-	if (order > MAX_ORDER)
+	if (order > MAX_PAGE_ORDER)
 		return -EINVAL;
 	ima_maxorder = order;
 	ima_bufsize = PAGE_SIZE << order;
diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt
index 4c90cc176f81..2109690b0d5f 100644
--- a/tools/perf/Documentation/perf-intel-pt.txt
+++ b/tools/perf/Documentation/perf-intel-pt.txt
@@ -683,7 +683,7 @@ Buffer handling
 ~~~~~~~~~~~~~~~
 
 There may be buffer limitations (i.e. single ToPa entry) which means that actual
-buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER).  In order to
+buffer sizes are limited to powers of 2 up to 4MiB (MAX_PAGE_ORDER).  In order to
 provide other sizes, and in particular an arbitrarily large size, multiple
 buffers are logically concatenated.  However an interrupt must be used to switch
 between buffers.  That has two potential problems:
diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h
index 134f8eab0768..71546e15bdd3 100644
--- a/tools/testing/memblock/linux/mmzone.h
+++ b/tools/testing/memblock/linux/mmzone.h
@@ -17,10 +17,10 @@ enum zone_type {
 };
 
 #define MAX_NR_ZONES __MAX_NR_ZONES
-#define MAX_ORDER 10
-#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER)
+#define MAX_PAGE_ORDER 10
+#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER)
 
-#define pageblock_order		MAX_ORDER
+#define pageblock_order		MAX_PAGE_ORDER
 #define pageblock_nr_pages	BIT(pageblock_order)
 #define pageblock_align(pfn)	ALIGN((pfn), pageblock_nr_pages)
 #define pageblock_start_pfn(pfn)	ALIGN_DOWN((pfn), pageblock_nr_pages)
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
index 16ed4dfa7359..622987f12c89 100644
--- a/tools/testing/selftests/mm/thuge-gen.c
+++ b/tools/testing/selftests/mm/thuge-gen.c
@@ -3,7 +3,8 @@
 
    Before running this huge pages for each huge page size must have been
    reserved.
-   For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used.
+   For large pages beyond MAX_PAGE_ORDER (like 1GB on x86) boot options must
+   be used.
    Also shmmax must be increased.
    And you need to run as root to work around some weird permissions in shm.
    And nothing using huge pages should run in parallel.
-- 
cgit v1.2.3


From 3dc2f209208dddc73ffd1d817081711343de3242 Mon Sep 17 00:00:00 2001
From: ZhangPeng <zhangpeng362@huawei.com>
Date: Tue, 9 Jan 2024 10:45:47 +0800
Subject: swiotlb: check alloc_size before the allocation of a new memory pool

The allocation request for swiotlb contiguous memory greater than
128*2KB cannot be fulfilled because it exceeds the maximum contiguous
memory limit. If the swiotlb memory we allocate is larger than 128*2KB,
swiotlb_find_slots() will still schedule the allocation of a new memory
pool, which will increase memory overhead.

Fix it by adding a check with alloc_size no more than 128*2KB before
scheduling the allocation of a new memory pool in swiotlb_find_slots().

Signed-off-by: ZhangPeng <zhangpeng362@huawei.com>
Reviewed-by: Petr Tesarik <petr.tesarik1@huawei-partners.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index e20b856255ef..96f832d828fd 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1136,6 +1136,9 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
 	int cpu, i;
 	int index;
 
+	if (alloc_size > IO_TLB_SEGSIZE * IO_TLB_SIZE)
+		return -1;
+
 	cpu = raw_smp_processor_id();
 	for (i = 0; i < default_nareas; ++i) {
 		index = swiotlb_search_area(dev, cpu, i, orig_addr, alloc_size,
-- 
cgit v1.2.3


From 25742aeb135c4a44e92fb347e037adaa145b9484 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 20 Dec 2023 08:10:28 -0500
Subject: ring-buffer: Remove stale comment from ring_buffer_size()

It's been 11 years since the ring_buffer_size() function was updated to
use the nr_pages from the buffer->buffers[cpu] structure instead of using
the buffer->nr_pages that no longer exists.

The comment in the code is more of what a change log should have and is
pretty much useless for development. It's saying how things worked back in
2012 that bares no purpose on today's code. Remove it.

Link: https://lore.kernel.org/linux-trace-kernel/84d3b41a72bd43dbb9d44921ef535c92@AcuMS.aculab.com/
Link: https://lore.kernel.org/linux-trace-kernel/20231220081028.7cd7e8e2@gandalf.local.home

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reported-by: David Laight <David.Laight@ACULAB.COM>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/ring_buffer.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 173d2595ce2d..7887d61d5b56 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5122,12 +5122,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_advance);
  */
 unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
 {
-	/*
-	 * Earlier, this method returned
-	 *	buffer->subbuf_size * buffer->nr_pages
-	 * Since the nr_pages field is now removed, we have converted this to
-	 * return the per cpu buffer value.
-	 */
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return 0;
 
-- 
cgit v1.2.3


From 4a693ce65b186fddc1a73621bd6f941e6e3eca21 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Fri, 29 Dec 2023 16:02:13 +0800
Subject: kdump: defer the insertion of crashkernel resources

In /proc/iomem, sub-regions should be inserted after their parent,
otherwise the insertion of parent resource fails.  But after generic
crashkernel reservation applied, in both RISC-V and ARM64 (LoongArch will
also use generic reservation later on), crashkernel resources are inserted
before their parent, which causes the parent disappear in /proc/iomem.  So
we defer the insertion of crashkernel resources to an early_initcall().

1, Without 'crashkernel' parameter:

 100d0100-100d01ff : LOON0001:00
   100d0100-100d01ff : LOON0001:00 LOON0001:00
 100e0000-100e0bff : LOON0002:00
   100e0000-100e0bff : LOON0002:00 LOON0002:00
 1fe001e0-1fe001e7 : serial
 90400000-fa17ffff : System RAM
   f6220000-f622ffff : Reserved
   f9ee0000-f9ee3fff : Reserved
   fa120000-fa17ffff : Reserved
 fa190000-fe0bffff : System RAM
   fa190000-fa1bffff : Reserved
 fe4e0000-47fffffff : System RAM
   43c000000-441ffffff : Reserved
   47ff98000-47ffa3fff : Reserved
   47ffa4000-47ffa7fff : Reserved
   47ffa8000-47ffabfff : Reserved
   47ffac000-47ffaffff : Reserved
   47ffb0000-47ffb3fff : Reserved

2, With 'crashkernel' parameter, before this patch:

 100d0100-100d01ff : LOON0001:00
   100d0100-100d01ff : LOON0001:00 LOON0001:00
 100e0000-100e0bff : LOON0002:00
   100e0000-100e0bff : LOON0002:00 LOON0002:00
 1fe001e0-1fe001e7 : serial
 e6200000-f61fffff : Crash kernel
 fa190000-fe0bffff : System RAM
   fa190000-fa1bffff : Reserved
 fe4e0000-47fffffff : System RAM
   43c000000-441ffffff : Reserved
   47ff98000-47ffa3fff : Reserved
   47ffa4000-47ffa7fff : Reserved
   47ffa8000-47ffabfff : Reserved
   47ffac000-47ffaffff : Reserved
   47ffb0000-47ffb3fff : Reserved

3, With 'crashkernel' parameter, after this patch:

 100d0100-100d01ff : LOON0001:00
   100d0100-100d01ff : LOON0001:00 LOON0001:00
 100e0000-100e0bff : LOON0002:00
   100e0000-100e0bff : LOON0002:00 LOON0002:00
 1fe001e0-1fe001e7 : serial
 90400000-fa17ffff : System RAM
   e6200000-f61fffff : Crash kernel
   f6220000-f622ffff : Reserved
   f9ee0000-f9ee3fff : Reserved
   fa120000-fa17ffff : Reserved
 fa190000-fe0bffff : System RAM
   fa190000-fa1bffff : Reserved
 fe4e0000-47fffffff : System RAM
   43c000000-441ffffff : Reserved
   47ff98000-47ffa3fff : Reserved
   47ffa4000-47ffa7fff : Reserved
   47ffa8000-47ffabfff : Reserved
   47ffac000-47ffaffff : Reserved
   47ffb0000-47ffb3fff : Reserved

Link: https://lkml.kernel.org/r/20231229080213.2622204-1-chenhuacai@loongson.cn
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Fixes: 0ab97169aa05 ("crash_core: add generic function to do reservation")
Cc: Baoquan He <bhe@redhat.com>
Cc: Zhen Lei <thunder.leizhen@huawei.com>
Cc: <stable@vger.kernel.org>	[6.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/crash_core.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index d48315667752..4e2cac71b84f 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -376,7 +376,6 @@ static int __init reserve_crashkernel_low(unsigned long long low_size)
 
 	crashk_low_res.start = low_base;
 	crashk_low_res.end   = low_base + low_size - 1;
-	insert_resource(&iomem_resource, &crashk_low_res);
 #endif
 	return 0;
 }
@@ -458,8 +457,19 @@ retry:
 
 	crashk_res.start = crash_base;
 	crashk_res.end = crash_base + crash_size - 1;
-	insert_resource(&iomem_resource, &crashk_res);
 }
+
+static __init int insert_crashkernel_resources(void)
+{
+	if (crashk_res.start < crashk_res.end)
+		insert_resource(&iomem_resource, &crashk_res);
+
+	if (crashk_low_res.start < crashk_low_res.end)
+		insert_resource(&iomem_resource, &crashk_low_res);
+
+	return 0;
+}
+early_initcall(insert_crashkernel_resources);
 #endif
 
 int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
-- 
cgit v1.2.3


From 7bb943806ff61e83ae4cceef8906b7fe52453e8a Mon Sep 17 00:00:00 2001
From: James Gowans <jgowans@amazon.com>
Date: Wed, 13 Dec 2023 08:40:04 +0200
Subject: kexec: do syscore_shutdown() in kernel_kexec

syscore_shutdown() runs driver and module callbacks to get the system into
a state where it can be correctly shut down.  In commit 6f389a8f1dd2 ("PM
/ reboot: call syscore_shutdown() after disable_nonboot_cpus()")
syscore_shutdown() was removed from kernel_restart_prepare() and hence got
(incorrectly?) removed from the kexec flow.  This was innocuous until
commit 6735150b6997 ("KVM: Use syscore_ops instead of reboot_notifier to
hook restart/shutdown") changed the way that KVM registered its shutdown
callbacks, switching from reboot notifiers to syscore_ops.shutdown.  As
syscore_shutdown() is missing from kexec, KVM's shutdown hook is not run
and virtualisation is left enabled on the boot CPU which results in triple
faults when switching to the new kernel on Intel x86 VT-x with VMXE
enabled.

Fix this by adding syscore_shutdown() to the kexec sequence.  In terms of
where to add it, it is being added after migrating the kexec task to the
boot CPU, but before APs are shut down.  It is not totally clear if this
is the best place: in commit 6f389a8f1dd2 ("PM / reboot: call
syscore_shutdown() after disable_nonboot_cpus()") it is stated that
"syscore_ops operations should be carried with one CPU on-line and
interrupts disabled." APs are only offlined later in machine_shutdown(),
so this syscore_shutdown() is being run while APs are still online.  This
seems to be the correct place as it matches where syscore_shutdown() is
run in the reboot and halt flows - they also run it before APs are shut
down.  The assumption is that the commit message in commit 6f389a8f1dd2
("PM / reboot: call syscore_shutdown() after disable_nonboot_cpus()") is
no longer valid.

KVM has been discussed here as it is what broke loudly by not having
syscore_shutdown() in kexec, but this change impacts more than just KVM;
all drivers/modules which register a syscore_ops.shutdown callback will
now be invoked in the kexec flow.  Looking at some of them like x86 MCE it
is probably more correct to also shut these down during kexec.
Maintainers of all drivers which use syscore_ops.shutdown are added on CC
for visibility.  They are:

arch/powerpc/platforms/cell/spu_base.c  .shutdown = spu_shutdown,
arch/x86/kernel/cpu/mce/core.c	        .shutdown = mce_syscore_shutdown,
arch/x86/kernel/i8259.c                 .shutdown = i8259A_shutdown,
drivers/irqchip/irq-i8259.c	        .shutdown = i8259A_shutdown,
drivers/irqchip/irq-sun6i-r.c	        .shutdown = sun6i_r_intc_shutdown,
drivers/leds/trigger/ledtrig-cpu.c	.shutdown = ledtrig_cpu_syscore_shutdown,
drivers/power/reset/sc27xx-poweroff.c	.shutdown = sc27xx_poweroff_shutdown,
kernel/irq/generic-chip.c	        .shutdown = irq_gc_shutdown,
virt/kvm/kvm_main.c	                .shutdown = kvm_shutdown,

This has been tested by doing a kexec on x86_64 and aarch64.

Link: https://lkml.kernel.org/r/20231213064004.2419447-1-jgowans@amazon.com
Fixes: 6735150b6997 ("KVM: Use syscore_ops instead of reboot_notifier to hook restart/shutdown")
Signed-off-by: James Gowans <jgowans@amazon.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Chen-Yu Tsai <wens@csie.org>
Cc: Jernej Skrabec <jernej.skrabec@gmail.com>
Cc: Samuel Holland <samuel@sholland.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Orson Zhai <orsonzhai@gmail.com>
Cc: Alexander Graf <graf@amazon.de>
Cc: Jan H. Schoenherr <jschoenh@amazon.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/kexec_core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index a08031b57a61..d08fc7b5db97 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1257,6 +1257,7 @@ int kernel_kexec(void)
 		kexec_in_progress = true;
 		kernel_restart_prepare("kexec reboot");
 		migrate_to_reboot_cpu();
+		syscore_shutdown();
 
 		/*
 		 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
-- 
cgit v1.2.3


From 4e87ff59cebb53d3ce1333245e64ab7d51ebf118 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 9 Jan 2024 21:35:21 -0800
Subject: kernel/crash_core.c: make __crash_hotplug_lock static

sparse warnings:
kernel/crash_core.c:749:1: sparse: sparse: symbol '__crash_hotplug_lock' was not declared. Should it be static?

Fixes: e2a8f20dd8e9 ("Crash: add lock to serialize crash hotplug handling")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202401080654.IjjU5oK7-lkp@intel.com/
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/crash_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 4e2cac71b84f..75cd6a736d03 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -877,7 +877,7 @@ subsys_initcall(crash_notes_memory_init);
  * regions are online. So mutex lock  __crash_hotplug_lock is used to
  * serialize the crash hotplug handling specifically.
  */
-DEFINE_MUTEX(__crash_hotplug_lock);
+static DEFINE_MUTEX(__crash_hotplug_lock);
 #define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock)
 #define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock)
 
-- 
cgit v1.2.3


From 7c65aa3cc072cee76f577262fbe381a111a98774 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 13 Jan 2024 20:46:42 -0800
Subject: dma-debug: fix kernel-doc warnings

Update the kernel-doc comments to catch up with the code changes and
fix the kernel-doc warnings:

debug.c:83: warning: Excess struct member 'stacktrace' description in 'dma_debug_entry'
debug.c:83: warning: Function parameter or struct member 'stack_len' not described in 'dma_debug_entry'
debug.c:83: warning: Function parameter or struct member 'stack_entries' not described in 'dma_debug_entry'

Fixes: 746017ed8d4d ("dma/debug: Simplify stracktrace retrieval")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: iommu@lists.linux.dev
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/debug.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 3de494375b7b..08b7f84b76f9 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -62,7 +62,8 @@ enum map_err_types {
  * @pfn: page frame of the start address
  * @offset: offset of mapping relative to pfn
  * @map_err_type: track whether dma_mapping_error() was checked
- * @stacktrace: support backtraces when a violation is detected
+ * @stack_len: number of backtrace entries in @stack_entries
+ * @stack_entries: stack of backtrace history
  */
 struct dma_debug_entry {
 	struct list_head list;
-- 
cgit v1.2.3


From e37617c8e53a1f7fcba6d5e1041f4fd8a2425c27 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Sun, 14 Jan 2024 19:36:00 +0100
Subject: sched/fair: Fix frequency selection for non-invariant case

Linus reported a ~50% performance regression on single-threaded
workloads on his AMD Ryzen system, and bisected it to:

  9c0b4bb7f630 ("sched/cpufreq: Rework schedutil governor performance estimation")

When frequency invariance is not enabled, get_capacity_ref_freq(policy)
is supposed to return the current frequency and the performance margin
applied by map_util_perf(), enabling the utilization to go above the
maximum compute capacity and to select a higher frequency than the current one.

After the changes in 9c0b4bb7f630, the performance margin was applied
earlier in the path to take into account utilization clampings and
we couldn't get a utilization higher than the maximum compute capacity,
and the CPU remained 'stuck' at lower frequencies.

To fix this, we must use a frequency above the current frequency to
get a chance to select a higher OPP when the current one becomes fully used.
Apply the same margin and return a frequency 25% higher than the current
one in order to switch to the next OPP before we fully use the CPU
at the current one.

[ mingo: Clarified the changelog. ]

Fixes: 9c0b4bb7f630 ("sched/cpufreq: Rework schedutil governor performance estimation")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Bisected-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Wyes Karny <wkarny@gmail.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Wyes Karny <wkarny@gmail.com>
Link: https://lore.kernel.org/r/20240114183600.135316-1-vincent.guittot@linaro.org
---
 kernel/sched/cpufreq_schedutil.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 95c3c097083e..eece6244f9d2 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -133,7 +133,11 @@ unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy)
 	if (arch_scale_freq_invariant())
 		return policy->cpuinfo.max_freq;
 
-	return policy->cur;
+	/*
+	 * Apply a 25% margin so that we select a higher frequency than
+	 * the current one before the CPU is fully busy:
+	 */
+	return policy->cur + (policy->cur >> 2);
 }
 
 /**
-- 
cgit v1.2.3


From 22c7fa171a02d310e3a3f6ed46a698ca8a0060ed Mon Sep 17 00:00:00 2001
From: Hao Sun <sunhao.th@gmail.com>
Date: Mon, 15 Jan 2024 09:20:27 +0100
Subject: bpf: Reject variable offset alu on PTR_TO_FLOW_KEYS

For PTR_TO_FLOW_KEYS, check_flow_keys_access() only uses fixed off
for validation. However, variable offset ptr alu is not prohibited
for this ptr kind. So the variable offset is not checked.

The following prog is accepted:

  func#0 @0
  0: R1=ctx() R10=fp0
  0: (bf) r6 = r1                       ; R1=ctx() R6_w=ctx()
  1: (79) r7 = *(u64 *)(r6 +144)        ; R6_w=ctx() R7_w=flow_keys()
  2: (b7) r8 = 1024                     ; R8_w=1024
  3: (37) r8 /= 1                       ; R8_w=scalar()
  4: (57) r8 &= 1024                    ; R8_w=scalar(smin=smin32=0,
  smax=umax=smax32=umax32=1024,var_off=(0x0; 0x400))
  5: (0f) r7 += r8
  mark_precise: frame0: last_idx 5 first_idx 0 subseq_idx -1
  mark_precise: frame0: regs=r8 stack= before 4: (57) r8 &= 1024
  mark_precise: frame0: regs=r8 stack= before 3: (37) r8 /= 1
  mark_precise: frame0: regs=r8 stack= before 2: (b7) r8 = 1024
  6: R7_w=flow_keys(smin=smin32=0,smax=umax=smax32=umax32=1024,var_off
  =(0x0; 0x400)) R8_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=1024,
  var_off=(0x0; 0x400))
  6: (79) r0 = *(u64 *)(r7 +0)          ; R0_w=scalar()
  7: (95) exit

This prog loads flow_keys to r7, and adds the variable offset r8
to r7, and finally causes out-of-bounds access:

  BUG: unable to handle page fault for address: ffffc90014c80038
  [...]
  Call Trace:
   <TASK>
   bpf_dispatcher_nop_func include/linux/bpf.h:1231 [inline]
   __bpf_prog_run include/linux/filter.h:651 [inline]
   bpf_prog_run include/linux/filter.h:658 [inline]
   bpf_prog_run_pin_on_cpu include/linux/filter.h:675 [inline]
   bpf_flow_dissect+0x15f/0x350 net/core/flow_dissector.c:991
   bpf_prog_test_run_flow_dissector+0x39d/0x620 net/bpf/test_run.c:1359
   bpf_prog_test_run kernel/bpf/syscall.c:4107 [inline]
   __sys_bpf+0xf8f/0x4560 kernel/bpf/syscall.c:5475
   __do_sys_bpf kernel/bpf/syscall.c:5561 [inline]
   __se_sys_bpf kernel/bpf/syscall.c:5559 [inline]
   __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:5559
   do_syscall_x64 arch/x86/entry/common.c:52 [inline]
   do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:83
   entry_SYSCALL_64_after_hwframe+0x63/0x6b

Fix this by rejecting ptr alu with variable offset on flow_keys.
Applying the patch rejects the program with "R7 pointer arithmetic
on flow_keys prohibited".

Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook")
Signed-off-by: Hao Sun <sunhao.th@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20240115082028.9992-1-sunhao.th@gmail.com
---
 kernel/bpf/verifier.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index adbf330d364b..65f598694d55 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12826,6 +12826,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	}
 
 	switch (base_type(ptr_reg->type)) {
+	case PTR_TO_FLOW_KEYS:
+		if (known)
+			break;
+		fallthrough;
 	case CONST_PTR_TO_MAP:
 		/* smin_val represents the known value */
 		if (known && smin_val == 0 && opcode == BPF_ADD)
-- 
cgit v1.2.3


From 4f41d30cd6dc865c3cbc1a852372321eba6d4e4c Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 25 Nov 2023 13:05:04 +0100
Subject: kdb: Fix a potential buffer overflow in kdb_local()

When appending "[defcmd]" to 'kdb_prompt_str', the size of the string
already in the buffer should be taken into account.

An option could be to switch from strncat() to strlcat() which does the
correct test to avoid such an overflow.

However, this actually looks as dead code, because 'defcmd_in_progress'
can't be true here.
See a more detailed explanation at [1].

[1]: https://lore.kernel.org/all/CAD=FV=WSh7wKN7Yp-3wWiDgX4E3isQ8uh0LCzTmd1v9Cg9j+nQ@mail.gmail.com/

Fixes: 5d5314d6795f ("kdb: core for kgdb back end (1 of 2)")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
---
 kernel/debug/kdb/kdb_main.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 6b213c8252d6..d05066cb40b2 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1348,8 +1348,6 @@ do_full_getstr:
 		/* PROMPT can only be set if we have MEM_READ permission. */
 		snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
 			 raw_smp_processor_id());
-		if (defcmd_in_progress)
-			strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
 
 		/*
 		 * Fetch command from keyboard
-- 
cgit v1.2.3


From 66967a32d3b16ed447e76fed4d946bab52e43d86 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 17 Jan 2024 19:31:40 -0800
Subject: bpf: extract bpf_ctx_convert_map logic and make it more reusable

Refactor btf_get_prog_ctx_type() a bit to allow reuse of
bpf_ctx_convert_map logic in more than one places. Simplify interface by
returning btf_type instead of btf_member (field reference in BTF).

To do the above we need to touch and start untangling
btf_translate_to_vmlinux() implementation. We do the bare minimum to
not regress anything for btf_translate_to_vmlinux(), but its
implementation is very questionable for what it claims to be doing.
Mapping kfunc argument types to kernel corresponding types conceptually
is quite different from recognizing program context types. Fixing this
is out of scope for this change though.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240118033143.3384355-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h |  2 +-
 kernel/bpf/btf.c    | 71 +++++++++++++++++++++++++++++++++--------------------
 2 files changed, 46 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 59d404e22814..cf5c6ff48981 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -512,7 +512,7 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id);
 int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt,
 				struct module *owner);
 struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id);
-const struct btf_member *
+const struct btf_type *
 btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
 		      const struct btf_type *t, enum bpf_prog_type prog_type,
 		      int arg);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 51e8b4bee0c8..10ac9efc662d 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5615,21 +5615,46 @@ static u8 bpf_ctx_convert_map[] = {
 #undef BPF_MAP_TYPE
 #undef BPF_LINK_TYPE
 
-const struct btf_member *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
-		      const struct btf_type *t, enum bpf_prog_type prog_type,
-		      int arg)
+static const struct btf_type *find_canonical_prog_ctx_type(enum bpf_prog_type prog_type)
 {
 	const struct btf_type *conv_struct;
-	const struct btf_type *ctx_struct;
 	const struct btf_member *ctx_type;
-	const char *tname, *ctx_tname;
 
 	conv_struct = bpf_ctx_convert.t;
-	if (!conv_struct) {
-		bpf_log(log, "btf_vmlinux is malformed\n");
+	if (!conv_struct)
 		return NULL;
-	}
+	/* prog_type is valid bpf program type. No need for bounds check. */
+	ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
+	/* ctx_type is a pointer to prog_ctx_type in vmlinux.
+	 * Like 'struct __sk_buff'
+	 */
+	return btf_type_by_id(btf_vmlinux, ctx_type->type);
+}
+
+static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
+{
+	const struct btf_type *conv_struct;
+	const struct btf_member *ctx_type;
+
+	conv_struct = bpf_ctx_convert.t;
+	if (!conv_struct)
+		return -EFAULT;
+	/* prog_type is valid bpf program type. No need for bounds check. */
+	ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+	/* ctx_type is a pointer to prog_ctx_type in vmlinux.
+	 * Like 'struct sk_buff'
+	 */
+	return ctx_type->type;
+}
+
+const struct btf_type *
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+		      const struct btf_type *t, enum bpf_prog_type prog_type,
+		      int arg)
+{
+	const struct btf_type *ctx_type;
+	const char *tname, *ctx_tname;
+
 	t = btf_type_by_id(btf, t->type);
 	while (btf_type_is_modifier(t))
 		t = btf_type_by_id(btf, t->type);
@@ -5646,17 +5671,15 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
 		bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
 		return NULL;
 	}
-	/* prog_type is valid bpf program type. No need for bounds check. */
-	ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
-	/* ctx_struct is a pointer to prog_ctx_type in vmlinux.
-	 * Like 'struct __sk_buff'
-	 */
-	ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type);
-	if (!ctx_struct)
+
+	ctx_type = find_canonical_prog_ctx_type(prog_type);
+	if (!ctx_type) {
+		bpf_log(log, "btf_vmlinux is malformed\n");
 		/* should not happen */
 		return NULL;
+	}
 again:
-	ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off);
+	ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
 	if (!ctx_tname) {
 		/* should not happen */
 		bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
@@ -5677,10 +5700,10 @@ again:
 		/* bpf_user_pt_regs_t is a typedef, so resolve it to
 		 * underlying struct and check name again
 		 */
-		if (!btf_type_is_modifier(ctx_struct))
+		if (!btf_type_is_modifier(ctx_type))
 			return NULL;
-		while (btf_type_is_modifier(ctx_struct))
-			ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type);
+		while (btf_type_is_modifier(ctx_type))
+			ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
 		goto again;
 	}
 	return ctx_type;
@@ -5692,13 +5715,9 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
 				     enum bpf_prog_type prog_type,
 				     int arg)
 {
-	const struct btf_member *prog_ctx_type, *kern_ctx_type;
-
-	prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg);
-	if (!prog_ctx_type)
+	if (!btf_get_prog_ctx_type(log, btf, t, prog_type, arg))
 		return -ENOENT;
-	kern_ctx_type = prog_ctx_type + 1;
-	return kern_ctx_type->type;
+	return find_kern_ctx_type_id(prog_type);
 }
 
 int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type)
-- 
cgit v1.2.3


From 0ba971511d16603599f947459e59b435cc465b0d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 17 Jan 2024 19:31:41 -0800
Subject: bpf: enforce types for __arg_ctx-tagged arguments in global subprogs

Add enforcement of expected types for context arguments tagged with
arg:ctx (__arg_ctx) tag.

First, any program type will accept generic `void *` context type when
combined with __arg_ctx tag.

Besides accepting "canonical" struct names and `void *`, for a bunch of
program types for which program context is actually a named struct, we
allows a bunch of pragmatic exceptions to match real-world and expected
usage:

  - for both kprobes and perf_event we allow `bpf_user_pt_regs_t *` as
    canonical context argument type, where `bpf_user_pt_regs_t` is a
    *typedef*, not a struct;
  - for kprobes, we also always accept `struct pt_regs *`, as that's what
    actually is passed as a context to any kprobe program;
  - for perf_event, we resolve typedefs (unless it's `bpf_user_pt_regs_t`)
    down to actual struct type and accept `struct pt_regs *`, or
    `struct user_pt_regs *`, or `struct user_regs_struct *`, depending
    on the actual struct type kernel architecture points `bpf_user_pt_regs_t`
    typedef to; otherwise, canonical `struct bpf_perf_event_data *` is
    expected;
  - for raw_tp/raw_tp.w programs, `u64/long *` are accepted, as that's
    what's expected with BPF_PROG() usage; otherwise, canonical
    `struct bpf_raw_tracepoint_args *` is expected;
  - tp_btf supports both `struct bpf_raw_tracepoint_args *` and `u64 *`
    formats, both are coded as expections as tp_btf is actually a TRACING
    program type, which has no canonical context type;
  - iterator programs accept `struct bpf_iter__xxx *` structs, currently
    with no further iterator-type specific enforcement;
  - fentry/fexit/fmod_ret/lsm/struct_ops all accept `u64 *`;
  - classic tracepoint programs, as well as syscall and freplace
    programs allow any user-provided type.

In all other cases kernel will enforce exact match of struct name to
expected canonical type. And if user-provided type doesn't match that
expectation, verifier will emit helpful message with expected type name.

Note a bit unnatural way the check is done after processing all the
arguments. This is done to avoid conflict between bpf and bpf-next
trees. Once trees converge, a small follow up patch will place a simple
btf_validate_prog_ctx_type() check into a proper ARG_PTR_TO_CTX branch
(which bpf-next tree patch refactored already), removing duplicated
arg:ctx detection logic.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240118033143.3384355-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 160 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 10ac9efc662d..596471189176 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5709,6 +5709,149 @@ again:
 	return ctx_type;
 }
 
+/* forward declarations for arch-specific underlying types of
+ * bpf_user_pt_regs_t; this avoids the need for arch-specific #ifdef
+ * compilation guards below for BPF_PROG_TYPE_PERF_EVENT checks, but still
+ * works correctly with __builtin_types_compatible_p() on respective
+ * architectures
+ */
+struct user_regs_struct;
+struct user_pt_regs;
+
+static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
+				      const struct btf_type *t, int arg,
+				      enum bpf_prog_type prog_type,
+				      enum bpf_attach_type attach_type)
+{
+	const struct btf_type *ctx_type;
+	const char *tname, *ctx_tname;
+
+	if (!btf_is_ptr(t)) {
+		bpf_log(log, "arg#%d type isn't a pointer\n", arg);
+		return -EINVAL;
+	}
+	t = btf_type_by_id(btf, t->type);
+
+	/* KPROBE and PERF_EVENT programs allow bpf_user_pt_regs_t typedef */
+	if (prog_type == BPF_PROG_TYPE_KPROBE || prog_type == BPF_PROG_TYPE_PERF_EVENT) {
+		while (btf_type_is_modifier(t) && !btf_type_is_typedef(t))
+			t = btf_type_by_id(btf, t->type);
+
+		if (btf_type_is_typedef(t)) {
+			tname = btf_name_by_offset(btf, t->name_off);
+			if (tname && strcmp(tname, "bpf_user_pt_regs_t") == 0)
+				return 0;
+		}
+	}
+
+	/* all other program types don't use typedefs for context type */
+	while (btf_type_is_modifier(t))
+		t = btf_type_by_id(btf, t->type);
+
+	/* `void *ctx __arg_ctx` is always valid */
+	if (btf_type_is_void(t))
+		return 0;
+
+	tname = btf_name_by_offset(btf, t->name_off);
+	if (str_is_empty(tname)) {
+		bpf_log(log, "arg#%d type doesn't have a name\n", arg);
+		return -EINVAL;
+	}
+
+	/* special cases */
+	switch (prog_type) {
+	case BPF_PROG_TYPE_KPROBE:
+		if (__btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
+			return 0;
+		break;
+	case BPF_PROG_TYPE_PERF_EVENT:
+		if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) &&
+		    __btf_type_is_struct(t) && strcmp(tname, "pt_regs") == 0)
+			return 0;
+		if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) &&
+		    __btf_type_is_struct(t) && strcmp(tname, "user_pt_regs") == 0)
+			return 0;
+		if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) &&
+		    __btf_type_is_struct(t) && strcmp(tname, "user_regs_struct") == 0)
+			return 0;
+		break;
+	case BPF_PROG_TYPE_RAW_TRACEPOINT:
+	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+		/* allow u64* as ctx */
+		if (btf_is_int(t) && t->size == 8)
+			return 0;
+		break;
+	case BPF_PROG_TYPE_TRACING:
+		switch (attach_type) {
+		case BPF_TRACE_RAW_TP:
+			/* tp_btf program is TRACING, so need special case here */
+			if (__btf_type_is_struct(t) &&
+			    strcmp(tname, "bpf_raw_tracepoint_args") == 0)
+				return 0;
+			/* allow u64* as ctx */
+			if (btf_is_int(t) && t->size == 8)
+				return 0;
+			break;
+		case BPF_TRACE_ITER:
+			/* allow struct bpf_iter__xxx types only */
+			if (__btf_type_is_struct(t) &&
+			    strncmp(tname, "bpf_iter__", sizeof("bpf_iter__") - 1) == 0)
+				return 0;
+			break;
+		case BPF_TRACE_FENTRY:
+		case BPF_TRACE_FEXIT:
+		case BPF_MODIFY_RETURN:
+			/* allow u64* as ctx */
+			if (btf_is_int(t) && t->size == 8)
+				return 0;
+			break;
+		default:
+			break;
+		}
+		break;
+	case BPF_PROG_TYPE_LSM:
+	case BPF_PROG_TYPE_STRUCT_OPS:
+		/* allow u64* as ctx */
+		if (btf_is_int(t) && t->size == 8)
+			return 0;
+		break;
+	case BPF_PROG_TYPE_TRACEPOINT:
+	case BPF_PROG_TYPE_SYSCALL:
+	case BPF_PROG_TYPE_EXT:
+		return 0; /* anything goes */
+	default:
+		break;
+	}
+
+	ctx_type = find_canonical_prog_ctx_type(prog_type);
+	if (!ctx_type) {
+		/* should not happen */
+		bpf_log(log, "btf_vmlinux is malformed\n");
+		return -EINVAL;
+	}
+
+	/* resolve typedefs and check that underlying structs are matching as well */
+	while (btf_type_is_modifier(ctx_type))
+		ctx_type = btf_type_by_id(btf_vmlinux, ctx_type->type);
+
+	/* if program type doesn't have distinctly named struct type for
+	 * context, then __arg_ctx argument can only be `void *`, which we
+	 * already checked above
+	 */
+	if (!__btf_type_is_struct(ctx_type)) {
+		bpf_log(log, "arg#%d should be void pointer\n", arg);
+		return -EINVAL;
+	}
+
+	ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_type->name_off);
+	if (!__btf_type_is_struct(t) || strcmp(ctx_tname, tname) != 0) {
+		bpf_log(log, "arg#%d should be `struct %s *`\n", arg, ctx_tname);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
 				     struct btf *btf,
 				     const struct btf_type *t,
@@ -6953,6 +7096,23 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 		return -EINVAL;
 	}
 
+	for (i = 0; i < nargs; i++) {
+		const char *tag;
+
+		if (sub->args[i].arg_type != ARG_PTR_TO_CTX)
+			continue;
+
+		/* check if arg has "arg:ctx" tag */
+		t = btf_type_by_id(btf, args[i].type);
+		tag = btf_find_decl_tag_value(btf, fn_t, i, "arg:");
+		if (IS_ERR_OR_NULL(tag) || strcmp(tag, "ctx") != 0)
+			continue;
+
+		if (btf_validate_prog_ctx_type(log, btf, t, i, prog_type,
+					       prog->expected_attach_type))
+			return -EINVAL;
+	}
+
 	sub->arg_cnt = nargs;
 	sub->args_cached = true;
 
-- 
cgit v1.2.3


From e626cb02ee8399fd42c415e542d031d185783903 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 18 Jan 2024 12:54:51 +0100
Subject: futex: Prevent the reuse of stale pi_state

Jiri Slaby reported a futex state inconsistency resulting in -EINVAL during
a lock operation for a PI futex. It requires that the a lock process is
interrupted by a timeout or signal:

  T1 Owns the futex in user space.

  T2 Tries to acquire the futex in kernel (futex_lock_pi()). Allocates a
     pi_state and attaches itself to it.

  T2 Times out and removes its rt_waiter from the rt_mutex. Drops the
     rtmutex lock and tries to acquire the hash bucket lock to remove
     the futex_q. The lock is contended and T2 schedules out.

  T1 Unlocks the futex (futex_unlock_pi()). Finds a futex_q but no
     rt_waiter. Unlocks the futex (do_uncontended) and makes it available
     to user space.

  T3 Acquires the futex in user space.

  T4 Tries to acquire the futex in kernel (futex_lock_pi()). Finds the
     existing futex_q of T2 and tries to attach itself to the existing
     pi_state.  This (attach_to_pi_state()) fails with -EINVAL because uval
     contains the TID of T3 but pi_state points to T1.

It's incorrect to unlock the futex and make it available for user space to
acquire as long as there is still an existing state attached to it in the
kernel.

T1 cannot hand over the futex to T2 because T2 already gave up and started
to clean up and is blocked on the hash bucket lock, so T2's futex_q with
the pi_state pointing to T1 is still queued.

T2 observes the futex_q, but ignores it as there is no waiter on the
corresponding rt_mutex and takes the uncontended path which allows the
subsequent caller of futex_lock_pi() (T4) to observe that stale state.

To prevent this the unlock path must dequeue all futex_q entries which
point to the same pi_state when there is no waiter on the rt mutex. This
requires obviously to make the dequeue conditional in the locking path to
prevent a double dequeue. With that it's guaranteed that user space cannot
observe an uncontended futex which has kernel state attached.

Fixes: fbeb558b0dd0d ("futex/pi: Fix recursive rt_mutex waiter state")
Reported-by: Jiri Slaby <jirislaby@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Jiri Slaby <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20240118115451.0TkD_ZhB@linutronix.de
Closes: https://lore.kernel.org/all/4611bcf2-44d0-4c34-9b84-17406f881003@kernel.org
---
 kernel/futex/core.c | 15 ++++++++++++---
 kernel/futex/pi.c   | 11 ++++++++---
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index e0e853412c15..1e78ef24321e 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -627,12 +627,21 @@ retry:
 }
 
 /*
- * PI futexes can not be requeued and must remove themselves from the
- * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
+ * PI futexes can not be requeued and must remove themselves from the hash
+ * bucket. The hash bucket lock (i.e. lock_ptr) is held.
  */
 void futex_unqueue_pi(struct futex_q *q)
 {
-	__futex_unqueue(q);
+	/*
+	 * If the lock was not acquired (due to timeout or signal) then the
+	 * rt_waiter is removed before futex_q is. If this is observed by
+	 * an unlocker after dropping the rtmutex wait lock and before
+	 * acquiring the hash bucket lock, then the unlocker dequeues the
+	 * futex_q from the hash bucket list to guarantee consistent state
+	 * vs. userspace. Therefore the dequeue here must be conditional.
+	 */
+	if (!plist_node_empty(&q->list))
+		__futex_unqueue(q);
 
 	BUG_ON(!q->pi_state);
 	put_pi_state(q->pi_state);
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index 90e5197f4e56..5722467f2737 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1135,6 +1135,7 @@ retry:
 
 	hb = futex_hash(&key);
 	spin_lock(&hb->lock);
+retry_hb:
 
 	/*
 	 * Check waiters first. We do not trust user space values at
@@ -1177,12 +1178,17 @@ retry:
 		/*
 		 * Futex vs rt_mutex waiter state -- if there are no rt_mutex
 		 * waiters even though futex thinks there are, then the waiter
-		 * is leaving and the uncontended path is safe to take.
+		 * is leaving. The entry needs to be removed from the list so a
+		 * new futex_lock_pi() is not using this stale PI-state while
+		 * the futex is available in user space again.
+		 * There can be more than one task on its way out so it needs
+		 * to retry.
 		 */
 		rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
 		if (!rt_waiter) {
+			__futex_unqueue(top_waiter);
 			raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-			goto do_uncontended;
+			goto retry_hb;
 		}
 
 		get_pi_state(pi_state);
@@ -1217,7 +1223,6 @@ retry:
 		return ret;
 	}
 
-do_uncontended:
 	/*
 	 * We have no kernel internal state, i.e. no waiters in the
 	 * kernel. Waiters which are about to queue themselves are stuck
-- 
cgit v1.2.3


From 71fee48fb772ac4f6cfa63dbebc5629de8b4cc09 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Mon, 15 Jan 2024 17:35:55 +0100
Subject: tick-sched: Fix idle and iowait sleeptime accounting vs CPU hotplug

When offlining and onlining CPUs the overall reported idle and iowait
times as reported by /proc/stat jump backward and forward:

cpu  132 0 176 225249 47 6 6 21 0 0
cpu0 80 0 115 112575 33 3 4 18 0 0
cpu1 52 0 60 112673 13 3 1 2 0 0

cpu  133 0 177 226681 47 6 6 21 0 0
cpu0 80 0 116 113387 33 3 4 18 0 0

cpu  133 0 178 114431 33 6 6 21 0 0 <---- jump backward
cpu0 80 0 116 114247 33 3 4 18 0 0
cpu1 52 0 61 183 0 3 1 2 0 0        <---- idle + iowait start with 0

cpu  133 0 178 228956 47 6 6 21 0 0 <---- jump forward
cpu0 81 0 117 114929 33 3 4 18 0 0

Reason for this is that get_idle_time() in fs/proc/stat.c has different
sources for both values depending on if a CPU is online or offline:

- if a CPU is online the values may be taken from its per cpu
  tick_cpu_sched structure

- if a CPU is offline the values are taken from its per cpu cpustat
  structure

The problem is that the per cpu tick_cpu_sched structure is set to zero on
CPU offline. See tick_cancel_sched_timer() in kernel/time/tick-sched.c.

Therefore when a CPU is brought offline and online afterwards both its idle
and iowait sleeptime will be zero, causing a jump backward in total system
idle and iowait sleeptime. In a similar way if a CPU is then brought
offline again the total idle and iowait sleeptimes will jump forward.

It looks like this behavior was introduced with commit 4b0c0f294f60
("tick: Cleanup NOHZ per cpu data on cpu down").

This was only noticed now on s390, since we switched to generic idle time
reporting with commit be76ea614460 ("s390/idle: remove arch_cpu_idle_time()
and corresponding code").

Fix this by preserving the values of idle_sleeptime and iowait_sleeptime
members of the per-cpu tick_sched structure on CPU hotplug.

Fixes: 4b0c0f294f60 ("tick: Cleanup NOHZ per cpu data on cpu down")
Reported-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20240115163555.1004144-1-hca@linux.ibm.com
---
 kernel/time/tick-sched.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a17d26002831..d2501673028d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1576,13 +1576,18 @@ void tick_setup_sched_timer(void)
 void tick_cancel_sched_timer(int cpu)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	ktime_t idle_sleeptime, iowait_sleeptime;
 
 # ifdef CONFIG_HIGH_RES_TIMERS
 	if (ts->sched_timer.base)
 		hrtimer_cancel(&ts->sched_timer);
 # endif
 
+	idle_sleeptime = ts->idle_sleeptime;
+	iowait_sleeptime = ts->iowait_sleeptime;
 	memset(ts, 0, sizeof(*ts));
+	ts->idle_sleeptime = idle_sleeptime;
+	ts->iowait_sleeptime = iowait_sleeptime;
 }
 #endif
 
-- 
cgit v1.2.3


From 2b44760609e9eaafc9d234a6883d042fc21132a7 Mon Sep 17 00:00:00 2001
From: Petr Pavlu <petr.pavlu@suse.com>
Date: Mon, 22 Jan 2024 16:09:28 +0100
Subject: tracing: Ensure visibility when inserting an element into tracing_map

Running the following two commands in parallel on a multi-processor
AArch64 machine can sporadically produce an unexpected warning about
duplicate histogram entries:

 $ while true; do
     echo hist:key=id.syscall:val=hitcount > \
       /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
     cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
     sleep 0.001
   done
 $ stress-ng --sysbadaddr $(nproc)

The warning looks as follows:

[ 2911.172474] ------------[ cut here ]------------
[ 2911.173111] Duplicates detected: 1
[ 2911.173574] WARNING: CPU: 2 PID: 12247 at kernel/trace/tracing_map.c:983 tracing_map_sort_entries+0x3e0/0x408
[ 2911.174702] Modules linked in: iscsi_ibft(E) iscsi_boot_sysfs(E) rfkill(E) af_packet(E) nls_iso8859_1(E) nls_cp437(E) vfat(E) fat(E) ena(E) tiny_power_button(E) qemu_fw_cfg(E) button(E) fuse(E) efi_pstore(E) ip_tables(E) x_tables(E) xfs(E) libcrc32c(E) aes_ce_blk(E) aes_ce_cipher(E) crct10dif_ce(E) polyval_ce(E) polyval_generic(E) ghash_ce(E) gf128mul(E) sm4_ce_gcm(E) sm4_ce_ccm(E) sm4_ce(E) sm4_ce_cipher(E) sm4(E) sm3_ce(E) sm3(E) sha3_ce(E) sha512_ce(E) sha512_arm64(E) sha2_ce(E) sha256_arm64(E) nvme(E) sha1_ce(E) nvme_core(E) nvme_auth(E) t10_pi(E) sg(E) scsi_mod(E) scsi_common(E) efivarfs(E)
[ 2911.174738] Unloaded tainted modules: cppc_cpufreq(E):1
[ 2911.180985] CPU: 2 PID: 12247 Comm: cat Kdump: loaded Tainted: G            E      6.7.0-default #2 1b58bbb22c97e4399dc09f92d309344f69c44a01
[ 2911.182398] Hardware name: Amazon EC2 c7g.8xlarge/, BIOS 1.0 11/1/2018
[ 2911.183208] pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
[ 2911.184038] pc : tracing_map_sort_entries+0x3e0/0x408
[ 2911.184667] lr : tracing_map_sort_entries+0x3e0/0x408
[ 2911.185310] sp : ffff8000a1513900
[ 2911.185750] x29: ffff8000a1513900 x28: ffff0003f272fe80 x27: 0000000000000001
[ 2911.186600] x26: ffff0003f272fe80 x25: 0000000000000030 x24: 0000000000000008
[ 2911.187458] x23: ffff0003c5788000 x22: ffff0003c16710c8 x21: ffff80008017f180
[ 2911.188310] x20: ffff80008017f000 x19: ffff80008017f180 x18: ffffffffffffffff
[ 2911.189160] x17: 0000000000000000 x16: 0000000000000000 x15: ffff8000a15134b8
[ 2911.190015] x14: 0000000000000000 x13: 205d373432323154 x12: 5b5d313131333731
[ 2911.190844] x11: 00000000fffeffff x10: 00000000fffeffff x9 : ffffd1b78274a13c
[ 2911.191716] x8 : 000000000017ffe8 x7 : c0000000fffeffff x6 : 000000000057ffa8
[ 2911.192554] x5 : ffff0012f6c24ec0 x4 : 0000000000000000 x3 : ffff2e5b72b5d000
[ 2911.193404] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0003ff254480
[ 2911.194259] Call trace:
[ 2911.194626]  tracing_map_sort_entries+0x3e0/0x408
[ 2911.195220]  hist_show+0x124/0x800
[ 2911.195692]  seq_read_iter+0x1d4/0x4e8
[ 2911.196193]  seq_read+0xe8/0x138
[ 2911.196638]  vfs_read+0xc8/0x300
[ 2911.197078]  ksys_read+0x70/0x108
[ 2911.197534]  __arm64_sys_read+0x24/0x38
[ 2911.198046]  invoke_syscall+0x78/0x108
[ 2911.198553]  el0_svc_common.constprop.0+0xd0/0xf8
[ 2911.199157]  do_el0_svc+0x28/0x40
[ 2911.199613]  el0_svc+0x40/0x178
[ 2911.200048]  el0t_64_sync_handler+0x13c/0x158
[ 2911.200621]  el0t_64_sync+0x1a8/0x1b0
[ 2911.201115] ---[ end trace 0000000000000000 ]---

The problem appears to be caused by CPU reordering of writes issued from
__tracing_map_insert().

The check for the presence of an element with a given key in this
function is:

 val = READ_ONCE(entry->val);
 if (val && keys_match(key, val->key, map->key_size)) ...

The write of a new entry is:

 elt = get_free_elt(map);
 memcpy(elt->key, key, map->key_size);
 entry->val = elt;

The "memcpy(elt->key, key, map->key_size);" and "entry->val = elt;"
stores may become visible in the reversed order on another CPU. This
second CPU might then incorrectly determine that a new key doesn't match
an already present val->key and subsequently insert a new element,
resulting in a duplicate.

Fix the problem by adding a write barrier between
"memcpy(elt->key, key, map->key_size);" and "entry->val = elt;", and for
good measure, also use WRITE_ONCE(entry->val, elt) for publishing the
element. The sequence pairs with the mentioned "READ_ONCE(entry->val);"
and the "val->key" check which has an address dependency.

The barrier is placed on a path executed when adding an element for
a new key. Subsequent updates targeting the same key remain unaffected.

From the user's perspective, the issue was introduced by commit
c193707dde77 ("tracing: Remove code which merges duplicates"), which
followed commit cbf4100efb8f ("tracing: Add support to detect and avoid
duplicates"). The previous code operated differently; it inherently
expected potential races which result in duplicates but merged them
later when they occurred.

Link: https://lore.kernel.org/linux-trace-kernel/20240122150928.27725-1-petr.pavlu@suse.com

Fixes: c193707dde77 ("tracing: Remove code which merges duplicates")
Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
Acked-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index c774e560f2f9..a4dcf0f24352 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -574,7 +574,12 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
 				}
 
 				memcpy(elt->key, key, map->key_size);
-				entry->val = elt;
+				/*
+				 * Ensure the initialization is visible and
+				 * publish the elt.
+				 */
+				smp_wmb();
+				WRITE_ONCE(entry->val, elt);
 				atomic64_inc(&map->hits);
 
 				return entry->val;
-- 
cgit v1.2.3


From b184c8c2889ceef0a137c7d0567ef9fe3d92276e Mon Sep 17 00:00:00 2001
From: Dawei Li <dawei.li@shingroup.cn>
Date: Mon, 22 Jan 2024 16:57:15 +0800
Subject: genirq: Initialize resend_node hlist for all interrupt descriptors

For a CONFIG_SPARSE_IRQ=n kernel, early_irq_init() is supposed to
initialize all interrupt descriptors.

It does except for irq_desc::resend_node, which ia only initialized for the
first descriptor.

Use the indexed decriptor and not the base pointer to address that.

Fixes: bc06a9e08742 ("genirq: Use hlist for managing resend handlers")
Signed-off-by: Dawei Li <dawei.li@shingroup.cn>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <maz@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20240122085716.2999875-5-dawei.li@shingroup.cn
---
 kernel/irq/irqdesc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 27ca1c866f29..371eb1711d34 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -600,7 +600,7 @@ int __init early_irq_init(void)
 		mutex_init(&desc[i].request_mutex);
 		init_waitqueue_head(&desc[i].wait_for_threads);
 		desc_set_defaults(i, &desc[i], node, NULL, NULL);
-		irq_resend_init(desc);
+		irq_resend_init(&desc[i]);
 	}
 	return arch_early_irq_init();
 }
-- 
cgit v1.2.3


From e787644caf7628ad3269c1fbd321c3255cf51710 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Tue, 19 Dec 2023 00:19:15 +0100
Subject: rcu: Defer RCU kthreads wakeup when CPU is dying

When the CPU goes idle for the last time during the CPU down hotplug
process, RCU reports a final quiescent state for the current CPU. If
this quiescent state propagates up to the top, some tasks may then be
woken up to complete the grace period: the main grace period kthread
and/or the expedited main workqueue (or kworker).

If those kthreads have a SCHED_FIFO policy, the wake up can indirectly
arm the RT bandwith timer to the local offline CPU. Since this happens
after hrtimers have been migrated at CPUHP_AP_HRTIMERS_DYING stage, the
timer gets ignored. Therefore if the RCU kthreads are waiting for RT
bandwidth to be available, they may never be actually scheduled.

This triggers TREE03 rcutorture hangs:

	 rcu: INFO: rcu_preempt self-detected stall on CPU
	 rcu:     4-...!: (1 GPs behind) idle=9874/1/0x4000000000000000 softirq=0/0 fqs=20 rcuc=21071 jiffies(starved)
	 rcu:     (t=21035 jiffies g=938281 q=40787 ncpus=6)
	 rcu: rcu_preempt kthread starved for 20964 jiffies! g938281 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x0 ->cpu=0
	 rcu:     Unless rcu_preempt kthread gets sufficient CPU time, OOM is now expected behavior.
	 rcu: RCU grace-period kthread stack dump:
	 task:rcu_preempt     state:R  running task     stack:14896 pid:14    tgid:14    ppid:2      flags:0x00004000
	 Call Trace:
	  <TASK>
	  __schedule+0x2eb/0xa80
	  schedule+0x1f/0x90
	  schedule_timeout+0x163/0x270
	  ? __pfx_process_timeout+0x10/0x10
	  rcu_gp_fqs_loop+0x37c/0x5b0
	  ? __pfx_rcu_gp_kthread+0x10/0x10
	  rcu_gp_kthread+0x17c/0x200
	  kthread+0xde/0x110
	  ? __pfx_kthread+0x10/0x10
	  ret_from_fork+0x2b/0x40
	  ? __pfx_kthread+0x10/0x10
	  ret_from_fork_asm+0x1b/0x30
	  </TASK>

The situation can't be solved with just unpinning the timer. The hrtimer
infrastructure and the nohz heuristics involved in finding the best
remote target for an unpinned timer would then also need to handle
enqueues from an offline CPU in the most horrendous way.

So fix this on the RCU side instead and defer the wake up to an online
CPU if it's too late for the local one.

Reported-by: Paul E. McKenney <paulmck@kernel.org>
Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
---
 kernel/rcu/tree.c     | 34 +++++++++++++++++++++++++++++++++-
 kernel/rcu/tree_exp.h |  3 +--
 2 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1ae851777806..b2bccfd37c38 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1013,6 +1013,38 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
 	return needmore;
 }
 
+static void swake_up_one_online_ipi(void *arg)
+{
+	struct swait_queue_head *wqh = arg;
+
+	swake_up_one(wqh);
+}
+
+static void swake_up_one_online(struct swait_queue_head *wqh)
+{
+	int cpu = get_cpu();
+
+	/*
+	 * If called from rcutree_report_cpu_starting(), wake up
+	 * is dangerous that late in the CPU-down hotplug process. The
+	 * scheduler might queue an ignored hrtimer. Defer the wake up
+	 * to an online CPU instead.
+	 */
+	if (unlikely(cpu_is_offline(cpu))) {
+		int target;
+
+		target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU),
+					 cpu_online_mask);
+
+		smp_call_function_single(target, swake_up_one_online_ipi,
+					 wqh, 0);
+		put_cpu();
+	} else {
+		put_cpu();
+		swake_up_one(wqh);
+	}
+}
+
 /*
  * Awaken the grace-period kthread.  Don't do a self-awaken (unless in an
  * interrupt or softirq handler, in which case we just might immediately
@@ -1037,7 +1069,7 @@ static void rcu_gp_kthread_wake(void)
 		return;
 	WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
 	WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
-	swake_up_one(&rcu_state.gp_wq);
+	swake_up_one_online(&rcu_state.gp_wq);
 }
 
 /*
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 6d7cea5d591f..2ac440bc7e10 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -173,7 +173,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
 	return ret;
 }
 
-
 /*
  * Report the exit from RCU read-side critical section for the last task
  * that queued itself during or before the current expedited preemptible-RCU
@@ -201,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 			if (wake) {
 				smp_mb(); /* EGP done before wake_up(). */
-				swake_up_one(&rcu_state.expedited_wq);
+				swake_up_one_online(&rcu_state.expedited_wq);
 			}
 			break;
 		}
-- 
cgit v1.2.3


From 90383cc07895183c75a0db2460301c2ffd912359 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 24 Jan 2024 11:15:33 -0800
Subject: exec: Distinguish in_execve from in_exec

Just to help distinguish the fs->in_exec flag from the current->in_execve
flag, add comments in check_unsafe_exec() and copy_fs() for more
context. Also note that in_execve is only used by TOMOYO now.

Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 fs/exec.c             | 1 +
 include/linux/sched.h | 2 +-
 kernel/fork.c         | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 39d773021fff..d179abb78a1c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1633,6 +1633,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	}
 	rcu_read_unlock();
 
+	/* "users" and "in_exec" locked for copy_fs() */
 	if (p->fs->users > n_fs)
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
 	else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cdb8ea53c365..ffe8f618ab86 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -920,7 +920,7 @@ struct task_struct {
 	unsigned			sched_rt_mutex:1;
 #endif
 
-	/* Bit to tell LSMs we're in execve(): */
+	/* Bit to tell TOMOYO we're in execve(): */
 	unsigned			in_execve:1;
 	unsigned			in_iowait:1;
 #ifndef TIF_RESTORE_SIGMASK
diff --git a/kernel/fork.c b/kernel/fork.c
index 47ff3b35352e..0d944e92a43f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1748,6 +1748,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 	if (clone_flags & CLONE_FS) {
 		/* tsk->fs is already what we want */
 		spin_lock(&fs->lock);
+		/* "users" and "in_exec" locked for check_unsafe_exec() */
 		if (fs->in_exec) {
 			spin_unlock(&fs->lock);
 			return -EAGAIN;
-- 
cgit v1.2.3


From 644649553508b9bacf0fc7a5bdc4f9e0165576a5 Mon Sep 17 00:00:00 2001
From: Jiri Wiesner <jwiesner@suse.de>
Date: Mon, 22 Jan 2024 18:23:50 +0100
Subject: clocksource: Skip watchdog check for large watchdog intervals

There have been reports of the watchdog marking clocksources unstable on
machines with 8 NUMA nodes:

  clocksource: timekeeping watchdog on CPU373:
  Marking clocksource 'tsc' as unstable because the skew is too large:
  clocksource:   'hpet' wd_nsec: 14523447520
  clocksource:   'tsc'  cs_nsec: 14524115132

The measured clocksource skew - the absolute difference between cs_nsec
and wd_nsec - was 668 microseconds:

  cs_nsec - wd_nsec = 14524115132 - 14523447520 = 667612

The kernel used 200 microseconds for the uncertainty_margin of both the
clocksource and watchdog, resulting in a threshold of 400 microseconds (the
md variable). Both the cs_nsec and the wd_nsec value indicate that the
readout interval was circa 14.5 seconds.  The observed behaviour is that
watchdog checks failed for large readout intervals on 8 NUMA node
machines. This indicates that the size of the skew was directly proportinal
to the length of the readout interval on those machines. The measured
clocksource skew, 668 microseconds, was evaluated against a threshold (the
md variable) that is suited for readout intervals of roughly
WATCHDOG_INTERVAL, i.e. HZ >> 1, which is 0.5 second.

The intention of 2e27e793e280 ("clocksource: Reduce clocksource-skew
threshold") was to tighten the threshold for evaluating skew and set the
lower bound for the uncertainty_margin of clocksources to twice
WATCHDOG_MAX_SKEW. Later in c37e85c135ce ("clocksource: Loosen clocksource
watchdog constraints"), the WATCHDOG_MAX_SKEW constant was increased to
125 microseconds to fit the limit of NTP, which is able to use a
clocksource that suffers from up to 500 microseconds of skew per second.
Both the TSC and the HPET use default uncertainty_margin. When the
readout interval gets stretched the default uncertainty_margin is no
longer a suitable lower bound for evaluating skew - it imposes a limit
that is far stricter than the skew with which NTP can deal.

The root causes of the skew being directly proportinal to the length of
the readout interval are:

  * the inaccuracy of the shift/mult pairs of clocksources and the watchdog
  * the conversion to nanoseconds is imprecise for large readout intervals

Prevent this by skipping the current watchdog check if the readout
interval exceeds 2 * WATCHDOG_INTERVAL. Considering the maximum readout
interval of 2 * WATCHDOG_INTERVAL, the current default uncertainty margin
(of the TSC and HPET) corresponds to a limit on clocksource skew of 250
ppm (microseconds of skew per second).  To keep the limit imposed by NTP
(500 microseconds of skew per second) for all possible readout intervals,
the margins would have to be scaled so that the threshold value is
proportional to the length of the actual readout interval.

As for why the readout interval may get stretched: Since the watchdog is
executed in softirq context the expiration of the watchdog timer can get
severely delayed on account of a ksoftirqd thread not getting to run in a
timely manner. Surely, a system with such belated softirq execution is not
working well and the scheduling issue should be looked into but the
clocksource watchdog should be able to deal with it accordingly.

Fixes: 2e27e793e280 ("clocksource: Reduce clocksource-skew threshold")
Suggested-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Jiri Wiesner <jwiesner@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: Feng Tang <feng.tang@intel.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20240122172350.GA740@incl
---
 kernel/time/clocksource.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c108ed8a9804..3052b1f1168e 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -99,6 +99,7 @@ static u64 suspend_start;
  * Interval: 0.5sec.
  */
 #define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
 
 /*
  * Threshold: 0.0312s, when doubled: 0.0625s.
@@ -134,6 +135,7 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
 static DEFINE_SPINLOCK(watchdog_lock);
 static int watchdog_running;
 static atomic_t watchdog_reset_pending;
+static int64_t watchdog_max_interval;
 
 static inline void clocksource_watchdog_lock(unsigned long *flags)
 {
@@ -399,8 +401,8 @@ static inline void clocksource_reset_watchdog(void)
 static void clocksource_watchdog(struct timer_list *unused)
 {
 	u64 csnow, wdnow, cslast, wdlast, delta;
+	int64_t wd_nsec, cs_nsec, interval;
 	int next_cpu, reset_pending;
-	int64_t wd_nsec, cs_nsec;
 	struct clocksource *cs;
 	enum wd_read_status read_ret;
 	unsigned long extra_wait = 0;
@@ -470,6 +472,27 @@ static void clocksource_watchdog(struct timer_list *unused)
 		if (atomic_read(&watchdog_reset_pending))
 			continue;
 
+		/*
+		 * The processing of timer softirqs can get delayed (usually
+		 * on account of ksoftirqd not getting to run in a timely
+		 * manner), which causes the watchdog interval to stretch.
+		 * Skew detection may fail for longer watchdog intervals
+		 * on account of fixed margins being used.
+		 * Some clocksources, e.g. acpi_pm, cannot tolerate
+		 * watchdog intervals longer than a few seconds.
+		 */
+		interval = max(cs_nsec, wd_nsec);
+		if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
+			if (system_state > SYSTEM_SCHEDULING &&
+			    interval > 2 * watchdog_max_interval) {
+				watchdog_max_interval = interval;
+				pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
+					cs_nsec, wd_nsec);
+			}
+			watchdog_timer.expires = jiffies;
+			continue;
+		}
+
 		/* Check the deviation from the watchdog clocksource. */
 		md = cs->uncertainty_margin + watchdog->uncertainty_margin;
 		if (abs(cs_nsec - wd_nsec) > md) {
-- 
cgit v1.2.3


From 9a574ea9069be30b835a3da772c039993c43369b Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Mon, 22 Jan 2024 15:35:34 -0800
Subject: tick/sched: Preserve number of idle sleeps across CPU hotplug events

Commit 71fee48f ("tick-sched: Fix idle and iowait sleeptime accounting vs
CPU hotplug") preserved total idle sleep time and iowait sleeptime across
CPU hotplug events.

Similar reasoning applies to the number of idle calls and idle sleeps to
get the proper average of sleep time per idle invocation.

Preserve those fields too.

Fixes: 71fee48f ("tick-sched: Fix idle and iowait sleeptime accounting vs CPU hotplug")
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20240122233534.3094238-1-tim.c.chen@linux.intel.com
---
 kernel/time/tick-sched.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d2501673028d..01fb50c1b17e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1577,6 +1577,7 @@ void tick_cancel_sched_timer(int cpu)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 	ktime_t idle_sleeptime, iowait_sleeptime;
+	unsigned long idle_calls, idle_sleeps;
 
 # ifdef CONFIG_HIGH_RES_TIMERS
 	if (ts->sched_timer.base)
@@ -1585,9 +1586,13 @@ void tick_cancel_sched_timer(int cpu)
 
 	idle_sleeptime = ts->idle_sleeptime;
 	iowait_sleeptime = ts->iowait_sleeptime;
+	idle_calls = ts->idle_calls;
+	idle_sleeps = ts->idle_sleeps;
 	memset(ts, 0, sizeof(*ts));
 	ts->idle_sleeptime = idle_sleeptime;
 	ts->iowait_sleeptime = iowait_sleeptime;
+	ts->idle_calls = idle_calls;
+	ts->idle_sleeps = idle_sleeps;
 }
 #endif
 
-- 
cgit v1.2.3


From 4dca82d14174fe53f656a6bc32398db1bdd8f481 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 15 Jan 2024 11:07:31 +0100
Subject: uprobes: use pagesize-aligned virtual address when replacing pages

uprobes passes an unaligned page mapping address to
folio_add_new_anon_rmap(), which ends up triggering a VM_BUG_ON() we
recently extended in commit 372cbd4d5a066 ("mm: non-pmd-mappable, large
folios for folio_add_new_anon_rmap()").

Arguably, this is uprobes code doing something wrong; however, for the
time being it would have likely worked in rmap code because
__folio_set_anon() would set folio->index to the same value.

Looking at __replace_page(), we'd also pass slightly wrong values to
mmu_notifier_range_init(), page_vma_mapped_walk(), flush_cache_page(),
ptep_clear_flush() and set_pte_at_notify().  I suspect most of them are
fine, but let's just mark the introducing commit as the one needed fixing.
I don't think CC stable is warranted.

We'll add more sanity checks in rmap code separately, to make sure that we
always get properly aligned addresses.

Link: https://lkml.kernel.org/r/20240115100731.91007-1-david@redhat.com
Fixes: c517ee744b96 ("uprobes: __replace_page() should not use page_address_in_vma()")
Signed-off-by: David Hildenbrand <david@redhat.com>
Reported-by: Jiri Olsa <jolsa@kernel.org>
Closes: https://lkml.kernel.org/r/ZaMR2EWN-HvlCfUl@krava
Tested-by: Jiri Olsa <jolsa@kernel.org>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/events/uprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 485bb0389b48..929e98c62965 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -537,7 +537,7 @@ retry:
 		}
 	}
 
-	ret = __replace_page(vma, vaddr, old_page, new_page);
+	ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
 	if (new_page)
 		put_page(new_page);
 put_old:
-- 
cgit v1.2.3


From 0958b33ef5a04ed91f61cef4760ac412080c4e08 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Fri, 26 Jan 2024 09:42:58 +0900
Subject: tracing/trigger: Fix to return error if failed to alloc snapshot

Fix register_snapshot_trigger() to return error code if it failed to
allocate a snapshot instead of 0 (success). Unless that, it will register
snapshot trigger without an error.

Link: https://lore.kernel.org/linux-trace-kernel/170622977792.270660.2789298642759362200.stgit@devnote2

Fixes: 0bbe7f719985 ("tracing: Fix the race between registering 'snapshot' event trigger and triggering 'snapshot' operation")
Cc: stable@vger.kernel.org
Cc: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_trigger.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 46439e3bcec4..b33c3861fbbb 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1470,8 +1470,10 @@ register_snapshot_trigger(char *glob,
 			  struct event_trigger_data *data,
 			  struct trace_event_file *file)
 {
-	if (tracing_alloc_snapshot_instance(file->tr) != 0)
-		return 0;
+	int ret = tracing_alloc_snapshot_instance(file->tr);
+
+	if (ret < 0)
+		return ret;
 
 	return register_trigger(glob, data, file);
 }
-- 
cgit v1.2.3


From 66bbea9ed6446b8471d365a22734dc00556c4785 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Wed, 31 Jan 2024 14:09:55 +0000
Subject: ring-buffer: Clean ring_buffer_poll_wait() error return

The return type for ring_buffer_poll_wait() is __poll_t. This is behind
the scenes an unsigned where we can set event bits. In case of a
non-allocated CPU, we do return instead -EINVAL (0xffffffea). Lucky us,
this ends up setting few error bits (EPOLLERR | EPOLLHUP | EPOLLNVAL), so
user-space at least is aware something went wrong.

Nonetheless, this is an incorrect code. Replace that -EINVAL with a
proper EPOLLERR to clean that output. As this doesn't change the
behaviour, there's no need to treat this change as a bug fix.

Link: https://lore.kernel.org/linux-trace-kernel/20240131140955.3322792-1-vdonnefort@google.com

Cc: stable@vger.kernel.org
Fixes: 6721cb6002262 ("ring-buffer: Do not poll non allocated cpu buffers")
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 13aaf5e85b81..fd4bfe3ecf01 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -944,7 +944,7 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
 		full = 0;
 	} else {
 		if (!cpumask_test_cpu(cpu, buffer->cpumask))
-			return -EINVAL;
+			return EPOLLERR;
 
 		cpu_buffer = buffer->buffers[cpu];
 		work = &cpu_buffer->irq_work;
-- 
cgit v1.2.3


From 1389358bb008e7625942846e9f03554319b7fecc Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@kernel.org>
Date: Thu, 1 Feb 2024 16:13:39 +0100
Subject: tracing/timerlat: Move hrtimer_init to timerlat_fd open()

Currently, the timerlat's hrtimer is initialized at the first read of
timerlat_fd, and destroyed at close(). It works, but it causes an error
if the user program open() and close() the file without reading.

Here's an example:

 # echo NO_OSNOISE_WORKLOAD > /sys/kernel/debug/tracing/osnoise/options
 # echo timerlat > /sys/kernel/debug/tracing/current_tracer

 # cat <<EOF > ./timerlat_load.py
 # !/usr/bin/env python3

 timerlat_fd = open("/sys/kernel/tracing/osnoise/per_cpu/cpu0/timerlat_fd", 'r')
 timerlat_fd.close();
 EOF

 # ./taskset -c 0 ./timerlat_load.py
<BOOM>

 BUG: kernel NULL pointer dereference, address: 0000000000000010
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: 0000 [#1] PREEMPT SMP NOPTI
 CPU: 1 PID: 2673 Comm: python3 Not tainted 6.6.13-200.fc39.x86_64 #1
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-1.fc39 04/01/2014
 RIP: 0010:hrtimer_active+0xd/0x50
 Code: 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 48 8b 57 30 <8b> 42 10 a8 01 74 09 f3 90 8b 42 10 a8 01 75 f7 80 7f 38 00 75 1d
 RSP: 0018:ffffb031009b7e10 EFLAGS: 00010286
 RAX: 000000000002db00 RBX: ffff9118f786db08 RCX: 0000000000000000
 RDX: 0000000000000000 RSI: ffff9117a0e64400 RDI: ffff9118f786db08
 RBP: ffff9118f786db80 R08: ffff9117a0ddd420 R09: ffff9117804d4f70
 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9118f786db08
 R13: ffff91178fdd5e20 R14: ffff9117840978c0 R15: 0000000000000000
 FS:  00007f2ffbab1740(0000) GS:ffff9118f7840000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000010 CR3: 00000001b402e000 CR4: 0000000000750ee0
 PKRU: 55555554
 Call Trace:
  <TASK>
  ? __die+0x23/0x70
  ? page_fault_oops+0x171/0x4e0
  ? srso_alias_return_thunk+0x5/0x7f
  ? avc_has_extended_perms+0x237/0x520
  ? exc_page_fault+0x7f/0x180
  ? asm_exc_page_fault+0x26/0x30
  ? hrtimer_active+0xd/0x50
  hrtimer_cancel+0x15/0x40
  timerlat_fd_release+0x48/0xe0
  __fput+0xf5/0x290
  __x64_sys_close+0x3d/0x80
  do_syscall_64+0x60/0x90
  ? srso_alias_return_thunk+0x5/0x7f
  ? __x64_sys_ioctl+0x72/0xd0
  ? srso_alias_return_thunk+0x5/0x7f
  ? syscall_exit_to_user_mode+0x2b/0x40
  ? srso_alias_return_thunk+0x5/0x7f
  ? do_syscall_64+0x6c/0x90
  ? srso_alias_return_thunk+0x5/0x7f
  ? exit_to_user_mode_prepare+0x142/0x1f0
  ? srso_alias_return_thunk+0x5/0x7f
  ? syscall_exit_to_user_mode+0x2b/0x40
  ? srso_alias_return_thunk+0x5/0x7f
  ? do_syscall_64+0x6c/0x90
  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
 RIP: 0033:0x7f2ffb321594
 Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 80 3d d5 cd 0d 00 00 74 13 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 3c c3 0f 1f 00 55 48 89 e5 48 83 ec 10 89 7d
 RSP: 002b:00007ffe8d8eef18 EFLAGS: 00000202 ORIG_RAX: 0000000000000003
 RAX: ffffffffffffffda RBX: 00007f2ffba4e668 RCX: 00007f2ffb321594
 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003
 RBP: 00007ffe8d8eef40 R08: 0000000000000000 R09: 0000000000000000
 R10: 55c926e3167eae79 R11: 0000000000000202 R12: 0000000000000003
 R13: 00007ffe8d8ef030 R14: 0000000000000000 R15: 00007f2ffba4e668
  </TASK>
 CR2: 0000000000000010
 ---[ end trace 0000000000000000 ]---

Move hrtimer_init to timerlat_fd open() to avoid this problem.

Link: https://lore.kernel.org/linux-trace-kernel/7324dd3fc0035658c99b825204a66049389c56e3.1706798888.git.bristot@kernel.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: stable@vger.kernel.org
Fixes: e88ed227f639 ("tracing/timerlat: Add user-space interface")
Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_osnoise.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index bd0d01d00fb9..a8e28f9b9271 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -2444,6 +2444,9 @@ static int timerlat_fd_open(struct inode *inode, struct file *file)
 	tlat = this_cpu_tmr_var();
 	tlat->count = 0;
 
+	hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
+	tlat->timer.function = timerlat_irq;
+
 	migrate_enable();
 	return 0;
 };
@@ -2526,9 +2529,6 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
 		tlat->tracing_thread = false;
 		tlat->kthread = current;
 
-		hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
-		tlat->timer.function = timerlat_irq;
-
 		/* Annotate now to drift new period */
 		tlat->abs_period = hrtimer_cb_get_time(&tlat->timer);
 
-- 
cgit v1.2.3


From dad6a09f3148257ac1773cd90934d721d68ab595 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 29 Jan 2024 15:56:36 -0800
Subject: hrtimer: Report offline hrtimer enqueue

The hrtimers migration on CPU-down hotplug process has been moved
earlier, before the CPU actually goes to die. This leaves a small window
of opportunity to queue an hrtimer in a blind spot, leaving it ignored.

For example a practical case has been reported with RCU waking up a
SCHED_FIFO task right before the CPUHP_AP_IDLE_DEAD stage, queuing that
way a sched/rt timer to the local offline CPU.

Make sure such situations never go unnoticed and warn when that happens.

Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
Reported-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20240129235646.3171983-4-boqun.feng@gmail.com
---
 include/linux/hrtimer.h | 4 +++-
 kernel/time/hrtimer.c   | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 87e3bedf8eb0..641c4567cfa7 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -157,6 +157,7 @@ enum  hrtimer_base_type {
  * @max_hang_time:	Maximum time spent in hrtimer_interrupt
  * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are
  *			 expired
+ * @online:		CPU is online from an hrtimers point of view
  * @timer_waiters:	A hrtimer_cancel() invocation waits for the timer
  *			callback to finish.
  * @expires_next:	absolute time of the next event, is required for remote
@@ -179,7 +180,8 @@ struct hrtimer_cpu_base {
 	unsigned int			hres_active		: 1,
 					in_hrtirq		: 1,
 					hang_detected		: 1,
-					softirq_activated       : 1;
+					softirq_activated       : 1,
+					online			: 1;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	unsigned int			nr_events;
 	unsigned short			nr_retries;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 760793998cdd..edb0f821dcea 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1085,6 +1085,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 			   enum hrtimer_mode mode)
 {
 	debug_activate(timer, mode);
+	WARN_ON_ONCE(!base->cpu_base->online);
 
 	base->cpu_base->active_bases |= 1 << base->index;
 
@@ -2183,6 +2184,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
 	cpu_base->softirq_next_timer = NULL;
 	cpu_base->expires_next = KTIME_MAX;
 	cpu_base->softirq_expires_next = KTIME_MAX;
+	cpu_base->online = 1;
 	hrtimer_cpu_base_init_expiry_lock(cpu_base);
 	return 0;
 }
@@ -2250,6 +2252,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
 	smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
 
 	raw_spin_unlock(&new_base->lock);
+	old_base->online = 0;
 	raw_spin_unlock(&old_base->lock);
 
 	return 0;
-- 
cgit v1.2.3


From daa694e4137571b4ebec330f9a9b4d54aa8b8089 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 22 Jan 2024 16:50:50 +0100
Subject: getrusage: move thread_group_cputime_adjusted() outside of
 lock_task_sighand()

Patch series "getrusage: use sig->stats_lock", v2.


This patch (of 2):

thread_group_cputime() does its own locking, we can safely shift
thread_group_cputime_adjusted() which does another for_each_thread loop
outside of ->siglock protected section.

This is also preparation for the next patch which changes getrusage() to
use stats_lock instead of siglock, thread_group_cputime() takes the same
lock.  With the current implementation recursive read_seqbegin_or_lock()
is fine, thread_group_cputime() can't enter the slow mode if the caller
holds stats_lock, yet this looks more safe and better performance-wise.

Link: https://lkml.kernel.org/r/20240122155023.GA26169@redhat.com
Link: https://lkml.kernel.org/r/20240122155050.GA26205@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Dylan Hatch <dylanbhatch@google.com>
Tested-by: Dylan Hatch <dylanbhatch@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/sys.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index e219fcfa112d..70ad06ad852e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1785,17 +1785,19 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 	struct task_struct *t;
 	unsigned long flags;
 	u64 tgutime, tgstime, utime, stime;
-	unsigned long maxrss = 0;
+	unsigned long maxrss;
+	struct mm_struct *mm;
 	struct signal_struct *sig = p->signal;
 
-	memset((char *)r, 0, sizeof (*r));
+	memset(r, 0, sizeof(*r));
 	utime = stime = 0;
+	maxrss = 0;
 
 	if (who == RUSAGE_THREAD) {
 		task_cputime_adjusted(current, &utime, &stime);
 		accumulate_thread_rusage(p, r);
 		maxrss = sig->maxrss;
-		goto out;
+		goto out_thread;
 	}
 
 	if (!lock_task_sighand(p, &flags))
@@ -1819,9 +1821,6 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 		fallthrough;
 
 	case RUSAGE_SELF:
-		thread_group_cputime_adjusted(p, &tgutime, &tgstime);
-		utime += tgutime;
-		stime += tgstime;
 		r->ru_nvcsw += sig->nvcsw;
 		r->ru_nivcsw += sig->nivcsw;
 		r->ru_minflt += sig->min_flt;
@@ -1839,19 +1838,24 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 	}
 	unlock_task_sighand(p, &flags);
 
-out:
-	r->ru_utime = ns_to_kernel_old_timeval(utime);
-	r->ru_stime = ns_to_kernel_old_timeval(stime);
+	if (who == RUSAGE_CHILDREN)
+		goto out_children;
 
-	if (who != RUSAGE_CHILDREN) {
-		struct mm_struct *mm = get_task_mm(p);
+	thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+	utime += tgutime;
+	stime += tgstime;
 
-		if (mm) {
-			setmax_mm_hiwater_rss(&maxrss, mm);
-			mmput(mm);
-		}
+out_thread:
+	mm = get_task_mm(p);
+	if (mm) {
+		setmax_mm_hiwater_rss(&maxrss, mm);
+		mmput(mm);
 	}
+
+out_children:
 	r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
+	r->ru_utime = ns_to_kernel_old_timeval(utime);
+	r->ru_stime = ns_to_kernel_old_timeval(stime);
 }
 
 SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
-- 
cgit v1.2.3


From f7ec1cd5cc7ef3ad964b677ba82b8b77f1c93009 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 22 Jan 2024 16:50:53 +0100
Subject: getrusage: use sig->stats_lock rather than lock_task_sighand()

lock_task_sighand() can trigger a hard lockup. If NR_CPUS threads call
getrusage() at the same time and the process has NR_THREADS, spin_lock_irq
will spin with irqs disabled O(NR_CPUS * NR_THREADS) time.

Change getrusage() to use sig->stats_lock, it was specifically designed
for this type of use. This way it runs lockless in the likely case.

TODO:
	- Change do_task_stat() to use sig->stats_lock too, then we can
	  remove spin_lock_irq(siglock) in wait_task_zombie().

	- Turn sig->stats_lock into seqcount_rwlock_t, this way the
	  readers in the slow mode won't exclude each other. See
	  https://lore.kernel.org/all/20230913154907.GA26210@redhat.com/

	- stats_lock has to disable irqs because ->siglock can be taken
	  in irq context, it would be very nice to change __exit_signal()
	  to avoid the siglock->stats_lock dependency.

Link: https://lkml.kernel.org/r/20240122155053.GA26214@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Dylan Hatch <dylanbhatch@google.com>
Tested-by: Dylan Hatch <dylanbhatch@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/sys.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 70ad06ad852e..f8e543f1e38a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1788,7 +1788,9 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 	unsigned long maxrss;
 	struct mm_struct *mm;
 	struct signal_struct *sig = p->signal;
+	unsigned int seq = 0;
 
+retry:
 	memset(r, 0, sizeof(*r));
 	utime = stime = 0;
 	maxrss = 0;
@@ -1800,8 +1802,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 		goto out_thread;
 	}
 
-	if (!lock_task_sighand(p, &flags))
-		return;
+	flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
 
 	switch (who) {
 	case RUSAGE_BOTH:
@@ -1829,14 +1830,23 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 		r->ru_oublock += sig->oublock;
 		if (maxrss < sig->maxrss)
 			maxrss = sig->maxrss;
+
+		rcu_read_lock();
 		__for_each_thread(sig, t)
 			accumulate_thread_rusage(t, r);
+		rcu_read_unlock();
+
 		break;
 
 	default:
 		BUG();
 	}
-	unlock_task_sighand(p, &flags);
+
+	if (need_seqretry(&sig->stats_lock, seq)) {
+		seq = 1;
+		goto retry;
+	}
+	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
 
 	if (who == RUSAGE_CHILDREN)
 		goto out_children;
-- 
cgit v1.2.3


From c1be35a16b2f1fe21f4f26f9de030ad6eaaf6a25 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 23 Jan 2024 16:34:00 +0100
Subject: exit: wait_task_zombie: kill the no longer necessary
 spin_lock_irq(siglock)

After the recent changes nobody use siglock to read the values protected
by stats_lock, we can kill spin_lock_irq(&current->sighand->siglock) and
update the comment.

With this patch only __exit_signal() and thread_group_start_cputime() take
stats_lock under siglock.

Link: https://lkml.kernel.org/r/20240123153359.GA21866@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Dylan Hatch <dylanbhatch@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/exit.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 3988a02efaef..dfb963d2f862 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1127,17 +1127,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		 * and nobody can change them.
 		 *
 		 * psig->stats_lock also protects us from our sub-threads
-		 * which can reap other children at the same time. Until
-		 * we change k_getrusage()-like users to rely on this lock
-		 * we have to take ->siglock as well.
+		 * which can reap other children at the same time.
 		 *
 		 * We use thread_group_cputime_adjusted() to get times for
 		 * the thread group, which consolidates times for all threads
 		 * in the group including the group leader.
 		 */
 		thread_group_cputime_adjusted(p, &tgutime, &tgstime);
-		spin_lock_irq(&current->sighand->siglock);
-		write_seqlock(&psig->stats_lock);
+		write_seqlock_irq(&psig->stats_lock);
 		psig->cutime += tgutime + sig->cutime;
 		psig->cstime += tgstime + sig->cstime;
 		psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1160,8 +1157,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 			psig->cmaxrss = maxrss;
 		task_io_accounting_add(&psig->ioac, &p->ioac);
 		task_io_accounting_add(&psig->ioac, &sig->ioac);
-		write_sequnlock(&psig->stats_lock);
-		spin_unlock_irq(&current->sighand->siglock);
+		write_sequnlock_irq(&psig->stats_lock);
 	}
 
 	if (wo->wo_rusage)
-- 
cgit v1.2.3


From 8c427cc2fa73684ea140999e121b7b6c1c717632 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Wed, 24 Jan 2024 00:02:34 +0900
Subject: tracing/probes: Fix to show a parse error for bad type for $comm

Fix to show a parse error for bad type (non-string) for $comm/$COMM and
immediate-string. With this fix, error_log file shows appropriate error
message as below.

 /sys/kernel/tracing # echo 'p vfs_read $comm:u32' >> kprobe_events
sh: write error: Invalid argument
 /sys/kernel/tracing # echo 'p vfs_read \"hoge":u32' >> kprobe_events
sh: write error: Invalid argument
 /sys/kernel/tracing # cat error_log

[   30.144183] trace_kprobe: error: $comm and immediate-string only accepts string type
  Command: p vfs_read $comm:u32
                            ^
[   62.618500] trace_kprobe: error: $comm and immediate-string only accepts string type
  Command: p vfs_read \"hoge":u32
                              ^
Link: https://lore.kernel.org/all/170602215411.215583.2238016352271091852.stgit@devnote2/

Fixes: 3dd1f7f24f8c ("tracing: probeevent: Fix to make the type of $comm string")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace_probe.c | 7 +++++--
 kernel/trace/trace_probe.h | 3 ++-
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 4dc74d73fc1d..c6da5923e5b9 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1159,9 +1159,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
 	if (!(ctx->flags & TPARG_FL_TEVENT) &&
 	    (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 ||
 	     strncmp(arg, "\\\"", 2) == 0)) {
-		/* The type of $comm must be "string", and not an array. */
-		if (parg->count || (t && strcmp(t, "string")))
+		/* The type of $comm must be "string", and not an array type. */
+		if (parg->count || (t && strcmp(t, "string"))) {
+			trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
+					NEED_STRING_TYPE);
 			goto out;
+		}
 		parg->type = find_fetch_type("string", ctx->flags);
 	} else
 		parg->type = find_fetch_type(t, ctx->flags);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 850d9ecb6765..c1877d018269 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -515,7 +515,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
 	C(BAD_HYPHEN,		"Failed to parse single hyphen. Forgot '>'?"),	\
 	C(NO_BTF_FIELD,		"This field is not found."),	\
 	C(BAD_BTF_TID,		"Failed to get BTF type info."),\
-	C(BAD_TYPE4STR,		"This type does not fit for string."),
+	C(BAD_TYPE4STR,		"This type does not fit for string."),\
+	C(NEED_STRING_TYPE,	"$comm and immediate-string only accepts string type"),
 
 #undef C
 #define C(a, b)		TP_ERR_##a
-- 
cgit v1.2.3


From 9a571c1e275cedacd48c66a6bddd0c23f1dffdbf Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Wed, 24 Jan 2024 00:03:02 +0900
Subject: tracing/probes: Fix to set arg size and fmt after setting type from
 BTF

Since the BTF type setting updates probe_arg::type, the type size
calculation and setting print-fmt should be done after that.
Without this fix, the argument size and print-fmt can be wrong.

Link: https://lore.kernel.org/all/170602218196.215583.6417859469540955777.stgit@devnote2/

Fixes: b576e09701c7 ("tracing/probes: Support function parameters if BTF is available")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/trace_probe.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index c6da5923e5b9..34289f9c6707 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1172,18 +1172,6 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
 		trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE);
 		goto out;
 	}
-	parg->offset = *size;
-	*size += parg->type->size * (parg->count ?: 1);
-
-	ret = -ENOMEM;
-	if (parg->count) {
-		len = strlen(parg->type->fmttype) + 6;
-		parg->fmt = kmalloc(len, GFP_KERNEL);
-		if (!parg->fmt)
-			goto out;
-		snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
-			 parg->count);
-	}
 
 	code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
 	if (!code)
@@ -1207,6 +1195,19 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
 				goto fail;
 		}
 	}
+	parg->offset = *size;
+	*size += parg->type->size * (parg->count ?: 1);
+
+	if (parg->count) {
+		len = strlen(parg->type->fmttype) + 6;
+		parg->fmt = kmalloc(len, GFP_KERNEL);
+		if (!parg->fmt) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
+			 parg->count);
+	}
 
 	ret = -EINVAL;
 	/* Store operation */
-- 
cgit v1.2.3


From 9efd24ec5599ed485b7c4d9aeb731141f6285167 Mon Sep 17 00:00:00 2001
From: Li zeming <zeming@nfschina.com>
Date: Tue, 19 Sep 2023 09:28:23 +0800
Subject: kprobes: Remove unnecessary initial values of variables

ri and sym is assigned first, so it does not need to initialize the
assignment.

Link: https://lore.kernel.org/all/20230919012823.7815-1-zeming@nfschina.com/

Signed-off-by: Li zeming <zeming@nfschina.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/kprobes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d5a0ee40bf66..9d9095e81792 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1993,7 +1993,7 @@ NOKPROBE_SYMBOL(__kretprobe_find_ret_addr);
 unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp,
 				      struct llist_node **cur)
 {
-	struct kretprobe_instance *ri = NULL;
+	struct kretprobe_instance *ri;
 	kprobe_opcode_t *ret;
 
 	if (WARN_ON_ONCE(!cur))
@@ -2802,7 +2802,7 @@ static int show_kprobe_addr(struct seq_file *pi, void *v)
 {
 	struct hlist_head *head;
 	struct kprobe *p, *kp;
-	const char *sym = NULL;
+	const char *sym;
 	unsigned int i = *(loff_t *) v;
 	unsigned long offset = 0;
 	char *modname, namebuf[KSYM_NAME_LEN];
-- 
cgit v1.2.3


From a8b9cf62ade1bf17261a979fc97e40c2d7842353 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Wed, 10 Jan 2024 09:13:06 +0900
Subject: ftrace: Fix DIRECT_CALLS to use SAVE_REGS by default

The commit 60c8971899f3 ("ftrace: Make DIRECT_CALLS work WITH_ARGS
and !WITH_REGS") changed DIRECT_CALLS to use SAVE_ARGS when there
are multiple ftrace_ops at the same function, but since the x86 only
support to jump to direct_call from ftrace_regs_caller, when we set
the function tracer on the same target function on x86, ftrace-direct
does not work as below (this actually works on arm64.)

At first, insmod ftrace-direct.ko to put a direct_call on
'wake_up_process()'.

 # insmod kernel/samples/ftrace/ftrace-direct.ko
 # less trace
...
          <idle>-0       [006] ..s1.   564.686958: my_direct_func: waking up rcu_preempt-17
          <idle>-0       [007] ..s1.   564.687836: my_direct_func: waking up kcompactd0-63
          <idle>-0       [006] ..s1.   564.690926: my_direct_func: waking up rcu_preempt-17
          <idle>-0       [006] ..s1.   564.696872: my_direct_func: waking up rcu_preempt-17
          <idle>-0       [007] ..s1.   565.191982: my_direct_func: waking up kcompactd0-63

Setup a function filter to the 'wake_up_process' too, and enable it.

 # cd /sys/kernel/tracing/
 # echo wake_up_process > set_ftrace_filter
 # echo function > current_tracer
 # less trace
...
          <idle>-0       [006] ..s3.   686.180972: wake_up_process <-call_timer_fn
          <idle>-0       [006] ..s3.   686.186919: wake_up_process <-call_timer_fn
          <idle>-0       [002] ..s3.   686.264049: wake_up_process <-call_timer_fn
          <idle>-0       [002] d.h6.   686.515216: wake_up_process <-kick_pool
          <idle>-0       [002] d.h6.   686.691386: wake_up_process <-kick_pool

Then, only function tracer is shown on x86.
But if you enable 'kprobe on ftrace' event (which uses SAVE_REGS flag)
on the same function, it is shown again.

 # echo 'p wake_up_process' >> dynamic_events
 # echo 1 > events/kprobes/p_wake_up_process_0/enable
 # echo > trace
 # less trace
...
          <idle>-0       [006] ..s2.  2710.345919: p_wake_up_process_0: (wake_up_process+0x4/0x20)
          <idle>-0       [006] ..s3.  2710.345923: wake_up_process <-call_timer_fn
          <idle>-0       [006] ..s1.  2710.345928: my_direct_func: waking up rcu_preempt-17
          <idle>-0       [006] ..s2.  2710.349931: p_wake_up_process_0: (wake_up_process+0x4/0x20)
          <idle>-0       [006] ..s3.  2710.349934: wake_up_process <-call_timer_fn
          <idle>-0       [006] ..s1.  2710.349937: my_direct_func: waking up rcu_preempt-17

To fix this issue, use SAVE_REGS flag for multiple ftrace_ops flag of
direct_call by default.

Link: https://lore.kernel.org/linux-trace-kernel/170484558617.178953.1590516949390270842.stgit@devnote2

Fixes: 60c8971899f3 ("ftrace: Make DIRECT_CALLS work WITH_ARGS and !WITH_REGS")
Cc: stable@vger.kernel.org
Cc: Florent Revest <revest@chromium.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com> [arm64]
Acked-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b01ae7d36021..c060d5b47910 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5325,7 +5325,17 @@ static LIST_HEAD(ftrace_direct_funcs);
 
 static int register_ftrace_function_nolock(struct ftrace_ops *ops);
 
+/*
+ * If there are multiple ftrace_ops, use SAVE_REGS by default, so that direct
+ * call will be jumped from ftrace_regs_caller. Only if the architecture does
+ * not support ftrace_regs_caller but direct_call, use SAVE_ARGS so that it
+ * jumps from ftrace_caller for multiple ftrace_ops.
+ */
+#ifndef HAVE_DYNAMIC_FTRACE_WITH_REGS
 #define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_ARGS)
+#else
+#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS)
+#endif
 
 static int check_direct_multi(struct ftrace_ops *ops)
 {
-- 
cgit v1.2.3


From 44dc5c41b5b1267d4dd037d26afc0c4d3a568acb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 9 Feb 2024 06:36:22 -0500
Subject: tracing: Fix wasted memory in saved_cmdlines logic

While looking at improving the saved_cmdlines cache I found a huge amount
of wasted memory that should be used for the cmdlines.

The tracing data saves pids during the trace. At sched switch, if a trace
occurred, it will save the comm of the task that did the trace. This is
saved in a "cache" that maps pids to comms and exposed to user space via
the /sys/kernel/tracing/saved_cmdlines file. Currently it only caches by
default 128 comms.

The structure that uses this creates an array to store the pids using
PID_MAX_DEFAULT (which is usually set to 32768). This causes the structure
to be of the size of 131104 bytes on 64 bit machines.

In hex: 131104 = 0x20020, and since the kernel allocates generic memory in
powers of two, the kernel would allocate 0x40000 or 262144 bytes to store
this structure. That leaves 131040 bytes of wasted space.

Worse, the structure points to an allocated array to store the comm names,
which is 16 bytes times the amount of names to save (currently 128), which
is 2048 bytes. Instead of allocating a separate array, make the structure
end with a variable length string and use the extra space for that.

This is similar to a recommendation that Linus had made about eventfs_inode names:

  https://lore.kernel.org/all/20240130190355.11486-5-torvalds@linux-foundation.org/

Instead of allocating a separate string array to hold the saved comms,
have the structure end with: char saved_cmdlines[]; and round up to the
next power of two over sizeof(struct saved_cmdline_buffers) + num_cmdlines * TASK_COMM_LEN
It will use this extra space for the saved_cmdline portion.

Now, instead of saving only 128 comms by default, by using this wasted
space at the end of the structure it can save over 8000 comms and even
saves space by removing the need for allocating the other array.

Link: https://lore.kernel.org/linux-trace-kernel/20240209063622.1f7b6d5f@rorschach.local.home

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Vincent Donnefort <vdonnefort@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Mete Durlu <meted@linux.ibm.com>
Fixes: 939c7a4f04fcd ("tracing: Introduce saved_cmdlines_size file")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 75 ++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2a7c6fd934e9..9ff8a439d674 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2320,7 +2320,7 @@ struct saved_cmdlines_buffer {
 	unsigned *map_cmdline_to_pid;
 	unsigned cmdline_num;
 	int cmdline_idx;
-	char *saved_cmdlines;
+	char saved_cmdlines[];
 };
 static struct saved_cmdlines_buffer *savedcmd;
 
@@ -2334,47 +2334,58 @@ static inline void set_cmdline(int idx, const char *cmdline)
 	strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
 }
 
-static int allocate_cmdlines_buffer(unsigned int val,
-				    struct saved_cmdlines_buffer *s)
+static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
+{
+	int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
+
+	kfree(s->map_cmdline_to_pid);
+	free_pages((unsigned long)s, order);
+}
+
+static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
 {
+	struct saved_cmdlines_buffer *s;
+	struct page *page;
+	int orig_size, size;
+	int order;
+
+	/* Figure out how much is needed to hold the given number of cmdlines */
+	orig_size = sizeof(*s) + val * TASK_COMM_LEN;
+	order = get_order(orig_size);
+	size = 1 << (order + PAGE_SHIFT);
+	page = alloc_pages(GFP_KERNEL, order);
+	if (!page)
+		return NULL;
+
+	s = page_address(page);
+	memset(s, 0, sizeof(*s));
+
+	/* Round up to actual allocation */
+	val = (size - sizeof(*s)) / TASK_COMM_LEN;
+	s->cmdline_num = val;
+
 	s->map_cmdline_to_pid = kmalloc_array(val,
 					      sizeof(*s->map_cmdline_to_pid),
 					      GFP_KERNEL);
-	if (!s->map_cmdline_to_pid)
-		return -ENOMEM;
-
-	s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL);
-	if (!s->saved_cmdlines) {
-		kfree(s->map_cmdline_to_pid);
-		return -ENOMEM;
+	if (!s->map_cmdline_to_pid) {
+		free_saved_cmdlines_buffer(s);
+		return NULL;
 	}
 
 	s->cmdline_idx = 0;
-	s->cmdline_num = val;
 	memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
 	       sizeof(s->map_pid_to_cmdline));
 	memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
 	       val * sizeof(*s->map_cmdline_to_pid));
 
-	return 0;
+	return s;
 }
 
 static int trace_create_savedcmd(void)
 {
-	int ret;
-
-	savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL);
-	if (!savedcmd)
-		return -ENOMEM;
-
-	ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd);
-	if (ret < 0) {
-		kfree(savedcmd);
-		savedcmd = NULL;
-		return -ENOMEM;
-	}
+	savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);
 
-	return 0;
+	return savedcmd ? 0 : -ENOMEM;
 }
 
 int is_tracing_stopped(void)
@@ -6056,26 +6067,14 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
-static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
-{
-	kfree(s->saved_cmdlines);
-	kfree(s->map_cmdline_to_pid);
-	kfree(s);
-}
-
 static int tracing_resize_saved_cmdlines(unsigned int val)
 {
 	struct saved_cmdlines_buffer *s, *savedcmd_temp;
 
-	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	s = allocate_cmdlines_buffer(val);
 	if (!s)
 		return -ENOMEM;
 
-	if (allocate_cmdlines_buffer(val, s) < 0) {
-		kfree(s);
-		return -ENOMEM;
-	}
-
 	preempt_disable();
 	arch_spin_lock(&trace_cmdline_lock);
 	savedcmd_temp = savedcmd;
-- 
cgit v1.2.3