From 6efbd5ddb6af0408301b4c15b413e6425c7650b2 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Sat, 21 Sep 2024 16:07:45 +0530 Subject: kexec/crash: no crash update when kexec in progress The following errors are observed when kexec is done with SMT=off on powerpc. [ 358.458385] Removing IBM Power 842 compression device [ 374.795734] kexec_core: Starting new kernel [ 374.795748] kexec: Waking offline cpu 1. [ 374.875695] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate [ 374.935833] kexec: Waking offline cpu 2. [ 375.015664] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate snip.. [ 375.515823] kexec: Waking offline cpu 6. [ 375.635667] crash hp: kexec_trylock() failed, elfcorehdr may be inaccurate [ 375.695836] kexec: Waking offline cpu 7. To avoid kexec kernel boot failure on PowerPC, all the present CPUs that are offline are brought online during kexec. For more information, refer to commit e8e5c2155b00 ("powerpc/kexec: Fix orphaned offline CPUs across kexec"). Bringing the CPUs online triggers the crash hotplug handler, crash_handle_hotplug_event(), to update the kdump image. Since the system is on the kexec kernel boot path and the kexec lock is held, the crash_handle_hotplug_event() function fails to acquire the same lock to update the kdump image, resulting in the error messages mentioned above. To fix this, return from crash_handle_hotplug_event() without printing the error message if kexec is in progress. The same applies to the crash_check_hotplug_support() function. Return 0 if kexec is in progress because kernel is not in a position to update the kdump image. Link: https://lkml.kernel.org/r/20240921103745.560430-1-sourabhjain@linux.ibm.com Signed-off-by: Sourabh Jain Acked-by: Baoquan he Reported-by: Sachin P Bappalige Cc: Hari Bathini Cc: Michael Ellerman Signed-off-by: Andrew Morton --- kernel/crash_core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index c1048893f4b6..078fe5bc5a74 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -505,7 +505,8 @@ int crash_check_hotplug_support(void) crash_hotplug_lock(); /* Obtain lock while reading crash information */ if (!kexec_trylock()) { - pr_info("kexec_trylock() failed, kdump image may be inaccurate\n"); + if (!kexec_in_progress) + pr_info("kexec_trylock() failed, kdump image may be inaccurate\n"); crash_hotplug_unlock(); return 0; } @@ -547,7 +548,8 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu, crash_hotplug_lock(); /* Obtain lock while changing crash information */ if (!kexec_trylock()) { - pr_info("kexec_trylock() failed, kdump image may be inaccurate\n"); + if (!kexec_in_progress) + pr_info("kexec_trylock() failed, kdump image may be inaccurate\n"); crash_hotplug_unlock(); return; } -- cgit v1.2.3 From 838010180241f5a9779a9ef9a621cdd2842f7354 Mon Sep 17 00:00:00 2001 From: Tio Zhang Date: Fri, 6 Sep 2024 17:47:00 +0800 Subject: kernel/watchdog: always restore watchdog_softlockup(,hardlockup)_user_enabled after proc show MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise when watchdog_enabled becomes 0, watchdog_softlockup(,hardlockup)_user_enabled will changes to 0 after proc show. Steps to reproduce: step 1: # cat /proc/sys/kernel/*watchdog 1 1 1 | name | value |----------------------------------|-------------------------- | watchdog_enabled | 1 |----------------------------------|-------------------------- | watchdog_hardlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_softlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_user_enabled | 1 |----------------------------------|-------------------------- step 2: # echo 0 > /proc/sys/kernel/watchdog | name | value |----------------------------------|-------------------------- | watchdog_enabled | 0 |----------------------------------|-------------------------- | watchdog_hardlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_softlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_user_enabled | 0 |----------------------------------|-------------------------- step 3: # cat /proc/sys/kernel/*watchdog 0 0 0 | name | value |----------------------------------|-------------------------- | watchdog_enabled | 0 |----------------------------------|-------------------------- | watchdog_hardlockup_user_enabled | 0 |----------------------------------|-------------------------- | watchdog_softlockup_user_enabled | 0 |----------------------------------|-------------------------- | watchdog_user_enabled | 0 |----------------------------------|-------------------------- step 4: # echo 1 > /proc/sys/kernel/watchdog | name | value |----------------------------------|-------------------------- | watchdog_enabled | 0 |----------------------------------|-------------------------- | watchdog_hardlockup_user_enabled | 0 |----------------------------------|-------------------------- | watchdog_softlockup_user_enabled | 0 |----------------------------------|-------------------------- | watchdog_user_enabled | 0 |----------------------------------|-------------------------- step 5: # cat /proc/sys/kernel/*watchdog 0 0 0 If we dont do "step 3", do "step 4" right after "step 2", it will be | name | value |----------------------------------|-------------------------- | watchdog_enabled | 1 |----------------------------------|-------------------------- | watchdog_hardlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_softlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_user_enabled | 1 |----------------------------------|-------------------------- then everything works correctly. So this patch fix "step 3"'s value into | name | value |----------------------------------|-------------------------- | watchdog_enabled | 0 |----------------------------------|-------------------------- | watchdog_hardlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_softlockup_user_enabled | 1 |----------------------------------|-------------------------- | watchdog_user_enabled | 0 |----------------------------------|-------------------------- And still print 0 as before. Link: https://lkml.kernel.org/r/20240906094700.GA30052@didi-ThinkCentre-M930t-N000 Signed-off-by: Tio Zhang Reviewed-by: Douglas Anderson Cc: Ben Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: John Ogness Cc: Juri Lelli Cc: Krister Johansen Cc: Li Zhe Cc: Luis Chamberlain Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Weißschuh Cc: Valentin Schneider Cc: Vincent Guittot Signed-off-by: Andrew Morton --- kernel/watchdog.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 262691ba62b7..6c91b6b72f51 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -990,6 +990,7 @@ static int proc_watchdog_common(int which, const struct ctl_table *table, int wr mutex_lock(&watchdog_mutex); + old = *param; if (!write) { /* * On read synchronize the userspace interface. This is a @@ -997,8 +998,8 @@ static int proc_watchdog_common(int which, const struct ctl_table *table, int wr */ *param = (watchdog_enabled & which) != 0; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + *param = old; } else { - old = READ_ONCE(*param); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!err && old != READ_ONCE(*param)) proc_watchdog_update(); -- cgit v1.2.3 From 5c1edea773c98707fbb23d1df168bcff52f61e4b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 25 Sep 2024 18:43:34 +0300 Subject: resource: replace open coded resource_intersection() Patch series "resource: A couple of cleanups". A couple of ad-hoc cleanups since there was a recent development of the code in question. No functional changes intended. This patch (of 2): __region_intersects() uses open coded resource_intersection(). Replace it with existing API which also make more clear what we are checking. Link: https://lkml.kernel.org/r/20240925154355.1170859-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20240925154355.1170859-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- kernel/resource.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 4101016e8b20..1c77ac239c7a 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -537,17 +537,16 @@ static int __region_intersects(struct resource *parent, resource_size_t start, size_t size, unsigned long flags, unsigned long desc) { - resource_size_t ostart, oend; int type = 0; int other = 0; struct resource *p, *dp; + struct resource res, o; bool is_type, covered; - struct resource res; res.start = start; res.end = start + size - 1; for (p = parent->child; p ; p = p->sibling) { - if (!resource_overlaps(p, &res)) + if (!resource_intersection(p, &res, &o)) continue; is_type = (p->flags & flags) == flags && (desc == IORES_DESC_NONE || desc == p->desc); @@ -568,8 +567,6 @@ static int __region_intersects(struct resource *parent, resource_size_t start, * |-- "System RAM" --||-- "CXL Window 0a" --| */ covered = false; - ostart = max(res.start, p->start); - oend = min(res.end, p->end); for_each_resource(p, dp, false) { if (!resource_overlaps(dp, &res)) continue; @@ -578,17 +575,17 @@ static int __region_intersects(struct resource *parent, resource_size_t start, if (is_type) { type++; /* - * Range from 'ostart' to 'dp->start' + * Range from 'o.start' to 'dp->start' * isn't covered by matched resource. */ - if (dp->start > ostart) + if (dp->start > o.start) break; - if (dp->end >= oend) { + if (dp->end >= o.end) { covered = true; break; } /* Remove covered range */ - ostart = max(ostart, dp->end + 1); + o.start = max(o.start, dp->end + 1); } } if (!covered) -- cgit v1.2.3 From ba1eccc114ffc62c4495a5e15659190fa2c42308 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 25 Sep 2024 18:43:35 +0300 Subject: resource: introduce is_type_match() helper and use it There are already a couple of places where we may replace a few lines of code by calling a helper, which increases readability while deduplicating the code. Introduce is_type_match() helper and use it. Link: https://lkml.kernel.org/r/20240925154355.1170859-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- kernel/resource.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 1c77ac239c7a..55bc09f50e21 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -297,6 +297,11 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); +static bool is_type_match(struct resource *p, unsigned long flags, unsigned long desc) +{ + return (p->flags & flags) == flags && (desc == IORES_DESC_NONE || desc == p->desc); +} + /** * find_next_iomem_res - Finds the lowest iomem resource that covers part of * [@start..@end]. @@ -339,13 +344,9 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, if (p->end < start) continue; - if ((p->flags & flags) != flags) - continue; - if ((desc != IORES_DESC_NONE) && (desc != p->desc)) - continue; - /* Found a match, break */ - break; + if (is_type_match(p, flags, desc)) + break; } if (p) { @@ -540,7 +541,7 @@ static int __region_intersects(struct resource *parent, resource_size_t start, int type = 0; int other = 0; struct resource *p, *dp; struct resource res, o; - bool is_type, covered; + bool covered; res.start = start; res.end = start + size - 1; @@ -548,9 +549,7 @@ static int __region_intersects(struct resource *parent, resource_size_t start, for (p = parent->child; p ; p = p->sibling) { if (!resource_intersection(p, &res, &o)) continue; - is_type = (p->flags & flags) == flags && - (desc == IORES_DESC_NONE || desc == p->desc); - if (is_type) { + if (is_type_match(p, flags, desc)) { type++; continue; } @@ -570,9 +569,7 @@ static int __region_intersects(struct resource *parent, resource_size_t start, for_each_resource(p, dp, false) { if (!resource_overlaps(dp, &res)) continue; - is_type = (dp->flags & flags) == flags && - (desc == IORES_DESC_NONE || desc == dp->desc); - if (is_type) { + if (is_type_match(dp, flags, desc)) { type++; /* * Range from 'o.start' to 'dp->start' -- cgit v1.2.3 From 4cc0473d7754d387680bdf0728eb29f0ec8834bf Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 7 Oct 2024 22:49:05 +0800 Subject: get rid of __get_task_comm() Patch series "Improve the copy of task comm", v8. Using {memcpy,strncpy,strcpy,kstrdup} to copy the task comm relies on the length of task comm. Changes in the task comm could result in a destination string that is overflow. Therefore, we should explicitly ensure the destination string is always NUL-terminated, regardless of the task comm. This approach will facilitate future extensions to the task comm. As suggested by Linus [0], we can identify all relevant code with the following git grep command: git grep 'memcpy.*->comm\>' git grep 'kstrdup.*->comm\>' git grep 'strncpy.*->comm\>' git grep 'strcpy.*->comm\>' PATCH #2~#4: memcpy PATCH #5~#6: kstrdup PATCH #7: strcpy Please note that strncpy() is not included in this series as it is being tracked by another effort. [1] This patch (of 7): We want to eliminate the use of __get_task_comm() for the following reasons: - The task_lock() is unnecessary Quoted from Linus [0]: : Since user space can randomly change their names anyway, using locking : was always wrong for readers (for writers it probably does make sense : to have some lock - although practically speaking nobody cares there : either, but at least for a writer some kind of race could have : long-term mixed results Link: https://lkml.kernel.org/r/20241007144911.27693-1-laoar.shao@gmail.com Link: https://lkml.kernel.org/r/20241007144911.27693-2-laoar.shao@gmail.com Link: https://lore.kernel.org/all/CAHk-=wivfrF0_zvf+oj6==Sh=-npJooP8chLPEfaFV0oNYTTBA@mail.gmail.com [0] Link: https://lore.kernel.org/all/CAHk-=whWtUC-AjmGJveAETKOMeMFSTwKwu99v7+b6AyHMmaDFA@mail.gmail.com/ Link: https://lore.kernel.org/all/CAHk-=wjAmmHUg6vho1KjzQi2=psR30+CogFd4aXrThr2gsiS4g@mail.gmail.com/ [0] Link: https://github.com/KSPP/linux/issues/90 [1] Signed-off-by: Yafang Shao Suggested-by: Linus Torvalds Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Eric Biederman Cc: Kees Cook Cc: Alexei Starovoitov Cc: Matus Jokay Cc: Alejandro Colomar Cc: "Serge E. Hallyn" Cc: Catalin Marinas Cc: Justin Stitt Cc: Steven Rostedt (Google) Cc: Tetsuo Handa Cc: Andy Shevchenko Cc: Daniel Vetter Cc: David Airlie Cc: Eric Paris Cc: James Morris Cc: Maarten Lankhorst Cc: Matthew Wilcox Cc: Maxime Ripard Cc: Ondrej Mosnacek Cc: Paul Moore Cc: Quentin Monnet Cc: Simon Horman Cc: Stephen Smalley Cc: Thomas Zimmermann Signed-off-by: Andrew Morton --- fs/exec.c | 10 ---------- fs/proc/array.c | 2 +- include/linux/sched.h | 28 ++++++++++++++++++++++------ kernel/kthread.c | 2 +- 4 files changed, 24 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 6c53920795c2..77364806b48d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1189,16 +1189,6 @@ static int unshare_sighand(struct task_struct *me) return 0; } -char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) -{ - task_lock(tsk); - /* Always NUL terminated and zero-padded */ - strscpy_pad(buf, tsk->comm, buf_size); - task_unlock(tsk); - return buf; -} -EXPORT_SYMBOL_GPL(__get_task_comm); - /* * These functions flushes out all traces of the currently running executable * so that a new one can be started diff --git a/fs/proc/array.c b/fs/proc/array.c index 34a47fb0c57f..55ed3510d2bb 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -109,7 +109,7 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) else if (p->flags & PF_KTHREAD) get_kthread_comm(tcomm, sizeof(tcomm), p); else - __get_task_comm(tcomm, sizeof(tcomm), p); + get_task_comm(tcomm, p); if (escape) seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); diff --git a/include/linux/sched.h b/include/linux/sched.h index bb343136ddd0..67718d5591dd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1121,9 +1121,12 @@ struct task_struct { /* * executable name, excluding path. * - * - normally initialized setup_new_exec() - * - access it with [gs]et_task_comm() - * - lock it with task_lock() + * - normally initialized begin_new_exec() + * - set it with set_task_comm() + * - strscpy_pad() to ensure it is always NUL-terminated and + * zero-padded + * - task_lock() to ensure the operation is atomic and the name is + * fully updated. */ char comm[TASK_COMM_LEN]; @@ -1938,10 +1941,23 @@ static inline void set_task_comm(struct task_struct *tsk, const char *from) __set_task_comm(tsk, from, false); } -extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); +/* + * - Why not use task_lock()? + * User space can randomly change their names anyway, so locking for readers + * doesn't make sense. For writers, locking is probably necessary, as a race + * condition could lead to long-term mixed results. + * The strscpy_pad() in __set_task_comm() can ensure that the task comm is + * always NUL-terminated and zero-padded. Therefore the race condition between + * reader and writer is not an issue. + * + * - BUILD_BUG_ON() can help prevent the buf from being truncated. + * Since the callers don't perform any return value checks, this safeguard is + * necessary. + */ #define get_task_comm(buf, tsk) ({ \ - BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ - __get_task_comm(buf, sizeof(buf), tsk); \ + BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \ + strscpy_pad(buf, (tsk)->comm); \ + buf; \ }) #ifdef CONFIG_SMP diff --git a/kernel/kthread.c b/kernel/kthread.c index 9bb36897b6c6..a5ac612b1609 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -101,7 +101,7 @@ void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { - __get_task_comm(buf, buf_size, tsk); + strscpy(buf, tsk->comm, buf_size); return; } -- cgit v1.2.3 From 286d7a54c8a2f124337a91235199585a35822d94 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 7 Oct 2024 22:49:06 +0800 Subject: auditsc: replace memcpy() with strscpy() Using strscpy() to read the task comm ensures that the name is always NUL-terminated, regardless of the source string. This approach also facilitates future extensions to the task comm. Link: https://lkml.kernel.org/r/20241007144911.27693-3-laoar.shao@gmail.com Signed-off-by: Yafang Shao Acked-by: Paul Moore Reviewed-by: Justin Stitt Cc: Eric Paris Cc: Alejandro Colomar Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andy Shevchenko Cc: Catalin Marinas Cc: Christian Brauner Cc: Daniel Vetter Cc: David Airlie Cc: Eric Biederman Cc: James Morris Cc: Jan Kara Cc: Kees Cook Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Matthew Wilcox Cc: Matus Jokay Cc: Maxime Ripard Cc: Ondrej Mosnacek Cc: Quentin Monnet Cc: "Serge E. Hallyn" Cc: Simon Horman Cc: Stephen Smalley Cc: Steven Rostedt (Google) Cc: Tetsuo Handa Cc: Thomas Zimmermann Signed-off-by: Andrew Morton --- kernel/auditsc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cd57053b4a69..7adc67d5aafb 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2730,7 +2730,7 @@ void __audit_ptrace(struct task_struct *t) context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); security_task_getsecid_obj(t, &context->target_sid); - memcpy(context->target_comm, t->comm, TASK_COMM_LEN); + strscpy(context->target_comm, t->comm); } /** @@ -2757,7 +2757,7 @@ int audit_signal_info_syscall(struct task_struct *t) ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); security_task_getsecid_obj(t, &ctx->target_sid); - memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); + strscpy(ctx->target_comm, t->comm); return 0; } @@ -2778,7 +2778,7 @@ int audit_signal_info_syscall(struct task_struct *t) axp->target_uid[axp->pid_count] = t_uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]); - memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); + strscpy(axp->target_comm[axp->pid_count], t->comm); axp->pid_count++; return 0; -- cgit v1.2.3 From 834b251b1db6b88b9364955196e5e32746e5ccc7 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 9 Oct 2024 15:57:51 +0300 Subject: resource: correct reallocate_resource() documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit reallocate_resource() documentation claims constraint is about "the size and alignment" but the size is provided in another parameter. Instead of size, constraint has the allowed memory range (min, max) so change the wording to reflect that. Link: https://lkml.kernel.org/r/20241009125751.8090-1-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen Signed-off-by: Andrew Morton --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 55bc09f50e21..2d4208b2f62f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -738,7 +738,7 @@ EXPORT_SYMBOL_GPL(find_resource_space); * @root: root resource descriptor * @old: resource descriptor desired by caller * @newsize: new size of the resource descriptor - * @constraint: the size and alignment constraints to be met. + * @constraint: the memory range and alignment constraints to be met. */ static int reallocate_resource(struct resource *root, struct resource *old, resource_size_t newsize, -- cgit v1.2.3 From f2fa0fd4e7db8326a77618962714924b64f5f889 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 12 Oct 2024 19:52:53 +0200 Subject: reboot: move reboot_notifier_list to kernel/reboot.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All the functions related to the reboot notifier list are in kernel/reboot.c. Move the list itself, too. As there are no direct users anymore, make the declaration static. Link: https://lkml.kernel.org/r/20241012-reboot_notifier_list-v1-1-6093bb9455ce@weissschuh.net Signed-off-by: Thomas Weißschuh Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- include/linux/notifier.h | 2 -- kernel/notifier.c | 8 -------- kernel/reboot.c | 7 +++++++ 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 45702bdcbceb..b42e64734968 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -237,7 +237,5 @@ static inline int notifier_to_errno(int ret) #define KBD_KEYSYM 0x0004 /* Keyboard keysym */ #define KBD_POST_KEYSYM 0x0005 /* Called after keyboard keysym interpretation */ -extern struct blocking_notifier_head reboot_notifier_list; - #endif /* __KERNEL__ */ #endif /* _LINUX_NOTIFIER_H */ diff --git a/kernel/notifier.c b/kernel/notifier.c index b3ce28f39eb6..2f9fe7c30287 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -5,18 +5,10 @@ #include #include #include -#include #define CREATE_TRACE_POINTS #include -/* - * Notifier list for kernel code which wants to be called - * at shutdown. This is used to stop any idling DMA operations - * and the like. - */ -BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); - /* * Notifier chain core routines. The exported routines below * are layered on top of these, with appropriate locking added. diff --git a/kernel/reboot.c b/kernel/reboot.c index f05dbde2c93f..ffdf86b717ab 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -72,6 +72,13 @@ static bool poweroff_fallback_to_halt; */ void __weak (*pm_power_off)(void); +/* + * Notifier list for kernel code which wants to be called + * at shutdown. This is used to stop any idling DMA operations + * and the like. + */ +static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); + /** * emergency_restart - reboot the system * -- cgit v1.2.3 From ad8f63f935b6785c87681d35b9408f5ecd5db967 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 24 Sep 2024 11:07:13 +0200 Subject: perf/hw_breakpoint: use ERR_PTR_PCPU(), IS_ERR_PCPU() and PTR_ERR_PCPU() macros Use ERR_PTR_PCPU() when returning error pointer in the percpu address space. Use IS_ERR_PCPU() and PTR_ERR_PCPU() when returning the error pointer from the percpu address space. These macros add intermediate cast to unsigned long when switching named address spaces. The patch will avoid future build errors due to pointer address space mismatch with enabled strict percpu address space checks. Link: https://lkml.kernel.org/r/20240924090813.1353586-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Ian Rogers Cc: Adrian Hunter Cc: "Liang, Kan" Signed-off-by: Andrew Morton --- kernel/events/hw_breakpoint.c | 4 ++-- samples/hw_breakpoint/data_breakpoint.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 6c2cb4e4f48d..bc4a61029b6d 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -849,7 +849,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, cpu_events = alloc_percpu(typeof(*cpu_events)); if (!cpu_events) - return (void __percpu __force *)ERR_PTR(-ENOMEM); + return ERR_PTR_PCPU(-ENOMEM); cpus_read_lock(); for_each_online_cpu(cpu) { @@ -868,7 +868,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, return cpu_events; unregister_wide_hw_breakpoint(cpu_events); - return (void __percpu __force *)ERR_PTR(err); + return ERR_PTR_PCPU(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index a2c831e89ce0..fbb03b66dcbd 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -52,8 +52,8 @@ static int __init hw_break_module_init(void) attr.bp_type = HW_BREAKPOINT_W; sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, NULL); - if (IS_ERR((void __force *)sample_hbp)) { - ret = PTR_ERR((void __force *)sample_hbp); + if (IS_ERR_PCPU(sample_hbp)) { + ret = PTR_ERR_PCPU(sample_hbp); goto fail; } -- cgit v1.2.3 From 92a8b224b833e82d286d2100432adbac8cf8a2a1 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:51 +0800 Subject: lib/min_heap: introduce non-inline versions of min heap API functions Patch series "Enhance min heap API with non-inline functions and optimizations", v2. Add non-inline versions of the min heap API functions in lib/min_heap.c and updates all users outside of kernel/events/core.c to use these non-inline versions. To mitigate the performance impact of indirect function calls caused by the non-inline versions of the swap and compare functions, a builtin swap has been introduced that swaps elements based on their size. Additionally, it micro-optimizes the efficiency of the min heap by pre-scaling the counter, following the same approach as in lib/sort.c. Documentation for the min heap API has also been added to the core-api section. This patch (of 10): All current min heap API functions are marked with '__always_inline'. However, as the number of users increases, inlining these functions everywhere leads to a increase in kernel size. In performance-critical paths, such as when perf events are enabled and min heap functions are called on every context switch, it is important to retain the inline versions for optimal performance. To balance this, the original inline functions are kept, and additional non-inline versions of the functions have been added in lib/min_heap.c. Link: https://lkml.kernel.org/r/20241020040200.939973-1-visitorckw@gmail.com Link: https://lore.kernel.org/20240522161048.8d8bbc7b153b4ecd92c50666@linux-foundation.org Link: https://lkml.kernel.org/r/20241020040200.939973-2-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Suggested-by: Andrew Morton Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: Kuan-Wei Chiu Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- drivers/md/bcache/Kconfig | 1 + drivers/md/dm-vdo/Kconfig | 1 + fs/bcachefs/Kconfig | 1 + include/linux/min_heap.h | 129 ++++++++++++++++++++++++++++++---------------- kernel/events/core.c | 6 +-- lib/Kconfig | 3 ++ lib/Kconfig.debug | 1 + lib/Makefile | 1 + lib/min_heap.c | 70 +++++++++++++++++++++++++ 9 files changed, 167 insertions(+), 46 deletions(-) create mode 100644 lib/min_heap.c (limited to 'kernel') diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index b2d10063d35f..d4697e79d5a3 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -5,6 +5,7 @@ config BCACHE select BLOCK_HOLDER_DEPRECATED if SYSFS select CRC64 select CLOSURES + select MIN_HEAP help Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. diff --git a/drivers/md/dm-vdo/Kconfig b/drivers/md/dm-vdo/Kconfig index 111ecd2c2a24..2400b2bc4bc7 100644 --- a/drivers/md/dm-vdo/Kconfig +++ b/drivers/md/dm-vdo/Kconfig @@ -7,6 +7,7 @@ config DM_VDO select DM_BUFIO select LZ4_COMPRESS select LZ4_DECOMPRESS + select MIN_HEAP help This device mapper target presents a block device with deduplication, compression and thin-provisioning. diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 5bac803ea367..ab6c95b895b3 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -24,6 +24,7 @@ config BCACHEFS_FS select XXHASH select SRCU select SYMBOLIC_ERRNAME + select MIN_HEAP help The bcachefs filesystem - a modern, copy on write filesystem, with support for multiple devices, compression, checksumming, etc. diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 43a7b9dcf15e..0abb21173979 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -40,7 +40,7 @@ struct min_heap_callbacks { /* Initialize a min-heap. */ static __always_inline -void __min_heap_init(min_heap_char *heap, void *data, int size) +void __min_heap_init_inline(min_heap_char *heap, void *data, int size) { heap->nr = 0; heap->size = size; @@ -50,33 +50,33 @@ void __min_heap_init(min_heap_char *heap, void *data, int size) heap->data = heap->preallocated; } -#define min_heap_init(_heap, _data, _size) \ - __min_heap_init((min_heap_char *)_heap, _data, _size) +#define min_heap_init_inline(_heap, _data, _size) \ + __min_heap_init_inline((min_heap_char *)_heap, _data, _size) /* Get the minimum element from the heap. */ static __always_inline -void *__min_heap_peek(struct min_heap_char *heap) +void *__min_heap_peek_inline(struct min_heap_char *heap) { return heap->nr ? heap->data : NULL; } -#define min_heap_peek(_heap) \ - (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) +#define min_heap_peek_inline(_heap) \ + (__minheap_cast(_heap) __min_heap_peek_inline((min_heap_char *)_heap)) /* Check if the heap is full. */ static __always_inline -bool __min_heap_full(min_heap_char *heap) +bool __min_heap_full_inline(min_heap_char *heap) { return heap->nr == heap->size; } -#define min_heap_full(_heap) \ - __min_heap_full((min_heap_char *)_heap) +#define min_heap_full_inline(_heap) \ + __min_heap_full_inline((min_heap_char *)_heap) /* Sift the element at pos down the heap. */ static __always_inline -void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *left, *right; void *data = heap->data; @@ -108,13 +108,14 @@ void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, } } -#define min_heap_sift_down(_heap, _pos, _func, _args) \ - __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) +#define min_heap_sift_down_inline(_heap, _pos, _func, _args) \ + __min_heap_sift_down_inline((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), \ + _func, _args) /* Sift up ith element from the heap, O(log2(nr)). */ static __always_inline -void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, - const struct min_heap_callbacks *func, void *args) +void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; size_t parent; @@ -128,27 +129,28 @@ void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, } } -#define min_heap_sift_up(_heap, _idx, _func, _args) \ - __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) +#define min_heap_sift_up_inline(_heap, _idx, _func, _args) \ + __min_heap_sift_up_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, \ + _func, _args) /* Floyd's approach to heapification that is O(nr). */ static __always_inline -void __min_heapify_all(min_heap_char *heap, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { int i; for (i = heap->nr / 2 - 1; i >= 0; i--) - __min_heap_sift_down(heap, i, elem_size, func, args); + __min_heap_sift_down_inline(heap, i, elem_size, func, args); } -#define min_heapify_all(_heap, _func, _args) \ - __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heapify_all_inline(_heap, _func, _args) \ + __min_heapify_all_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) /* Remove minimum element from the heap, O(log2(nr)). */ static __always_inline -bool __min_heap_pop(min_heap_char *heap, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +bool __min_heap_pop_inline(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; @@ -158,13 +160,13 @@ bool __min_heap_pop(min_heap_char *heap, size_t elem_size, /* Place last element at the root (position 0) and then sift down. */ heap->nr--; memcpy(data, data + (heap->nr * elem_size), elem_size); - __min_heap_sift_down(heap, 0, elem_size, func, args); + __min_heap_sift_down_inline(heap, 0, elem_size, func, args); return true; } -#define min_heap_pop(_heap, _func, _args) \ - __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop_inline(_heap, _func, _args) \ + __min_heap_pop_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) /* * Remove the minimum element and then push the given element. The @@ -172,22 +174,21 @@ bool __min_heap_pop(min_heap_char *heap, size_t elem_size, * efficient than a pop followed by a push that does 2. */ static __always_inline -void __min_heap_pop_push(min_heap_char *heap, - const void *element, size_t elem_size, - const struct min_heap_callbacks *func, - void *args) +void __min_heap_pop_push_inline(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { memcpy(heap->data, element, elem_size); - __min_heap_sift_down(heap, 0, elem_size, func, args); + __min_heap_sift_down_inline(heap, 0, elem_size, func, args); } -#define min_heap_pop_push(_heap, _element, _func, _args) \ - __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop_push_inline(_heap, _element, _func, _args) \ + __min_heap_pop_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ + _func, _args) /* Push an element on to the heap, O(log2(nr)). */ static __always_inline -bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, - const struct min_heap_callbacks *func, void *args) +bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; int pos; @@ -201,18 +202,19 @@ bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, heap->nr++; /* Sift child at pos up. */ - __min_heap_sift_up(heap, elem_size, pos, func, args); + __min_heap_sift_up_inline(heap, elem_size, pos, func, args); return true; } -#define min_heap_push(_heap, _element, _func, _args) \ - __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) +#define min_heap_push_inline(_heap, _element, _func, _args) \ + __min_heap_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ + _func, _args) /* Remove ith element from the heap, O(log2(nr)). */ static __always_inline -bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, - const struct min_heap_callbacks *func, void *args) +bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) { void *data = heap->data; @@ -224,12 +226,53 @@ bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, if (idx == heap->nr) return true; func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args); - __min_heap_sift_up(heap, elem_size, idx, func, args); - __min_heap_sift_down(heap, idx, elem_size, func, args); + __min_heap_sift_up_inline(heap, elem_size, idx, func, args); + __min_heap_sift_down_inline(heap, idx, elem_size, func, args); return true; } +#define min_heap_del_inline(_heap, _idx, _func, _args) \ + __min_heap_del_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, \ + _func, _args) + +void __min_heap_init(min_heap_char *heap, void *data, int size); +void *__min_heap_peek(struct min_heap_char *heap); +bool __min_heap_full(min_heap_char *heap); +void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args); +void __min_heapify_all(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __min_heap_pop(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +void __min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args); +bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args); + +#define min_heap_init(_heap, _data, _size) \ + __min_heap_init((min_heap_char *)_heap, _data, _size) +#define min_heap_peek(_heap) \ + (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) +#define min_heap_full(_heap) \ + __min_heap_full((min_heap_char *)_heap) +#define min_heap_sift_down(_heap, _pos, _func, _args) \ + __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) +#define min_heap_sift_up(_heap, _idx, _func, _args) \ + __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) +#define min_heapify_all(_heap, _func, _args) \ + __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop(_heap, _func, _args) \ + __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) +#define min_heap_pop_push(_heap, _element, _func, _args) \ + __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ + _func, _args) +#define min_heap_push(_heap, _element, _func, _args) \ + __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) #define min_heap_del(_heap, _idx, _func, _args) \ __min_heap_del((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) diff --git a/kernel/events/core.c b/kernel/events/core.c index df27d08a7232..1b3c1198b2af 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3870,7 +3870,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); } - min_heapify_all(&event_heap, &perf_min_heap, NULL); + min_heapify_all_inline(&event_heap, &perf_min_heap, NULL); while (event_heap.nr) { ret = func(*evt, data); @@ -3879,9 +3879,9 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, *evt = perf_event_groups_next(*evt, pmu); if (*evt) - min_heap_sift_down(&event_heap, 0, &perf_min_heap, NULL); + min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL); else - min_heap_pop(&event_heap, &perf_min_heap, NULL); + min_heap_pop_inline(&event_heap, &perf_min_heap, NULL); } return 0; diff --git a/lib/Kconfig b/lib/Kconfig index cf303bd91dda..f5a2781669ea 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -780,3 +780,6 @@ config FIRMWARE_TABLE config UNION_FIND bool + +config MIN_HEAP + bool diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index eda319e9d569..2549b64b2280 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2279,6 +2279,7 @@ config TEST_LIST_SORT config TEST_MIN_HEAP tristate "Min heap test" depends on DEBUG_KERNEL || m + select MIN_HEAP help Enable this to turn on min heap function tests. This test is executed only once during system boot (so affects only boot time), diff --git a/lib/Makefile b/lib/Makefile index feebed74fc7a..1eb89962daef 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -40,6 +40,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ lib-$(CONFIG_UNION_FIND) += union_find.o lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_SMP) += cpumask.o +lib-$(CONFIG_MIN_HEAP) += min_heap.o lib-y += kobject.o klist.o obj-y += lockref.o diff --git a/lib/min_heap.c b/lib/min_heap.c new file mode 100644 index 000000000000..4485372ff3b1 --- /dev/null +++ b/lib/min_heap.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +void __min_heap_init(min_heap_char *heap, void *data, int size) +{ + __min_heap_init_inline(heap, data, size); +} +EXPORT_SYMBOL(__min_heap_init); + +void *__min_heap_peek(struct min_heap_char *heap) +{ + return __min_heap_peek_inline(heap); +} +EXPORT_SYMBOL(__min_heap_peek); + +bool __min_heap_full(min_heap_char *heap) +{ + return __min_heap_full_inline(heap); +} +EXPORT_SYMBOL(__min_heap_full); + +void __min_heap_sift_down(min_heap_char *heap, int pos, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + __min_heap_sift_down_inline(heap, pos, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heap_sift_down); + +void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + __min_heap_sift_up_inline(heap, elem_size, idx, func, args); +} +EXPORT_SYMBOL(__min_heap_sift_up); + +void __min_heapify_all(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + __min_heapify_all_inline(heap, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heapify_all); + +bool __min_heap_pop(min_heap_char *heap, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + return __min_heap_pop_inline(heap, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heap_pop); + +void __min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + __min_heap_pop_push_inline(heap, element, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heap_pop_push); + +bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size, + const struct min_heap_callbacks *func, void *args) +{ + return __min_heap_push_inline(heap, element, elem_size, func, args); +} +EXPORT_SYMBOL(__min_heap_push); + +bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, + const struct min_heap_callbacks *func, void *args) +{ + return __min_heap_del_inline(heap, elem_size, idx, func, args); +} +EXPORT_SYMBOL(__min_heap_del); -- cgit v1.2.3 From 083ad2871a8bbaf404b97eaa5e713e427e229f6b Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 20 Oct 2024 12:01:55 +0800 Subject: perf/core: update min_heap_callbacks to use default builtin swap After introducing the default builtin swap implementation, update the min_heap_callbacks to replace the swp function pointer with NULL. This change allows the min heap to directly utilize the builtin swap, simplifying the code. Link: https://lkml.kernel.org/r/20241020040200.939973-6-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Ching-Chun (Jim) Huang Cc: Coly Li Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Jonathan Corbet Cc: Kent Overstreet Cc: "Liang, Kan" Cc: Mark Rutland Cc: Matthew Sakai Cc: Matthew Wilcox (Oracle) Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- kernel/events/core.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1b3c1198b2af..c2b4d7ee6296 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3778,18 +3778,11 @@ static bool perf_less_group_idx(const void *l, const void *r, void __always_unus return le->group_index < re->group_index; } -static void swap_ptr(void *l, void *r, void __always_unused *args) -{ - void **lp = l, **rp = r; - - swap(*lp, *rp); -} - DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap); static const struct min_heap_callbacks perf_min_heap = { .less = perf_less_group_idx, - .swp = swap_ptr, + .swp = NULL, }; static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event) -- cgit v1.2.3 From d7ce9c73da54a096311edbf4688b78b179dd79bc Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 29 Oct 2024 20:27:35 +0800 Subject: resource: avoid unnecessary resource tree walking in __region_intersects() Currently, if __region_intersects() finds any overlapped but unmatched resource, it walks the descendant resource tree to check for overlapped and matched descendant resources using for_each_resource(). However, in current kernel, for_each_resource() iterates not only the descendant tree, but also subsequent sibling trees in certain scenarios. While this doesn't introduce bugs, it makes code hard to be understood and potentially inefficient. So, the patch revises next_resource() and for_each_resource() and makes for_each_resource() traverse the subtree under the specified subtree root only. Test shows that this avoids unnecessary resource tree walking in __region_intersects(). For the example resource tree as follows, X | A----D----E | B--C if 'A' is the overlapped but unmatched resource, original kernel iterates 'B', 'C', 'D', 'E' when it walks the descendant tree. While the patched kernel iterates only 'B', 'C'. Thanks David Hildenbrand for providing a good resource tree example. Link: https://lkml.kernel.org/r/20241029122735.79164-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Dan Williams Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: Jonathan Cameron Cc: Alistair Popple Cc: Andy Shevchenko Cc: Bjorn Helgaas Cc: Baoquan He Cc: Dave Jiang Cc: Alison Schofield Signed-off-by: Andrew Morton --- kernel/resource.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 2d4208b2f62f..59c6e608f1d1 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -50,17 +50,35 @@ EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); -static struct resource *next_resource(struct resource *p, bool skip_children) +/* + * Return the next node of @p in pre-order tree traversal. If + * @skip_children is true, skip the descendant nodes of @p in + * traversal. If @p is a descendant of @subtree_root, only traverse + * the subtree under @subtree_root. + */ +static struct resource *next_resource(struct resource *p, bool skip_children, + struct resource *subtree_root) { if (!skip_children && p->child) return p->child; - while (!p->sibling && p->parent) + while (!p->sibling && p->parent) { p = p->parent; + if (p == subtree_root) + return NULL; + } return p->sibling; } +/* + * Traverse the resource subtree under @_root in pre-order, excluding + * @_root itself. + * + * NOTE: '__p' is introduced to avoid shadowing '_p' outside of loop. + * And it is referenced to avoid unused variable warning. + */ #define for_each_resource(_root, _p, _skip_children) \ - for ((_p) = (_root)->child; (_p); (_p) = next_resource(_p, _skip_children)) + for (typeof(_root) __root = (_root), __p = _p = __root->child; \ + __p && _p; _p = next_resource(_p, _skip_children, __root)) #ifdef CONFIG_PROC_FS @@ -88,7 +106,7 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; - return (void *)next_resource(p, false); + return (void *)next_resource(p, false, NULL); } static void r_stop(struct seq_file *m, void *v) -- cgit v1.2.3 From 03ecb24db20e78c478b9b7c0ec767bfdc053ecd4 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Sun, 27 Oct 2024 20:07:46 +0800 Subject: hung_task: add detect count for hung tasks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "add detect count for hung tasks", v2. This patchset adds a counter, hung_task_detect_count, to track the number of times hung tasks are detected. IHMO, hung tasks are a critical metric. Currently, we detect them by periodically parsing dmesg. However, this method isn't as user-friendly as using a counter. Sometimes, a short-lived issue with NIC or hard drive can quickly decrease the hung_task_warnings to zero. Without warnings, we must directly access the node to ensure that there are no more hung tasks and that the system has recovered. After all, load average alone cannot provide a clear picture. Once this counter is in place, in a high-density deployment pattern, we plan to set hung_task_timeout_secs to a lower number to improve stability, even though this might result in false positives. And then we can set a time-based threshold: if hung tasks last beyond this duration, we will automatically migrate containers to other nodes. Based on past experience, this approach could help avoid many production disruptions. Moreover, just like other important events such as OOM that already have counters, having a dedicated counter for hung tasks makes sense ;) This patch (of 2): This commit adds a counter, hung_task_detect_count, to track the number of times hung tasks are detected. IHMO, hung tasks are a critical metric. Currently, we detect them by periodically parsing dmesg. However, this method isn't as user-friendly as using a counter. Sometimes, a short-lived issue with NIC or hard drive can quickly decrease the hung_task_warnings to zero. Without warnings, we must directly access the node to ensure that there are no more hung tasks and that the system has recovered. After all, load average alone cannot provide a clear picture. Once this counter is in place, in a high-density deployment pattern, we plan to set hung_task_timeout_secs to a lower number to improve stability, even though this might result in false positives. And then we can set a time-based threshold: if hung tasks last beyond this duration, we will automatically migrate containers to other nodes. Based on past experience, this approach could help avoid many production disruptions. Moreover, just like other important events such as OOM that already have counters, having a dedicated counter for hung tasks makes sense. [ioworker0@gmail.com: proc_doulongvec_minmax instead of proc_dointvec] Link: https://lkml.kernel.org/r/20241101114833.8377-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20241027120747.42833-1-ioworker0@gmail.com Link: https://lkml.kernel.org/r/20241027120747.42833-2-ioworker0@gmail.com Signed-off-by: Mingzhe Yang Signed-off-by: Lance Yang Cc: Bang Li Cc: Baolin Wang Cc: David Hildenbrand Cc: Huang Cun Cc: Joel Granados Cc: Joel Granados Cc: John Siddle Cc: Kent Overstreet Cc: Ryan Roberts Cc: Thomas Weißschuh Cc: Yongliang Gao Cc: Zi Yan Signed-off-by: Andrew Morton --- kernel/hung_task.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 959d99583d1c..c18717189f32 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -30,6 +30,11 @@ */ static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; +/* + * Total number of tasks detected as hung since boot: + */ +static unsigned long __read_mostly sysctl_hung_task_detect_count; + /* * Limit number of tasks checked in a batch. * @@ -115,6 +120,12 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) return; + /* + * This counter tracks the total number of tasks detected as hung + * since boot. + */ + sysctl_hung_task_detect_count++; + trace_sched_process_hang(t); if (sysctl_hung_task_panic) { @@ -314,6 +325,13 @@ static struct ctl_table hung_task_sysctls[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_NEG_ONE, }, + { + .procname = "hung_task_detect_count", + .data = &sysctl_hung_task_detect_count, + .maxlen = sizeof(unsigned long), + .mode = 0444, + .proc_handler = proc_doulongvec_minmax, + }, }; static void __init hung_task_sysctl_init(void) -- cgit v1.2.3 From 45dac1959bbdc498a2abb89919221455225789dc Mon Sep 17 00:00:00 2001 From: zhangguopeng Date: Tue, 5 Nov 2024 17:49:41 +0800 Subject: kernel/reboot: replace sprintf() with sysfs_emit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As Documentation/filesystems/sysfs.rst suggested, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. No functional change intended. Link: https://lkml.kernel.org/r/20241105094941.33739-1-zhangguopeng@kylinos.cn Signed-off-by: zhangguopeng Cc: Daniel Lezcano Cc: Fabio Estevam Cc: Joel Granados Cc: Thomas Weißschuh Signed-off-by: Andrew Morton --- kernel/reboot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index ffdf86b717ab..a701000bab34 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -1137,7 +1137,7 @@ static ssize_t mode_show(struct kobject *kobj, struct kobj_attribute *attr, char val = REBOOT_UNDEFINED_STR; } - return sprintf(buf, "%s\n", val); + return sysfs_emit(buf, "%s\n", val); } static ssize_t mode_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -1167,7 +1167,7 @@ static struct kobj_attribute reboot_mode_attr = __ATTR_RW(mode); #ifdef CONFIG_X86 static ssize_t force_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", reboot_force); + return sysfs_emit(buf, "%d\n", reboot_force); } static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -1214,7 +1214,7 @@ static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char val = REBOOT_UNDEFINED_STR; } - return sprintf(buf, "%s\n", val); + return sysfs_emit(buf, "%s\n", val); } static ssize_t type_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -1247,7 +1247,7 @@ static struct kobj_attribute reboot_type_attr = __ATTR_RW(type); #ifdef CONFIG_SMP static ssize_t cpu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", reboot_cpu); + return sysfs_emit(buf, "%d\n", reboot_cpu); } static ssize_t cpu_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) -- cgit v1.2.3