summaryrefslogtreecommitdiffstats
path: root/kernel/kexec_file.c
diff options
context:
space:
mode:
authorAlexander Graf <graf@amazon.com>2025-06-10 08:53:27 +0000
committerAndrew Morton <akpm@linux-foundation.org>2025-08-02 12:01:38 -0700
commit07d24902977e4704fab8472981e73a0ad6dfa1fd (patch)
treed5b37b1b5b2e62f8c4a7f6ab998b1aeb8f8eecab /kernel/kexec_file.c
parentstackdepot: make max number of pools boot-time configurable (diff)
downloadlinux-07d24902977e4704fab8472981e73a0ad6dfa1fd.tar.gz
linux-07d24902977e4704fab8472981e73a0ad6dfa1fd.zip
kexec: enable CMA based contiguous allocation
When booting a new kernel with kexec_file, the kernel picks a target location that the kernel should live at, then allocates random pages, checks whether any of those patches magically happens to coincide with a target address range and if so, uses them for that range. For every page allocated this way, it then creates a page list that the relocation code - code that executes while all CPUs are off and we are just about to jump into the new kernel - copies to their final memory location. We can not put them there before, because chances are pretty good that at least some page in the target range is already in use by the currently running Linux environment. Copying is happening from a single CPU at RAM rate, which takes around 4-50 ms per 100 MiB. All of this is inefficient and error prone. To successfully kexec, we need to quiesce all devices of the outgoing kernel so they don't scribble over the new kernel's memory. We have seen cases where that does not happen properly (*cough* GIC *cough*) and hence the new kernel was corrupted. This started a month long journey to root cause failing kexecs to eventually see memory corruption, because the new kernel was corrupted severely enough that it could not emit output to tell us about the fact that it was corrupted. By allocating memory for the next kernel from a memory range that is guaranteed scribbling free, we can boot the next kernel up to a point where it is at least able to detect corruption and maybe even stop it before it becomes severe. This increases the chance for successful kexecs. Since kexec got introduced, Linux has gained the CMA framework which can perform physically contiguous memory mappings, while keeping that memory available for movable memory when it is not needed for contiguous allocations. The default CMA allocator is for DMA allocations. This patch adds logic to the kexec file loader to attempt to place the target payload at a location allocated from CMA. If successful, it uses that memory range directly instead of creating copy instructions during the hot phase. To ensure that there is a safety net in case anything goes wrong with the CMA allocation, it also adds a flag for user space to force disable CMA allocations. Using CMA allocations has two advantages: 1) Faster by 4-50 ms per 100 MiB. There is no more need to copy in the hot phase. 2) More robust. Even if by accident some page is still in use for DMA, the new kernel image will be safe from that access because it resides in a memory region that is considered allocated in the old kernel and has a chance to reinitialize that component. Link: https://lkml.kernel.org/r/20250610085327.51817-1-graf@amazon.com Signed-off-by: Alexander Graf <graf@amazon.com> Acked-by: Baoquan He <bhe@redhat.com> Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Zhongkun He <hezhongkun.hzk@bytedance.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'kernel/kexec_file.c')
-rw-r--r--kernel/kexec_file.c51
1 files changed, 50 insertions, 1 deletions
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 69fe76fd9233..41271eee0f99 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -26,6 +26,7 @@
#include <linux/kernel_read_file.h>
#include <linux/syscalls.h>
#include <linux/vmalloc.h>
+#include <linux/dma-map-ops.h>
#include "kexec_internal.h"
#ifdef CONFIG_KEXEC_SIG
@@ -253,6 +254,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
ret = 0;
}
+ image->no_cma = !!(flags & KEXEC_FILE_NO_CMA);
+
if (cmdline_len) {
image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
if (IS_ERR(image->cmdline_buf)) {
@@ -434,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
i, ksegment->buf, ksegment->bufsz, ksegment->mem,
ksegment->memsz);
- ret = kimage_load_segment(image, &image->segment[i]);
+ ret = kimage_load_segment(image, i);
if (ret)
goto out;
}
@@ -663,6 +666,43 @@ static int kexec_walk_resources(struct kexec_buf *kbuf,
return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
}
+static int kexec_alloc_contig(struct kexec_buf *kbuf)
+{
+ size_t nr_pages = kbuf->memsz >> PAGE_SHIFT;
+ unsigned long mem;
+ struct page *p;
+
+ /* User space disabled CMA allocations, bail out. */
+ if (kbuf->image->no_cma)
+ return -EPERM;
+
+ /* Skip CMA logic for crash kernel */
+ if (kbuf->image->type == KEXEC_TYPE_CRASH)
+ return -EPERM;
+
+ p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true);
+ if (!p)
+ return -ENOMEM;
+
+ pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p));
+
+ mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+
+ if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) {
+ /* Our region is already in use by a statically defined one. Bail out. */
+ pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz);
+ dma_release_from_contiguous(NULL, p, nr_pages);
+ return -EBUSY;
+ }
+
+ kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+ kbuf->cma = p;
+
+ arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0);
+
+ return 0;
+}
+
/**
* kexec_locate_mem_hole - find free memory for the purgatory or the next kernel
* @kbuf: Parameters for the memory search.
@@ -687,6 +727,13 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
if (ret <= 0)
return ret;
+ /*
+ * Try to find a free physically contiguous block of memory first. With that, we
+ * can avoid any copying at kexec time.
+ */
+ if (!kexec_alloc_contig(kbuf))
+ return 0;
+
if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
else
@@ -732,6 +779,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
/* Ensure minimum alignment needed for segments. */
kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
+ kbuf->cma = NULL;
/* Walk the RAM ranges and allocate a suitable range for the buffer */
ret = arch_kexec_locate_mem_hole(kbuf);
@@ -744,6 +792,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
ksegment->bufsz = kbuf->bufsz;
ksegment->mem = kbuf->mem;
ksegment->memsz = kbuf->memsz;
+ kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma;
kbuf->image->nr_segments++;
return 0;
}