aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/proc.rst5
-rw-r--r--fs/proc/array.c2
-rw-r--r--include/linux/huge_mm.h20
-rw-r--r--include/linux/mm_types.h13
-rw-r--r--include/uapi/linux/prctl.h10
-rw-r--r--kernel/sys.c59
-rw-r--r--mm/khugepaged.c2
7 files changed, 82 insertions, 29 deletions
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 2971551b7235..915a3e44bc12 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -291,8 +291,9 @@ It's slow but very precise.
HugetlbPages size of hugetlb memory portions
CoreDumping process's memory is currently being dumped
(killing the process may lead to a corrupted core)
- THP_enabled process is allowed to use THP (returns 0 when
- PR_SET_THP_DISABLE is set on the process
+ THP_enabled process is allowed to use THP (returns 0 when
+ PR_SET_THP_DISABLE is set on the process to disable
+ THP completely, not just partially)
Threads number of threads
SigQ number of signals queued/max. number for queue
SigPnd bitmap of pending signals for the thread
diff --git a/fs/proc/array.c b/fs/proc/array.c
index c286dc12325e..d84b291dd1ed 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -422,7 +422,7 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm)
bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE);
if (thp_enabled)
- thp_enabled = !mm_flags_test(MMF_DISABLE_THP, mm);
+ thp_enabled = !mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
}
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 84b7eebe0d68..22b8b067b295 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -318,16 +318,26 @@ struct thpsize {
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
+/*
+ * Check whether THPs are explicitly disabled for this VMA, for example,
+ * through madvise or prctl.
+ */
static inline bool vma_thp_disabled(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
+ /* Are THPs disabled for this VMA? */
+ if (vm_flags & VM_NOHUGEPAGE)
+ return true;
+ /* Are THPs disabled for all VMAs in the whole process? */
+ if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, vma->vm_mm))
+ return true;
/*
- * Explicitly disabled through madvise or prctl, or some
- * architectures may disable THP for some mappings, for
- * example, s390 kvm.
+ * Are THPs disabled only for VMAs where we didn't get an explicit
+ * advise to use them?
*/
- return (vm_flags & VM_NOHUGEPAGE) ||
- mm_flags_test(MMF_DISABLE_THP, vma->vm_mm);
+ if (vm_flags & VM_HUGEPAGE)
+ return false;
+ return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm);
}
static inline bool thp_disabled_by_hw(void)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 05475b5fd516..d247da2fdb52 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1792,19 +1792,16 @@ enum {
#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
#define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */
-/*
- * This one-shot flag is dropped due to necessity of changing exe once again
- * on NFS restore
- */
-//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */
+#define MMF_HUGE_ZERO_FOLIO 18 /* mm has ever used the global huge zero folio */
#define MMF_HAS_UPROBES 19 /* has uprobes */
#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */
#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
-#define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */
-#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
-#define MMF_DISABLE_THP_MASK BIT(MMF_DISABLE_THP)
+#define MMF_DISABLE_THP_EXCEPT_ADVISED 23 /* no THP except when advised (e.g., VM_HUGEPAGE) */
+#define MMF_DISABLE_THP_COMPLETELY 24 /* no THP for all VMAs */
+#define MMF_DISABLE_THP_MASK (BIT(MMF_DISABLE_THP_COMPLETELY) | \
+ BIT(MMF_DISABLE_THP_EXCEPT_ADVISED))
#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */
#define MMF_MULTIPROCESS 26 /* mm is shared between processes */
/*
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index ed3aed264aeb..150b6deebfb1 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -177,7 +177,17 @@ struct prctl_mm_map {
#define PR_GET_TID_ADDRESS 40
+/*
+ * Flags for PR_SET_THP_DISABLE are only applicable when disabling. Bit 0
+ * is reserved, so PR_GET_THP_DISABLE can return "1 | flags", to effectively
+ * return "1" when no flags were specified for PR_SET_THP_DISABLE.
+ */
#define PR_SET_THP_DISABLE 41
+/*
+ * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE /
+ * VM_HUGEPAGE).
+ */
+# define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1)
#define PR_GET_THP_DISABLE 42
/*
diff --git a/kernel/sys.c b/kernel/sys.c
index 605f7fe9a143..a46d9b75880b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2452,6 +2452,51 @@ static int prctl_get_auxv(void __user *addr, unsigned long len)
return sizeof(mm->saved_auxv);
}
+static int prctl_get_thp_disable(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ /* If disabled, we return "1 | flags", otherwise 0. */
+ if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
+ return 1;
+ else if (mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, mm))
+ return 1 | PR_THP_DISABLE_EXCEPT_ADVISED;
+ return 0;
+}
+
+static int prctl_set_thp_disable(bool thp_disable, unsigned long flags,
+ unsigned long arg4, unsigned long arg5)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (arg4 || arg5)
+ return -EINVAL;
+
+ /* Flags are only allowed when disabling. */
+ if ((!thp_disable && flags) || (flags & ~PR_THP_DISABLE_EXCEPT_ADVISED))
+ return -EINVAL;
+ if (mmap_write_lock_killable(current->mm))
+ return -EINTR;
+ if (thp_disable) {
+ if (flags & PR_THP_DISABLE_EXCEPT_ADVISED) {
+ mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm);
+ mm_flags_set(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
+ } else {
+ mm_flags_set(MMF_DISABLE_THP_COMPLETELY, mm);
+ mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
+ }
+ } else {
+ mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm);
+ mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
+ }
+ mmap_write_unlock(current->mm);
+ return 0;
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2625,20 +2670,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
return task_no_new_privs(current) ? 1 : 0;
case PR_GET_THP_DISABLE:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- error = !!mm_flags_test(MMF_DISABLE_THP, me->mm);
+ error = prctl_get_thp_disable(arg2, arg3, arg4, arg5);
break;
case PR_SET_THP_DISABLE:
- if (arg3 || arg4 || arg5)
- return -EINVAL;
- if (mmap_write_lock_killable(me->mm))
- return -EINTR;
- if (arg2)
- mm_flags_set(MMF_DISABLE_THP, me->mm);
- else
- mm_flags_clear(MMF_DISABLE_THP, me->mm);
- mmap_write_unlock(me->mm);
+ error = prctl_set_thp_disable(arg2, arg3, arg4, arg5);
break;
case PR_MPX_ENABLE_MANAGEMENT:
case PR_MPX_DISABLE_MANAGEMENT:
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 550eb00116c5..1a416b865997 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -410,7 +410,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
{
return hpage_collapse_test_exit(mm) ||
- mm_flags_test(MMF_DISABLE_THP, mm);
+ mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
}
static bool hugepage_pmd_enabled(void)