aboutsummaryrefslogtreecommitdiffstats
path: root/mm/swap_table.h
diff options
context:
space:
mode:
authorKairui Song <kasong@tencent.com>2025-09-17 00:00:59 +0800
committerAndrew Morton <akpm@linux-foundation.org>2025-09-21 14:22:25 -0700
commit07adc4cf1ecd316e7b6f4a142e5f5e96ce697e65 (patch)
tree18120dfe539c450fc9387fd32023f172323bc25a /mm/swap_table.h
parentmm, swap: remove contention workaround for swap cache (diff)
downloadlinux-07adc4cf1ecd316e7b6f4a142e5f5e96ce697e65.tar.gz
linux-07adc4cf1ecd316e7b6f4a142e5f5e96ce697e65.zip
mm, swap: implement dynamic allocation of swap table
Now swap table is cluster based, which means free clusters can free its table since no one should modify it. There could be speculative readers, like swap cache look up, protect them by making them RCU protected. All swap table should be filled with null entries before free, so such readers will either see a NULL pointer or a null filled table being lazy freed. On allocation, allocate the table when a cluster is used by any order. This way, we can reduce the memory usage of large swap device significantly. This idea to dynamically release unused swap cluster data was initially suggested by Chris Li while proposing the cluster swap allocator and it suits the swap table idea very well. Link: https://lkml.kernel.org/r/20250916160100.31545-15-ryncsn@gmail.com Co-developed-by: Chris Li <chrisl@kernel.org> Signed-off-by: Chris Li <chrisl@kernel.org> Signed-off-by: Kairui Song <kasong@tencent.com> Suggested-by: Chris Li <chrisl@kernel.org> Reviewed-by: Barry Song <baohua@kernel.org> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Baoquan He <bhe@redhat.com> Cc: David Hildenbrand <david@redhat.com> Cc: "Huang, Ying" <ying.huang@linux.alibaba.com> Cc: Hugh Dickins <hughd@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kemeng Shi <shikemeng@huaweicloud.com> Cc: kernel test robot <oliver.sang@intel.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Zi Yan <ziy@nvidia.com> Cc: SeongJae Park <sj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/swap_table.h')
-rw-r--r--mm/swap_table.h37
1 files changed, 34 insertions, 3 deletions
diff --git a/mm/swap_table.h b/mm/swap_table.h
index e1f7cc009701..52254e455304 100644
--- a/mm/swap_table.h
+++ b/mm/swap_table.h
@@ -2,8 +2,15 @@
#ifndef _MM_SWAP_TABLE_H
#define _MM_SWAP_TABLE_H
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
#include "swap.h"
+/* A typical flat array in each cluster as swap table */
+struct swap_table {
+ atomic_long_t entries[SWAPFILE_CLUSTER];
+};
+
/*
* A swap table entry represents the status of a swap slot on a swap
* (physical or virtual) device. The swap table in each cluster is a
@@ -76,22 +83,46 @@ static inline void *swp_tb_to_shadow(unsigned long swp_tb)
static inline void __swap_table_set(struct swap_cluster_info *ci,
unsigned int off, unsigned long swp_tb)
{
+ atomic_long_t *table = rcu_dereference_protected(ci->table, true);
+
+ lockdep_assert_held(&ci->lock);
VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
- atomic_long_set(&ci->table[off], swp_tb);
+ atomic_long_set(&table[off], swp_tb);
}
static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci,
unsigned int off, unsigned long swp_tb)
{
+ atomic_long_t *table = rcu_dereference_protected(ci->table, true);
+
+ lockdep_assert_held(&ci->lock);
VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
/* Ordering is guaranteed by cluster lock, relax */
- return atomic_long_xchg_relaxed(&ci->table[off], swp_tb);
+ return atomic_long_xchg_relaxed(&table[off], swp_tb);
}
static inline unsigned long __swap_table_get(struct swap_cluster_info *ci,
unsigned int off)
{
+ atomic_long_t *table;
+
VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
- return atomic_long_read(&ci->table[off]);
+ table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock));
+
+ return atomic_long_read(&table[off]);
+}
+
+static inline unsigned long swap_table_get(struct swap_cluster_info *ci,
+ unsigned int off)
+{
+ atomic_long_t *table;
+ unsigned long swp_tb;
+
+ rcu_read_lock();
+ table = rcu_dereference(ci->table);
+ swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb();
+ rcu_read_unlock();
+
+ return swp_tb;
}
#endif