diff options
| author | Kairui Song <kasong@tencent.com> | 2025-09-17 00:00:59 +0800 |
|---|---|---|
| committer | Andrew Morton <akpm@linux-foundation.org> | 2025-09-21 14:22:25 -0700 |
| commit | 07adc4cf1ecd316e7b6f4a142e5f5e96ce697e65 (patch) | |
| tree | 18120dfe539c450fc9387fd32023f172323bc25a /mm/swap_table.h | |
| parent | mm, swap: remove contention workaround for swap cache (diff) | |
| download | linux-07adc4cf1ecd316e7b6f4a142e5f5e96ce697e65.tar.gz linux-07adc4cf1ecd316e7b6f4a142e5f5e96ce697e65.zip | |
mm, swap: implement dynamic allocation of swap table
Now swap table is cluster based, which means free clusters can free its
table since no one should modify it.
There could be speculative readers, like swap cache look up, protect them
by making them RCU protected. All swap table should be filled with null
entries before free, so such readers will either see a NULL pointer or a
null filled table being lazy freed.
On allocation, allocate the table when a cluster is used by any order.
This way, we can reduce the memory usage of large swap device
significantly.
This idea to dynamically release unused swap cluster data was initially
suggested by Chris Li while proposing the cluster swap allocator and it
suits the swap table idea very well.
Link: https://lkml.kernel.org/r/20250916160100.31545-15-ryncsn@gmail.com
Co-developed-by: Chris Li <chrisl@kernel.org>
Signed-off-by: Chris Li <chrisl@kernel.org>
Signed-off-by: Kairui Song <kasong@tencent.com>
Suggested-by: Chris Li <chrisl@kernel.org>
Reviewed-by: Barry Song <baohua@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: kernel test robot <oliver.sang@intel.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/swap_table.h')
| -rw-r--r-- | mm/swap_table.h | 37 |
1 files changed, 34 insertions, 3 deletions
diff --git a/mm/swap_table.h b/mm/swap_table.h index e1f7cc009701..52254e455304 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -2,8 +2,15 @@ #ifndef _MM_SWAP_TABLE_H #define _MM_SWAP_TABLE_H +#include <linux/rcupdate.h> +#include <linux/atomic.h> #include "swap.h" +/* A typical flat array in each cluster as swap table */ +struct swap_table { + atomic_long_t entries[SWAPFILE_CLUSTER]; +}; + /* * A swap table entry represents the status of a swap slot on a swap * (physical or virtual) device. The swap table in each cluster is a @@ -76,22 +83,46 @@ static inline void *swp_tb_to_shadow(unsigned long swp_tb) static inline void __swap_table_set(struct swap_cluster_info *ci, unsigned int off, unsigned long swp_tb) { + atomic_long_t *table = rcu_dereference_protected(ci->table, true); + + lockdep_assert_held(&ci->lock); VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); - atomic_long_set(&ci->table[off], swp_tb); + atomic_long_set(&table[off], swp_tb); } static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci, unsigned int off, unsigned long swp_tb) { + atomic_long_t *table = rcu_dereference_protected(ci->table, true); + + lockdep_assert_held(&ci->lock); VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); /* Ordering is guaranteed by cluster lock, relax */ - return atomic_long_xchg_relaxed(&ci->table[off], swp_tb); + return atomic_long_xchg_relaxed(&table[off], swp_tb); } static inline unsigned long __swap_table_get(struct swap_cluster_info *ci, unsigned int off) { + atomic_long_t *table; + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); - return atomic_long_read(&ci->table[off]); + table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock)); + + return atomic_long_read(&table[off]); +} + +static inline unsigned long swap_table_get(struct swap_cluster_info *ci, + unsigned int off) +{ + atomic_long_t *table; + unsigned long swp_tb; + + rcu_read_lock(); + table = rcu_dereference(ci->table); + swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb(); + rcu_read_unlock(); + + return swp_tb; } #endif |
