From 7cf2b0f9611b9971d663e1fc3206eeda3b902922 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 16 Jun 2022 07:44:31 -0700 Subject: xfs: bound maximum wait time for inodegc work Currently inodegc work can sit queued on the per-cpu queue until the workqueue is either flushed of the queue reaches a depth that triggers work queuing (and later throttling). This means that we could queue work that waits for a long time for some other event to trigger flushing. Hence instead of just queueing work at a specific depth, use a delayed work that queues the work at a bound time. We can still schedule the work immediately at a given depth, but we no long need to worry about leaving a number of items on the list that won't get processed until external events prevail. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'fs/xfs/xfs_icache.c') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 5269354b1b69..786702273621 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -440,7 +440,7 @@ xfs_inodegc_queue_all( for_each_online_cpu(cpu) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) - queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); } } @@ -1841,8 +1841,8 @@ void xfs_inodegc_worker( struct work_struct *work) { - struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc, - work); + struct xfs_inodegc *gc = container_of(to_delayed_work(work), + struct xfs_inodegc, work); struct llist_node *node = llist_del_all(&gc->list); struct xfs_inode *ip, *n; @@ -2014,6 +2014,7 @@ xfs_inodegc_queue( struct xfs_inodegc *gc; int items; unsigned int shrinker_hits; + unsigned long queue_delay = 1; trace_xfs_inode_set_need_inactive(ip); spin_lock(&ip->i_flags_lock); @@ -2025,19 +2026,26 @@ xfs_inodegc_queue( items = READ_ONCE(gc->items); WRITE_ONCE(gc->items, items + 1); shrinker_hits = READ_ONCE(gc->shrinker_hits); - put_cpu_ptr(gc); - if (!xfs_is_inodegc_enabled(mp)) + /* + * We queue the work while holding the current CPU so that the work + * is scheduled to run on this CPU. + */ + if (!xfs_is_inodegc_enabled(mp)) { + put_cpu_ptr(gc); return; - - if (xfs_inodegc_want_queue_work(ip, items)) { - trace_xfs_inodegc_queue(mp, __return_address); - queue_work(mp->m_inodegc_wq, &gc->work); } + if (xfs_inodegc_want_queue_work(ip, items)) + queue_delay = 0; + + trace_xfs_inodegc_queue(mp, __return_address); + mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay); + put_cpu_ptr(gc); + if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { trace_xfs_inodegc_throttle(mp, __return_address); - flush_work(&gc->work); + flush_delayed_work(&gc->work); } } @@ -2054,7 +2062,7 @@ xfs_inodegc_cpu_dead( unsigned int count = 0; dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); - cancel_work_sync(&dead_gc->work); + cancel_delayed_work_sync(&dead_gc->work); if (llist_empty(&dead_gc->list)) return; @@ -2073,12 +2081,12 @@ xfs_inodegc_cpu_dead( llist_add_batch(first, last, &gc->list); count += READ_ONCE(gc->items); WRITE_ONCE(gc->items, count); - put_cpu_ptr(gc); if (xfs_is_inodegc_enabled(mp)) { trace_xfs_inodegc_queue(mp, __return_address); - queue_work(mp->m_inodegc_wq, &gc->work); + mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0); } + put_cpu_ptr(gc); } /* @@ -2173,7 +2181,7 @@ xfs_inodegc_shrinker_scan( unsigned int h = READ_ONCE(gc->shrinker_hits); WRITE_ONCE(gc->shrinker_hits, h + 1); - queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); no_items = false; } } -- cgit v1.2.3 From 5e672cd69f0a534a445df4372141fd0d1d00901d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 16 Jun 2022 07:44:32 -0700 Subject: xfs: introduce xfs_inodegc_push() The current blocking mechanism for pushing the inodegc queue out to disk can result in systems becoming unusable when there is a long running inodegc operation. This is because the statfs() implementation currently issues a blocking flush of the inodegc queue and a significant number of common system utilities will call statfs() to discover something about the underlying filesystem. This can result in userspace operations getting stuck on inodegc progress, and when trying to remove a heavily reflinked file on slow storage with a full journal, this can result in delays measuring in hours. Avoid this problem by adding "push" function that expedites the flushing of the inodegc queue, but doesn't wait for it to complete. Convert xfs_fs_statfs() and xfs_qm_scall_getquota() to use this mechanism so they don't block but still ensure that queued operations are expedited. Fixes: ab23a7768739 ("xfs: per-cpu deferred inode inactivation queues") Reported-by: Chris Dunlop Signed-off-by: Dave Chinner [djwong: fix _getquota_next to use _inodegc_push too] Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 20 +++++++++++++++----- fs/xfs/xfs_icache.h | 1 + fs/xfs/xfs_qm_syscalls.c | 9 ++++++--- fs/xfs/xfs_super.c | 7 +++++-- fs/xfs/xfs_trace.h | 1 + 5 files changed, 28 insertions(+), 10 deletions(-) (limited to 'fs/xfs/xfs_icache.c') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 786702273621..2609825d53ee 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1862,19 +1862,29 @@ xfs_inodegc_worker( } /* - * Force all currently queued inode inactivation work to run immediately and - * wait for the work to finish. + * Expedite all pending inodegc work to run immediately. This does not wait for + * completion of the work. */ void -xfs_inodegc_flush( +xfs_inodegc_push( struct xfs_mount *mp) { if (!xfs_is_inodegc_enabled(mp)) return; + trace_xfs_inodegc_push(mp, __return_address); + xfs_inodegc_queue_all(mp); +} +/* + * Force all currently queued inode inactivation work to run immediately and + * wait for the work to finish. + */ +void +xfs_inodegc_flush( + struct xfs_mount *mp) +{ + xfs_inodegc_push(mp); trace_xfs_inodegc_flush(mp, __return_address); - - xfs_inodegc_queue_all(mp); flush_workqueue(mp->m_inodegc_wq); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 2e4cfddf8b8e..6cd180721659 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp); void xfs_inodegc_worker(struct work_struct *work); +void xfs_inodegc_push(struct xfs_mount *mp); void xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 74ac9ca9e119..392cb39cc10c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -454,9 +454,12 @@ xfs_qm_scall_getquota( struct xfs_dquot *dqp; int error; - /* Flush inodegc work at the start of a quota reporting scan. */ + /* + * Expedite pending inodegc work at the start of a quota reporting + * scan but don't block waiting for it to complete. + */ if (id == 0) - xfs_inodegc_flush(mp); + xfs_inodegc_push(mp); /* * Try to get the dquot. We don't want it allocated on disk, so don't @@ -498,7 +501,7 @@ xfs_qm_scall_getquota_next( /* Flush inodegc work at the start of a quota reporting scan. */ if (*id == 0) - xfs_inodegc_flush(mp); + xfs_inodegc_push(mp); error = xfs_qm_dqget_next(mp, *id, type, &dqp); if (error) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 90d9c419ecc5..aa977c7ea370 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -797,8 +797,11 @@ xfs_fs_statfs( xfs_extlen_t lsize; int64_t ffree; - /* Wait for whatever inactivations are in progress. */ - xfs_inodegc_flush(mp); + /* + * Expedite background inodegc but don't wait. We do not want to block + * here waiting hours for a billion extent file to be truncated. + */ + xfs_inodegc_push(mp); statp->f_type = XFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d32026585c1b..0fa1b7a2918c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -240,6 +240,7 @@ DEFINE_EVENT(xfs_fs_class, name, \ TP_PROTO(struct xfs_mount *mp, void *caller_ip), \ TP_ARGS(mp, caller_ip)) DEFINE_FS_EVENT(xfs_inodegc_flush); +DEFINE_FS_EVENT(xfs_inodegc_push); DEFINE_FS_EVENT(xfs_inodegc_start); DEFINE_FS_EVENT(xfs_inodegc_stop); DEFINE_FS_EVENT(xfs_inodegc_queue); -- cgit v1.2.3